datasketch 1.6.4__tar.gz → 1.6.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {datasketch-1.6.4 → datasketch-1.6.5}/PKG-INFO +5 -5
  2. {datasketch-1.6.4 → datasketch-1.6.5}/README.rst +4 -4
  3. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/lsh.py +55 -0
  4. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/lshforest.py +25 -0
  5. datasketch-1.6.5/datasketch/version.py +1 -0
  6. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch.egg-info/PKG-INFO +5 -5
  7. {datasketch-1.6.4 → datasketch-1.6.5}/test/test_lsh.py +111 -0
  8. {datasketch-1.6.4 → datasketch-1.6.5}/test/test_lshforest.py +12 -0
  9. datasketch-1.6.4/datasketch/version.py +0 -1
  10. {datasketch-1.6.4 → datasketch-1.6.5}/LICENSE +0 -0
  11. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/__init__.py +0 -0
  12. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/b_bit_minhash.py +0 -0
  13. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/experimental/__init__.py +0 -0
  14. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/experimental/aio/__init__.py +0 -0
  15. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/experimental/aio/lsh.py +0 -0
  16. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/experimental/aio/storage.py +0 -0
  17. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/hashfunc.py +0 -0
  18. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/hnsw.py +0 -0
  19. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/hyperloglog.py +0 -0
  20. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/hyperloglog_const.py +0 -0
  21. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/lean_minhash.py +0 -0
  22. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/lshensemble.py +0 -0
  23. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/lshensemble_partition.py +0 -0
  24. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/minhash.py +0 -0
  25. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/storage.py +0 -0
  26. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch/weighted_minhash.py +0 -0
  27. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch.egg-info/SOURCES.txt +0 -0
  28. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch.egg-info/dependency_links.txt +0 -0
  29. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch.egg-info/requires.txt +0 -0
  30. {datasketch-1.6.4 → datasketch-1.6.5}/datasketch.egg-info/top_level.txt +0 -0
  31. {datasketch-1.6.4 → datasketch-1.6.5}/setup.cfg +0 -0
  32. {datasketch-1.6.4 → datasketch-1.6.5}/setup.py +0 -0
  33. {datasketch-1.6.4 → datasketch-1.6.5}/test/test_hnsw.py +0 -0
  34. {datasketch-1.6.4 → datasketch-1.6.5}/test/test_hyperloglog.py +0 -0
  35. {datasketch-1.6.4 → datasketch-1.6.5}/test/test_lean_minhash.py +0 -0
  36. {datasketch-1.6.4 → datasketch-1.6.5}/test/test_lsh_cassandra.py +0 -0
  37. {datasketch-1.6.4 → datasketch-1.6.5}/test/test_lshensemble.py +0 -0
  38. {datasketch-1.6.4 → datasketch-1.6.5}/test/test_minhash.py +0 -0
  39. {datasketch-1.6.4 → datasketch-1.6.5}/test/test_weighted_minhash.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datasketch
3
- Version: 1.6.4
3
+ Version: 1.6.5
4
4
  Summary: Probabilistic data structures for processing and searching very large datasets
5
5
  Home-page: https://ekzhu.github.io/datasketch
6
6
  Author: ekzhu
@@ -51,11 +51,11 @@ Requires-Dist: motor; python_version >= "3.6" and extra == "experimental-aio"
51
51
  datasketch: Big Data Looks Small
52
52
  ================================
53
53
 
54
- .. image:: https://github.com/ekzhu/datasketch/workflows/Python%20package/badge.svg
55
- :target: https://github.com/ekzhu/datasketch/actions
54
+ .. image:: https://static.pepy.tech/badge/datasketch/month
55
+ :target: https://pepy.tech/project/datasketch
56
56
 
57
- .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.290602.svg
58
- :target: https://doi.org/10.5281/zenodo.290602
57
+ .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
58
+ :target: https://zenodo.org/doi/10.5281/zenodo.598238
59
59
 
60
60
  datasketch gives you probabilistic data structures that can process and
61
61
  search very large amount of data super fast, with little loss of
@@ -1,11 +1,11 @@
1
1
  datasketch: Big Data Looks Small
2
2
  ================================
3
3
 
4
- .. image:: https://github.com/ekzhu/datasketch/workflows/Python%20package/badge.svg
5
- :target: https://github.com/ekzhu/datasketch/actions
4
+ .. image:: https://static.pepy.tech/badge/datasketch/month
5
+ :target: https://pepy.tech/project/datasketch
6
6
 
7
- .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.290602.svg
8
- :target: https://doi.org/10.5281/zenodo.290602
7
+ .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
8
+ :target: https://zenodo.org/doi/10.5281/zenodo.598238
9
9
 
10
10
  datasketch gives you probabilistic data structures that can process and
11
11
  search very large amount of data super fast, with little loss of
@@ -226,6 +226,29 @@ class MinHashLSH(object):
226
226
  """
227
227
  self._insert(key, minhash, check_duplication=check_duplication, buffer=False)
228
228
 
229
+ def merge(
230
+ self,
231
+ other: MinHashLSH,
232
+ check_overlap: bool = False
233
+ ):
234
+ """Merge the other MinHashLSH with this one, making this one the union
235
+ of both.
236
+
237
+ Note:
238
+ Only num_perm, number of bands and sizes of each band is checked for equivalency of two MinHashLSH indexes.
239
+ Other initialization parameters threshold, weights, storage_config, prepickle and hash_func are not checked.
240
+
241
+ Args:
242
+ other (MinHashLSH): The other MinHashLSH.
243
+ check_overlap (bool): Check if there are any overlapping keys before merging and raise if there are any.
244
+ (`default=False`)
245
+
246
+ Raises:
247
+ ValueError: If the two MinHashLSH have different initialization
248
+ parameters, or if `check_overlap` is `True` and there are overlapping keys.
249
+ """
250
+ self._merge(other, check_overlap=check_overlap, buffer=False)
251
+
229
252
  def insertion_session(self, buffer_size: int = 50000) -> MinHashLSHInsertionSession:
230
253
  """
231
254
  Create a context manager for fast insertion into this index.
@@ -282,6 +305,38 @@ class MinHashLSH(object):
282
305
  for H, hashtable in zip(Hs, self.hashtables):
283
306
  hashtable.insert(H, key, buffer=buffer)
284
307
 
308
+ def __equivalent(self, other:MinHashLSH) -> bool:
309
+ """
310
+ Returns:
311
+ bool: If the two MinHashLSH have equal num_perm, number of bands, size of each band then two are equivalent.
312
+ """
313
+ return (
314
+ type(self) is type(other) and
315
+ self.h == other.h and
316
+ self.b == other.b and
317
+ self.r == other.r
318
+ )
319
+
320
+ def _merge(
321
+ self,
322
+ other: MinHashLSH,
323
+ check_overlap: bool = False,
324
+ buffer: bool = False
325
+ ) -> MinHashLSH:
326
+ if self.__equivalent(other):
327
+ if check_overlap and set(self.keys).intersection(set(other.keys)):
328
+ raise ValueError("The keys are overlapping, duplicate key exists.")
329
+ for key in other.keys:
330
+ Hs = other.keys.get(key)
331
+ self.keys.insert(key, *Hs, buffer=buffer)
332
+ for H, hashtable in zip(Hs, self.hashtables):
333
+ hashtable.insert(H, key, buffer=buffer)
334
+ else:
335
+ if type(self) is not type(other):
336
+ raise ValueError(f"Cannot merge type MinHashLSH and type {type(other).__name__}.")
337
+ raise ValueError(
338
+ "Cannot merge MinHashLSH with different initialization parameters.")
339
+
285
340
  def query(self, minhash) -> List[Hashable]:
286
341
  """
287
342
  Giving the MinHash of the query set, retrieve
@@ -1,5 +1,6 @@
1
1
  from collections import defaultdict
2
2
  from typing import Hashable, List
3
+ import numpy as np
3
4
 
4
5
  from datasketch.minhash import MinHash
5
6
 
@@ -128,6 +129,30 @@ class MinHashLSHForest(object):
128
129
  r -= 1
129
130
  return list(results)
130
131
 
132
+ def get_minhash_hashvalues(self, key: Hashable) -> np.ndarray:
133
+ """
134
+ Returns the hashvalues from the MinHash object that corresponds to the given key in the LSHForest,
135
+ if it exists. This is useful for when we want to reconstruct the original MinHash
136
+ object to manually check the Jaccard Similarity for the top-k results from a query.
137
+
138
+ Args:
139
+ key (Hashable): The key whose MinHash hashvalues we want to retrieve.
140
+
141
+ Returns:
142
+ hashvalues: The hashvalues for the MinHash object corresponding to the given key.
143
+ """
144
+ byteslist = self.keys.get(key, None)
145
+ if byteslist is None:
146
+ raise KeyError(f"The provided key does not exist in the LSHForest: {key}")
147
+ hashvalue_byte_size = len(byteslist[0])//8
148
+ hashvalues = np.empty(len(byteslist)*hashvalue_byte_size, dtype=np.uint64)
149
+ for index, item in enumerate(byteslist):
150
+ # unswap the bytes, as their representation is flipped during storage
151
+ hv_segment = np.frombuffer(item, dtype=np.uint64).byteswap()
152
+ curr_index = index*hashvalue_byte_size
153
+ hashvalues[curr_index:curr_index+hashvalue_byte_size] = hv_segment
154
+ return hashvalues
155
+
131
156
  def _binary_search(self, n, func):
132
157
  """
133
158
  https://golang.org/src/sort/search.go?s=2247:2287#L49
@@ -0,0 +1 @@
1
+ __version__ = "1.6.5"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datasketch
3
- Version: 1.6.4
3
+ Version: 1.6.5
4
4
  Summary: Probabilistic data structures for processing and searching very large datasets
5
5
  Home-page: https://ekzhu.github.io/datasketch
6
6
  Author: ekzhu
@@ -51,11 +51,11 @@ Requires-Dist: motor; python_version >= "3.6" and extra == "experimental-aio"
51
51
  datasketch: Big Data Looks Small
52
52
  ================================
53
53
 
54
- .. image:: https://github.com/ekzhu/datasketch/workflows/Python%20package/badge.svg
55
- :target: https://github.com/ekzhu/datasketch/actions
54
+ .. image:: https://static.pepy.tech/badge/datasketch/month
55
+ :target: https://pepy.tech/project/datasketch
56
56
 
57
- .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.290602.svg
58
- :target: https://doi.org/10.5281/zenodo.290602
57
+ .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
58
+ :target: https://zenodo.org/doi/10.5281/zenodo.598238
59
59
 
60
60
  datasketch gives you probabilistic data structures that can process and
61
61
  search very large amount of data super fast, with little loss of
@@ -240,6 +240,117 @@ class TestMinHashLSH(unittest.TestCase):
240
240
  for table in counts:
241
241
  self.assertEqual(sum(table.values()), 2)
242
242
 
243
+ def test_merge(self):
244
+ lsh1 = MinHashLSH(threshold=0.5, num_perm=16)
245
+ m1 = MinHash(16)
246
+ m1.update("a".encode("utf-8"))
247
+ m2 = MinHash(16)
248
+ m2.update("b".encode("utf-8"))
249
+ lsh1.insert("a",m1)
250
+ lsh1.insert("b",m2)
251
+
252
+ lsh2 = MinHashLSH(threshold=0.5, num_perm=16)
253
+ m3 = MinHash(16)
254
+ m3.update("c".encode("utf-8"))
255
+ m4 = MinHash(16)
256
+ m4.update("d".encode("utf-8"))
257
+ lsh2.insert("c",m1)
258
+ lsh2.insert("d",m2)
259
+
260
+ lsh1.merge(lsh2)
261
+ for t in lsh1.hashtables:
262
+ self.assertTrue(len(t) >= 1)
263
+ items = []
264
+ for H in t:
265
+ items.extend(t[H])
266
+ self.assertTrue("c" in items)
267
+ self.assertTrue("d" in items)
268
+ self.assertTrue("a" in lsh1)
269
+ self.assertTrue("b" in lsh1)
270
+ self.assertTrue("c" in lsh1)
271
+ self.assertTrue("d" in lsh1)
272
+ for i, H in enumerate(lsh1.keys["c"]):
273
+ self.assertTrue("c" in lsh1.hashtables[i][H])
274
+
275
+ self.assertTrue(lsh1.merge, lsh2)
276
+ self.assertRaises(ValueError, lsh1.merge, lsh2, check_overlap=True)
277
+
278
+ m5 = MinHash(16)
279
+ m5.update("e".encode("utf-8"))
280
+ lsh3 = MinHashLSH(threshold=0.5, num_perm=16)
281
+ lsh3.insert("a",m5)
282
+
283
+ self.assertRaises(ValueError, lsh1.merge, lsh3, check_overlap=True)
284
+
285
+ lsh1.merge(lsh3)
286
+
287
+ m6 = MinHash(16)
288
+ m6.update("e".encode("utf-8"))
289
+ lsh4 = MinHashLSH(threshold=0.5, num_perm=16)
290
+ lsh4.insert("a",m6)
291
+
292
+ lsh1.merge(lsh4, check_overlap=False)
293
+
294
+
295
+ def test_merge_redis(self):
296
+ with patch('redis.Redis', fake_redis) as mock_redis:
297
+ lsh1 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
298
+ 'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
299
+ })
300
+ lsh2 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
301
+ 'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
302
+ })
303
+
304
+ m1 = MinHash(16)
305
+ m1.update("a".encode("utf8"))
306
+ m2 = MinHash(16)
307
+ m2.update("b".encode("utf8"))
308
+ lsh1.insert("a", m1)
309
+ lsh1.insert("b", m2)
310
+
311
+ m3 = MinHash(16)
312
+ m3.update("c".encode("utf8"))
313
+ m4 = MinHash(16)
314
+ m4.update("d".encode("utf8"))
315
+ lsh2.insert("c", m3)
316
+ lsh2.insert("d", m4)
317
+
318
+ lsh1.merge(lsh2)
319
+ for t in lsh1.hashtables:
320
+ self.assertTrue(len(t) >= 1)
321
+ items = []
322
+ for H in t:
323
+ items.extend(t[H])
324
+ self.assertTrue(pickle.dumps("c") in items)
325
+ self.assertTrue(pickle.dumps("d") in items)
326
+ self.assertTrue("a" in lsh1)
327
+ self.assertTrue("b" in lsh1)
328
+ self.assertTrue("c" in lsh1)
329
+ self.assertTrue("d" in lsh1)
330
+ for i, H in enumerate(lsh1.keys[pickle.dumps("c")]):
331
+ self.assertTrue(pickle.dumps("c") in lsh1.hashtables[i][H])
332
+
333
+ self.assertTrue(lsh1.merge, lsh2)
334
+ self.assertRaises(ValueError, lsh1.merge, lsh2, check_overlap=True)
335
+
336
+ m5 = MinHash(16)
337
+ m5.update("e".encode("utf-8"))
338
+ lsh3 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
339
+ 'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
340
+ })
341
+ lsh3.insert("a",m5)
342
+
343
+ self.assertRaises(ValueError, lsh1.merge, lsh3, check_overlap=True)
344
+
345
+ m6 = MinHash(16)
346
+ m6.update("e".encode("utf-8"))
347
+ lsh4 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
348
+ 'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
349
+ })
350
+ lsh4.insert("a",m6)
351
+
352
+ lsh1.merge(lsh4, check_overlap=False)
353
+
243
354
 
244
355
  class TestWeightedMinHashLSH(unittest.TestCase):
245
356
 
@@ -62,6 +62,18 @@ class TestMinHashLSHForest(unittest.TestCase):
62
62
  results = forest.query(data[key], 10)
63
63
  self.assertIn(key, results)
64
64
 
65
+ def test_get_minhash_hashvalues(self):
66
+ forest, data = self._setup()
67
+ for key in data:
68
+ minhash_ori = data[key]
69
+ hashvalues = forest.get_minhash_hashvalues(key)
70
+ minhash_retrieved = MinHash(hashvalues=hashvalues)
71
+ retrieved_hashvalues = minhash_retrieved.hashvalues
72
+ self.assertEqual(len(hashvalues), len(retrieved_hashvalues))
73
+ self.assertEqual(minhash_retrieved.jaccard(minhash_ori), 1.0)
74
+ for i in range(len(retrieved_hashvalues)):
75
+ self.assertEqual(hashvalues[i], retrieved_hashvalues[i])
76
+
65
77
  def test_pickle(self):
66
78
  forest, _ = self._setup()
67
79
  forest2 = pickle.loads(pickle.dumps(forest))
@@ -1 +0,0 @@
1
- __version__ = "1.6.4"
File without changes
File without changes
File without changes
File without changes