datasketches 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -0,0 +1,87 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import numpy as np
19
+
20
+ from _datasketches import _density_sketch, KernelFunction
21
+ from .KernelFunction import GaussianKernel
22
+
23
+ class density_sketch:
24
+ """An instance of a Density Sketch for kernel density estimation. Requires a KernelFunction object."""
25
+
26
+ def __init__(self, k:int, dim:int, kernel:KernelFunction=GaussianKernel()):
27
+ self._kernel = kernel
28
+ self._gadget = _density_sketch(k, dim, self._kernel)
29
+
30
+ @classmethod
31
+ def deserialize(cls, data:bytes, kernel:KernelFunction=GaussianKernel()):
32
+ """Reads a bytes object and returns a density sketch, using the provided kerenl or defaulting to a Guassian kerenl"""
33
+ self = cls.__new__(cls)
34
+ self._kernel = kernel
35
+ self._gadget = _density_sketch.deserialize(data, kernel)
36
+ return self
37
+
38
+ def update(self, point:np.array):
39
+ """Updates the sketch with the given point"""
40
+ self._gadget.update(point)
41
+
42
+ def merge(self, other:'density_sketch'):
43
+ """Merges the provided sketch into this one"""
44
+ self._gadget.merge(other._gadget)
45
+
46
+ def is_empty(self):
47
+ """Returns True if the sketch is empty, otherwise False"""
48
+ return self._gadget.is_empty()
49
+
50
+ def get_k(self):
51
+ """Returns the configured parameter k"""
52
+ return self._gadget.get_k()
53
+
54
+ def get_dim(self):
55
+ """Returns the configured parameter dim"""
56
+ return self._gadget.get_dim()
57
+
58
+ def get_n(self):
59
+ """Returns the length of the input stream"""
60
+ return self._gadget.get_n()
61
+
62
+ def get_num_retained(self):
63
+ """Returns the number of retained items (samples) in the sketch"""
64
+ return self._gadget.get_num_retained()
65
+
66
+ def is_estimation_mode(self):
67
+ """Returns True if the sketch is in estimation mode, otherwise False"""
68
+ return self._gadget.is_estimation_mode()
69
+
70
+ def get_estimate(self, point:np.array):
71
+ """Returns an approximate density at the given point"""
72
+ return self._gadget.get_estimate(point)
73
+
74
+ def serialize(self):
75
+ """Serializes the sketch into a bytes object"""
76
+ return self._gadget.serialize()
77
+
78
+ def __str__(self, print_levels:bool=False, print_items:bool=False):
79
+ """Produces a string summary of the sketch"""
80
+ return self._gadget.to_string(print_levels, print_items)
81
+
82
+ def to_string(self, print_levels:bool=False, print_items:bool=False):
83
+ """Produces a string summary of the sketch"""
84
+ return self._gadget.to_string(print_levels, print_items)
85
+
86
+ def __iter__(self):
87
+ return self._gadget.__iter__()
@@ -0,0 +1,35 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import numpy as np
19
+
20
+ from _datasketches import KernelFunction
21
+
22
+ # This file provides an example Python Kernel Function implementation.
23
+ #
24
+ # Each implementation must extend the KernelFunction class
25
+ # and define the __call__ method
26
+
27
+ # Implements a basic Gaussian Kernel
28
+ class GaussianKernel(KernelFunction):
29
+ def __init__(self, bandwidth: float=1.0):
30
+ KernelFunction.__init__(self)
31
+ self._bw = bandwidth
32
+ self._scale = -0.5 * (bandwidth ** -2)
33
+
34
+ def __call__(self, a: np.array, b: np.array) -> float:
35
+ return np.exp(self._scale * np.linalg.norm(a - b)**2)
@@ -54,51 +54,57 @@ class PyStringsSerDe(PyObjectSerDe):
54
54
  str = data[offset+4:offset+4+num_chars].decode()
55
55
  return (str, 4+num_chars)
56
56
 
57
- # Implements an integer-encoding scheme where each integer is written
57
+ # Implements an integer encoding scheme where each integer is written
58
58
  # as a 32-bit (4 byte) little-endian value.
59
59
  class PyIntsSerDe(PyObjectSerDe):
60
60
  def get_size(self, item):
61
61
  return int(4)
62
62
 
63
63
  def to_bytes(self, item):
64
- return struct.pack('i', item)
64
+ return struct.pack('<i', item)
65
65
 
66
66
  def from_bytes(self, data: bytes, offset: int):
67
- val = struct.unpack_from('i', data, offset)[0]
67
+ val = struct.unpack_from('<i', data, offset)[0]
68
68
  return (val, 4)
69
69
 
70
70
 
71
+ # Implements an integer encoding scheme where each integer is written
72
+ # as a 64-bit (8 byte) little-endian value.
71
73
  class PyLongsSerDe(PyObjectSerDe):
72
74
  def get_size(self, item):
73
75
  return int(8)
74
76
 
75
77
  def to_bytes(self, item):
76
- return struct.pack('l', item)
78
+ return struct.pack('<l', item)
77
79
 
78
80
  def from_bytes(self, data: bytes, offset: int):
79
- val = struct.unpack_from('l', data, offset)[0]
81
+ val = struct.unpack_from('<l', data, offset)[0]
80
82
  return (val, 8)
81
83
 
82
84
 
85
+ # Implements a floating point encoding scheme where each value is written
86
+ # as a 32-bit floating point value.
83
87
  class PyFloatsSerDe(PyObjectSerDe):
84
88
  def get_size(self, item):
85
89
  return int(4)
86
90
 
87
91
  def to_bytes(self, item):
88
- return struct.pack('f', item)
92
+ return struct.pack('<f', item)
89
93
 
90
94
  def from_bytes(self, data: bytes, offset: int):
91
- val = struct.unpack_from('f', data, offset)[0]
95
+ val = struct.unpack_from('<f', data, offset)[0]
92
96
  return (val, 4)
93
97
 
94
98
 
99
+ # Implements a floating point encoding scheme where each value is written
100
+ # as a 64-bit floating point value.
95
101
  class PyDoublesSerDe(PyObjectSerDe):
96
102
  def get_size(self, item):
97
103
  return int(8)
98
104
 
99
105
  def to_bytes(self, item):
100
- return struct.pack('d', item)
106
+ return struct.pack('<d', item)
101
107
 
102
108
  def from_bytes(self, data: bytes, offset: int):
103
- val = struct.unpack_from('d', data, offset)[0]
109
+ val = struct.unpack_from('<d', data, offset)[0]
104
110
  return (val, 8)
@@ -0,0 +1,77 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import sys
19
+
20
+ from _datasketches import TuplePolicy
21
+
22
+ # This file provides an example Python Tuple Policy implementation.
23
+ #
24
+ # Each implementation must extend the PyTuplePolicy class and define
25
+ # two methods:
26
+ # * create_summary() returns a new Summary object
27
+ # * update_summary(summary, update) applies the relevant policy to update the
28
+ # provided summary with the data in update.
29
+ # * __call__ may be similar to update_summary but allows a different
30
+ # implementation for set operations (union and intersection)
31
+
32
+ # Implements an accumulator summary policy, where new values are
33
+ # added to the existing value.
34
+ class AccumulatorPolicy(TuplePolicy):
35
+ def __init__(self):
36
+ TuplePolicy.__init__(self)
37
+
38
+ def create_summary(self) -> int:
39
+ return int(0)
40
+
41
+ def update_summary(self, summary: int, update: int) -> int:
42
+ summary += update
43
+ return summary
44
+
45
+ def __call__(self, summary: int, update: int) -> int:
46
+ summary += update
47
+ return summary
48
+
49
+
50
+ # Implements a MAX rule, where the largest integer value is always kept
51
+ class MaxIntPolicy(TuplePolicy):
52
+ def __init__(self):
53
+ TuplePolicy.__init__(self)
54
+
55
+ def create_summary(self) -> int:
56
+ return int(-sys.maxsize-1)
57
+
58
+ def update_summary(self, summary: int, update: int) -> int:
59
+ return max(summary, update)
60
+
61
+ def __call__(self, summary: int, update: int) -> int:
62
+ return max(summary, update)
63
+
64
+
65
+ # Implements a MIN rule, where the smallest integer value is always kept
66
+ class MinIntPolicy(TuplePolicy):
67
+ def __init__(self):
68
+ TuplePolicy.__init__(self)
69
+
70
+ def create_summary(self) -> int:
71
+ return int(sys.maxsize)
72
+
73
+ def update_summary(self, summary: int, update: int) -> int:
74
+ return min(summary, update)
75
+
76
+ def __call__(self, summary: int, update: int) -> int:
77
+ return min(summary, update)
@@ -0,0 +1,205 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ from abc import ABC, abstractmethod
19
+
20
+ from _datasketches import _tuple_sketch, _compact_tuple_sketch, _update_tuple_sketch
21
+ from _datasketches import _tuple_union, _tuple_intersection
22
+ from _datasketches import _tuple_a_not_b, _tuple_jaccard_similarity
23
+ from _datasketches import PyObjectSerDe, theta_sketch, TuplePolicy
24
+
25
+ class tuple_sketch(ABC):
26
+ """An abstract base class representing a Tuple Sketch."""
27
+ _gadget: _tuple_sketch
28
+
29
+ def __str__(self, print_items:bool=False):
30
+ return self._gadget.to_string(print_items)
31
+
32
+ def is_empty(self):
33
+ """Returns True if the sketch is empty, otherwise False."""
34
+ return self._gadget.is_empty()
35
+
36
+ def get_estimate(self):
37
+ """Returns an estimate of the distinct count of the input stream."""
38
+ return self._gadget.get_estimate()
39
+
40
+ def get_upper_bound(self, num_std_devs:int):
41
+ """Returns an approximate upper bound on the estimate at standard deviations in {1, 2, 3}."""
42
+ return self._gadget.get_upper_bound(num_std_devs)
43
+
44
+ def get_lower_bound(self, num_std_devs:int):
45
+ """Returns an approximate lower bound on the estimate at standard deviations in {1, 2, 3}."""
46
+ return self._gadget.get_lower_bound(num_std_devs)
47
+
48
+ def is_estimation_mode(self):
49
+ """Returns True if the sketch is in estimation mode, otherwise False."""
50
+ return self._gadget.is_estimation_mode()
51
+
52
+ def get_theta(self):
53
+ """Returns theta (the effective sampling rate) as a fraction from 0 to 1."""
54
+ return self._gadget.get_theta()
55
+
56
+ def get_theta64(self):
57
+ """Returns theta as a 64-bit integer value."""
58
+ return self._gadget.get_theta64()
59
+
60
+ def get_num_retained(self):
61
+ """Returns the number of items currently in the sketch."""
62
+ return self._gadget.get_num_retained()
63
+
64
+ def get_seed_hash(self):
65
+ """Returns a hash of the seed used in the sketch."""
66
+ return self._gadget.get_seed_hash()
67
+
68
+ def is_ordered(self):
69
+ """Returns True if the sketch entries are sorder, otherwise False."""
70
+ return self._gadget.is_ordered()
71
+
72
+ def __iter__(self):
73
+ return self._gadget.__iter__()
74
+
75
+
76
+ class compact_tuple_sketch(tuple_sketch):
77
+ """An instance of a Tuple Sketch that has been compacted and can no longer accept updates."""
78
+
79
+ def __init__(self, other:tuple_sketch, ordered:bool = True):
80
+ if other == None:
81
+ self._gadget = None
82
+ else:
83
+ self._gadget = _compact_tuple_sketch(other, ordered)
84
+
85
+ def serialize(self, serde:PyObjectSerDe):
86
+ """Serializes the sketch into a bytes object with the provided SerDe."""
87
+ return self._gadget.serialize(serde)
88
+
89
+ @classmethod
90
+ def from_theta_sketch(cls, sketch:theta_sketch, summary, seed:int=_tuple_sketch.DEFAULT_SEED):
91
+ """Creates a comapct Tuple Sketch from a Theta Sketch using a fixed summary value."""
92
+ self = cls.__new__(cls)
93
+ self._gadget = _compact_tuple_sketch(sketch, summary, seed)
94
+ return self
95
+
96
+ @classmethod
97
+ def deserialize(cls, data:bytes, serde:PyObjectSerDe, seed:int=_tuple_sketch.DEFAULT_SEED):
98
+ """Reads a bytes object and uses the provded SerDe to return the corresponding compact_tuple_sketch."""
99
+ self = cls.__new__(cls)
100
+ self._gadget = _compact_tuple_sketch.deserialize(data, serde, seed)
101
+ return self
102
+
103
+
104
+ class update_tuple_sketch(tuple_sketch):
105
+ """An instance of a Tuple Sketch that is available for updates. Requires a Policy object to handle Summary values."""
106
+
107
+ def __init__(self, policy, lg_k:int = 12, p:float = 1.0, seed:int = _tuple_sketch.DEFAULT_SEED):
108
+ self._policy = policy
109
+ self._gadget = _update_tuple_sketch(self._policy, lg_k, p, seed)
110
+
111
+ def update(self, datum, value):
112
+ """Updates the sketch with the provided item and summary value."""
113
+ self._gadget.update(datum, value)
114
+
115
+ def compact(self, ordered:bool = True) -> compact_tuple_sketch:
116
+ """Returns a compacted form of the sketch, optionally sorting it."""
117
+ return self._gadget.compact(ordered)
118
+
119
+ def reset(self):
120
+ """Resets the sketch to the initial empty state."""
121
+ self._gadget.reset()
122
+
123
+
124
+ class tuple_union:
125
+ """An object that can merge Tuple Sketches. Requires a Policy object to handle merging Summaries."""
126
+ _policy: TuplePolicy
127
+
128
+ def __init__(self, policy:TuplePolicy, lg_k:int = 12, p:float = 1.0, seed:int = _tuple_sketch.DEFAULT_SEED):
129
+ self._policy = policy
130
+ self._gadget = _tuple_union(self._policy, lg_k, p, seed)
131
+
132
+ def update(self, sketch:tuple_sketch):
133
+ """Updates the union with the given sketch."""
134
+ self._gadget.update(sketch._gadget)
135
+
136
+ def get_result(self, ordered:bool = True) -> compact_tuple_sketch:
137
+ """Returns the sketch corresponding to the union result, optionally sorted."""
138
+ return compact_tuple_sketch(self._gadget.get_result(ordered), ordered)
139
+
140
+ def reset(self):
141
+ """Resets the union to the initial empty state."""
142
+ self._gadget.reset()
143
+
144
+
145
+ class tuple_intersection:
146
+ """An object that can intersect Tuple Sketches. Requires a Policy object to handle intersecting Summaries."""
147
+ _policy: TuplePolicy
148
+
149
+ def __init__(self, policy:TuplePolicy, seed:int = _tuple_sketch.DEFAULT_SEED):
150
+ self._policy = policy
151
+ self._gadget = _tuple_intersection(self._policy, seed)
152
+
153
+ def update(self, sketch:tuple_sketch):
154
+ """Intersects the provided sketch with the current intersection state."""
155
+ self._gadget.update(sketch._gadget)
156
+
157
+ def has_result(self) -> bool:
158
+ """Returns True if the intersection has a valid result, otherwise False."""
159
+ return self._gadget.has_result()
160
+
161
+ def get_result(self, ordered:bool = True) -> compact_tuple_sketch:
162
+ """Returns the sketch corresponding to the intersection result, optionally sorted."""
163
+ return compact_tuple_sketch(self._gadget.get_result(ordered), ordered)
164
+
165
+
166
+ class tuple_a_not_b:
167
+ """An object that can peform the A-not-B operation between two sketches."""
168
+ def __init__(self, seed:int = _tuple_sketch.DEFAULT_SEED):
169
+ self._gadget = _tuple_a_not_b(seed)
170
+
171
+ def compute(self, a:tuple_sketch, b:tuple_sketch, ordered:bool=True) -> compact_tuple_sketch:
172
+ """Returns a sketch with the result of applying the A-not-B operation on the given inputs."""
173
+ return compact_tuple_sketch(self._gadget.compute(a._gadget, b._gadget))
174
+
175
+
176
+ class tuple_jaccard_similarity:
177
+ @staticmethod
178
+ def jaccard(a:tuple_sketch, b:tuple_sketch, seed:int=_tuple_sketch.DEFAULT_SEED):
179
+ """Returns a list with {lower_bound, estimate, upper_bound} of the Jaccard similarity between sketches."""
180
+ return _tuple_jaccard_similarity.jaccard(a._gadget, b._gadget, seed)
181
+
182
+ @staticmethod
183
+ def exactly_equal(a:tuple_sketch, b:tuple_sketch, seed:int=_tuple_sketch.DEFAULT_SEED):
184
+ """Returns True if sketch_a and sketch_b are equivalent, otherwise False."""
185
+ return _tuple_jaccard_similarity.exactly_equal(a._gadget, b._gadget, seed)
186
+
187
+ @staticmethod
188
+ def similarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=_tuple_sketch.DEFAULT_SEED):
189
+ """Tests similarity of an actual sketch against an expected sketch.
190
+
191
+ Computes the lower bound of the Jaccard index J_{LB} of the actual and expected sketches.
192
+ If J_{LB} >= threshold, then the sketches are considered to be similar sith a confidence of
193
+ 97.7% and returns True, otherwise False.
194
+ """
195
+ return _tuple_jaccard_similarity.similarity_test(actual._gadget, expected._gadget, threshold, seed)
196
+
197
+ @staticmethod
198
+ def dissimilarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=_tuple_sketch.DEFAULT_SEED):
199
+ """Tests dissimilarity of an actual sketch against an expected sketch.
200
+
201
+ Computes the upper bound of the Jaccard index J_{UB} of the actual and expected sketches.
202
+ If J_{UB} <= threshold, then the sketches are considered to be dissimilar sith a confidence of
203
+ 97.7% and returns True, otherwise False.
204
+ """
205
+ return _tuple_jaccard_similarity.dissimilarity_test(actual._gadget, expected._gadget, threshold, seed)
@@ -15,8 +15,24 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
+ """The Apache DataSketches Library for Python
19
+
20
+ Provided under the Apache License, Verison 2.0
21
+ <http://www.apache.org/licenses/LICENSE-2.0>
22
+ """
23
+
18
24
  name = 'datasketches'
19
25
 
26
+ from _datasketches import *
27
+
20
28
  from .PySerDe import *
29
+ from .TuplePolicy import *
30
+ from .KernelFunction import *
21
31
 
22
- from _datasketches import *
32
+ # Wrappers around the pybind11 classes for cases where we
33
+ # need to define a python object that is persisted within
34
+ # the C++ object. Currently, the native python portion of
35
+ # a class derived from a C++ class may be garbage collected
36
+ # even though a pointer to the C++ portion remains valid.
37
+ from .TupleWrapper import *
38
+ from .DensityWrapper import *
@@ -0,0 +1,98 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ //#include <memory>
21
+ #include <pybind11/pybind11.h>
22
+ #include <pybind11/numpy.h>
23
+
24
+ #ifndef _KERNEL_FUNCTION_HPP_
25
+ #define _KERNEL_FUNCTION_HPP_
26
+
27
+ namespace py = pybind11;
28
+
29
+ namespace datasketches {
30
+
31
+ /**
32
+ * @brief kernel_function provides the underlying base class from
33
+ * which native Python kernels ultimately inherit. The actual
34
+ * kernels implement KernelFunction, as shown in KernelFunction.py
35
+ */
36
+ struct kernel_function {
37
+ virtual double operator()(py::array_t<double>& a, const py::array_t<double>& b) const = 0;
38
+ virtual ~kernel_function() = default;
39
+ };
40
+
41
+ /**
42
+ * @brief KernelFunction provides the "trampoline" class for pybind11
43
+ * that allows for a native Python implementation of kernel
44
+ * functions.
45
+ */
46
+ struct KernelFunction : public kernel_function {
47
+ using kernel_function::kernel_function;
48
+
49
+ /**
50
+ * @brief Evaluates K(a,b), the kernel function for the given points a and b
51
+ * @param a the first vector
52
+ * @param b the second vector
53
+ * @return The function value K(a,b)
54
+ */
55
+ double operator()(py::array_t<double>& a, const py::array_t<double>& b) const override {
56
+ PYBIND11_OVERRIDE_PURE_NAME(
57
+ double, // Return type
58
+ kernel_function, // Parent class
59
+ "__call__", // Name of function in python
60
+ operator(), // Name of function in C++
61
+ a, b // Arguemnts
62
+ );
63
+ }
64
+ };
65
+
66
+ /* The kernel_function_holder provides a concrete class that dispatches calls
67
+ * from the sketch to the kernel_function. This class is needed to provide a
68
+ * concrete object to produce a compiled library, but library users should
69
+ * never need to use this directly.
70
+ */
71
+ struct kernel_function_holder {
72
+ explicit kernel_function_holder(std::shared_ptr<kernel_function> kernel) : _kernel(kernel) {}
73
+ kernel_function_holder(const kernel_function_holder& other) : _kernel(other._kernel) {}
74
+ kernel_function_holder(kernel_function_holder&& other) : _kernel(std::move(other._kernel)) {}
75
+ kernel_function_holder& operator=(const kernel_function_holder& other) { _kernel = other._kernel; return *this; }
76
+ kernel_function_holder& operator=(kernel_function_holder&& other) { std::swap(_kernel, other._kernel); return *this; }
77
+
78
+ double operator()(const std::vector<double>& a, const py::array_t<double>& b) const {
79
+ py::array_t<double> a_arr(a.size(), a.data(), dummy_array_owner);
80
+ return _kernel->operator()(a_arr, b);
81
+ }
82
+
83
+ double operator()(const std::vector<double>& a, const std::vector<double>& b) const {
84
+ py::array_t<double> a_arr(a.size(), a.data(), dummy_array_owner);
85
+ py::array_t<double> b_arr(b.size(), b.data(), dummy_array_owner);
86
+ return _kernel->operator()(a_arr, b_arr);
87
+ }
88
+
89
+ private:
90
+ // a dummy object to "own" arrays when translating from std::vector to avoid a copy:
91
+ // https://github.com/pybind/pybind11/issues/323#issuecomment-575717041
92
+ py::str dummy_array_owner;
93
+ std::shared_ptr<kernel_function> _kernel;
94
+ };
95
+
96
+ }
97
+
98
+ #endif // _KERNEL_FUNCTION_HPP_
@@ -0,0 +1,37 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _PY_OBJECT_LT_HPP_
21
+ #define _PY_OBJECT_LT_HPP_
22
+
23
+ #include <pybind11/pybind11.h>
24
+
25
+ /*
26
+ This header defines a less than operator on generic python
27
+ objects. The implementation calls the object's built-in __lt__()
28
+ method. If that method is not defined, the call may fail.
29
+ */
30
+
31
+ struct py_object_lt {
32
+ bool operator()(const pybind11::object& a, const pybind11::object& b) const {
33
+ return a < b;
34
+ }
35
+ };
36
+
37
+ #endif // _PY_OBJECT_LT_HPP_
@@ -0,0 +1,48 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _PY_OBJECT_OSTREAM_HPP_
21
+ #define _PY_OBJECT_OSTREAM_HPP_
22
+
23
+ #include <pybind11/pybind11.h>
24
+
25
+ #include <string>
26
+ #include <ostream>
27
+
28
+ /*
29
+ This header defines an ostream output operator on a generic python
30
+ object. The implementation calls the object's built-in __str__()
31
+ method. If that method is not defined, the call may fail.
32
+
33
+ NOTE: This header must be included before the inclusion of
34
+ any sketch classes.
35
+ */
36
+
37
+ namespace py = pybind11;
38
+
39
+ namespace datasketches {
40
+
41
+ static std::ostream& operator<<(std::ostream& os, const py::object& obj) {
42
+ os << std::string(pybind11::str(obj));
43
+ return os;
44
+ }
45
+
46
+ }
47
+
48
+ #endif // _PY_OBJECT_OSTREAM_HPP_