datasketches 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -0,0 +1,87 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import numpy as np
19
+
20
+ from _datasketches import _density_sketch, KernelFunction
21
+ from .KernelFunction import GaussianKernel
22
+
23
+ class density_sketch:
24
+ """An instance of a Density Sketch for kernel density estimation. Requires a KernelFunction object."""
25
+
26
+ def __init__(self, k:int, dim:int, kernel:KernelFunction=GaussianKernel()):
27
+ self._kernel = kernel
28
+ self._gadget = _density_sketch(k, dim, self._kernel)
29
+
30
+ @classmethod
31
+ def deserialize(cls, data:bytes, kernel:KernelFunction=GaussianKernel()):
32
+ """Reads a bytes object and returns a density sketch, using the provided kerenl or defaulting to a Guassian kerenl"""
33
+ self = cls.__new__(cls)
34
+ self._kernel = kernel
35
+ self._gadget = _density_sketch.deserialize(data, kernel)
36
+ return self
37
+
38
+ def update(self, point:np.array):
39
+ """Updates the sketch with the given point"""
40
+ self._gadget.update(point)
41
+
42
+ def merge(self, other:'density_sketch'):
43
+ """Merges the provided sketch into this one"""
44
+ self._gadget.merge(other._gadget)
45
+
46
+ def is_empty(self):
47
+ """Returns True if the sketch is empty, otherwise False"""
48
+ return self._gadget.is_empty()
49
+
50
+ def get_k(self):
51
+ """Returns the configured parameter k"""
52
+ return self._gadget.get_k()
53
+
54
+ def get_dim(self):
55
+ """Returns the configured parameter dim"""
56
+ return self._gadget.get_dim()
57
+
58
+ def get_n(self):
59
+ """Returns the length of the input stream"""
60
+ return self._gadget.get_n()
61
+
62
+ def get_num_retained(self):
63
+ """Returns the number of retained items (samples) in the sketch"""
64
+ return self._gadget.get_num_retained()
65
+
66
+ def is_estimation_mode(self):
67
+ """Returns True if the sketch is in estimation mode, otherwise False"""
68
+ return self._gadget.is_estimation_mode()
69
+
70
+ def get_estimate(self, point:np.array):
71
+ """Returns an approximate density at the given point"""
72
+ return self._gadget.get_estimate(point)
73
+
74
+ def serialize(self):
75
+ """Serializes the sketch into a bytes object"""
76
+ return self._gadget.serialize()
77
+
78
+ def __str__(self, print_levels:bool=False, print_items:bool=False):
79
+ """Produces a string summary of the sketch"""
80
+ return self._gadget.to_string(print_levels, print_items)
81
+
82
+ def to_string(self, print_levels:bool=False, print_items:bool=False):
83
+ """Produces a string summary of the sketch"""
84
+ return self._gadget.to_string(print_levels, print_items)
85
+
86
+ def __iter__(self):
87
+ return self._gadget.__iter__()
@@ -0,0 +1,35 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import numpy as np
19
+
20
+ from _datasketches import KernelFunction
21
+
22
+ # This file provides an example Python Kernel Function implementation.
23
+ #
24
+ # Each implementation must extend the KernelFunction class
25
+ # and define the __call__ method
26
+
27
+ # Implements a basic Gaussian Kernel
28
+ class GaussianKernel(KernelFunction):
29
+ def __init__(self, bandwidth: float=1.0):
30
+ KernelFunction.__init__(self)
31
+ self._bw = bandwidth
32
+ self._scale = -0.5 * (bandwidth ** -2)
33
+
34
+ def __call__(self, a: np.array, b: np.array) -> float:
35
+ return np.exp(self._scale * np.linalg.norm(a - b)**2)
@@ -54,51 +54,57 @@ class PyStringsSerDe(PyObjectSerDe):
54
54
  str = data[offset+4:offset+4+num_chars].decode()
55
55
  return (str, 4+num_chars)
56
56
 
57
- # Implements an integer-encoding scheme where each integer is written
57
+ # Implements an integer encoding scheme where each integer is written
58
58
  # as a 32-bit (4 byte) little-endian value.
59
59
  class PyIntsSerDe(PyObjectSerDe):
60
60
  def get_size(self, item):
61
61
  return int(4)
62
62
 
63
63
  def to_bytes(self, item):
64
- return struct.pack('i', item)
64
+ return struct.pack('<i', item)
65
65
 
66
66
  def from_bytes(self, data: bytes, offset: int):
67
- val = struct.unpack_from('i', data, offset)[0]
67
+ val = struct.unpack_from('<i', data, offset)[0]
68
68
  return (val, 4)
69
69
 
70
70
 
71
+ # Implements an integer encoding scheme where each integer is written
72
+ # as a 64-bit (8 byte) little-endian value.
71
73
  class PyLongsSerDe(PyObjectSerDe):
72
74
  def get_size(self, item):
73
75
  return int(8)
74
76
 
75
77
  def to_bytes(self, item):
76
- return struct.pack('l', item)
78
+ return struct.pack('<l', item)
77
79
 
78
80
  def from_bytes(self, data: bytes, offset: int):
79
- val = struct.unpack_from('l', data, offset)[0]
81
+ val = struct.unpack_from('<l', data, offset)[0]
80
82
  return (val, 8)
81
83
 
82
84
 
85
+ # Implements a floating point encoding scheme where each value is written
86
+ # as a 32-bit floating point value.
83
87
  class PyFloatsSerDe(PyObjectSerDe):
84
88
  def get_size(self, item):
85
89
  return int(4)
86
90
 
87
91
  def to_bytes(self, item):
88
- return struct.pack('f', item)
92
+ return struct.pack('<f', item)
89
93
 
90
94
  def from_bytes(self, data: bytes, offset: int):
91
- val = struct.unpack_from('f', data, offset)[0]
95
+ val = struct.unpack_from('<f', data, offset)[0]
92
96
  return (val, 4)
93
97
 
94
98
 
99
+ # Implements a floating point encoding scheme where each value is written
100
+ # as a 64-bit floating point value.
95
101
  class PyDoublesSerDe(PyObjectSerDe):
96
102
  def get_size(self, item):
97
103
  return int(8)
98
104
 
99
105
  def to_bytes(self, item):
100
- return struct.pack('d', item)
106
+ return struct.pack('<d', item)
101
107
 
102
108
  def from_bytes(self, data: bytes, offset: int):
103
- val = struct.unpack_from('d', data, offset)[0]
109
+ val = struct.unpack_from('<d', data, offset)[0]
104
110
  return (val, 8)
@@ -0,0 +1,77 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import sys
19
+
20
+ from _datasketches import TuplePolicy
21
+
22
+ # This file provides an example Python Tuple Policy implementation.
23
+ #
24
+ # Each implementation must extend the PyTuplePolicy class and define
25
+ # two methods:
26
+ # * create_summary() returns a new Summary object
27
+ # * update_summary(summary, update) applies the relevant policy to update the
28
+ # provided summary with the data in update.
29
+ # * __call__ may be similar to update_summary but allows a different
30
+ # implementation for set operations (union and intersection)
31
+
32
+ # Implements an accumulator summary policy, where new values are
33
+ # added to the existing value.
34
+ class AccumulatorPolicy(TuplePolicy):
35
+ def __init__(self):
36
+ TuplePolicy.__init__(self)
37
+
38
+ def create_summary(self) -> int:
39
+ return int(0)
40
+
41
+ def update_summary(self, summary: int, update: int) -> int:
42
+ summary += update
43
+ return summary
44
+
45
+ def __call__(self, summary: int, update: int) -> int:
46
+ summary += update
47
+ return summary
48
+
49
+
50
+ # Implements a MAX rule, where the largest integer value is always kept
51
+ class MaxIntPolicy(TuplePolicy):
52
+ def __init__(self):
53
+ TuplePolicy.__init__(self)
54
+
55
+ def create_summary(self) -> int:
56
+ return int(-sys.maxsize-1)
57
+
58
+ def update_summary(self, summary: int, update: int) -> int:
59
+ return max(summary, update)
60
+
61
+ def __call__(self, summary: int, update: int) -> int:
62
+ return max(summary, update)
63
+
64
+
65
+ # Implements a MIN rule, where the smallest integer value is always kept
66
+ class MinIntPolicy(TuplePolicy):
67
+ def __init__(self):
68
+ TuplePolicy.__init__(self)
69
+
70
+ def create_summary(self) -> int:
71
+ return int(sys.maxsize)
72
+
73
+ def update_summary(self, summary: int, update: int) -> int:
74
+ return min(summary, update)
75
+
76
+ def __call__(self, summary: int, update: int) -> int:
77
+ return min(summary, update)
@@ -0,0 +1,205 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ from abc import ABC, abstractmethod
19
+
20
+ from _datasketches import _tuple_sketch, _compact_tuple_sketch, _update_tuple_sketch
21
+ from _datasketches import _tuple_union, _tuple_intersection
22
+ from _datasketches import _tuple_a_not_b, _tuple_jaccard_similarity
23
+ from _datasketches import PyObjectSerDe, theta_sketch, TuplePolicy
24
+
25
+ class tuple_sketch(ABC):
26
+ """An abstract base class representing a Tuple Sketch."""
27
+ _gadget: _tuple_sketch
28
+
29
+ def __str__(self, print_items:bool=False):
30
+ return self._gadget.to_string(print_items)
31
+
32
+ def is_empty(self):
33
+ """Returns True if the sketch is empty, otherwise False."""
34
+ return self._gadget.is_empty()
35
+
36
+ def get_estimate(self):
37
+ """Returns an estimate of the distinct count of the input stream."""
38
+ return self._gadget.get_estimate()
39
+
40
+ def get_upper_bound(self, num_std_devs:int):
41
+ """Returns an approximate upper bound on the estimate at standard deviations in {1, 2, 3}."""
42
+ return self._gadget.get_upper_bound(num_std_devs)
43
+
44
+ def get_lower_bound(self, num_std_devs:int):
45
+ """Returns an approximate lower bound on the estimate at standard deviations in {1, 2, 3}."""
46
+ return self._gadget.get_lower_bound(num_std_devs)
47
+
48
+ def is_estimation_mode(self):
49
+ """Returns True if the sketch is in estimation mode, otherwise False."""
50
+ return self._gadget.is_estimation_mode()
51
+
52
+ def get_theta(self):
53
+ """Returns theta (the effective sampling rate) as a fraction from 0 to 1."""
54
+ return self._gadget.get_theta()
55
+
56
+ def get_theta64(self):
57
+ """Returns theta as a 64-bit integer value."""
58
+ return self._gadget.get_theta64()
59
+
60
+ def get_num_retained(self):
61
+ """Returns the number of items currently in the sketch."""
62
+ return self._gadget.get_num_retained()
63
+
64
+ def get_seed_hash(self):
65
+ """Returns a hash of the seed used in the sketch."""
66
+ return self._gadget.get_seed_hash()
67
+
68
+ def is_ordered(self):
69
+ """Returns True if the sketch entries are sorder, otherwise False."""
70
+ return self._gadget.is_ordered()
71
+
72
+ def __iter__(self):
73
+ return self._gadget.__iter__()
74
+
75
+
76
+ class compact_tuple_sketch(tuple_sketch):
77
+ """An instance of a Tuple Sketch that has been compacted and can no longer accept updates."""
78
+
79
+ def __init__(self, other:tuple_sketch, ordered:bool = True):
80
+ if other == None:
81
+ self._gadget = None
82
+ else:
83
+ self._gadget = _compact_tuple_sketch(other, ordered)
84
+
85
+ def serialize(self, serde:PyObjectSerDe):
86
+ """Serializes the sketch into a bytes object with the provided SerDe."""
87
+ return self._gadget.serialize(serde)
88
+
89
+ @classmethod
90
+ def from_theta_sketch(cls, sketch:theta_sketch, summary, seed:int=_tuple_sketch.DEFAULT_SEED):
91
+ """Creates a comapct Tuple Sketch from a Theta Sketch using a fixed summary value."""
92
+ self = cls.__new__(cls)
93
+ self._gadget = _compact_tuple_sketch(sketch, summary, seed)
94
+ return self
95
+
96
+ @classmethod
97
+ def deserialize(cls, data:bytes, serde:PyObjectSerDe, seed:int=_tuple_sketch.DEFAULT_SEED):
98
+ """Reads a bytes object and uses the provded SerDe to return the corresponding compact_tuple_sketch."""
99
+ self = cls.__new__(cls)
100
+ self._gadget = _compact_tuple_sketch.deserialize(data, serde, seed)
101
+ return self
102
+
103
+
104
+ class update_tuple_sketch(tuple_sketch):
105
+ """An instance of a Tuple Sketch that is available for updates. Requires a Policy object to handle Summary values."""
106
+
107
+ def __init__(self, policy, lg_k:int = 12, p:float = 1.0, seed:int = _tuple_sketch.DEFAULT_SEED):
108
+ self._policy = policy
109
+ self._gadget = _update_tuple_sketch(self._policy, lg_k, p, seed)
110
+
111
+ def update(self, datum, value):
112
+ """Updates the sketch with the provided item and summary value."""
113
+ self._gadget.update(datum, value)
114
+
115
+ def compact(self, ordered:bool = True) -> compact_tuple_sketch:
116
+ """Returns a compacted form of the sketch, optionally sorting it."""
117
+ return self._gadget.compact(ordered)
118
+
119
+ def reset(self):
120
+ """Resets the sketch to the initial empty state."""
121
+ self._gadget.reset()
122
+
123
+
124
+ class tuple_union:
125
+ """An object that can merge Tuple Sketches. Requires a Policy object to handle merging Summaries."""
126
+ _policy: TuplePolicy
127
+
128
+ def __init__(self, policy:TuplePolicy, lg_k:int = 12, p:float = 1.0, seed:int = _tuple_sketch.DEFAULT_SEED):
129
+ self._policy = policy
130
+ self._gadget = _tuple_union(self._policy, lg_k, p, seed)
131
+
132
+ def update(self, sketch:tuple_sketch):
133
+ """Updates the union with the given sketch."""
134
+ self._gadget.update(sketch._gadget)
135
+
136
+ def get_result(self, ordered:bool = True) -> compact_tuple_sketch:
137
+ """Returns the sketch corresponding to the union result, optionally sorted."""
138
+ return compact_tuple_sketch(self._gadget.get_result(ordered), ordered)
139
+
140
+ def reset(self):
141
+ """Resets the union to the initial empty state."""
142
+ self._gadget.reset()
143
+
144
+
145
+ class tuple_intersection:
146
+ """An object that can intersect Tuple Sketches. Requires a Policy object to handle intersecting Summaries."""
147
+ _policy: TuplePolicy
148
+
149
+ def __init__(self, policy:TuplePolicy, seed:int = _tuple_sketch.DEFAULT_SEED):
150
+ self._policy = policy
151
+ self._gadget = _tuple_intersection(self._policy, seed)
152
+
153
+ def update(self, sketch:tuple_sketch):
154
+ """Intersects the provided sketch with the current intersection state."""
155
+ self._gadget.update(sketch._gadget)
156
+
157
+ def has_result(self) -> bool:
158
+ """Returns True if the intersection has a valid result, otherwise False."""
159
+ return self._gadget.has_result()
160
+
161
+ def get_result(self, ordered:bool = True) -> compact_tuple_sketch:
162
+ """Returns the sketch corresponding to the intersection result, optionally sorted."""
163
+ return compact_tuple_sketch(self._gadget.get_result(ordered), ordered)
164
+
165
+
166
+ class tuple_a_not_b:
167
+ """An object that can peform the A-not-B operation between two sketches."""
168
+ def __init__(self, seed:int = _tuple_sketch.DEFAULT_SEED):
169
+ self._gadget = _tuple_a_not_b(seed)
170
+
171
+ def compute(self, a:tuple_sketch, b:tuple_sketch, ordered:bool=True) -> compact_tuple_sketch:
172
+ """Returns a sketch with the result of applying the A-not-B operation on the given inputs."""
173
+ return compact_tuple_sketch(self._gadget.compute(a._gadget, b._gadget))
174
+
175
+
176
+ class tuple_jaccard_similarity:
177
+ @staticmethod
178
+ def jaccard(a:tuple_sketch, b:tuple_sketch, seed:int=_tuple_sketch.DEFAULT_SEED):
179
+ """Returns a list with {lower_bound, estimate, upper_bound} of the Jaccard similarity between sketches."""
180
+ return _tuple_jaccard_similarity.jaccard(a._gadget, b._gadget, seed)
181
+
182
+ @staticmethod
183
+ def exactly_equal(a:tuple_sketch, b:tuple_sketch, seed:int=_tuple_sketch.DEFAULT_SEED):
184
+ """Returns True if sketch_a and sketch_b are equivalent, otherwise False."""
185
+ return _tuple_jaccard_similarity.exactly_equal(a._gadget, b._gadget, seed)
186
+
187
+ @staticmethod
188
+ def similarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=_tuple_sketch.DEFAULT_SEED):
189
+ """Tests similarity of an actual sketch against an expected sketch.
190
+
191
+ Computes the lower bound of the Jaccard index J_{LB} of the actual and expected sketches.
192
+ If J_{LB} >= threshold, then the sketches are considered to be similar sith a confidence of
193
+ 97.7% and returns True, otherwise False.
194
+ """
195
+ return _tuple_jaccard_similarity.similarity_test(actual._gadget, expected._gadget, threshold, seed)
196
+
197
+ @staticmethod
198
+ def dissimilarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=_tuple_sketch.DEFAULT_SEED):
199
+ """Tests dissimilarity of an actual sketch against an expected sketch.
200
+
201
+ Computes the upper bound of the Jaccard index J_{UB} of the actual and expected sketches.
202
+ If J_{UB} <= threshold, then the sketches are considered to be dissimilar sith a confidence of
203
+ 97.7% and returns True, otherwise False.
204
+ """
205
+ return _tuple_jaccard_similarity.dissimilarity_test(actual._gadget, expected._gadget, threshold, seed)
@@ -15,8 +15,24 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
+ """The Apache DataSketches Library for Python
19
+
20
+ Provided under the Apache License, Verison 2.0
21
+ <http://www.apache.org/licenses/LICENSE-2.0>
22
+ """
23
+
18
24
  name = 'datasketches'
19
25
 
26
+ from _datasketches import *
27
+
20
28
  from .PySerDe import *
29
+ from .TuplePolicy import *
30
+ from .KernelFunction import *
21
31
 
22
- from _datasketches import *
32
+ # Wrappers around the pybind11 classes for cases where we
33
+ # need to define a python object that is persisted within
34
+ # the C++ object. Currently, the native python portion of
35
+ # a class derived from a C++ class may be garbage collected
36
+ # even though a pointer to the C++ portion remains valid.
37
+ from .TupleWrapper import *
38
+ from .DensityWrapper import *
@@ -0,0 +1,98 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ //#include <memory>
21
+ #include <pybind11/pybind11.h>
22
+ #include <pybind11/numpy.h>
23
+
24
+ #ifndef _KERNEL_FUNCTION_HPP_
25
+ #define _KERNEL_FUNCTION_HPP_
26
+
27
+ namespace py = pybind11;
28
+
29
+ namespace datasketches {
30
+
31
+ /**
32
+ * @brief kernel_function provides the underlying base class from
33
+ * which native Python kernels ultimately inherit. The actual
34
+ * kernels implement KernelFunction, as shown in KernelFunction.py
35
+ */
36
+ struct kernel_function {
37
+ virtual double operator()(py::array_t<double>& a, const py::array_t<double>& b) const = 0;
38
+ virtual ~kernel_function() = default;
39
+ };
40
+
41
+ /**
42
+ * @brief KernelFunction provides the "trampoline" class for pybind11
43
+ * that allows for a native Python implementation of kernel
44
+ * functions.
45
+ */
46
+ struct KernelFunction : public kernel_function {
47
+ using kernel_function::kernel_function;
48
+
49
+ /**
50
+ * @brief Evaluates K(a,b), the kernel function for the given points a and b
51
+ * @param a the first vector
52
+ * @param b the second vector
53
+ * @return The function value K(a,b)
54
+ */
55
+ double operator()(py::array_t<double>& a, const py::array_t<double>& b) const override {
56
+ PYBIND11_OVERRIDE_PURE_NAME(
57
+ double, // Return type
58
+ kernel_function, // Parent class
59
+ "__call__", // Name of function in python
60
+ operator(), // Name of function in C++
61
+ a, b // Arguemnts
62
+ );
63
+ }
64
+ };
65
+
66
+ /* The kernel_function_holder provides a concrete class that dispatches calls
67
+ * from the sketch to the kernel_function. This class is needed to provide a
68
+ * concrete object to produce a compiled library, but library users should
69
+ * never need to use this directly.
70
+ */
71
+ struct kernel_function_holder {
72
+ explicit kernel_function_holder(std::shared_ptr<kernel_function> kernel) : _kernel(kernel) {}
73
+ kernel_function_holder(const kernel_function_holder& other) : _kernel(other._kernel) {}
74
+ kernel_function_holder(kernel_function_holder&& other) : _kernel(std::move(other._kernel)) {}
75
+ kernel_function_holder& operator=(const kernel_function_holder& other) { _kernel = other._kernel; return *this; }
76
+ kernel_function_holder& operator=(kernel_function_holder&& other) { std::swap(_kernel, other._kernel); return *this; }
77
+
78
+ double operator()(const std::vector<double>& a, const py::array_t<double>& b) const {
79
+ py::array_t<double> a_arr(a.size(), a.data(), dummy_array_owner);
80
+ return _kernel->operator()(a_arr, b);
81
+ }
82
+
83
+ double operator()(const std::vector<double>& a, const std::vector<double>& b) const {
84
+ py::array_t<double> a_arr(a.size(), a.data(), dummy_array_owner);
85
+ py::array_t<double> b_arr(b.size(), b.data(), dummy_array_owner);
86
+ return _kernel->operator()(a_arr, b_arr);
87
+ }
88
+
89
+ private:
90
+ // a dummy object to "own" arrays when translating from std::vector to avoid a copy:
91
+ // https://github.com/pybind/pybind11/issues/323#issuecomment-575717041
92
+ py::str dummy_array_owner;
93
+ std::shared_ptr<kernel_function> _kernel;
94
+ };
95
+
96
+ }
97
+
98
+ #endif // _KERNEL_FUNCTION_HPP_
@@ -0,0 +1,37 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _PY_OBJECT_LT_HPP_
21
+ #define _PY_OBJECT_LT_HPP_
22
+
23
+ #include <pybind11/pybind11.h>
24
+
25
+ /*
26
+ This header defines a less than operator on generic python
27
+ objects. The implementation calls the object's built-in __lt__()
28
+ method. If that method is not defined, the call may fail.
29
+ */
30
+
31
+ struct py_object_lt {
32
+ bool operator()(const pybind11::object& a, const pybind11::object& b) const {
33
+ return a < b;
34
+ }
35
+ };
36
+
37
+ #endif // _PY_OBJECT_LT_HPP_
@@ -0,0 +1,48 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _PY_OBJECT_OSTREAM_HPP_
21
+ #define _PY_OBJECT_OSTREAM_HPP_
22
+
23
+ #include <pybind11/pybind11.h>
24
+
25
+ #include <string>
26
+ #include <ostream>
27
+
28
+ /*
29
+ This header defines an ostream output operator on a generic python
30
+ object. The implementation calls the object's built-in __str__()
31
+ method. If that method is not defined, the call may fail.
32
+
33
+ NOTE: This header must be included before the inclusion of
34
+ any sketch classes.
35
+ */
36
+
37
+ namespace py = pybind11;
38
+
39
+ namespace datasketches {
40
+
41
+ static std::ostream& operator<<(std::ostream& os, const py::object& obj) {
42
+ os << std::string(pybind11::str(obj));
43
+ return os;
44
+ }
45
+
46
+ }
47
+
48
+ #endif // _PY_OBJECT_OSTREAM_HPP_