datasketches 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/datasketches/version.rb +1 -1
  4. data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
  5. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  6. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  7. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
  9. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  10. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  11. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  12. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  13. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  14. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
  16. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
  17. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  18. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  19. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  20. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  21. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  22. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  23. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
  25. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  28. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  29. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  30. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  31. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  32. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  33. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  34. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  35. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  36. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  37. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  38. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  39. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  40. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  41. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  42. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  43. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  44. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  45. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  47. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  48. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  49. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  50. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  51. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  52. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  53. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  54. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  55. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  56. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  57. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  58. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  59. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  60. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  61. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  62. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  63. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  64. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  65. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  66. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  67. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
  69. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  70. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  71. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
  73. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  74. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  75. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
  76. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  78. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  79. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
  84. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  85. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
  86. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
  87. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  88. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  89. data/vendor/datasketches-cpp/setup.py +1 -1
  90. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  91. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  92. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  93. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  94. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  95. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
  97. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
  98. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
  99. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  100. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
  101. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
  103. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  105. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  106. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
  107. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  108. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  109. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  112. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  113. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  114. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  116. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
  117. metadata +8 -3
@@ -79,7 +79,7 @@ class CMakeBuild(build_ext):
79
79
 
80
80
  setup(
81
81
  name='datasketches',
82
- version='3.0.0',
82
+ version='3.1.0',
83
83
  author='Apache DataSketches Developers',
84
84
  author_email='dev@datasketches.apache.org',
85
85
  description='A wrapper for the C++ Apache DataSketches library',
@@ -90,7 +90,7 @@ public:
90
90
  * @param f the inclusion probability used to produce the set with size <i>a</i>.
91
91
  * @return the approximate lower bound
92
92
  */
93
- static double estimate_of_a(uint64_t a, uint64_t f) {
93
+ static double estimate_of_a(uint64_t a, double f) {
94
94
  check_inputs(a, 1, f);
95
95
  return a / f;
96
96
  }
@@ -0,0 +1,67 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef COMPACT_THETA_SKETCH_PARSER_HPP_
21
+ #define COMPACT_THETA_SKETCH_PARSER_HPP_
22
+
23
+ #include <stdint.h>
24
+
25
+ namespace datasketches {
26
+
27
+ template<bool dummy>
28
+ class compact_theta_sketch_parser {
29
+ public:
30
+ struct compact_theta_sketch_data {
31
+ bool is_empty;
32
+ bool is_ordered;
33
+ uint16_t seed_hash;
34
+ uint32_t num_entries;
35
+ uint64_t theta;
36
+ const uint64_t* entries;
37
+ };
38
+
39
+ static compact_theta_sketch_data parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error = false);
40
+
41
+ private:
42
+ // offsets are in sizeof(type)
43
+ static const size_t COMPACT_SKETCH_PRE_LONGS_BYTE = 0;
44
+ static const size_t COMPACT_SKETCH_SERIAL_VERSION_BYTE = 1;
45
+ static const size_t COMPACT_SKETCH_TYPE_BYTE = 2;
46
+ static const size_t COMPACT_SKETCH_FLAGS_BYTE = 5;
47
+ static const size_t COMPACT_SKETCH_SEED_HASH_U16 = 3;
48
+ static const size_t COMPACT_SKETCH_NUM_ENTRIES_U32 = 2;
49
+ static const size_t COMPACT_SKETCH_SINGLE_ENTRY_U64 = 1;
50
+ static const size_t COMPACT_SKETCH_ENTRIES_EXACT_U64 = 2;
51
+ static const size_t COMPACT_SKETCH_THETA_U64 = 2;
52
+ static const size_t COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 = 3;
53
+
54
+ static const uint8_t COMPACT_SKETCH_IS_EMPTY_FLAG = 2;
55
+ static const uint8_t COMPACT_SKETCH_IS_ORDERED_FLAG = 4;
56
+
57
+ static const uint8_t COMPACT_SKETCH_SERIAL_VERSION = 3;
58
+ static const uint8_t COMPACT_SKETCH_TYPE = 3;
59
+
60
+ static std::string hex_dump(const uint8_t* ptr, size_t size);
61
+ };
62
+
63
+ } /* namespace datasketches */
64
+
65
+ #include "compact_theta_sketch_parser_impl.hpp"
66
+
67
+ #endif
@@ -0,0 +1,70 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef COMPACT_THETA_SKETCH_PARSER_IMPL_HPP_
21
+ #define COMPACT_THETA_SKETCH_PARSER_IMPL_HPP_
22
+
23
+ #include <iostream>
24
+ #include <iomanip>
25
+
26
+ namespace datasketches {
27
+
28
+ template<bool dummy>
29
+ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
30
+ if (size < 8) throw std::invalid_argument("at least 8 bytes expected, actual " + std::to_string(size)
31
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
32
+ checker<true>::check_serial_version(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE], COMPACT_SKETCH_SERIAL_VERSION);
33
+ checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
34
+ uint64_t theta = theta_constants::MAX_THETA;
35
+ const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
36
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
37
+ if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
38
+ return {true, true, seed_hash, 0, theta, nullptr};
39
+ }
40
+ const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
41
+ if (has_theta) {
42
+ if (size < 16) throw std::invalid_argument("at least 16 bytes expected, actual " + std::to_string(size));
43
+ theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
44
+ }
45
+ if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
46
+ return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
47
+ }
48
+ const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
49
+ const size_t entries_start_u64 = has_theta ? COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
50
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
51
+ const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
52
+ if (size < expected_size_bytes) {
53
+ throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
54
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
55
+ }
56
+ const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
57
+ return {false, is_ordered, seed_hash, num_entries, theta, entries};
58
+ }
59
+
60
+ template<bool dummy>
61
+ std::string compact_theta_sketch_parser<dummy>::hex_dump(const uint8_t* ptr, size_t size) {
62
+ std::stringstream s;
63
+ s << std::hex << std::setfill('0') << std::uppercase;
64
+ for (size_t i = 0; i < size; ++i) s << std::setw(2) << (ptr[i] & 0xff);
65
+ return s.str();
66
+ }
67
+
68
+ } /* namespace datasketches */
69
+
70
+ #endif
@@ -33,14 +33,19 @@ public:
33
33
  using Sketch = theta_sketch_alloc<Allocator>;
34
34
  using CompactSketch = compact_theta_sketch_alloc<Allocator>;
35
35
 
36
- struct pass_through_policy {
37
- uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
36
+ struct nop_policy {
37
+ void operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
38
38
  unused(incoming_entry);
39
- return internal_entry;
39
+ unused(internal_entry);
40
40
  }
41
41
  };
42
- using State = theta_intersection_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
42
+ using State = theta_intersection_base<Entry, ExtractKey, nop_policy, Sketch, CompactSketch, Allocator>;
43
43
 
44
+ /*
45
+ * Constructor
46
+ * @param seed for the hash function that was used to create the sketch
47
+ * @param allocator to use for allocating and deallocating memory
48
+ */
44
49
  explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
45
50
 
46
51
  /**
@@ -24,7 +24,7 @@ namespace datasketches {
24
24
 
25
25
  template<typename A>
26
26
  theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed, const A& allocator):
27
- state_(seed, pass_through_policy(), allocator)
27
+ state_(seed, nop_policy(), allocator)
28
28
  {}
29
29
 
30
30
  template<typename A>
@@ -46,20 +46,21 @@ public:
46
46
  *
47
47
  * @param sketch_a given sketch A
48
48
  * @param sketch_b given sketch B
49
+ * @param seed for the hash function that was used to create the sketch
49
50
  * @return a double array {LowerBound, Estimate, UpperBound} of the Jaccard index.
50
51
  * The Upper and Lower bounds are for a confidence interval of 95.4% or +/- 2 standard deviations.
51
52
  */
52
53
  template<typename SketchA, typename SketchB>
53
- static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b) {
54
+ static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
54
55
  if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return {1, 1, 1};
55
56
  if (sketch_a.is_empty() && sketch_b.is_empty()) return {1, 1, 1};
56
57
  if (sketch_a.is_empty() || sketch_b.is_empty()) return {0, 0, 0};
57
58
 
58
- auto union_ab = compute_union(sketch_a, sketch_b);
59
+ auto union_ab = compute_union(sketch_a, sketch_b, seed);
59
60
  if (identical_sets(sketch_a, sketch_b, union_ab)) return {1, 1, 1};
60
61
 
61
62
  // intersection
62
- Intersection i;
63
+ Intersection i(seed);
63
64
  i.update(sketch_a);
64
65
  i.update(sketch_b);
65
66
  i.update(union_ab); // ensures that intersection is a subset of the union
@@ -76,15 +77,16 @@ public:
76
77
  * Returns true if the two given sketches are equivalent.
77
78
  * @param sketch_a the given sketch A
78
79
  * @param sketch_b the given sketch B
80
+ * @param seed for the hash function that was used to create the sketch
79
81
  * @return true if the two given sketches are exactly equal
80
82
  */
81
83
  template<typename SketchA, typename SketchB>
82
- static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b) {
84
+ static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
83
85
  if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return true;
84
86
  if (sketch_a.is_empty() && sketch_b.is_empty()) return true;
85
87
  if (sketch_a.is_empty() || sketch_b.is_empty()) return false;
86
88
 
87
- auto union_ab = compute_union(sketch_a, sketch_b);
89
+ auto union_ab = compute_union(sketch_a, sketch_b, seed);
88
90
  if (identical_sets(sketch_a, sketch_b, union_ab)) return true;
89
91
  return false;
90
92
  }
@@ -99,12 +101,13 @@ public:
99
101
  * @param actual the sketch to be tested
100
102
  * @param expected the reference sketch that is considered to be correct
101
103
  * @param threshold a real value between zero and one
104
+ * @param seed for the hash function that was used to create the sketch
102
105
  * @return true if the similarity of the two sketches is greater than the given threshold
103
106
  * with at least 97.7% confidence
104
107
  */
105
108
  template<typename SketchA, typename SketchB>
106
- static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
107
- auto jc = jaccard(actual, expected);
109
+ static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
110
+ auto jc = jaccard(actual, expected, seed);
108
111
  return jc[0] >= threshold;
109
112
  }
110
113
 
@@ -118,23 +121,24 @@ public:
118
121
  * @param actual the sketch to be tested
119
122
  * @param expected the reference sketch that is considered to be correct
120
123
  * @param threshold a real value between zero and one
124
+ * @param seed for the hash function that was used to create the sketch
121
125
  * @return true if the dissimilarity of the two sketches is greater than the given threshold
122
126
  * with at least 97.7% confidence
123
127
  */
124
128
  template<typename SketchA, typename SketchB>
125
- static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
126
- auto jc = jaccard(actual, expected);
129
+ static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
130
+ auto jc = jaccard(actual, expected, seed);
127
131
  return jc[2] <= threshold;
128
132
  }
129
133
 
130
134
  private:
131
135
 
132
136
  template<typename SketchA, typename SketchB>
133
- static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b) {
134
- const unsigned count_a = sketch_a.get_num_retained();
135
- const unsigned count_b = sketch_b.get_num_retained();
136
- const unsigned lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
137
- auto u = typename Union::builder().set_lg_k(lg_k).build();
137
+ static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed) {
138
+ const auto count_a = sketch_a.get_num_retained();
139
+ const auto count_b = sketch_b.get_num_retained();
140
+ const uint8_t lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
141
+ auto u = typename Union::builder().set_lg_k(lg_k).set_seed(seed).build();
138
142
  u.update(sketch_a);
139
143
  u.update(sketch_b);
140
144
  return u.get_result(false);
@@ -311,7 +311,8 @@ public:
311
311
  // - as a result of a set operation
312
312
  // - by deserializing a previously serialized compact sketch
313
313
 
314
- compact_theta_sketch_alloc(const Base& other, bool ordered);
314
+ template<typename Other>
315
+ compact_theta_sketch_alloc(const Other& other, bool ordered);
315
316
  compact_theta_sketch_alloc(const compact_theta_sketch_alloc&) = default;
316
317
  compact_theta_sketch_alloc(compact_theta_sketch_alloc&&) noexcept = default;
317
318
  virtual ~compact_theta_sketch_alloc() = default;
@@ -387,10 +388,50 @@ public:
387
388
  update_theta_sketch_alloc build() const;
388
389
  };
389
390
 
391
+ // This is to wrap a buffer containing a serialized compact sketch and use it in a set operation avoiding some cost of deserialization.
392
+ // It does not take the ownership of the buffer.
393
+
394
+ template<typename Allocator = std::allocator<uint64_t>>
395
+ class wrapped_compact_theta_sketch_alloc {
396
+ public:
397
+ using const_iterator = const uint64_t*;
398
+
399
+ Allocator get_allocator() const;
400
+ bool is_empty() const;
401
+ bool is_ordered() const;
402
+ uint64_t get_theta64() const;
403
+ uint32_t get_num_retained() const;
404
+ uint16_t get_seed_hash() const;
405
+
406
+ const_iterator begin() const;
407
+ const_iterator end() const;
408
+
409
+ /**
410
+ * This method wraps a serialized compact sketch as an array of bytes.
411
+ * @param bytes pointer to the array of bytes
412
+ * @param size the size of the array
413
+ * @param seed the seed for the hash function that was used to create the sketch
414
+ * @return an instance of the sketch
415
+ */
416
+ static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
417
+
418
+ private:
419
+ bool is_empty_;
420
+ bool is_ordered_;
421
+ uint16_t seed_hash_;
422
+ uint32_t num_entries_;
423
+ uint64_t theta_;
424
+ const uint64_t* entries_;
425
+
426
+ wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
427
+ uint64_t theta, const uint64_t* entries);
428
+ };
429
+
390
430
  // aliases with default allocator for convenience
391
431
  using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
392
432
  using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
393
433
  using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
434
+ using wrapped_compact_theta_sketch = wrapped_compact_theta_sketch_alloc<std::allocator<uint64_t>>;
394
435
 
395
436
  } /* namespace datasketches */
396
437
 
@@ -26,6 +26,7 @@
26
26
  #include "serde.hpp"
27
27
  #include "binomial_bounds.hpp"
28
28
  #include "theta_helpers.hpp"
29
+ #include "compact_theta_sketch_parser.hpp"
29
30
 
30
31
  namespace datasketches {
31
32
 
@@ -246,7 +247,8 @@ update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() cons
246
247
  // compact sketch
247
248
 
248
249
  template<typename A>
249
- compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const Base& other, bool ordered):
250
+ template<typename Other>
251
+ compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const Other& other, bool ordered):
250
252
  is_empty_(other.is_empty()),
251
253
  is_ordered_(other.is_ordered() || ordered),
252
254
  seed_hash_(other.get_seed_hash()),
@@ -290,7 +292,7 @@ uint64_t compact_theta_sketch_alloc<A>::get_theta64() const {
290
292
 
291
293
  template<typename A>
292
294
  uint32_t compact_theta_sketch_alloc<A>::get_num_retained() const {
293
- return entries_.size();
295
+ return static_cast<uint32_t>(entries_.size());
294
296
  }
295
297
 
296
298
  template<typename A>
@@ -300,22 +302,22 @@ uint16_t compact_theta_sketch_alloc<A>::get_seed_hash() const {
300
302
 
301
303
  template<typename A>
302
304
  auto compact_theta_sketch_alloc<A>::begin() -> iterator {
303
- return iterator(entries_.data(), entries_.size(), 0);
305
+ return iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
304
306
  }
305
307
 
306
308
  template<typename A>
307
309
  auto compact_theta_sketch_alloc<A>::end() -> iterator {
308
- return iterator(nullptr, 0, entries_.size());
310
+ return iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
309
311
  }
310
312
 
311
313
  template<typename A>
312
314
  auto compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
313
- return const_iterator(entries_.data(), entries_.size(), 0);
315
+ return const_iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
314
316
  }
315
317
 
316
318
  template<typename A>
317
319
  auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
318
- return const_iterator(nullptr, 0, entries_.size());
320
+ return const_iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
319
321
  }
320
322
 
321
323
  template<typename A>
@@ -325,33 +327,33 @@ template<typename A>
325
327
  void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
326
328
  const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
327
329
  const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
328
- os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
330
+ write(os, preamble_longs);
329
331
  const uint8_t serial_version = SERIAL_VERSION;
330
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
332
+ write(os, serial_version);
331
333
  const uint8_t type = SKETCH_TYPE;
332
- os.write(reinterpret_cast<const char*>(&type), sizeof(type));
334
+ write(os, type);
333
335
  const uint16_t unused16 = 0;
334
- os.write(reinterpret_cast<const char*>(&unused16), sizeof(unused16));
336
+ write(os, unused16);
335
337
  const uint8_t flags_byte(
336
338
  (1 << flags::IS_COMPACT) |
337
339
  (1 << flags::IS_READ_ONLY) |
338
340
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
339
341
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
340
342
  );
341
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
343
+ write(os, flags_byte);
342
344
  const uint16_t seed_hash = get_seed_hash();
343
- os.write(reinterpret_cast<const char*>(&seed_hash), sizeof(seed_hash));
345
+ write(os, seed_hash);
344
346
  if (!this->is_empty()) {
345
347
  if (!is_single_item) {
346
- const uint32_t num_entries = entries_.size();
347
- os.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
348
+ const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
349
+ write(os, num_entries);
348
350
  const uint32_t unused32 = 0;
349
- os.write(reinterpret_cast<const char*>(&unused32), sizeof(unused32));
351
+ write(os, unused32);
350
352
  if (this->is_estimation_mode()) {
351
- os.write(reinterpret_cast<const char*>(&(this->theta_)), sizeof(uint64_t));
353
+ write(os, this->theta_);
352
354
  }
353
355
  }
354
- os.write(reinterpret_cast<const char*>(entries_.data()), entries_.size() * sizeof(uint64_t));
356
+ write(os, entries_.data(), entries_.size() * sizeof(uint64_t));
355
357
  }
356
358
  }
357
359
 
@@ -364,30 +366,28 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
364
366
  vector_bytes bytes(size, 0, entries_.get_allocator());
365
367
  uint8_t* ptr = bytes.data() + header_size_bytes;
366
368
 
367
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
369
+ ptr += copy_to_mem(preamble_longs, ptr);
368
370
  const uint8_t serial_version = SERIAL_VERSION;
369
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
371
+ ptr += copy_to_mem(serial_version, ptr);
370
372
  const uint8_t type = SKETCH_TYPE;
371
- ptr += copy_to_mem(&type, ptr, sizeof(type));
372
- const uint16_t unused16 = 0;
373
- ptr += copy_to_mem(&unused16, ptr, sizeof(unused16));
373
+ ptr += copy_to_mem(type, ptr);
374
+ ptr += sizeof(uint16_t); // unused
374
375
  const uint8_t flags_byte(
375
376
  (1 << flags::IS_COMPACT) |
376
377
  (1 << flags::IS_READ_ONLY) |
377
378
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
378
379
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
379
380
  );
380
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
381
+ ptr += copy_to_mem(flags_byte, ptr);
381
382
  const uint16_t seed_hash = get_seed_hash();
382
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
383
+ ptr += copy_to_mem(seed_hash, ptr);
383
384
  if (!this->is_empty()) {
384
385
  if (!is_single_item) {
385
- const uint32_t num_entries = entries_.size();
386
- ptr += copy_to_mem(&num_entries, ptr, sizeof(num_entries));
387
- const uint32_t unused32 = 0;
388
- ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
386
+ const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
387
+ ptr += copy_to_mem(num_entries, ptr);
388
+ ptr += sizeof(uint32_t);
389
389
  if (this->is_estimation_mode()) {
390
- ptr += copy_to_mem(&theta_, ptr, sizeof(uint64_t));
390
+ ptr += copy_to_mem(theta_, ptr);
391
391
  }
392
392
  }
393
393
  ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
@@ -397,18 +397,12 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
397
397
 
398
398
  template<typename A>
399
399
  compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
400
- uint8_t preamble_longs;
401
- is.read(reinterpret_cast<char*>(&preamble_longs), sizeof(preamble_longs));
402
- uint8_t serial_version;
403
- is.read(reinterpret_cast<char*>(&serial_version), sizeof(serial_version));
404
- uint8_t type;
405
- is.read(reinterpret_cast<char*>(&type), sizeof(type));
406
- uint16_t unused16;
407
- is.read(reinterpret_cast<char*>(&unused16), sizeof(unused16));
408
- uint8_t flags_byte;
409
- is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
410
- uint16_t seed_hash;
411
- is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
400
+ const auto preamble_longs = read<uint8_t>(is);
401
+ const auto serial_version = read<uint8_t>(is);
402
+ const auto type = read<uint8_t>(is);
403
+ read<uint16_t>(is); // unused
404
+ const auto flags_byte = read<uint8_t>(is);
405
+ const auto seed_hash = read<uint16_t>(is);
412
406
  checker<true>::check_sketch_type(type, SKETCH_TYPE);
413
407
  checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
414
408
  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
@@ -420,16 +414,15 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
420
414
  if (preamble_longs == 1) {
421
415
  num_entries = 1;
422
416
  } else {
423
- is.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
424
- uint32_t unused32;
425
- is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
417
+ num_entries = read<uint32_t>(is);
418
+ read<uint32_t>(is); // unused
426
419
  if (preamble_longs > 2) {
427
- is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
420
+ theta = read<uint64_t>(is);
428
421
  }
429
422
  }
430
423
  }
431
424
  std::vector<uint64_t, A> entries(num_entries, 0, allocator);
432
- if (!is_empty) is.read(reinterpret_cast<char*>(entries.data()), sizeof(uint64_t) * entries.size());
425
+ if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
433
426
 
434
427
  const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
435
428
  if (!is.good()) throw std::runtime_error("error reading from std::istream");
@@ -442,17 +435,16 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
442
435
  const char* ptr = static_cast<const char*>(bytes);
443
436
  const char* base = ptr;
444
437
  uint8_t preamble_longs;
445
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
438
+ ptr += copy_from_mem(ptr, preamble_longs);
446
439
  uint8_t serial_version;
447
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
440
+ ptr += copy_from_mem(ptr, serial_version);
448
441
  uint8_t type;
449
- ptr += copy_from_mem(ptr, &type, sizeof(type));
450
- uint16_t unused16;
451
- ptr += copy_from_mem(ptr, &unused16, sizeof(unused16));
442
+ ptr += copy_from_mem(ptr, type);
443
+ ptr += sizeof(uint16_t); // unused
452
444
  uint8_t flags_byte;
453
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
445
+ ptr += copy_from_mem(ptr, flags_byte);
454
446
  uint16_t seed_hash;
455
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
447
+ ptr += copy_from_mem(ptr, seed_hash);
456
448
  checker<true>::check_sketch_type(type, SKETCH_TYPE);
457
449
  checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
458
450
  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
@@ -465,12 +457,11 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
465
457
  num_entries = 1;
466
458
  } else {
467
459
  ensure_minimum_memory(size, 8); // read the first prelong before this method
468
- ptr += copy_from_mem(ptr, &num_entries, sizeof(num_entries));
469
- uint32_t unused32;
470
- ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
460
+ ptr += copy_from_mem(ptr, num_entries);
461
+ ptr += sizeof(uint32_t); // unused
471
462
  if (preamble_longs > 2) {
472
463
  ensure_minimum_memory(size, (preamble_longs - 1) << 3);
473
- ptr += copy_from_mem(ptr, &theta, sizeof(theta));
464
+ ptr += copy_from_mem(ptr, theta);
474
465
  }
475
466
  }
476
467
  }
@@ -483,7 +474,65 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
483
474
  return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
484
475
  }
485
476
 
477
+ // wrapped compact sketch
478
+
479
+ template<typename A>
480
+ wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
481
+ uint64_t theta, const uint64_t* entries):
482
+ is_empty_(is_empty),
483
+ is_ordered_(is_ordered),
484
+ seed_hash_(seed_hash),
485
+ num_entries_(num_entries),
486
+ theta_(theta),
487
+ entries_(entries)
488
+ {}
489
+
490
+ template<typename A>
491
+ const wrapped_compact_theta_sketch_alloc<A> wrapped_compact_theta_sketch_alloc<A>::wrap(const void* bytes, size_t size, uint64_t seed, bool dump_on_error) {
492
+ auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error);
493
+ return wrapped_compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.num_entries, data.theta, data.entries);
494
+ }
495
+
496
+ template<typename A>
497
+ A wrapped_compact_theta_sketch_alloc<A>::get_allocator() const {
498
+ return A();
499
+ }
500
+
501
+ template<typename A>
502
+ bool wrapped_compact_theta_sketch_alloc<A>::is_empty() const {
503
+ return is_empty_;
504
+ }
505
+
506
+ template<typename A>
507
+ bool wrapped_compact_theta_sketch_alloc<A>::is_ordered() const {
508
+ return is_ordered_;
509
+ }
510
+
511
+ template<typename A>
512
+ uint64_t wrapped_compact_theta_sketch_alloc<A>::get_theta64() const {
513
+ return theta_;
514
+ }
515
+
516
+ template<typename A>
517
+ uint32_t wrapped_compact_theta_sketch_alloc<A>::get_num_retained() const {
518
+ return static_cast<uint32_t>(num_entries_);
519
+ }
520
+
521
+ template<typename A>
522
+ uint16_t wrapped_compact_theta_sketch_alloc<A>::get_seed_hash() const {
523
+ return seed_hash_;
524
+ }
525
+
526
+ template<typename A>
527
+ auto wrapped_compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
528
+ return entries_;
529
+ }
530
+
531
+ template<typename A>
532
+ auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
533
+ return entries_ + num_entries_;
534
+ }
535
+
486
536
  } /* namespace datasketches */
487
537
 
488
538
  #endif
489
-