datasketches 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/datasketches/version.rb +1 -1
  4. data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
  5. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  6. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  7. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
  9. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  10. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  11. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  12. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  13. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  14. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
  16. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
  17. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  18. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  19. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  20. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  21. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  22. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  23. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
  25. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  28. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  29. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  30. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  31. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  32. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  33. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  34. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  35. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  36. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  37. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  38. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  39. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  40. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  41. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  42. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  43. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  44. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  45. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  47. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  48. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  49. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  50. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  51. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  52. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  53. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  54. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  55. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  56. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  57. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  58. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  59. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  60. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  61. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  62. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  63. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  64. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  65. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  66. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  67. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
  69. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  70. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  71. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
  73. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  74. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  75. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
  76. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  78. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  79. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
  84. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  85. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
  86. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
  87. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  88. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  89. data/vendor/datasketches-cpp/setup.py +1 -1
  90. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  91. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  92. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  93. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  94. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  95. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
  97. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
  98. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
  99. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  100. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
  101. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
  103. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  105. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  106. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
  107. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  108. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  109. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  112. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  113. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  114. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  116. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
  117. metadata +8 -3
@@ -79,7 +79,7 @@ class CMakeBuild(build_ext):
79
79
 
80
80
  setup(
81
81
  name='datasketches',
82
- version='3.0.0',
82
+ version='3.1.0',
83
83
  author='Apache DataSketches Developers',
84
84
  author_email='dev@datasketches.apache.org',
85
85
  description='A wrapper for the C++ Apache DataSketches library',
@@ -90,7 +90,7 @@ public:
90
90
  * @param f the inclusion probability used to produce the set with size <i>a</i>.
91
91
  * @return the approximate lower bound
92
92
  */
93
- static double estimate_of_a(uint64_t a, uint64_t f) {
93
+ static double estimate_of_a(uint64_t a, double f) {
94
94
  check_inputs(a, 1, f);
95
95
  return a / f;
96
96
  }
@@ -0,0 +1,67 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef COMPACT_THETA_SKETCH_PARSER_HPP_
21
+ #define COMPACT_THETA_SKETCH_PARSER_HPP_
22
+
23
+ #include <stdint.h>
24
+
25
+ namespace datasketches {
26
+
27
+ template<bool dummy>
28
+ class compact_theta_sketch_parser {
29
+ public:
30
+ struct compact_theta_sketch_data {
31
+ bool is_empty;
32
+ bool is_ordered;
33
+ uint16_t seed_hash;
34
+ uint32_t num_entries;
35
+ uint64_t theta;
36
+ const uint64_t* entries;
37
+ };
38
+
39
+ static compact_theta_sketch_data parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error = false);
40
+
41
+ private:
42
+ // offsets are in sizeof(type)
43
+ static const size_t COMPACT_SKETCH_PRE_LONGS_BYTE = 0;
44
+ static const size_t COMPACT_SKETCH_SERIAL_VERSION_BYTE = 1;
45
+ static const size_t COMPACT_SKETCH_TYPE_BYTE = 2;
46
+ static const size_t COMPACT_SKETCH_FLAGS_BYTE = 5;
47
+ static const size_t COMPACT_SKETCH_SEED_HASH_U16 = 3;
48
+ static const size_t COMPACT_SKETCH_NUM_ENTRIES_U32 = 2;
49
+ static const size_t COMPACT_SKETCH_SINGLE_ENTRY_U64 = 1;
50
+ static const size_t COMPACT_SKETCH_ENTRIES_EXACT_U64 = 2;
51
+ static const size_t COMPACT_SKETCH_THETA_U64 = 2;
52
+ static const size_t COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 = 3;
53
+
54
+ static const uint8_t COMPACT_SKETCH_IS_EMPTY_FLAG = 2;
55
+ static const uint8_t COMPACT_SKETCH_IS_ORDERED_FLAG = 4;
56
+
57
+ static const uint8_t COMPACT_SKETCH_SERIAL_VERSION = 3;
58
+ static const uint8_t COMPACT_SKETCH_TYPE = 3;
59
+
60
+ static std::string hex_dump(const uint8_t* ptr, size_t size);
61
+ };
62
+
63
+ } /* namespace datasketches */
64
+
65
+ #include "compact_theta_sketch_parser_impl.hpp"
66
+
67
+ #endif
@@ -0,0 +1,70 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef COMPACT_THETA_SKETCH_PARSER_IMPL_HPP_
21
+ #define COMPACT_THETA_SKETCH_PARSER_IMPL_HPP_
22
+
23
+ #include <iostream>
24
+ #include <iomanip>
25
+
26
+ namespace datasketches {
27
+
28
+ template<bool dummy>
29
+ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
30
+ if (size < 8) throw std::invalid_argument("at least 8 bytes expected, actual " + std::to_string(size)
31
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
32
+ checker<true>::check_serial_version(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE], COMPACT_SKETCH_SERIAL_VERSION);
33
+ checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
34
+ uint64_t theta = theta_constants::MAX_THETA;
35
+ const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
36
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
37
+ if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
38
+ return {true, true, seed_hash, 0, theta, nullptr};
39
+ }
40
+ const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
41
+ if (has_theta) {
42
+ if (size < 16) throw std::invalid_argument("at least 16 bytes expected, actual " + std::to_string(size));
43
+ theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
44
+ }
45
+ if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
46
+ return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
47
+ }
48
+ const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
49
+ const size_t entries_start_u64 = has_theta ? COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
50
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
51
+ const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
52
+ if (size < expected_size_bytes) {
53
+ throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
54
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
55
+ }
56
+ const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
57
+ return {false, is_ordered, seed_hash, num_entries, theta, entries};
58
+ }
59
+
60
+ template<bool dummy>
61
+ std::string compact_theta_sketch_parser<dummy>::hex_dump(const uint8_t* ptr, size_t size) {
62
+ std::stringstream s;
63
+ s << std::hex << std::setfill('0') << std::uppercase;
64
+ for (size_t i = 0; i < size; ++i) s << std::setw(2) << (ptr[i] & 0xff);
65
+ return s.str();
66
+ }
67
+
68
+ } /* namespace datasketches */
69
+
70
+ #endif
@@ -33,14 +33,19 @@ public:
33
33
  using Sketch = theta_sketch_alloc<Allocator>;
34
34
  using CompactSketch = compact_theta_sketch_alloc<Allocator>;
35
35
 
36
- struct pass_through_policy {
37
- uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
36
+ struct nop_policy {
37
+ void operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
38
38
  unused(incoming_entry);
39
- return internal_entry;
39
+ unused(internal_entry);
40
40
  }
41
41
  };
42
- using State = theta_intersection_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
42
+ using State = theta_intersection_base<Entry, ExtractKey, nop_policy, Sketch, CompactSketch, Allocator>;
43
43
 
44
+ /*
45
+ * Constructor
46
+ * @param seed for the hash function that was used to create the sketch
47
+ * @param allocator to use for allocating and deallocating memory
48
+ */
44
49
  explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
45
50
 
46
51
  /**
@@ -24,7 +24,7 @@ namespace datasketches {
24
24
 
25
25
  template<typename A>
26
26
  theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed, const A& allocator):
27
- state_(seed, pass_through_policy(), allocator)
27
+ state_(seed, nop_policy(), allocator)
28
28
  {}
29
29
 
30
30
  template<typename A>
@@ -46,20 +46,21 @@ public:
46
46
  *
47
47
  * @param sketch_a given sketch A
48
48
  * @param sketch_b given sketch B
49
+ * @param seed for the hash function that was used to create the sketch
49
50
  * @return a double array {LowerBound, Estimate, UpperBound} of the Jaccard index.
50
51
  * The Upper and Lower bounds are for a confidence interval of 95.4% or +/- 2 standard deviations.
51
52
  */
52
53
  template<typename SketchA, typename SketchB>
53
- static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b) {
54
+ static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
54
55
  if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return {1, 1, 1};
55
56
  if (sketch_a.is_empty() && sketch_b.is_empty()) return {1, 1, 1};
56
57
  if (sketch_a.is_empty() || sketch_b.is_empty()) return {0, 0, 0};
57
58
 
58
- auto union_ab = compute_union(sketch_a, sketch_b);
59
+ auto union_ab = compute_union(sketch_a, sketch_b, seed);
59
60
  if (identical_sets(sketch_a, sketch_b, union_ab)) return {1, 1, 1};
60
61
 
61
62
  // intersection
62
- Intersection i;
63
+ Intersection i(seed);
63
64
  i.update(sketch_a);
64
65
  i.update(sketch_b);
65
66
  i.update(union_ab); // ensures that intersection is a subset of the union
@@ -76,15 +77,16 @@ public:
76
77
  * Returns true if the two given sketches are equivalent.
77
78
  * @param sketch_a the given sketch A
78
79
  * @param sketch_b the given sketch B
80
+ * @param seed for the hash function that was used to create the sketch
79
81
  * @return true if the two given sketches are exactly equal
80
82
  */
81
83
  template<typename SketchA, typename SketchB>
82
- static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b) {
84
+ static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed = DEFAULT_SEED) {
83
85
  if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return true;
84
86
  if (sketch_a.is_empty() && sketch_b.is_empty()) return true;
85
87
  if (sketch_a.is_empty() || sketch_b.is_empty()) return false;
86
88
 
87
- auto union_ab = compute_union(sketch_a, sketch_b);
89
+ auto union_ab = compute_union(sketch_a, sketch_b, seed);
88
90
  if (identical_sets(sketch_a, sketch_b, union_ab)) return true;
89
91
  return false;
90
92
  }
@@ -99,12 +101,13 @@ public:
99
101
  * @param actual the sketch to be tested
100
102
  * @param expected the reference sketch that is considered to be correct
101
103
  * @param threshold a real value between zero and one
104
+ * @param seed for the hash function that was used to create the sketch
102
105
  * @return true if the similarity of the two sketches is greater than the given threshold
103
106
  * with at least 97.7% confidence
104
107
  */
105
108
  template<typename SketchA, typename SketchB>
106
- static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
107
- auto jc = jaccard(actual, expected);
109
+ static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
110
+ auto jc = jaccard(actual, expected, seed);
108
111
  return jc[0] >= threshold;
109
112
  }
110
113
 
@@ -118,23 +121,24 @@ public:
118
121
  * @param actual the sketch to be tested
119
122
  * @param expected the reference sketch that is considered to be correct
120
123
  * @param threshold a real value between zero and one
124
+ * @param seed for the hash function that was used to create the sketch
121
125
  * @return true if the dissimilarity of the two sketches is greater than the given threshold
122
126
  * with at least 97.7% confidence
123
127
  */
124
128
  template<typename SketchA, typename SketchB>
125
- static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
126
- auto jc = jaccard(actual, expected);
129
+ static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold, uint64_t seed = DEFAULT_SEED) {
130
+ auto jc = jaccard(actual, expected, seed);
127
131
  return jc[2] <= threshold;
128
132
  }
129
133
 
130
134
  private:
131
135
 
132
136
  template<typename SketchA, typename SketchB>
133
- static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b) {
134
- const unsigned count_a = sketch_a.get_num_retained();
135
- const unsigned count_b = sketch_b.get_num_retained();
136
- const unsigned lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
137
- auto u = typename Union::builder().set_lg_k(lg_k).build();
137
+ static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b, uint64_t seed) {
138
+ const auto count_a = sketch_a.get_num_retained();
139
+ const auto count_b = sketch_b.get_num_retained();
140
+ const uint8_t lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
141
+ auto u = typename Union::builder().set_lg_k(lg_k).set_seed(seed).build();
138
142
  u.update(sketch_a);
139
143
  u.update(sketch_b);
140
144
  return u.get_result(false);
@@ -311,7 +311,8 @@ public:
311
311
  // - as a result of a set operation
312
312
  // - by deserializing a previously serialized compact sketch
313
313
 
314
- compact_theta_sketch_alloc(const Base& other, bool ordered);
314
+ template<typename Other>
315
+ compact_theta_sketch_alloc(const Other& other, bool ordered);
315
316
  compact_theta_sketch_alloc(const compact_theta_sketch_alloc&) = default;
316
317
  compact_theta_sketch_alloc(compact_theta_sketch_alloc&&) noexcept = default;
317
318
  virtual ~compact_theta_sketch_alloc() = default;
@@ -387,10 +388,50 @@ public:
387
388
  update_theta_sketch_alloc build() const;
388
389
  };
389
390
 
391
+ // This is to wrap a buffer containing a serialized compact sketch and use it in a set operation avoiding some cost of deserialization.
392
+ // It does not take the ownership of the buffer.
393
+
394
+ template<typename Allocator = std::allocator<uint64_t>>
395
+ class wrapped_compact_theta_sketch_alloc {
396
+ public:
397
+ using const_iterator = const uint64_t*;
398
+
399
+ Allocator get_allocator() const;
400
+ bool is_empty() const;
401
+ bool is_ordered() const;
402
+ uint64_t get_theta64() const;
403
+ uint32_t get_num_retained() const;
404
+ uint16_t get_seed_hash() const;
405
+
406
+ const_iterator begin() const;
407
+ const_iterator end() const;
408
+
409
+ /**
410
+ * This method wraps a serialized compact sketch as an array of bytes.
411
+ * @param bytes pointer to the array of bytes
412
+ * @param size the size of the array
413
+ * @param seed the seed for the hash function that was used to create the sketch
414
+ * @return an instance of the sketch
415
+ */
416
+ static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
417
+
418
+ private:
419
+ bool is_empty_;
420
+ bool is_ordered_;
421
+ uint16_t seed_hash_;
422
+ uint32_t num_entries_;
423
+ uint64_t theta_;
424
+ const uint64_t* entries_;
425
+
426
+ wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
427
+ uint64_t theta, const uint64_t* entries);
428
+ };
429
+
390
430
  // aliases with default allocator for convenience
391
431
  using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
392
432
  using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
393
433
  using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
434
+ using wrapped_compact_theta_sketch = wrapped_compact_theta_sketch_alloc<std::allocator<uint64_t>>;
394
435
 
395
436
  } /* namespace datasketches */
396
437
 
@@ -26,6 +26,7 @@
26
26
  #include "serde.hpp"
27
27
  #include "binomial_bounds.hpp"
28
28
  #include "theta_helpers.hpp"
29
+ #include "compact_theta_sketch_parser.hpp"
29
30
 
30
31
  namespace datasketches {
31
32
 
@@ -246,7 +247,8 @@ update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() cons
246
247
  // compact sketch
247
248
 
248
249
  template<typename A>
249
- compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const Base& other, bool ordered):
250
+ template<typename Other>
251
+ compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const Other& other, bool ordered):
250
252
  is_empty_(other.is_empty()),
251
253
  is_ordered_(other.is_ordered() || ordered),
252
254
  seed_hash_(other.get_seed_hash()),
@@ -290,7 +292,7 @@ uint64_t compact_theta_sketch_alloc<A>::get_theta64() const {
290
292
 
291
293
  template<typename A>
292
294
  uint32_t compact_theta_sketch_alloc<A>::get_num_retained() const {
293
- return entries_.size();
295
+ return static_cast<uint32_t>(entries_.size());
294
296
  }
295
297
 
296
298
  template<typename A>
@@ -300,22 +302,22 @@ uint16_t compact_theta_sketch_alloc<A>::get_seed_hash() const {
300
302
 
301
303
  template<typename A>
302
304
  auto compact_theta_sketch_alloc<A>::begin() -> iterator {
303
- return iterator(entries_.data(), entries_.size(), 0);
305
+ return iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
304
306
  }
305
307
 
306
308
  template<typename A>
307
309
  auto compact_theta_sketch_alloc<A>::end() -> iterator {
308
- return iterator(nullptr, 0, entries_.size());
310
+ return iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
309
311
  }
310
312
 
311
313
  template<typename A>
312
314
  auto compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
313
- return const_iterator(entries_.data(), entries_.size(), 0);
315
+ return const_iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
314
316
  }
315
317
 
316
318
  template<typename A>
317
319
  auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
318
- return const_iterator(nullptr, 0, entries_.size());
320
+ return const_iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
319
321
  }
320
322
 
321
323
  template<typename A>
@@ -325,33 +327,33 @@ template<typename A>
325
327
  void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
326
328
  const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
327
329
  const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
328
- os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
330
+ write(os, preamble_longs);
329
331
  const uint8_t serial_version = SERIAL_VERSION;
330
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
332
+ write(os, serial_version);
331
333
  const uint8_t type = SKETCH_TYPE;
332
- os.write(reinterpret_cast<const char*>(&type), sizeof(type));
334
+ write(os, type);
333
335
  const uint16_t unused16 = 0;
334
- os.write(reinterpret_cast<const char*>(&unused16), sizeof(unused16));
336
+ write(os, unused16);
335
337
  const uint8_t flags_byte(
336
338
  (1 << flags::IS_COMPACT) |
337
339
  (1 << flags::IS_READ_ONLY) |
338
340
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
339
341
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
340
342
  );
341
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
343
+ write(os, flags_byte);
342
344
  const uint16_t seed_hash = get_seed_hash();
343
- os.write(reinterpret_cast<const char*>(&seed_hash), sizeof(seed_hash));
345
+ write(os, seed_hash);
344
346
  if (!this->is_empty()) {
345
347
  if (!is_single_item) {
346
- const uint32_t num_entries = entries_.size();
347
- os.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
348
+ const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
349
+ write(os, num_entries);
348
350
  const uint32_t unused32 = 0;
349
- os.write(reinterpret_cast<const char*>(&unused32), sizeof(unused32));
351
+ write(os, unused32);
350
352
  if (this->is_estimation_mode()) {
351
- os.write(reinterpret_cast<const char*>(&(this->theta_)), sizeof(uint64_t));
353
+ write(os, this->theta_);
352
354
  }
353
355
  }
354
- os.write(reinterpret_cast<const char*>(entries_.data()), entries_.size() * sizeof(uint64_t));
356
+ write(os, entries_.data(), entries_.size() * sizeof(uint64_t));
355
357
  }
356
358
  }
357
359
 
@@ -364,30 +366,28 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
364
366
  vector_bytes bytes(size, 0, entries_.get_allocator());
365
367
  uint8_t* ptr = bytes.data() + header_size_bytes;
366
368
 
367
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
369
+ ptr += copy_to_mem(preamble_longs, ptr);
368
370
  const uint8_t serial_version = SERIAL_VERSION;
369
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
371
+ ptr += copy_to_mem(serial_version, ptr);
370
372
  const uint8_t type = SKETCH_TYPE;
371
- ptr += copy_to_mem(&type, ptr, sizeof(type));
372
- const uint16_t unused16 = 0;
373
- ptr += copy_to_mem(&unused16, ptr, sizeof(unused16));
373
+ ptr += copy_to_mem(type, ptr);
374
+ ptr += sizeof(uint16_t); // unused
374
375
  const uint8_t flags_byte(
375
376
  (1 << flags::IS_COMPACT) |
376
377
  (1 << flags::IS_READ_ONLY) |
377
378
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
378
379
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
379
380
  );
380
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
381
+ ptr += copy_to_mem(flags_byte, ptr);
381
382
  const uint16_t seed_hash = get_seed_hash();
382
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
383
+ ptr += copy_to_mem(seed_hash, ptr);
383
384
  if (!this->is_empty()) {
384
385
  if (!is_single_item) {
385
- const uint32_t num_entries = entries_.size();
386
- ptr += copy_to_mem(&num_entries, ptr, sizeof(num_entries));
387
- const uint32_t unused32 = 0;
388
- ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
386
+ const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
387
+ ptr += copy_to_mem(num_entries, ptr);
388
+ ptr += sizeof(uint32_t);
389
389
  if (this->is_estimation_mode()) {
390
- ptr += copy_to_mem(&theta_, ptr, sizeof(uint64_t));
390
+ ptr += copy_to_mem(theta_, ptr);
391
391
  }
392
392
  }
393
393
  ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
@@ -397,18 +397,12 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
397
397
 
398
398
  template<typename A>
399
399
  compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
400
- uint8_t preamble_longs;
401
- is.read(reinterpret_cast<char*>(&preamble_longs), sizeof(preamble_longs));
402
- uint8_t serial_version;
403
- is.read(reinterpret_cast<char*>(&serial_version), sizeof(serial_version));
404
- uint8_t type;
405
- is.read(reinterpret_cast<char*>(&type), sizeof(type));
406
- uint16_t unused16;
407
- is.read(reinterpret_cast<char*>(&unused16), sizeof(unused16));
408
- uint8_t flags_byte;
409
- is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
410
- uint16_t seed_hash;
411
- is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
400
+ const auto preamble_longs = read<uint8_t>(is);
401
+ const auto serial_version = read<uint8_t>(is);
402
+ const auto type = read<uint8_t>(is);
403
+ read<uint16_t>(is); // unused
404
+ const auto flags_byte = read<uint8_t>(is);
405
+ const auto seed_hash = read<uint16_t>(is);
412
406
  checker<true>::check_sketch_type(type, SKETCH_TYPE);
413
407
  checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
414
408
  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
@@ -420,16 +414,15 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
420
414
  if (preamble_longs == 1) {
421
415
  num_entries = 1;
422
416
  } else {
423
- is.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
424
- uint32_t unused32;
425
- is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
417
+ num_entries = read<uint32_t>(is);
418
+ read<uint32_t>(is); // unused
426
419
  if (preamble_longs > 2) {
427
- is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
420
+ theta = read<uint64_t>(is);
428
421
  }
429
422
  }
430
423
  }
431
424
  std::vector<uint64_t, A> entries(num_entries, 0, allocator);
432
- if (!is_empty) is.read(reinterpret_cast<char*>(entries.data()), sizeof(uint64_t) * entries.size());
425
+ if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
433
426
 
434
427
  const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
435
428
  if (!is.good()) throw std::runtime_error("error reading from std::istream");
@@ -442,17 +435,16 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
442
435
  const char* ptr = static_cast<const char*>(bytes);
443
436
  const char* base = ptr;
444
437
  uint8_t preamble_longs;
445
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
438
+ ptr += copy_from_mem(ptr, preamble_longs);
446
439
  uint8_t serial_version;
447
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
440
+ ptr += copy_from_mem(ptr, serial_version);
448
441
  uint8_t type;
449
- ptr += copy_from_mem(ptr, &type, sizeof(type));
450
- uint16_t unused16;
451
- ptr += copy_from_mem(ptr, &unused16, sizeof(unused16));
442
+ ptr += copy_from_mem(ptr, type);
443
+ ptr += sizeof(uint16_t); // unused
452
444
  uint8_t flags_byte;
453
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
445
+ ptr += copy_from_mem(ptr, flags_byte);
454
446
  uint16_t seed_hash;
455
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
447
+ ptr += copy_from_mem(ptr, seed_hash);
456
448
  checker<true>::check_sketch_type(type, SKETCH_TYPE);
457
449
  checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
458
450
  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
@@ -465,12 +457,11 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
465
457
  num_entries = 1;
466
458
  } else {
467
459
  ensure_minimum_memory(size, 8); // read the first prelong before this method
468
- ptr += copy_from_mem(ptr, &num_entries, sizeof(num_entries));
469
- uint32_t unused32;
470
- ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
460
+ ptr += copy_from_mem(ptr, num_entries);
461
+ ptr += sizeof(uint32_t); // unused
471
462
  if (preamble_longs > 2) {
472
463
  ensure_minimum_memory(size, (preamble_longs - 1) << 3);
473
- ptr += copy_from_mem(ptr, &theta, sizeof(theta));
464
+ ptr += copy_from_mem(ptr, theta);
474
465
  }
475
466
  }
476
467
  }
@@ -483,7 +474,65 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
483
474
  return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
484
475
  }
485
476
 
477
+ // wrapped compact sketch
478
+
479
+ template<typename A>
480
+ wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
481
+ uint64_t theta, const uint64_t* entries):
482
+ is_empty_(is_empty),
483
+ is_ordered_(is_ordered),
484
+ seed_hash_(seed_hash),
485
+ num_entries_(num_entries),
486
+ theta_(theta),
487
+ entries_(entries)
488
+ {}
489
+
490
+ template<typename A>
491
+ const wrapped_compact_theta_sketch_alloc<A> wrapped_compact_theta_sketch_alloc<A>::wrap(const void* bytes, size_t size, uint64_t seed, bool dump_on_error) {
492
+ auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error);
493
+ return wrapped_compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.num_entries, data.theta, data.entries);
494
+ }
495
+
496
+ template<typename A>
497
+ A wrapped_compact_theta_sketch_alloc<A>::get_allocator() const {
498
+ return A();
499
+ }
500
+
501
+ template<typename A>
502
+ bool wrapped_compact_theta_sketch_alloc<A>::is_empty() const {
503
+ return is_empty_;
504
+ }
505
+
506
+ template<typename A>
507
+ bool wrapped_compact_theta_sketch_alloc<A>::is_ordered() const {
508
+ return is_ordered_;
509
+ }
510
+
511
+ template<typename A>
512
+ uint64_t wrapped_compact_theta_sketch_alloc<A>::get_theta64() const {
513
+ return theta_;
514
+ }
515
+
516
+ template<typename A>
517
+ uint32_t wrapped_compact_theta_sketch_alloc<A>::get_num_retained() const {
518
+ return static_cast<uint32_t>(num_entries_);
519
+ }
520
+
521
+ template<typename A>
522
+ uint16_t wrapped_compact_theta_sketch_alloc<A>::get_seed_hash() const {
523
+ return seed_hash_;
524
+ }
525
+
526
+ template<typename A>
527
+ auto wrapped_compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
528
+ return entries_;
529
+ }
530
+
531
+ template<typename A>
532
+ auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
533
+ return entries_ + num_entries_;
534
+ }
535
+
486
536
  } /* namespace datasketches */
487
537
 
488
538
  #endif
489
-