datasketches 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -0,0 +1,155 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <vector>
22
+ #include <cstring>
23
+ #include <sstream>
24
+ #include <fstream>
25
+
26
+ #include "count_min.hpp"
27
+ #include "common_defs.hpp"
28
+ #include "test_allocator.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ using count_min_sketch_test_alloc = count_min_sketch<uint64_t, test_allocator<uint64_t>>;
33
+ using alloc = test_allocator<uint64_t>;
34
+
35
+ TEST_CASE("CountMin sketch test allocator: serialize-deserialize empty", "[cm_sketch_alloc]"){
36
+ test_allocator_total_bytes = 0;
37
+ test_allocator_net_allocations = 0;
38
+ {
39
+ uint8_t n_hashes = 1 ;
40
+ uint32_t n_buckets = 5 ;
41
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
42
+ count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0)) ;
43
+ c.serialize(s);
44
+ count_min_sketch_test_alloc d = count_min_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0)) ;
45
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes()) ;
46
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets()) ;
47
+ REQUIRE(c.get_seed() == d.get_seed()) ;
48
+ uint64_t zero = 0;
49
+ REQUIRE(c.get_estimate(zero) == d.get_estimate(zero)) ;
50
+ REQUIRE(c.get_total_weight() == d.get_total_weight()) ;
51
+
52
+ // Check that all entries are equal and 0
53
+ for(auto di: d){
54
+ REQUIRE(di == 0) ;
55
+ }
56
+ }
57
+ REQUIRE(test_allocator_total_bytes == 0);
58
+ REQUIRE(test_allocator_net_allocations == 0);
59
+ }
60
+
61
+ TEST_CASE("CountMin sketch test allocator: serialize-deserialize non-empty", "[cm_sketch_alloc]"){
62
+ test_allocator_total_bytes = 0;
63
+ test_allocator_net_allocations = 0;
64
+ {
65
+ uint8_t n_hashes = 3 ;
66
+ uint32_t n_buckets = 1024 ;
67
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
68
+ count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0)) ;
69
+ for(uint64_t i=0 ; i < 10; ++i) c.update(i,10*i*i) ;
70
+ c.serialize(s);
71
+ count_min_sketch_test_alloc d = count_min_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0)) ;
72
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes()) ;
73
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets()) ;
74
+ REQUIRE(c.get_seed() == d.get_seed()) ;
75
+ REQUIRE(c.get_total_weight() == d.get_total_weight()) ;
76
+ for(uint64_t i=0 ; i < 10; ++i){
77
+ REQUIRE(c.get_estimate(i) == d.get_estimate(i)) ;
78
+ }
79
+
80
+ auto c_it = c.begin() ;
81
+ auto d_it = d.begin() ;
82
+ while(c_it != c.end()){
83
+ REQUIRE(*c_it == *d_it) ;
84
+ ++c_it ;
85
+ ++d_it ;
86
+ }
87
+ }
88
+ REQUIRE(test_allocator_total_bytes == 0);
89
+ REQUIRE(test_allocator_net_allocations == 0);
90
+ }
91
+
92
+ TEST_CASE("CountMin sketch test allocator: bytes serialize-deserialize empty", "[cm_sketch_alloc]"){
93
+ test_allocator_total_bytes = 0;
94
+ test_allocator_net_allocations = 0;
95
+ {
96
+ uint8_t n_hashes = 3 ;
97
+ uint32_t n_buckets = 32 ;
98
+ count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0)) ;
99
+ auto bytes = c.serialize() ;
100
+
101
+ REQUIRE_THROWS_AS(count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1, alloc(0)), std::invalid_argument);
102
+ auto d = count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, alloc(0)) ;
103
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes()) ;
104
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets()) ;
105
+ REQUIRE(c.get_seed() == d.get_seed()) ;
106
+ uint64_t zero = 0;
107
+ REQUIRE(c.get_estimate(zero) == d.get_estimate(zero)) ;
108
+ REQUIRE(c.get_total_weight() == d.get_total_weight()) ;
109
+
110
+ // Check that all entries are equal and 0
111
+ for(auto di: d){
112
+ REQUIRE(di == 0) ;
113
+ }
114
+ }
115
+ REQUIRE(test_allocator_total_bytes == 0);
116
+ REQUIRE(test_allocator_net_allocations == 0);
117
+ }
118
+
119
+ TEST_CASE("CountMin sketch test allocator: bytes serialize-deserialize non-empty", "[cm_sketch_alloc]"){
120
+ test_allocator_total_bytes = 0;
121
+ test_allocator_net_allocations = 0;
122
+ {
123
+ uint8_t n_hashes = 5 ;
124
+ uint32_t n_buckets = 64 ;
125
+ count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0)) ;
126
+ for(uint64_t i=0 ; i < 10; ++i) c.update(i,10*i*i) ;
127
+
128
+ auto bytes = c.serialize() ;
129
+ REQUIRE_THROWS_AS(count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1, alloc(0)), std::invalid_argument);
130
+ auto d = count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, alloc(0)) ;
131
+
132
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes()) ;
133
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets()) ;
134
+ REQUIRE(c.get_seed() == d.get_seed()) ;
135
+ REQUIRE(c.get_total_weight() == d.get_total_weight()) ;
136
+
137
+ // Check that all entries are equal
138
+ auto c_it = c.begin() ;
139
+ auto d_it = d.begin() ;
140
+ while(c_it != c.end()){
141
+ REQUIRE(*c_it == *d_it) ;
142
+ ++c_it ;
143
+ ++d_it ;
144
+ }
145
+
146
+ // Check that the estimates agree
147
+ for(uint64_t i=0 ; i < 10; ++i){
148
+ REQUIRE(c.get_estimate(i) == d.get_estimate(i)) ;
149
+ }
150
+ }
151
+ REQUIRE(test_allocator_total_bytes == 0);
152
+ REQUIRE(test_allocator_net_allocations == 0);
153
+ }
154
+
155
+ } // namespace datasketches
@@ -0,0 +1,306 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <vector>
22
+ #include <cstring>
23
+ #include <sstream>
24
+ #include <fstream>
25
+
26
+ #include "count_min.hpp"
27
+ #include "common_defs.hpp"
28
+
29
+ namespace datasketches{
30
+
31
+ TEST_CASE("CM init - throws") {
32
+ REQUIRE_THROWS_AS(count_min_sketch<uint64_t>(5, 1), std::invalid_argument);
33
+ REQUIRE_THROWS_AS(count_min_sketch<uint64_t>(4, 268435456), std::invalid_argument);
34
+ }
35
+
36
+ TEST_CASE("CM init"){
37
+ uint8_t n_hashes = 3 ;
38
+ uint32_t n_buckets = 5 ;
39
+ uint64_t seed = 1234567 ;
40
+ count_min_sketch<uint64_t> c(n_hashes, n_buckets, seed) ;
41
+ REQUIRE(c.get_num_hashes() == n_hashes) ;
42
+ REQUIRE(c.get_num_buckets() == n_buckets) ;
43
+ REQUIRE(c.get_seed() == seed) ;
44
+ REQUIRE(c.is_empty()) ;
45
+
46
+ for(auto x: c){
47
+ REQUIRE(x == 0) ;
48
+ }
49
+
50
+ // Check the default seed is appropriately set.
51
+ count_min_sketch<uint64_t> c1(n_hashes, n_buckets) ;
52
+ REQUIRE(c1.get_seed() == DEFAULT_SEED) ;
53
+ }
54
+
55
+ TEST_CASE("CM parameter suggestions", "[error parameters]") {
56
+
57
+ // Bucket suggestions
58
+ REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_buckets(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." ) ;
59
+ REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.2) == 14) ;
60
+ REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.1) == 28) ;
61
+ REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.05) == 55) ;
62
+ REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.01) == 272) ;
63
+
64
+ // Check that the sketch get_epsilon acts inversely to suggest_num_buckets
65
+ uint8_t n_hashes = 3 ;
66
+ REQUIRE(count_min_sketch<uint64_t>(n_hashes, 14).get_relative_error() <= 0.2) ;
67
+ REQUIRE(count_min_sketch<uint64_t>(n_hashes, 28).get_relative_error() <= 0.1) ;
68
+ REQUIRE(count_min_sketch<uint64_t>(n_hashes, 55).get_relative_error() <= 0.05) ;
69
+ REQUIRE(count_min_sketch<uint64_t>(n_hashes, 272).get_relative_error() <= 0.01) ;
70
+
71
+ // Hash suggestions
72
+ REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_hashes(10.0), "Confidence must be between 0 and 1.0 (inclusive)." ) ;
73
+ REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_hashes(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." ) ;
74
+ REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.682689492) == 2) ; // 1 STDDEV
75
+ REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.954499736) == 4) ; // 2 STDDEV
76
+ REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.997300204) == 6) ; // 3 STDDEV
77
+ }
78
+
79
+ TEST_CASE("CM one update: uint64_t"){
80
+ uint8_t n_hashes = 3 ;
81
+ uint32_t n_buckets = 5 ;
82
+ uint64_t seed = 9223372036854775807 ; //1234567 ;
83
+ uint64_t inserted_weight = 0 ;
84
+ count_min_sketch<uint64_t> c(n_hashes, n_buckets, seed) ;
85
+ std::string x = "x" ;
86
+
87
+ REQUIRE(c.is_empty()) ;
88
+ REQUIRE(c.get_estimate("x") == 0) ; // No items in sketch so estimates should be zero
89
+ c.update(x) ;
90
+ REQUIRE(!c.is_empty()) ;
91
+ REQUIRE(c.get_estimate(x) == 1) ;
92
+ inserted_weight += 1 ;
93
+
94
+ uint64_t w = 9 ;
95
+ inserted_weight += w ;
96
+ c.update(x, w) ;
97
+ REQUIRE(c.get_estimate(x) == inserted_weight) ;
98
+
99
+ // Doubles are converted to uint64_t
100
+ double w1 = 10.0 ;
101
+ inserted_weight += w1 ;
102
+ c.update(x, w1) ;
103
+ REQUIRE(c.get_estimate(x) == inserted_weight) ;
104
+ REQUIRE(c.get_total_weight() == inserted_weight) ;
105
+ REQUIRE(c.get_estimate(x) <= c.get_upper_bound(x)) ;
106
+ REQUIRE(c.get_estimate(x) >= c.get_lower_bound(x)) ;
107
+ }
108
+
109
+ TEST_CASE("CM frequency cancellation"){
110
+ count_min_sketch<int64_t> c(1, 5) ;
111
+ c.update("x") ;
112
+ c.update("y", -1) ;
113
+ REQUIRE(c.get_total_weight() == 2) ;
114
+ REQUIRE(c.get_estimate("x") == 1) ;
115
+ REQUIRE(c.get_estimate("y") == -1) ;
116
+ }
117
+
118
+
119
+ TEST_CASE("CM frequency estimates"){
120
+ int number_of_items = 10 ;
121
+ std::vector<uint64_t> data(number_of_items) ;
122
+ std::vector<uint64_t> frequencies(number_of_items) ;
123
+
124
+ // Populate data vector
125
+ for(int i=0; i < number_of_items; i++){
126
+ data[i] = i;
127
+ frequencies[i] = 1 << (number_of_items - i) ;
128
+ }
129
+
130
+ double relative_error = 0.1 ;
131
+ double confidence = 0.99 ;
132
+ uint8_t n_buckets = count_min_sketch<uint64_t>::suggest_num_buckets(relative_error) ;
133
+ uint32_t n_hashes = count_min_sketch<uint64_t>::suggest_num_hashes(confidence) ;
134
+
135
+ count_min_sketch<uint64_t> c(n_hashes, n_buckets) ;
136
+ for(int i=0 ; i < number_of_items ; i++) {
137
+ uint64_t value = data[i] ;
138
+ uint64_t freq = frequencies[i] ;
139
+ c.update(value, freq) ;
140
+ }
141
+
142
+ for(const auto i: data){
143
+ uint64_t est = c.get_estimate(i) ;
144
+ uint64_t upp = c.get_upper_bound(i) ;
145
+ uint64_t low = c.get_lower_bound(i) ;
146
+ REQUIRE(est <= upp) ;
147
+ REQUIRE(est >= low) ;
148
+ }
149
+ }
150
+
151
+ TEST_CASE("CM merge - reject", "[reject cases]"){
152
+ double relative_error = 0.25 ;
153
+ double confidence = 0.9 ;
154
+ uint32_t n_buckets = count_min_sketch<uint64_t>::suggest_num_buckets(relative_error) ;
155
+ uint8_t n_hashes = count_min_sketch<uint64_t>::suggest_num_hashes(confidence) ;
156
+ count_min_sketch<uint64_t> s(n_hashes, n_buckets, 9082435234709287) ;
157
+
158
+
159
+ // Generate sketches that we cannot merge into ie they disagree on at least one of the config entries
160
+ count_min_sketch<uint64_t> s1(n_hashes+1, n_buckets) ; // incorrect number of hashes
161
+ count_min_sketch<uint64_t> s2(n_hashes, n_buckets+1) ;// incorrect number of buckets
162
+ count_min_sketch<uint64_t> s3(n_hashes, n_buckets, 1) ;// incorrect seed
163
+ std::vector<count_min_sketch<uint64_t>> sketches = {s1, s2, s3};
164
+
165
+ // Fail cases
166
+ REQUIRE_THROWS(s.merge(s), "Cannot merge a sketch with itself." ) ;
167
+ for(count_min_sketch<uint64_t> sk : sketches){
168
+ REQUIRE_THROWS(s.merge(sk), "Incompatible sketch config." ) ;
169
+ }
170
+ }
171
+
172
+ TEST_CASE("CM merge - pass", "[acceptable cases]"){
173
+ double relative_error = 0.25 ;
174
+ double confidence = 0.9 ;
175
+ uint32_t n_buckets = count_min_sketch<uint64_t>::suggest_num_buckets(relative_error) ;
176
+ uint8_t n_hashes = count_min_sketch<uint64_t>::suggest_num_hashes(confidence) ;
177
+ count_min_sketch<uint64_t> s(n_hashes, n_buckets) ;
178
+ uint8_t s_hashes = s.get_num_hashes() ;
179
+ uint32_t s_buckets = s.get_num_buckets() ;
180
+ count_min_sketch<uint64_t> t(s_hashes, s_buckets) ;
181
+
182
+ // Merge in an all-zeros sketch t. Should not change the total weight.
183
+ s.merge(t) ;
184
+ REQUIRE(s.get_total_weight() == 0 ) ;
185
+
186
+ std::vector<uint64_t> data = {2,3,5,7};
187
+ for(auto d: data){
188
+ s.update(d) ;
189
+ t.update(d) ;
190
+ }
191
+ s.merge(t);
192
+
193
+ REQUIRE(s.get_total_weight() == 2*t.get_total_weight());
194
+
195
+ // Estimator checks.
196
+ for (auto x : data) {
197
+ REQUIRE(s.get_estimate(x) <= s.get_upper_bound(x)) ;
198
+ REQUIRE(s.get_estimate(x) <= 2); // True frequency x == 2 for all x.
199
+ }
200
+ }
201
+
202
+ TEST_CASE("CountMin sketch: serialize-deserialize empty", "[cm_sketch]"){
203
+ uint8_t n_hashes = 1 ;
204
+ uint32_t n_buckets = 5 ;
205
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
206
+ count_min_sketch<uint64_t> c(n_hashes, n_buckets) ;
207
+ c.serialize(s);
208
+ count_min_sketch<uint64_t> d = count_min_sketch<uint64_t>::deserialize(s, DEFAULT_SEED) ;
209
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes()) ;
210
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets()) ;
211
+ REQUIRE(c.get_seed() == d.get_seed()) ;
212
+ uint64_t zero = 0;
213
+ REQUIRE(c.get_estimate(zero) == d.get_estimate(zero)) ;
214
+ REQUIRE(c.get_total_weight() == d.get_total_weight()) ;
215
+
216
+ // Check that all entries are equal and 0
217
+ for(auto di: d){
218
+ REQUIRE(di == 0) ;
219
+ }
220
+ std::ofstream os("count_min-empty.bin");
221
+ c.serialize(os);
222
+ }
223
+
224
+ TEST_CASE("CountMin sketch: serialize-deserialize non-empty", "[cm_sketch]"){
225
+ uint8_t n_hashes = 3 ;
226
+ uint32_t n_buckets = 1024 ;
227
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
228
+ count_min_sketch<uint64_t> c(n_hashes, n_buckets) ;
229
+ for(uint64_t i=0 ; i < 10; ++i) c.update(i,10*i*i) ;
230
+ c.serialize(s);
231
+ count_min_sketch<uint64_t> d = count_min_sketch<uint64_t>::deserialize(s, DEFAULT_SEED) ;
232
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes()) ;
233
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets()) ;
234
+ REQUIRE(c.get_seed() == d.get_seed()) ;
235
+ REQUIRE(c.get_total_weight() == d.get_total_weight()) ;
236
+ for(uint64_t i=0 ; i < 10; ++i){
237
+ REQUIRE(c.get_estimate(i) == d.get_estimate(i)) ;
238
+ }
239
+
240
+ auto c_it = c.begin() ;
241
+ auto d_it = d.begin() ;
242
+ while(c_it != c.end()){
243
+ REQUIRE(*c_it == *d_it) ;
244
+ ++c_it ;
245
+ ++d_it ;
246
+ }
247
+
248
+ std::ofstream os("count_min-non-empty.bin");
249
+ c.serialize(os);
250
+ }
251
+
252
+ TEST_CASE("CountMin sketch: bytes serialize-deserialize empty", "[cm_sketch]"){
253
+ uint8_t n_hashes = 3 ;
254
+ uint32_t n_buckets = 32 ;
255
+ count_min_sketch<uint64_t> c(n_hashes, n_buckets) ;
256
+ auto bytes = c.serialize() ;
257
+
258
+ REQUIRE_THROWS_AS(count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1), std::invalid_argument);
259
+ auto d = count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED) ;
260
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes()) ;
261
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets()) ;
262
+ REQUIRE(c.get_seed() == d.get_seed()) ;
263
+ uint64_t zero = 0;
264
+ REQUIRE(c.get_estimate(zero) == d.get_estimate(zero)) ;
265
+ REQUIRE(c.get_total_weight() == d.get_total_weight()) ;
266
+
267
+ // Check that all entries are equal and 0
268
+ for(auto di: d){
269
+ REQUIRE(di == 0) ;
270
+ }
271
+ }
272
+
273
+
274
+ TEST_CASE("CountMin sketch: bytes serialize-deserialize non-empty", "[cm_sketch]"){
275
+ uint8_t n_hashes = 5 ;
276
+ uint32_t n_buckets = 64 ;
277
+ count_min_sketch<uint64_t> c(n_hashes, n_buckets) ;
278
+ for(uint64_t i=0 ; i < 10; ++i) c.update(i,10*i*i) ;
279
+
280
+ auto bytes = c.serialize() ;
281
+ REQUIRE_THROWS_AS(count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1), std::invalid_argument);
282
+ auto d = count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED) ;
283
+
284
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes()) ;
285
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets()) ;
286
+ REQUIRE(c.get_seed() == d.get_seed()) ;
287
+ REQUIRE(c.get_total_weight() == d.get_total_weight()) ;
288
+
289
+ // Check that all entries are equal
290
+ auto c_it = c.begin() ;
291
+ auto d_it = d.begin() ;
292
+ while(c_it != c.end()){
293
+ REQUIRE(*c_it == *d_it) ;
294
+ ++c_it ;
295
+ ++d_it ;
296
+ }
297
+
298
+ // Check that the estimates agree
299
+ for(uint64_t i=0 ; i < 10; ++i){
300
+ REQUIRE(c.get_estimate(i) == d.get_estimate(i)) ;
301
+ }
302
+
303
+ }
304
+
305
+ } /* namespace datasketches */
306
+
@@ -30,7 +30,7 @@
30
30
  namespace datasketches {
31
31
 
32
32
  // ln 2.0
33
- static const double ICON_ERROT_CONSTANT = 0.693147180559945286;
33
+ static const double ICON_ERROR_CONSTANT = 0.693147180559945286;
34
34
 
35
35
  // 1, 2, 3, // kappa
36
36
  static const int16_t ICON_LOW_SIDE_DATA [33] = { // Empirically measured at N = 1000 * K.
@@ -102,7 +102,7 @@ double get_icon_confidence_lb(const cpc_sketch_alloc<A>& sketch, int kappa) {
102
102
  const long k = 1 << lg_k;
103
103
  if (lg_k < 4) throw std::logic_error("lgk < 4");
104
104
  if (kappa < 1 || kappa > 3) throw std::invalid_argument("kappa must be between 1 and 3");
105
- double x = ICON_ERROT_CONSTANT;
105
+ double x = ICON_ERROR_CONSTANT;
106
106
  if (lg_k <= 14) x = ((double) ICON_HIGH_SIDE_DATA[3 * (lg_k - 4) + (kappa - 1)]) / 10000.0;
107
107
  const double rel = x / sqrt(k);
108
108
  const double eps = kappa * rel;
@@ -120,7 +120,7 @@ double get_icon_confidence_ub(const cpc_sketch_alloc<A>& sketch, int kappa) {
120
120
  const long k = 1 << lg_k;
121
121
  if (lg_k < 4) throw std::logic_error("lgk < 4");
122
122
  if (kappa < 1 || kappa > 3) throw std::invalid_argument("kappa must be between 1 and 3");
123
- double x = ICON_ERROT_CONSTANT;
123
+ double x = ICON_ERROR_CONSTANT;
124
124
  if (lg_k <= 14) x = ((double) ICON_LOW_SIDE_DATA[3 * (lg_k - 4) + (kappa - 1)]) / 10000.0;
125
125
  const double rel = x / sqrt(k);
126
126
  const double eps = kappa * rel;
@@ -359,7 +359,7 @@ void cpc_sketch_alloc<A>::refresh_kxp(const uint64_t* bit_matrix) {
359
359
 
360
360
  // for improved numerical accuracy, we separately sum the bytes of the U64's
361
361
  double byte_sums[8]; // allocating on the stack
362
- std::fill(byte_sums, &byte_sums[8], 0);
362
+ std::fill(byte_sums, byte_sums + 8, 0);
363
363
 
364
364
  for (size_t i = 0; i < k; i++) {
365
365
  uint64_t word = bit_matrix[i];
@@ -89,7 +89,13 @@ static inline uint32_t warren_count_bits_set_in_matrix(const uint64_t* array, ui
89
89
 
90
90
  // This code is Figure 5-9 in "Hacker's Delight" by Henry S. Warren.
91
91
 
92
- #define CSA(h,l,a,b,c) {uint64_t u = a ^ b; uint64_t v = c; h = (a & b) | (u & v); l = u ^ v;}
92
+ #define DATASKETCHES_CSA(h, l, a, b, c) \
93
+ { \
94
+ uint64_t u = a ^ b; \
95
+ uint64_t v = c; \
96
+ h = (a & b) | (u & v); \
97
+ l = u ^ v; \
98
+ }
93
99
 
94
100
  static inline uint32_t count_bits_set_in_matrix(const uint64_t* a, uint32_t length) {
95
101
  if ((length & 0x7) != 0) throw std::invalid_argument("the length of the array must be a multiple of 8");
@@ -98,15 +104,15 @@ static inline uint32_t count_bits_set_in_matrix(const uint64_t* a, uint32_t leng
98
104
  fours = twos = ones = 0;
99
105
 
100
106
  for (uint32_t i = 0; i <= length - 8; i += 8) {
101
- CSA(twos_a, ones, ones, a[i+0], a[i+1]);
102
- CSA(twos_b, ones, ones, a[i+2], a[i+3]);
103
- CSA(fours_a, twos, twos, twos_a, twos_b);
107
+ DATASKETCHES_CSA(twos_a, ones, ones, a[i+0], a[i+1]);
108
+ DATASKETCHES_CSA(twos_b, ones, ones, a[i+2], a[i+3]);
109
+ DATASKETCHES_CSA(fours_a, twos, twos, twos_a, twos_b);
104
110
 
105
- CSA(twos_a, ones, ones, a[i+4], a[i+5]);
106
- CSA(twos_b, ones, ones, a[i+6], a[i+7]);
107
- CSA(fours_b, twos, twos, twos_a, twos_b);
111
+ DATASKETCHES_CSA(twos_a, ones, ones, a[i+4], a[i+5]);
112
+ DATASKETCHES_CSA(twos_b, ones, ones, a[i+6], a[i+7]);
113
+ DATASKETCHES_CSA(fours_b, twos, twos, twos_a, twos_b);
108
114
 
109
- CSA(eights, fours, fours, fours_a, fours_b);
115
+ DATASKETCHES_CSA(eights, fours, fours, fours_a, fours_b);
110
116
 
111
117
  total += warren_bit_count(eights);
112
118
  }
@@ -119,6 +125,8 @@ static inline uint32_t count_bits_set_in_matrix(const uint64_t* a, uint32_t leng
119
125
  return total;
120
126
  }
121
127
 
128
+ #undef DATASKETCHES_CSA
129
+
122
130
  // Here are some timings made with quickTestMerge.c
123
131
  // for the "5 5" case:
124
132
 
@@ -0,0 +1,42 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_library(density INTERFACE)
19
+
20
+ add_library(${PROJECT_NAME}::DENSITY ALIAS density)
21
+
22
+ if (BUILD_TESTS)
23
+ add_subdirectory(test)
24
+ endif()
25
+
26
+ target_include_directories(density
27
+ INTERFACE
28
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
29
+ $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
30
+ )
31
+
32
+ target_link_libraries(density INTERFACE common)
33
+ target_compile_features(density INTERFACE cxx_std_11)
34
+
35
+ install(TARGETS density
36
+ EXPORT ${PROJECT_NAME}
37
+ )
38
+
39
+ install(FILES
40
+ include/density_sketch.hpp
41
+ include/density_sketch_impl.hpp
42
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")