datasketches 0.3.0 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (114) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  7. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  8. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  9. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  10. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  11. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  12. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  13. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  14. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  15. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  16. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  18. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -1
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +2 -2
  73. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  74. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  75. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  76. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  77. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  78. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  79. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  80. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  81. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  82. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  83. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  84. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  86. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  87. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  88. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  89. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  90. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  94. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  96. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  97. data/vendor/datasketches-cpp/setup.py +1 -1
  98. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  99. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  101. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  102. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  104. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  105. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  107. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  108. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  109. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  110. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  111. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  112. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  113. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  114. metadata +31 -3
@@ -0,0 +1,155 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <vector>
22
+ #include <cstring>
23
+ #include <sstream>
24
+ #include <fstream>
25
+
26
+ #include "count_min.hpp"
27
+ #include "common_defs.hpp"
28
+ #include "test_allocator.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ using count_min_sketch_test_alloc = count_min_sketch<uint64_t, test_allocator<uint64_t>>;
33
+ using alloc = test_allocator<uint64_t>;
34
+
35
+ TEST_CASE("CountMin sketch test allocator: serialize-deserialize empty", "[cm_sketch_alloc]"){
36
+ test_allocator_total_bytes = 0;
37
+ test_allocator_net_allocations = 0;
38
+ {
39
+ uint8_t n_hashes = 1 ;
40
+ uint32_t n_buckets = 5 ;
41
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
42
+ count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0)) ;
43
+ c.serialize(s);
44
+ count_min_sketch_test_alloc d = count_min_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0)) ;
45
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes()) ;
46
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets()) ;
47
+ REQUIRE(c.get_seed() == d.get_seed()) ;
48
+ uint64_t zero = 0;
49
+ REQUIRE(c.get_estimate(zero) == d.get_estimate(zero)) ;
50
+ REQUIRE(c.get_total_weight() == d.get_total_weight()) ;
51
+
52
+ // Check that all entries are equal and 0
53
+ for(auto di: d){
54
+ REQUIRE(di == 0) ;
55
+ }
56
+ }
57
+ REQUIRE(test_allocator_total_bytes == 0);
58
+ REQUIRE(test_allocator_net_allocations == 0);
59
+ }
60
+
61
+ TEST_CASE("CountMin sketch test allocator: serialize-deserialize non-empty", "[cm_sketch_alloc]"){
62
+ test_allocator_total_bytes = 0;
63
+ test_allocator_net_allocations = 0;
64
+ {
65
+ uint8_t n_hashes = 3 ;
66
+ uint32_t n_buckets = 1024 ;
67
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
68
+ count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0)) ;
69
+ for(uint64_t i=0 ; i < 10; ++i) c.update(i,10*i*i) ;
70
+ c.serialize(s);
71
+ count_min_sketch_test_alloc d = count_min_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0)) ;
72
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes()) ;
73
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets()) ;
74
+ REQUIRE(c.get_seed() == d.get_seed()) ;
75
+ REQUIRE(c.get_total_weight() == d.get_total_weight()) ;
76
+ for(uint64_t i=0 ; i < 10; ++i){
77
+ REQUIRE(c.get_estimate(i) == d.get_estimate(i)) ;
78
+ }
79
+
80
+ auto c_it = c.begin() ;
81
+ auto d_it = d.begin() ;
82
+ while(c_it != c.end()){
83
+ REQUIRE(*c_it == *d_it) ;
84
+ ++c_it ;
85
+ ++d_it ;
86
+ }
87
+ }
88
+ REQUIRE(test_allocator_total_bytes == 0);
89
+ REQUIRE(test_allocator_net_allocations == 0);
90
+ }
91
+
92
+ TEST_CASE("CountMin sketch test allocator: bytes serialize-deserialize empty", "[cm_sketch_alloc]"){
93
+ test_allocator_total_bytes = 0;
94
+ test_allocator_net_allocations = 0;
95
+ {
96
+ uint8_t n_hashes = 3 ;
97
+ uint32_t n_buckets = 32 ;
98
+ count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0)) ;
99
+ auto bytes = c.serialize() ;
100
+
101
+ REQUIRE_THROWS_AS(count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1, alloc(0)), std::invalid_argument);
102
+ auto d = count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, alloc(0)) ;
103
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes()) ;
104
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets()) ;
105
+ REQUIRE(c.get_seed() == d.get_seed()) ;
106
+ uint64_t zero = 0;
107
+ REQUIRE(c.get_estimate(zero) == d.get_estimate(zero)) ;
108
+ REQUIRE(c.get_total_weight() == d.get_total_weight()) ;
109
+
110
+ // Check that all entries are equal and 0
111
+ for(auto di: d){
112
+ REQUIRE(di == 0) ;
113
+ }
114
+ }
115
+ REQUIRE(test_allocator_total_bytes == 0);
116
+ REQUIRE(test_allocator_net_allocations == 0);
117
+ }
118
+
119
+ TEST_CASE("CountMin sketch test allocator: bytes serialize-deserialize non-empty", "[cm_sketch_alloc]"){
120
+ test_allocator_total_bytes = 0;
121
+ test_allocator_net_allocations = 0;
122
+ {
123
+ uint8_t n_hashes = 5 ;
124
+ uint32_t n_buckets = 64 ;
125
+ count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0)) ;
126
+ for(uint64_t i=0 ; i < 10; ++i) c.update(i,10*i*i) ;
127
+
128
+ auto bytes = c.serialize() ;
129
+ REQUIRE_THROWS_AS(count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1, alloc(0)), std::invalid_argument);
130
+ auto d = count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, alloc(0)) ;
131
+
132
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes()) ;
133
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets()) ;
134
+ REQUIRE(c.get_seed() == d.get_seed()) ;
135
+ REQUIRE(c.get_total_weight() == d.get_total_weight()) ;
136
+
137
+ // Check that all entries are equal
138
+ auto c_it = c.begin() ;
139
+ auto d_it = d.begin() ;
140
+ while(c_it != c.end()){
141
+ REQUIRE(*c_it == *d_it) ;
142
+ ++c_it ;
143
+ ++d_it ;
144
+ }
145
+
146
+ // Check that the estimates agree
147
+ for(uint64_t i=0 ; i < 10; ++i){
148
+ REQUIRE(c.get_estimate(i) == d.get_estimate(i)) ;
149
+ }
150
+ }
151
+ REQUIRE(test_allocator_total_bytes == 0);
152
+ REQUIRE(test_allocator_net_allocations == 0);
153
+ }
154
+
155
+ } // namespace datasketches
@@ -0,0 +1,306 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <vector>
22
+ #include <cstring>
23
+ #include <sstream>
24
+ #include <fstream>
25
+
26
+ #include "count_min.hpp"
27
+ #include "common_defs.hpp"
28
+
29
+ namespace datasketches{
30
+
31
+ TEST_CASE("CM init - throws") {
32
+ REQUIRE_THROWS_AS(count_min_sketch<uint64_t>(5, 1), std::invalid_argument);
33
+ REQUIRE_THROWS_AS(count_min_sketch<uint64_t>(4, 268435456), std::invalid_argument);
34
+ }
35
+
36
+ TEST_CASE("CM init"){
37
+ uint8_t n_hashes = 3 ;
38
+ uint32_t n_buckets = 5 ;
39
+ uint64_t seed = 1234567 ;
40
+ count_min_sketch<uint64_t> c(n_hashes, n_buckets, seed) ;
41
+ REQUIRE(c.get_num_hashes() == n_hashes) ;
42
+ REQUIRE(c.get_num_buckets() == n_buckets) ;
43
+ REQUIRE(c.get_seed() == seed) ;
44
+ REQUIRE(c.is_empty()) ;
45
+
46
+ for(auto x: c){
47
+ REQUIRE(x == 0) ;
48
+ }
49
+
50
+ // Check the default seed is appropriately set.
51
+ count_min_sketch<uint64_t> c1(n_hashes, n_buckets) ;
52
+ REQUIRE(c1.get_seed() == DEFAULT_SEED) ;
53
+ }
54
+
55
+ TEST_CASE("CM parameter suggestions", "[error parameters]") {
56
+
57
+ // Bucket suggestions
58
+ REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_buckets(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." ) ;
59
+ REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.2) == 14) ;
60
+ REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.1) == 28) ;
61
+ REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.05) == 55) ;
62
+ REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.01) == 272) ;
63
+
64
+ // Check that the sketch get_epsilon acts inversely to suggest_num_buckets
65
+ uint8_t n_hashes = 3 ;
66
+ REQUIRE(count_min_sketch<uint64_t>(n_hashes, 14).get_relative_error() <= 0.2) ;
67
+ REQUIRE(count_min_sketch<uint64_t>(n_hashes, 28).get_relative_error() <= 0.1) ;
68
+ REQUIRE(count_min_sketch<uint64_t>(n_hashes, 55).get_relative_error() <= 0.05) ;
69
+ REQUIRE(count_min_sketch<uint64_t>(n_hashes, 272).get_relative_error() <= 0.01) ;
70
+
71
+ // Hash suggestions
72
+ REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_hashes(10.0), "Confidence must be between 0 and 1.0 (inclusive)." ) ;
73
+ REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_hashes(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." ) ;
74
+ REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.682689492) == 2) ; // 1 STDDEV
75
+ REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.954499736) == 4) ; // 2 STDDEV
76
+ REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.997300204) == 6) ; // 3 STDDEV
77
+ }
78
+
79
+ TEST_CASE("CM one update: uint64_t"){
80
+ uint8_t n_hashes = 3 ;
81
+ uint32_t n_buckets = 5 ;
82
+ uint64_t seed = 9223372036854775807 ; //1234567 ;
83
+ uint64_t inserted_weight = 0 ;
84
+ count_min_sketch<uint64_t> c(n_hashes, n_buckets, seed) ;
85
+ std::string x = "x" ;
86
+
87
+ REQUIRE(c.is_empty()) ;
88
+ REQUIRE(c.get_estimate("x") == 0) ; // No items in sketch so estimates should be zero
89
+ c.update(x) ;
90
+ REQUIRE(!c.is_empty()) ;
91
+ REQUIRE(c.get_estimate(x) == 1) ;
92
+ inserted_weight += 1 ;
93
+
94
+ uint64_t w = 9 ;
95
+ inserted_weight += w ;
96
+ c.update(x, w) ;
97
+ REQUIRE(c.get_estimate(x) == inserted_weight) ;
98
+
99
+ // Doubles are converted to uint64_t
100
+ double w1 = 10.0 ;
101
+ inserted_weight += w1 ;
102
+ c.update(x, w1) ;
103
+ REQUIRE(c.get_estimate(x) == inserted_weight) ;
104
+ REQUIRE(c.get_total_weight() == inserted_weight) ;
105
+ REQUIRE(c.get_estimate(x) <= c.get_upper_bound(x)) ;
106
+ REQUIRE(c.get_estimate(x) >= c.get_lower_bound(x)) ;
107
+ }
108
+
109
+ TEST_CASE("CM frequency cancellation"){
110
+ count_min_sketch<int64_t> c(1, 5) ;
111
+ c.update("x") ;
112
+ c.update("y", -1) ;
113
+ REQUIRE(c.get_total_weight() == 2) ;
114
+ REQUIRE(c.get_estimate("x") == 1) ;
115
+ REQUIRE(c.get_estimate("y") == -1) ;
116
+ }
117
+
118
+
119
+ TEST_CASE("CM frequency estimates"){
120
+ int number_of_items = 10 ;
121
+ std::vector<uint64_t> data(number_of_items) ;
122
+ std::vector<uint64_t> frequencies(number_of_items) ;
123
+
124
+ // Populate data vector
125
+ for(int i=0; i < number_of_items; i++){
126
+ data[i] = i;
127
+ frequencies[i] = 1 << (number_of_items - i) ;
128
+ }
129
+
130
+ double relative_error = 0.1 ;
131
+ double confidence = 0.99 ;
132
+ uint8_t n_buckets = count_min_sketch<uint64_t>::suggest_num_buckets(relative_error) ;
133
+ uint32_t n_hashes = count_min_sketch<uint64_t>::suggest_num_hashes(confidence) ;
134
+
135
+ count_min_sketch<uint64_t> c(n_hashes, n_buckets) ;
136
+ for(int i=0 ; i < number_of_items ; i++) {
137
+ uint64_t value = data[i] ;
138
+ uint64_t freq = frequencies[i] ;
139
+ c.update(value, freq) ;
140
+ }
141
+
142
+ for(const auto i: data){
143
+ uint64_t est = c.get_estimate(i) ;
144
+ uint64_t upp = c.get_upper_bound(i) ;
145
+ uint64_t low = c.get_lower_bound(i) ;
146
+ REQUIRE(est <= upp) ;
147
+ REQUIRE(est >= low) ;
148
+ }
149
+ }
150
+
151
+ TEST_CASE("CM merge - reject", "[reject cases]"){
152
+ double relative_error = 0.25 ;
153
+ double confidence = 0.9 ;
154
+ uint32_t n_buckets = count_min_sketch<uint64_t>::suggest_num_buckets(relative_error) ;
155
+ uint8_t n_hashes = count_min_sketch<uint64_t>::suggest_num_hashes(confidence) ;
156
+ count_min_sketch<uint64_t> s(n_hashes, n_buckets, 9082435234709287) ;
157
+
158
+
159
+ // Generate sketches that we cannot merge into ie they disagree on at least one of the config entries
160
+ count_min_sketch<uint64_t> s1(n_hashes+1, n_buckets) ; // incorrect number of hashes
161
+ count_min_sketch<uint64_t> s2(n_hashes, n_buckets+1) ;// incorrect number of buckets
162
+ count_min_sketch<uint64_t> s3(n_hashes, n_buckets, 1) ;// incorrect seed
163
+ std::vector<count_min_sketch<uint64_t>> sketches = {s1, s2, s3};
164
+
165
+ // Fail cases
166
+ REQUIRE_THROWS(s.merge(s), "Cannot merge a sketch with itself." ) ;
167
+ for(count_min_sketch<uint64_t> sk : sketches){
168
+ REQUIRE_THROWS(s.merge(sk), "Incompatible sketch config." ) ;
169
+ }
170
+ }
171
+
172
+ TEST_CASE("CM merge - pass", "[acceptable cases]"){
173
+ double relative_error = 0.25 ;
174
+ double confidence = 0.9 ;
175
+ uint32_t n_buckets = count_min_sketch<uint64_t>::suggest_num_buckets(relative_error) ;
176
+ uint8_t n_hashes = count_min_sketch<uint64_t>::suggest_num_hashes(confidence) ;
177
+ count_min_sketch<uint64_t> s(n_hashes, n_buckets) ;
178
+ uint8_t s_hashes = s.get_num_hashes() ;
179
+ uint32_t s_buckets = s.get_num_buckets() ;
180
+ count_min_sketch<uint64_t> t(s_hashes, s_buckets) ;
181
+
182
+ // Merge in an all-zeros sketch t. Should not change the total weight.
183
+ s.merge(t) ;
184
+ REQUIRE(s.get_total_weight() == 0 ) ;
185
+
186
+ std::vector<uint64_t> data = {2,3,5,7};
187
+ for(auto d: data){
188
+ s.update(d) ;
189
+ t.update(d) ;
190
+ }
191
+ s.merge(t);
192
+
193
+ REQUIRE(s.get_total_weight() == 2*t.get_total_weight());
194
+
195
+ // Estimator checks.
196
+ for (auto x : data) {
197
+ REQUIRE(s.get_estimate(x) <= s.get_upper_bound(x)) ;
198
+ REQUIRE(s.get_estimate(x) <= 2); // True frequency x == 2 for all x.
199
+ }
200
+ }
201
+
202
+ TEST_CASE("CountMin sketch: serialize-deserialize empty", "[cm_sketch]"){
203
+ uint8_t n_hashes = 1 ;
204
+ uint32_t n_buckets = 5 ;
205
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
206
+ count_min_sketch<uint64_t> c(n_hashes, n_buckets) ;
207
+ c.serialize(s);
208
+ count_min_sketch<uint64_t> d = count_min_sketch<uint64_t>::deserialize(s, DEFAULT_SEED) ;
209
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes()) ;
210
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets()) ;
211
+ REQUIRE(c.get_seed() == d.get_seed()) ;
212
+ uint64_t zero = 0;
213
+ REQUIRE(c.get_estimate(zero) == d.get_estimate(zero)) ;
214
+ REQUIRE(c.get_total_weight() == d.get_total_weight()) ;
215
+
216
+ // Check that all entries are equal and 0
217
+ for(auto di: d){
218
+ REQUIRE(di == 0) ;
219
+ }
220
+ std::ofstream os("count_min-empty.bin");
221
+ c.serialize(os);
222
+ }
223
+
224
+ TEST_CASE("CountMin sketch: serialize-deserialize non-empty", "[cm_sketch]"){
225
+ uint8_t n_hashes = 3 ;
226
+ uint32_t n_buckets = 1024 ;
227
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
228
+ count_min_sketch<uint64_t> c(n_hashes, n_buckets) ;
229
+ for(uint64_t i=0 ; i < 10; ++i) c.update(i,10*i*i) ;
230
+ c.serialize(s);
231
+ count_min_sketch<uint64_t> d = count_min_sketch<uint64_t>::deserialize(s, DEFAULT_SEED) ;
232
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes()) ;
233
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets()) ;
234
+ REQUIRE(c.get_seed() == d.get_seed()) ;
235
+ REQUIRE(c.get_total_weight() == d.get_total_weight()) ;
236
+ for(uint64_t i=0 ; i < 10; ++i){
237
+ REQUIRE(c.get_estimate(i) == d.get_estimate(i)) ;
238
+ }
239
+
240
+ auto c_it = c.begin() ;
241
+ auto d_it = d.begin() ;
242
+ while(c_it != c.end()){
243
+ REQUIRE(*c_it == *d_it) ;
244
+ ++c_it ;
245
+ ++d_it ;
246
+ }
247
+
248
+ std::ofstream os("count_min-non-empty.bin");
249
+ c.serialize(os);
250
+ }
251
+
252
+ TEST_CASE("CountMin sketch: bytes serialize-deserialize empty", "[cm_sketch]"){
253
+ uint8_t n_hashes = 3 ;
254
+ uint32_t n_buckets = 32 ;
255
+ count_min_sketch<uint64_t> c(n_hashes, n_buckets) ;
256
+ auto bytes = c.serialize() ;
257
+
258
+ REQUIRE_THROWS_AS(count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1), std::invalid_argument);
259
+ auto d = count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED) ;
260
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes()) ;
261
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets()) ;
262
+ REQUIRE(c.get_seed() == d.get_seed()) ;
263
+ uint64_t zero = 0;
264
+ REQUIRE(c.get_estimate(zero) == d.get_estimate(zero)) ;
265
+ REQUIRE(c.get_total_weight() == d.get_total_weight()) ;
266
+
267
+ // Check that all entries are equal and 0
268
+ for(auto di: d){
269
+ REQUIRE(di == 0) ;
270
+ }
271
+ }
272
+
273
+
274
+ TEST_CASE("CountMin sketch: bytes serialize-deserialize non-empty", "[cm_sketch]"){
275
+ uint8_t n_hashes = 5 ;
276
+ uint32_t n_buckets = 64 ;
277
+ count_min_sketch<uint64_t> c(n_hashes, n_buckets) ;
278
+ for(uint64_t i=0 ; i < 10; ++i) c.update(i,10*i*i) ;
279
+
280
+ auto bytes = c.serialize() ;
281
+ REQUIRE_THROWS_AS(count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1), std::invalid_argument);
282
+ auto d = count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED) ;
283
+
284
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes()) ;
285
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets()) ;
286
+ REQUIRE(c.get_seed() == d.get_seed()) ;
287
+ REQUIRE(c.get_total_weight() == d.get_total_weight()) ;
288
+
289
+ // Check that all entries are equal
290
+ auto c_it = c.begin() ;
291
+ auto d_it = d.begin() ;
292
+ while(c_it != c.end()){
293
+ REQUIRE(*c_it == *d_it) ;
294
+ ++c_it ;
295
+ ++d_it ;
296
+ }
297
+
298
+ // Check that the estimates agree
299
+ for(uint64_t i=0 ; i < 10; ++i){
300
+ REQUIRE(c.get_estimate(i) == d.get_estimate(i)) ;
301
+ }
302
+
303
+ }
304
+
305
+ } /* namespace datasketches */
306
+
@@ -449,7 +449,7 @@ uint8_t cpc_compressor<A>::determine_pseudo_phase(uint8_t lg_k, uint32_t c) {
449
449
  if (lg_k < 4) throw std::logic_error("lgK < 4");
450
450
  const size_t tmp = c >> (lg_k - 4);
451
451
  const uint8_t phase = tmp & 15;
452
- if (phase < 0 || phase >= 16) throw std::out_of_range("wrong phase");
452
+ if (phase >= 16) throw std::out_of_range("wrong phase");
453
453
  return phase;
454
454
  }
455
455
  }
@@ -30,7 +30,7 @@
30
30
  namespace datasketches {
31
31
 
32
32
  // ln 2.0
33
- static const double ICON_ERROT_CONSTANT = 0.693147180559945286;
33
+ static const double ICON_ERROR_CONSTANT = 0.693147180559945286;
34
34
 
35
35
  // 1, 2, 3, // kappa
36
36
  static const int16_t ICON_LOW_SIDE_DATA [33] = { // Empirically measured at N = 1000 * K.
@@ -102,7 +102,7 @@ double get_icon_confidence_lb(const cpc_sketch_alloc<A>& sketch, int kappa) {
102
102
  const long k = 1 << lg_k;
103
103
  if (lg_k < 4) throw std::logic_error("lgk < 4");
104
104
  if (kappa < 1 || kappa > 3) throw std::invalid_argument("kappa must be between 1 and 3");
105
- double x = ICON_ERROT_CONSTANT;
105
+ double x = ICON_ERROR_CONSTANT;
106
106
  if (lg_k <= 14) x = ((double) ICON_HIGH_SIDE_DATA[3 * (lg_k - 4) + (kappa - 1)]) / 10000.0;
107
107
  const double rel = x / sqrt(k);
108
108
  const double eps = kappa * rel;
@@ -120,7 +120,7 @@ double get_icon_confidence_ub(const cpc_sketch_alloc<A>& sketch, int kappa) {
120
120
  const long k = 1 << lg_k;
121
121
  if (lg_k < 4) throw std::logic_error("lgk < 4");
122
122
  if (kappa < 1 || kappa > 3) throw std::invalid_argument("kappa must be between 1 and 3");
123
- double x = ICON_ERROT_CONSTANT;
123
+ double x = ICON_ERROR_CONSTANT;
124
124
  if (lg_k <= 14) x = ((double) ICON_LOW_SIDE_DATA[3 * (lg_k - 4) + (kappa - 1)]) / 10000.0;
125
125
  const double rel = x / sqrt(k);
126
126
  const double eps = kappa * rel;
@@ -359,7 +359,7 @@ void cpc_sketch_alloc<A>::refresh_kxp(const uint64_t* bit_matrix) {
359
359
 
360
360
  // for improved numerical accuracy, we separately sum the bytes of the U64's
361
361
  double byte_sums[8]; // allocating on the stack
362
- std::fill(byte_sums, &byte_sums[8], 0);
362
+ std::fill(byte_sums, byte_sums + 8, 0);
363
363
 
364
364
  for (size_t i = 0; i < k; i++) {
365
365
  uint64_t word = bit_matrix[i];
@@ -89,7 +89,13 @@ static inline uint32_t warren_count_bits_set_in_matrix(const uint64_t* array, ui
89
89
 
90
90
  // This code is Figure 5-9 in "Hacker's Delight" by Henry S. Warren.
91
91
 
92
- #define CSA(h,l,a,b,c) {uint64_t u = a ^ b; uint64_t v = c; h = (a & b) | (u & v); l = u ^ v;}
92
+ #define DATASKETCHES_CSA(h, l, a, b, c) \
93
+ { \
94
+ uint64_t u = a ^ b; \
95
+ uint64_t v = c; \
96
+ h = (a & b) | (u & v); \
97
+ l = u ^ v; \
98
+ }
93
99
 
94
100
  static inline uint32_t count_bits_set_in_matrix(const uint64_t* a, uint32_t length) {
95
101
  if ((length & 0x7) != 0) throw std::invalid_argument("the length of the array must be a multiple of 8");
@@ -98,15 +104,15 @@ static inline uint32_t count_bits_set_in_matrix(const uint64_t* a, uint32_t leng
98
104
  fours = twos = ones = 0;
99
105
 
100
106
  for (uint32_t i = 0; i <= length - 8; i += 8) {
101
- CSA(twos_a, ones, ones, a[i+0], a[i+1]);
102
- CSA(twos_b, ones, ones, a[i+2], a[i+3]);
103
- CSA(fours_a, twos, twos, twos_a, twos_b);
107
+ DATASKETCHES_CSA(twos_a, ones, ones, a[i+0], a[i+1]);
108
+ DATASKETCHES_CSA(twos_b, ones, ones, a[i+2], a[i+3]);
109
+ DATASKETCHES_CSA(fours_a, twos, twos, twos_a, twos_b);
104
110
 
105
- CSA(twos_a, ones, ones, a[i+4], a[i+5]);
106
- CSA(twos_b, ones, ones, a[i+6], a[i+7]);
107
- CSA(fours_b, twos, twos, twos_a, twos_b);
111
+ DATASKETCHES_CSA(twos_a, ones, ones, a[i+4], a[i+5]);
112
+ DATASKETCHES_CSA(twos_b, ones, ones, a[i+6], a[i+7]);
113
+ DATASKETCHES_CSA(fours_b, twos, twos, twos_a, twos_b);
108
114
 
109
- CSA(eights, fours, fours, fours_a, fours_b);
115
+ DATASKETCHES_CSA(eights, fours, fours, fours_a, fours_b);
110
116
 
111
117
  total += warren_bit_count(eights);
112
118
  }
@@ -119,6 +125,8 @@ static inline uint32_t count_bits_set_in_matrix(const uint64_t* a, uint32_t leng
119
125
  return total;
120
126
  }
121
127
 
128
+ #undef DATASKETCHES_CSA
129
+
122
130
  // Here are some timings made with quickTestMerge.c
123
131
  // for the "5 5" case:
124
132
 
@@ -0,0 +1,42 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_library(density INTERFACE)
19
+
20
+ add_library(${PROJECT_NAME}::DENSITY ALIAS density)
21
+
22
+ if (BUILD_TESTS)
23
+ add_subdirectory(test)
24
+ endif()
25
+
26
+ target_include_directories(density
27
+ INTERFACE
28
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
29
+ $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
30
+ )
31
+
32
+ target_link_libraries(density INTERFACE common)
33
+ target_compile_features(density INTERFACE cxx_std_11)
34
+
35
+ install(TARGETS density
36
+ EXPORT ${PROJECT_NAME}
37
+ )
38
+
39
+ install(FILES
40
+ include/density_sketch.hpp
41
+ include/density_sketch_impl.hpp
42
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")