ddsketch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/.rubocop.yml +4 -0
- data/.simplecov +1 -0
- data/.yardopts +4 -0
- data/CHANGELOG.md +8 -0
- data/CONTRIBUTING.md +6 -0
- data/Gemfile +41 -0
- data/LICENSE +201 -0
- data/LICENSE-3rdparty.csv +2 -0
- data/NOTICE +4 -0
- data/README.md +122 -0
- data/Rakefile +77 -0
- data/ddsketch-ruby.gemspec +29 -0
- data/lib/ddsketch/base_sketch.rb +196 -0
- data/lib/ddsketch/errors.rb +10 -0
- data/lib/ddsketch/log_collapsing_highest_dense_sketch.rb +21 -0
- data/lib/ddsketch/log_collapsing_lowest_dense_sketch.rb +21 -0
- data/lib/ddsketch/mapping/cubically_interpolated_key_mapping.rb +70 -0
- data/lib/ddsketch/mapping/key_mapping.rb +102 -0
- data/lib/ddsketch/mapping/linear_interpolated_key_mapping.rb +52 -0
- data/lib/ddsketch/mapping/logarithmic_key_mapping.rb +26 -0
- data/lib/ddsketch/proto/ddsketch.proto +66 -0
- data/lib/ddsketch/proto/ddsketch_pb.rb +36 -0
- data/lib/ddsketch/proto.rb +46 -0
- data/lib/ddsketch/sketch.rb +18 -0
- data/lib/ddsketch/store/collapsing_highest_dense_store.rb +143 -0
- data/lib/ddsketch/store/collapsing_lowest_dense_store.rb +145 -0
- data/lib/ddsketch/store/dense_store.rb +210 -0
- data/lib/ddsketch/version.rb +14 -0
- data/lib/ddsketch.rb +25 -0
- metadata +78 -0
@@ -0,0 +1,196 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DDSketch
|
4
|
+
# A quantile sketch with relative-error guarantees. This sketch computes
|
5
|
+
# quantile values with an approximation error that is relative to the actual
|
6
|
+
# quantile value. It works on both negative and non-negative input values.
|
7
|
+
#
|
8
|
+
# For instance, using DDSketch with a relative accuracy guarantee set to 1%, if
|
9
|
+
# the expected quantile value is 100, the computed quantile value is guaranteed to
|
10
|
+
# be between 99 and 101. If the expected quantile value is 1000, the computed
|
11
|
+
# quantile value is guaranteed to be between 990 and 1010.
|
12
|
+
#
|
13
|
+
# DDSketch works by mapping floating-point input values to bins and counting the
|
14
|
+
# number of values for each bin. The underlying structure that keeps track of bin
|
15
|
+
# counts is store.
|
16
|
+
#
|
17
|
+
# The memory size of the sketch depends on the range that is covered by the input
|
18
|
+
# values: the larger that range, the more bins are needed to keep track of the
|
19
|
+
# input values. As a rough estimate, if working on durations with a relative
|
20
|
+
# accuracy of 2%, about 2kB (275 bins) are needed to cover values between 1
|
21
|
+
# millisecond and 1 minute, and about 6kB (802 bins) to cover values between 1
|
22
|
+
# nanosecond and 1 day.
|
23
|
+
#
|
24
|
+
# The size of the sketch can be have a fail-safe upper-bound by using collapsing
|
25
|
+
# stores. As shown in
|
26
|
+
# <a href="http://www.vldb.org/pvldb/vol12/p2195-masson.pdf">the DDSketch paper</a>
|
27
|
+
# the likelihood of a store collapsing when using the default bound is vanishingly
|
28
|
+
# small for most data.
|
29
|
+
#
|
30
|
+
# @abstract Subclass and override to implement a custom Sketch class.
|
31
|
+
class BaseSketch
|
32
|
+
# @return [Float] the default relative accuracy for key mapping instantiation
|
33
|
+
DEFAULT_REL_ACC = 0.01
|
34
|
+
|
35
|
+
# @return [Integer] the default bin limit for collasping dense store instantiation
|
36
|
+
DEFAULT_BIN_LIMIT = 2048
|
37
|
+
|
38
|
+
# @return [Mapping::KeyMapping] Mapping between values and integer indices that imposes relative accuracy guarantees.
|
39
|
+
attr_reader :mapping
|
40
|
+
|
41
|
+
# @return [Store::DenseStore] store maps integers to counters
|
42
|
+
attr_reader :store
|
43
|
+
|
44
|
+
# @return [Store::DenseStore] store maps negative integers to counters
|
45
|
+
attr_reader :negative_store
|
46
|
+
|
47
|
+
# @return [Float] the count of zeros in the sketch
|
48
|
+
attr_reader :zero_count
|
49
|
+
|
50
|
+
# @return [Float] the maximum value in the sketch
|
51
|
+
attr_reader :max
|
52
|
+
|
53
|
+
# @return [Float] the minimum value in the sketch
|
54
|
+
attr_reader :min
|
55
|
+
|
56
|
+
# @return [Float] the sum of values in the sketch
|
57
|
+
attr_reader :sum
|
58
|
+
|
59
|
+
# @return [Float] the count of values in the sketch
|
60
|
+
attr_reader :count
|
61
|
+
|
62
|
+
# @param [Mapping::KeyMapping] mapping
|
63
|
+
# mapping between values and integer indices that imposes relative accuracy guarantees.
|
64
|
+
# @param [Store::DenseStore] store
|
65
|
+
# store maps integers to counters
|
66
|
+
# @param [Store::DenseStore] negative_store
|
67
|
+
# store maps negative integers to counters
|
68
|
+
# @param [Float] zero_count
|
69
|
+
# the count of zeros in the sketch
|
70
|
+
def initialize(mapping:, store:, negative_store:, zero_count: 0.0)
|
71
|
+
@mapping = mapping
|
72
|
+
@store = store
|
73
|
+
@negative_store = negative_store
|
74
|
+
@zero_count = zero_count
|
75
|
+
|
76
|
+
@relative_accuracy = mapping.relative_accuracy
|
77
|
+
@count = @negative_store.count + @zero_count + @store.count
|
78
|
+
@min = Float::INFINITY
|
79
|
+
@max = -Float::INFINITY
|
80
|
+
@sum = 0.0
|
81
|
+
end
|
82
|
+
|
83
|
+
# Average of the sketch
|
84
|
+
#
|
85
|
+
# @return [Float]
|
86
|
+
def avg
|
87
|
+
sum / count
|
88
|
+
end
|
89
|
+
|
90
|
+
# Add a value to the sketch.
|
91
|
+
#
|
92
|
+
# @param [Float] val The value to be added.
|
93
|
+
# @param [Float] weight Must be positive.
|
94
|
+
#
|
95
|
+
# @return [nil]
|
96
|
+
def add(val, weight = 1.0)
|
97
|
+
raise ArgumentError, "weight must be positive" if weight <= 0.0
|
98
|
+
|
99
|
+
if val > @mapping.min_possible
|
100
|
+
@store.add(@mapping.key(val), weight)
|
101
|
+
elsif val < -@mapping.min_possible
|
102
|
+
@negative_store.add(@mapping.key(-val), weight)
|
103
|
+
else
|
104
|
+
@zero_count += weight
|
105
|
+
end
|
106
|
+
|
107
|
+
# Keep track of summary stats
|
108
|
+
@count += weight
|
109
|
+
@sum += val * weight
|
110
|
+
@min = val if val < @min
|
111
|
+
@max = val if val > @max
|
112
|
+
|
113
|
+
nil
|
114
|
+
end
|
115
|
+
|
116
|
+
# Return the approximate value at the specified quantile.
|
117
|
+
#
|
118
|
+
# @param [Float] quantile Must be between 0 ~ 1
|
119
|
+
#
|
120
|
+
# @return [Float]
|
121
|
+
def get_quantile_value(quantile)
|
122
|
+
return nil if quantile < 0 || quantile > 1 || @count == 0
|
123
|
+
|
124
|
+
rank = quantile * (@count - 1)
|
125
|
+
if rank < @negative_store.count
|
126
|
+
reversed_rank = @negative_store.count - rank - 1
|
127
|
+
key = @negative_store.key_at_rank(reversed_rank, false)
|
128
|
+
quantile_value = -@mapping.value(key)
|
129
|
+
elsif rank < @zero_count + @negative_store.count
|
130
|
+
return 0
|
131
|
+
else
|
132
|
+
key = @store.key_at_rank(
|
133
|
+
rank - @zero_count - @negative_store.count
|
134
|
+
)
|
135
|
+
quantile_value = @mapping.value(key)
|
136
|
+
end
|
137
|
+
quantile_value
|
138
|
+
end
|
139
|
+
|
140
|
+
# Merge the given sketch into the current one. After this operation, this sketch
|
141
|
+
# encodes the values that were added to both this and the input sketch.
|
142
|
+
#
|
143
|
+
# @param [BaseSketch] sketch The sketch to be merged.
|
144
|
+
#
|
145
|
+
# @return [nil]
|
146
|
+
def merge(sketch)
|
147
|
+
unless mergeable?(sketch)
|
148
|
+
raise InvalidSketchMergeError, "Cannot merge two sketches with different relative accuracy"
|
149
|
+
end
|
150
|
+
|
151
|
+
return if sketch.count == 0
|
152
|
+
|
153
|
+
if @count == 0
|
154
|
+
copy(sketch)
|
155
|
+
return
|
156
|
+
end
|
157
|
+
|
158
|
+
# Merge the stores
|
159
|
+
@store.merge(sketch.store)
|
160
|
+
@negative_store.merge(sketch.negative_store)
|
161
|
+
@zero_count += sketch.zero_count
|
162
|
+
|
163
|
+
# Merge summary stats
|
164
|
+
@count += sketch.count
|
165
|
+
@sum += sketch.sum
|
166
|
+
@min = sketch.min if sketch.min < @min
|
167
|
+
|
168
|
+
@max = sketch.max if sketch.max > @max
|
169
|
+
|
170
|
+
nil
|
171
|
+
end
|
172
|
+
|
173
|
+
# @return [Float] the count of values in the sketch
|
174
|
+
def num_values
|
175
|
+
@count
|
176
|
+
end
|
177
|
+
|
178
|
+
private
|
179
|
+
|
180
|
+
# Two sketches can be merged only if their gammas are equal.
|
181
|
+
def mergeable?(other)
|
182
|
+
@mapping.gamma == other.mapping.gamma
|
183
|
+
end
|
184
|
+
|
185
|
+
# Copy the input sketch into this one
|
186
|
+
def copy(sketch)
|
187
|
+
@store.copy(sketch.store)
|
188
|
+
@negative_store.copy(sketch.negative_store)
|
189
|
+
@zero_count = sketch.zero_count
|
190
|
+
@min = sketch.min
|
191
|
+
@max = sketch.max
|
192
|
+
@count = sketch.count
|
193
|
+
@sum = sketch.sum
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DDSketch
|
4
|
+
# Implementation of BaseSketch with optimized memory usage at the cost of
|
5
|
+
# lower ingestion speed, using a limited number of bins. When the maximum
|
6
|
+
# number of bins is reached, bins with highest indices are collapsed, which
|
7
|
+
# causes the relative accuracy to be lost on the highest quantiles. For the
|
8
|
+
# default bin limit, collapsing is unlikely to occur unless the data is
|
9
|
+
# distributed with tails heavier than any subexponential.
|
10
|
+
class LogCollapsingHighestDenseSketch < BaseSketch
|
11
|
+
# @param relative_accuracy (see Sketch#initialize)
|
12
|
+
# @param [Integer] bin_limit the maximum number of bins
|
13
|
+
def initialize(relative_accuracy: DEFAULT_REL_ACC, bin_limit: DEFAULT_BIN_LIMIT)
|
14
|
+
super(
|
15
|
+
mapping: Mapping::LogarithmicKeyMapping.new(relative_accuracy: relative_accuracy),
|
16
|
+
store: Store::CollapsingHighestDenseStore.new(bin_limit: bin_limit),
|
17
|
+
negative_store: Store::CollapsingHighestDenseStore.new(bin_limit: bin_limit)
|
18
|
+
)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DDSketch
|
4
|
+
# Implementation of BaseSketch with optimized memory usage at the cost of
|
5
|
+
# lower ingestion speed, using a limited number of bins. When the maximum
|
6
|
+
# number of bins is reached, bins with lowest indices are collapsed, which
|
7
|
+
# causes the relative accuracy to be lost on the lowest quantiles. For the
|
8
|
+
# default bin limit, collapsing is unlikely to occur unless the data is
|
9
|
+
# distributed with tails heavier than any subexponential.
|
10
|
+
class LogCollapsingLowestDenseSketch < BaseSketch
|
11
|
+
# @param relative_accuracy (see Sketch#initialize)
|
12
|
+
# @param [Integer] bin_limit the maximum number of bins
|
13
|
+
def initialize(relative_accuracy: DEFAULT_REL_ACC, bin_limit: DEFAULT_BIN_LIMIT)
|
14
|
+
super(
|
15
|
+
mapping: Mapping::LogarithmicKeyMapping.new(relative_accuracy: relative_accuracy),
|
16
|
+
store: Store::CollapsingLowestDenseStore.new(bin_limit: bin_limit),
|
17
|
+
negative_store: Store::CollapsingLowestDenseStore.new(bin_limit: bin_limit)
|
18
|
+
)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DDSketch
|
4
|
+
module Mapping
|
5
|
+
# A fast KeyMapping that approximates the memory-optimal LogarithmicMapping by
|
6
|
+
# extracting the floor value of the logarithm to the base 2 from the binary
|
7
|
+
# representations of floating-point values and cubically interpolating the
|
8
|
+
# logarithm in-between.
|
9
|
+
class CubicallyInterpolatedKeyMapping < KeyMapping
|
10
|
+
A = 6.0 / 35.0
|
11
|
+
B = -3.0 / 5.0
|
12
|
+
C = 10.0 / 7.0
|
13
|
+
|
14
|
+
#
|
15
|
+
# Indicates cubically interpolating algorithm
|
16
|
+
#
|
17
|
+
# @return [Symbol]
|
18
|
+
#
|
19
|
+
def self.interpolation
|
20
|
+
:cubic
|
21
|
+
end
|
22
|
+
|
23
|
+
# (see KeyMapping#initialize)
|
24
|
+
def initialize(relative_accuracy:, offset: 0.0)
|
25
|
+
super(relative_accuracy: relative_accuracy, offset: offset)
|
26
|
+
|
27
|
+
@multiplier /= C
|
28
|
+
end
|
29
|
+
|
30
|
+
protected
|
31
|
+
|
32
|
+
def log_gamma(value)
|
33
|
+
_cubic_log2_approx(value) * @multiplier
|
34
|
+
end
|
35
|
+
|
36
|
+
def pow_gamma(value)
|
37
|
+
_cubic_exp2_approx(value / @multiplier)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Approximates log2 using a cubic polynomial
|
41
|
+
def _cubic_log2_approx(value)
|
42
|
+
mantissa, exponent = Math.frexp(value)
|
43
|
+
significand = 2 * mantissa - 1
|
44
|
+
(
|
45
|
+
(A * significand + B) * significand + C
|
46
|
+
) * significand + (exponent - 1)
|
47
|
+
end
|
48
|
+
|
49
|
+
def _cubic_exp2_approx(value)
|
50
|
+
exponent = Integer(value.floor)
|
51
|
+
delta_0 = B * B - 3 * A * C
|
52
|
+
|
53
|
+
# Derived from Cardano's formula
|
54
|
+
delta_1 = (2.0 * B * B * B) - (9.0 * A * B * C) - (27.0 * A * A * (value - exponent))
|
55
|
+
cardano = Math.cbrt(
|
56
|
+
(delta_1 - ((delta_1 * delta_1 - 4 * delta_0 * delta_0 * delta_0)**0.5)) / 2.0
|
57
|
+
)
|
58
|
+
|
59
|
+
significand_plus_one = (
|
60
|
+
-(B + cardano + delta_0 / cardano) / (3.0 * A) + 1.0
|
61
|
+
)
|
62
|
+
mantissa = significand_plus_one / 2
|
63
|
+
|
64
|
+
# JRuby has inconsistent result with `Math.ldexp`
|
65
|
+
# https://github.com/jruby/jruby/issues/7234
|
66
|
+
Math.ldexp(mantissa, exponent + 1)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DDSketch
|
4
|
+
module Mapping
|
5
|
+
# A mapping between values and integer indices that imposes relative accuracy
|
6
|
+
# guarantees. Specifically, for any value `minIndexableValue() < value <
|
7
|
+
# maxIndexableValue` implementations of `KeyMapping` must be such that
|
8
|
+
# `value(key(v))` is close to `v` with a relative error that is less than
|
9
|
+
# `relative_accuracy`.
|
10
|
+
#
|
11
|
+
# In implementations of KeyMapping, there is generally a trade-off between the
|
12
|
+
# cost of computing the key and the number of keys that are required to cover a
|
13
|
+
# given range of values (memory optimality). The most memory-optimal mapping is
|
14
|
+
# the LogarithmicMapping, but it requires the costly evaluation of the logarithm
|
15
|
+
# when computing the index. Other mappings can approximate the logarithmic
|
16
|
+
# mapping, while being less computationally costly.
|
17
|
+
#
|
18
|
+
# @abstract Subclass and override to implement a custom KeyMapping class.
|
19
|
+
class KeyMapping
|
20
|
+
# @return [Float] the base for the exponential buckets. gamma = (1 + alpha) / (1 - alpha)
|
21
|
+
attr_reader :gamma
|
22
|
+
|
23
|
+
# @return [Float] the relative accuaracy guaranteed, must between 0 ~ 1
|
24
|
+
attr_reader :relative_accuracy
|
25
|
+
|
26
|
+
# @return [Float] the smallest value the sketch can distinguish from 0
|
27
|
+
attr_reader :min_possible
|
28
|
+
|
29
|
+
# @return [Float] the largest value the sketch can handle
|
30
|
+
attr_reader :max_possible
|
31
|
+
|
32
|
+
# @return [Float] value used to shift all bin keys
|
33
|
+
attr_reader :offset
|
34
|
+
|
35
|
+
#
|
36
|
+
# Indicates interpolating algorithm
|
37
|
+
#
|
38
|
+
# @return [Symbol, nil]
|
39
|
+
#
|
40
|
+
def self.interpolation
|
41
|
+
nil
|
42
|
+
end
|
43
|
+
|
44
|
+
# @param [Float] relative_accuracy the relative accuaracy guaranteed, must between 0 ~ 1
|
45
|
+
# @param [Float] offset value used to shift all bin keys
|
46
|
+
def initialize(relative_accuracy:, offset: 0.0)
|
47
|
+
if (relative_accuracy <= 0) || (relative_accuracy >= 1)
|
48
|
+
raise ArgumentError, "Relative accuracy must be between 0 and 1."
|
49
|
+
end
|
50
|
+
|
51
|
+
@relative_accuracy = relative_accuracy
|
52
|
+
@offset = offset
|
53
|
+
|
54
|
+
gamma_mantissa = 2 * relative_accuracy / (1 - relative_accuracy)
|
55
|
+
|
56
|
+
@gamma = 1 + gamma_mantissa
|
57
|
+
@multiplier = 1 / Math.log(gamma_mantissa + 1)
|
58
|
+
@min_possible = Float::MIN * @gamma
|
59
|
+
@max_possible = Float::MAX / @gamma
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# Returns the key specifying the bucket for value
|
64
|
+
#
|
65
|
+
# @param [Float] value
|
66
|
+
#
|
67
|
+
# @return [Integer]
|
68
|
+
#
|
69
|
+
def key(value)
|
70
|
+
Integer(log_gamma(value).ceil + @offset)
|
71
|
+
end
|
72
|
+
|
73
|
+
#
|
74
|
+
# Returns the value represented by the bucket specified by the key
|
75
|
+
#
|
76
|
+
# @param [Integer] key
|
77
|
+
#
|
78
|
+
# @return [Float]
|
79
|
+
#
|
80
|
+
def value(key)
|
81
|
+
pow_gamma(key - @offset) * (2.0 / (1 + @gamma))
|
82
|
+
end
|
83
|
+
|
84
|
+
#
|
85
|
+
# Indicates interpolating algorithm
|
86
|
+
#
|
87
|
+
# @return [Symbol, nil]
|
88
|
+
#
|
89
|
+
def interpolation
|
90
|
+
self.class.interpolation
|
91
|
+
end
|
92
|
+
|
93
|
+
protected
|
94
|
+
|
95
|
+
def log_gamma(value)
|
96
|
+
end
|
97
|
+
|
98
|
+
def pow_gamma(value)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DDSketch
|
4
|
+
module Mapping
|
5
|
+
# A fast KeyMapping that approximates the memory-optimal
|
6
|
+
# LogarithmicMapping by extracting the floor value of the logarithm to the
|
7
|
+
# base 2 from the binary representations of floating-point values and
|
8
|
+
# linearly interpolating the logarithm in-between.
|
9
|
+
class LinearlyInterpolatedKeyMapping < KeyMapping
|
10
|
+
#
|
11
|
+
# Indicates linear interpolating algorithm
|
12
|
+
#
|
13
|
+
# @return [nil]
|
14
|
+
#
|
15
|
+
def self.interpolation
|
16
|
+
:linear
|
17
|
+
end
|
18
|
+
|
19
|
+
protected
|
20
|
+
|
21
|
+
def log_gamma(value)
|
22
|
+
_log2_approx(value) * @multiplier
|
23
|
+
end
|
24
|
+
|
25
|
+
def pow_gamma(value)
|
26
|
+
_exp2_approx(value / @multiplier)
|
27
|
+
end
|
28
|
+
|
29
|
+
# Approximates log2 by s + f
|
30
|
+
# where v = (s+1) * 2 ** f for s in [0, 1)
|
31
|
+
|
32
|
+
# frexp(v) returns m and e s.t.
|
33
|
+
# v = m * 2 ** e ; (m in [0.5, 1) or 0.0)
|
34
|
+
# so we adjust m and e accordingly
|
35
|
+
def _log2_approx(value)
|
36
|
+
mantissa, exponent = Math.frexp(value)
|
37
|
+
significand = 2 * mantissa - 1
|
38
|
+
|
39
|
+
significand + (exponent - 1)
|
40
|
+
end
|
41
|
+
|
42
|
+
def _exp2_approx(value)
|
43
|
+
exponent = Integer(value.floor + 1)
|
44
|
+
mantissa = (value - exponent + 2) / 2.0
|
45
|
+
|
46
|
+
# JRuby has inconsistent result with `Math.ldexp`
|
47
|
+
# https://github.com/jruby/jruby/issues/7234
|
48
|
+
Math.ldexp(mantissa, exponent)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DDSketch
|
4
|
+
module Mapping
|
5
|
+
# A memory-optimal KeyMapping, i.e., given a targeted relative accuracy, it
|
6
|
+
# requires the least number of keys to cover a given range of values. This is
|
7
|
+
# done by logarithmically mapping floating-point values to integers.
|
8
|
+
class LogarithmicKeyMapping < KeyMapping
|
9
|
+
# (see KeyMapping#initialize)
|
10
|
+
def initialize(relative_accuracy:, offset: 0.0)
|
11
|
+
super(relative_accuracy: relative_accuracy, offset: offset)
|
12
|
+
@multiplier *= Math.log(2)
|
13
|
+
end
|
14
|
+
|
15
|
+
protected
|
16
|
+
|
17
|
+
def log_gamma(value)
|
18
|
+
Math.log(value, 2) * @multiplier
|
19
|
+
end
|
20
|
+
|
21
|
+
def pow_gamma(value)
|
22
|
+
2**(value / @multiplier)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
/* Unless explicitly stated otherwise all files in this repository are licensed under the Apache License 2.0.
|
2
|
+
* This product includes software developed at Datadog (https://www.datadoghq.com/).
|
3
|
+
* Copyright 2020 Datadog, Inc.
|
4
|
+
*/
|
5
|
+
|
6
|
+
syntax = "proto3";
|
7
|
+
|
8
|
+
option ruby_package = "DDSketch::Proto";
|
9
|
+
|
10
|
+
// A DDSketch is essentially a histogram that partitions the range of positive values into an infinite number of
|
11
|
+
// indexed bins whose size grows exponentially. It keeps track of the number of values (or possibly floating-point
|
12
|
+
// weights) added to each bin. Negative values are partitioned like positive values, symmetrically to zero.
|
13
|
+
// The value zero as well as its close neighborhood that would be mapped to extreme bin indexes is mapped to a specific
|
14
|
+
// counter.
|
15
|
+
message DDSketch {
|
16
|
+
// The mapping between positive values and the bin indexes they belong to.
|
17
|
+
IndexMapping mapping = 1;
|
18
|
+
|
19
|
+
// The store for keeping track of positive values.
|
20
|
+
Store positiveValues = 2;
|
21
|
+
|
22
|
+
// The store for keeping track of negative values. A negative value v is mapped using its positive opposite -v.
|
23
|
+
Store negativeValues = 3;
|
24
|
+
|
25
|
+
// The count for the value zero and its close neighborhood (whose width depends on the mapping).
|
26
|
+
double zeroCount = 4;
|
27
|
+
}
|
28
|
+
|
29
|
+
// How to map positive values to the bins they belong to.
|
30
|
+
message IndexMapping {
|
31
|
+
// The gamma parameter of the mapping, such that bin index that a value v belongs to is roughly equal to
|
32
|
+
// log(v)/log(gamma).
|
33
|
+
double gamma = 1;
|
34
|
+
|
35
|
+
// An offset that can be used to shift all bin indexes.
|
36
|
+
double indexOffset = 2;
|
37
|
+
|
38
|
+
// To speed up the computation of the index a value belongs to, the computation of the log may be approximated using
|
39
|
+
// the fact that the log to the base 2 of powers of 2 can be computed at a low cost from the binary representation of
|
40
|
+
// the input value. Other values can be approximated by interpolating between successive powers of 2 (linearly,
|
41
|
+
// quadratically or cubically).
|
42
|
+
// NONE means that the log is to be computed exactly (no interpolation).
|
43
|
+
Interpolation interpolation = 3;
|
44
|
+
enum Interpolation {
|
45
|
+
NONE = 0;
|
46
|
+
LINEAR = 1;
|
47
|
+
QUADRATIC = 2;
|
48
|
+
CUBIC = 3;
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
// A Store maps bin indexes to their respective counts.
|
53
|
+
// Counts can be encoded sparsely using binCounts, but also in a contiguous way using contiguousBinCounts and
|
54
|
+
// contiguousBinIndexOffset. Given that non-empty bins are in practice usually contiguous or close to one another, the
|
55
|
+
// latter contiguous encoding method is usually more efficient than the sparse one.
|
56
|
+
// Both encoding methods can be used conjointly. If a bin appears in both the sparse and the contiguous encodings, its
|
57
|
+
// count value is the sum of the counts in each encodings.
|
58
|
+
message Store {
|
59
|
+
// The bin counts, encoded sparsely.
|
60
|
+
map<sint32, double> binCounts = 1;
|
61
|
+
|
62
|
+
// The bin counts, encoded contiguously. The values of contiguousBinCounts are the counts for the bins of indexes
|
63
|
+
// o, o+1, o+2, etc., where o is contiguousBinIndexOffset.
|
64
|
+
repeated double contiguousBinCounts = 2 [packed = true];
|
65
|
+
sint32 contiguousBinIndexOffset = 3;
|
66
|
+
}
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
2
|
+
# source: ddsketch.proto
|
3
|
+
|
4
|
+
require "google/protobuf"
|
5
|
+
|
6
|
+
Google::Protobuf::DescriptorPool.generated_pool.build do
|
7
|
+
add_message "DDSketch" do
|
8
|
+
optional :mapping, :message, 1, "IndexMapping"
|
9
|
+
optional :positiveValues, :message, 2, "Store"
|
10
|
+
optional :negativeValues, :message, 3, "Store"
|
11
|
+
optional :zeroCount, :double, 4
|
12
|
+
end
|
13
|
+
add_message "IndexMapping" do
|
14
|
+
optional :gamma, :double, 1
|
15
|
+
optional :indexOffset, :double, 2
|
16
|
+
optional :interpolation, :enum, 3, "IndexMapping.Interpolation"
|
17
|
+
end
|
18
|
+
add_enum "IndexMapping.Interpolation" do
|
19
|
+
value :NONE, 0
|
20
|
+
value :LINEAR, 1
|
21
|
+
value :QUADRATIC, 2
|
22
|
+
value :CUBIC, 3
|
23
|
+
end
|
24
|
+
add_message "Store" do
|
25
|
+
map :binCounts, :sint32, :double, 1
|
26
|
+
repeated :contiguousBinCounts, :double, 2
|
27
|
+
optional :contiguousBinIndexOffset, :sint32, 3
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
module DDSketch::Proto
|
32
|
+
DDSketch = Google::Protobuf::DescriptorPool.generated_pool.lookup("DDSketch").msgclass
|
33
|
+
IndexMapping = Google::Protobuf::DescriptorPool.generated_pool.lookup("IndexMapping").msgclass
|
34
|
+
IndexMapping::Interpolation = Google::Protobuf::DescriptorPool.generated_pool.lookup("IndexMapping.Interpolation").enummodule
|
35
|
+
Store = Google::Protobuf::DescriptorPool.generated_pool.lookup("Store").msgclass
|
36
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "ddsketch/proto/ddsketch_pb"
|
4
|
+
|
5
|
+
module DDSketch
|
6
|
+
# Namespace for protobuf object generated by `google-protobuf`
|
7
|
+
# @!visibility private
|
8
|
+
module Proto
|
9
|
+
INTERPOLATION_MAPPING = {
|
10
|
+
linear: IndexMapping::Interpolation::LINEAR,
|
11
|
+
cubic: IndexMapping::Interpolation::CUBIC
|
12
|
+
}.freeze
|
13
|
+
|
14
|
+
private_constant :INTERPOLATION_MAPPING
|
15
|
+
|
16
|
+
module_function
|
17
|
+
|
18
|
+
def serialize_sketch(sketch)
|
19
|
+
DDSketch.new(
|
20
|
+
mapping: serialize_key_mapping(sketch.mapping),
|
21
|
+
positiveValues: serialize_store(sketch.store),
|
22
|
+
negativeValues: serialize_store(sketch.negative_store),
|
23
|
+
zeroCount: sketch.zero_count
|
24
|
+
)
|
25
|
+
end
|
26
|
+
|
27
|
+
def serialize_store(store)
|
28
|
+
Store.new(
|
29
|
+
contiguousBinCounts: store.bins,
|
30
|
+
contiguousBinIndexOffset: store.offset
|
31
|
+
)
|
32
|
+
end
|
33
|
+
|
34
|
+
def serialize_key_mapping(mapping)
|
35
|
+
IndexMapping.new(
|
36
|
+
gamma: mapping.relative_accuracy,
|
37
|
+
indexOffset: mapping.offset,
|
38
|
+
interpolation: serialize_interpolation(mapping)
|
39
|
+
)
|
40
|
+
end
|
41
|
+
|
42
|
+
def serialize_interpolation(mapping)
|
43
|
+
INTERPOLATION_MAPPING.fetch(mapping.interpolation, IndexMapping::Interpolation::NONE)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DDSketch
|
4
|
+
# The default implementation of DDSketch, with optimized memory usage at
|
5
|
+
# the cost of lower ingestion speed, using an unlimited number of bins. The
|
6
|
+
# number of bins will not exceed a reasonable number unless the data is
|
7
|
+
# distributed with tails heavier than any subexponential.
|
8
|
+
class Sketch < BaseSketch
|
9
|
+
# @param [Float] relative_accuracy The guaranteed relative accuracy for sketch
|
10
|
+
def initialize(relative_accuracy: DEFAULT_REL_ACC)
|
11
|
+
super(
|
12
|
+
mapping: Mapping::LogarithmicKeyMapping.new(relative_accuracy: relative_accuracy),
|
13
|
+
store: Store::DenseStore.new,
|
14
|
+
negative_store: Store::DenseStore.new
|
15
|
+
)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|