tdigest 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -0
- data/.travis.yml +2 -0
- data/README.md +32 -0
- data/lib/tdigest/centroid.rb +5 -1
- data/lib/tdigest/tdigest.rb +108 -8
- data/lib/tdigest/version.rb +1 -1
- data/tdigest.gemspec +5 -3
- metadata +35 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d6b00ddf3a1a0b0a5989002fb3cf8fd51b352c6a
|
4
|
+
data.tar.gz: 60d1e7f4b42e3e38300f2f9380f9a91077bbd02f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d7154db5857ee2b184ff16c1f2afb7ea110be15cb1d4253e5287b0bf6e656232fccdffecf1f0a3ba6f52626039f0498b030a56cc5e71b51f0a6c82c60f15c7f2
|
7
|
+
data.tar.gz: 7265929cc8f77b8973cd9d60240c38f3ae3ce29e9af6644e18041c7a56f72ec5980bccf1696cb232865d294853821ce31d8582e57bd26806f79785610b1aa3d9
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.2.3
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
# Tdigest
|
2
2
|
|
3
3
|
[](https://badge.fury.io/rb/tdigest)
|
4
|
+
[](https://travis-ci.org/castle/tdigest)
|
5
|
+
[](https://coveralls.io/github/castle/tdigest?branch=master)
|
4
6
|
|
5
7
|
Ruby implementation of Ted Dunning's [t-digest](https://github.com/tdunning/t-digest) data structure.
|
6
8
|
|
@@ -33,6 +35,36 @@ puts td.percentile(0.5)
|
|
33
35
|
puts td.p_rank(0.95)
|
34
36
|
```
|
35
37
|
|
38
|
+
#### Serialization
|
39
|
+
|
40
|
+
This gem offers the same serialization options as the original [Java implementation](https://github.com/tdunning/t-digest). You can read more about T-digest persistance in [Chapter 3 in the paper](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf).
|
41
|
+
|
42
|
+
**Standard encoding**
|
43
|
+
|
44
|
+
This encoding uses 8-byte Double for the means and a 4-byte integers for counts.
|
45
|
+
Size per centroid is a fixed 12-bytes.
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
bytes = tdigest.as_bytes
|
49
|
+
```
|
50
|
+
|
51
|
+
**Compressed encoding**
|
52
|
+
|
53
|
+
This encoding uses delta encoding with 4-byte floats for the means and variable
|
54
|
+
length encoding for the counts. Size per centroid is between 5-12 bytes.
|
55
|
+
|
56
|
+
```ruby
|
57
|
+
bytes = tdigest.as_small_bytes
|
58
|
+
```
|
59
|
+
|
60
|
+
**Deserializing**
|
61
|
+
|
62
|
+
Deserialization will automatically detect compression format
|
63
|
+
|
64
|
+
```ruby
|
65
|
+
tdigest = TDigest::TDigest.from_bytes(bytes)
|
66
|
+
```
|
67
|
+
|
36
68
|
## Development
|
37
69
|
|
38
70
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/lib/tdigest/centroid.rb
CHANGED
data/lib/tdigest/tdigest.rb
CHANGED
@@ -3,6 +3,9 @@ require 'tdigest/centroid'
|
|
3
3
|
|
4
4
|
module TDigest
|
5
5
|
class TDigest
|
6
|
+
VERBOSE_ENCODING = 1
|
7
|
+
SMALL_ENCODING = 2
|
8
|
+
|
6
9
|
attr_accessor :centroids
|
7
10
|
def initialize(delta = 0.01, k = 25, cx = 1.1)
|
8
11
|
@delta = delta
|
@@ -13,6 +16,45 @@ module TDigest
|
|
13
16
|
reset!
|
14
17
|
end
|
15
18
|
|
19
|
+
def as_bytes
|
20
|
+
# compression as defined by Java implementation
|
21
|
+
output = [VERBOSE_ENCODING, compression, size]
|
22
|
+
output += @centroids.map { |_, c| c.mean }
|
23
|
+
output += @centroids.map { |_, c| c.n }
|
24
|
+
output.pack("LdLd#{size}L#{size}")
|
25
|
+
end
|
26
|
+
|
27
|
+
def as_small_bytes
|
28
|
+
output = [SMALL_ENCODING, compression, size]
|
29
|
+
x = 0
|
30
|
+
# delta encoding allows saving 4-bytes floats
|
31
|
+
mean_arr = @centroids.map do |_, c|
|
32
|
+
val = c.mean - x
|
33
|
+
x = c.mean
|
34
|
+
val
|
35
|
+
end
|
36
|
+
output += mean_arr
|
37
|
+
# Variable length encoding of numbers
|
38
|
+
c_arr = @centroids.each_with_object([]) do |(_, c), arr|
|
39
|
+
k = 0
|
40
|
+
n = c.n
|
41
|
+
while n < 0 || n > 0x7f
|
42
|
+
b = 0x80 | (0x7f & n)
|
43
|
+
arr << b
|
44
|
+
n = n >> 7
|
45
|
+
k += 1
|
46
|
+
fail 'Unreasonable large number' if k > 6
|
47
|
+
end
|
48
|
+
arr << n
|
49
|
+
end
|
50
|
+
output += c_arr
|
51
|
+
output.pack("LdLf#{mean_arr.size}C#{c_arr.size}")
|
52
|
+
end
|
53
|
+
|
54
|
+
def as_json(_ = nil)
|
55
|
+
@centroids.map { |_, c| c.as_json }
|
56
|
+
end
|
57
|
+
|
16
58
|
def bound_mean(x)
|
17
59
|
upper = @centroids.upper_bound(x)
|
18
60
|
lower = @centroids.lower_bound(x)
|
@@ -50,6 +92,10 @@ module TDigest
|
|
50
92
|
nil
|
51
93
|
end
|
52
94
|
|
95
|
+
def compression
|
96
|
+
1 / @delta
|
97
|
+
end
|
98
|
+
|
53
99
|
def find_nearest(x)
|
54
100
|
return nil if size == 0
|
55
101
|
|
@@ -149,33 +195,87 @@ module TDigest
|
|
149
195
|
@centroids.map { |_, c| c }
|
150
196
|
end
|
151
197
|
|
198
|
+
def self.from_bytes(bytes)
|
199
|
+
format, compression, size = bytes.unpack('LdL')
|
200
|
+
tdigest = new(1 / compression)
|
201
|
+
|
202
|
+
start_idx = 16 # after header
|
203
|
+
case format
|
204
|
+
when VERBOSE_ENCODING
|
205
|
+
array = bytes[start_idx..-1].unpack("d#{size}L#{size}")
|
206
|
+
means, counts = array.each_slice(size).to_a if array.size > 0
|
207
|
+
when SMALL_ENCODING
|
208
|
+
means = bytes[start_idx..(start_idx + 4 * size)].unpack("f#{size}")
|
209
|
+
# Decode delta encoding of means
|
210
|
+
x = 0
|
211
|
+
means.map! do |m|
|
212
|
+
m += x
|
213
|
+
x = m
|
214
|
+
m
|
215
|
+
end
|
216
|
+
counts_bytes = bytes[(start_idx + 4 * size)..-1].unpack('C*')
|
217
|
+
counts = []
|
218
|
+
# Decode variable length integer bytes
|
219
|
+
size.times do
|
220
|
+
v = counts_bytes.shift
|
221
|
+
z = 0x7f & v
|
222
|
+
shift = 7
|
223
|
+
while (v & 0x80) != 0
|
224
|
+
fail 'Shift too large in decode' if shift > 28
|
225
|
+
v = counts_bytes.shift || 0
|
226
|
+
z += (v & 0x7f) << shift
|
227
|
+
shift += 7
|
228
|
+
end
|
229
|
+
counts << z
|
230
|
+
end
|
231
|
+
# This shouldn't happen
|
232
|
+
fail 'Mismatch' unless counts.size == means.size
|
233
|
+
else
|
234
|
+
fail 'Unknown compression format'
|
235
|
+
end
|
236
|
+
if means && counts
|
237
|
+
means.zip(counts).each { |val| tdigest.push(val[0], val[1]) }
|
238
|
+
end
|
239
|
+
tdigest
|
240
|
+
end
|
152
241
|
|
153
|
-
|
242
|
+
def self.from_json(array)
|
243
|
+
tdigest = new
|
244
|
+
# Handle both string and symbol keys
|
245
|
+
array.each { |a| tdigest.push(a['m'] || a[:m], a['n'] || a[:n]) }
|
246
|
+
tdigest
|
247
|
+
end
|
154
248
|
|
249
|
+
private
|
155
250
|
|
156
251
|
def _add_weight(nearest, x, n)
|
157
252
|
unless x == nearest.mean
|
158
253
|
nearest.mean += n * (x - nearest.mean) / (nearest.n + n)
|
159
254
|
end
|
160
255
|
|
256
|
+
_cumulate(false, true) if nearest.mean_cumn.nil?
|
257
|
+
|
161
258
|
nearest.cumn += n
|
162
|
-
nearest.mean_cumn += n / 2
|
259
|
+
nearest.mean_cumn += n / 2.0
|
163
260
|
nearest.n += n
|
164
261
|
@n += n
|
165
262
|
|
166
263
|
nil
|
167
264
|
end
|
168
265
|
|
169
|
-
def _cumulate(exact = false)
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
266
|
+
def _cumulate(exact = false, force = false)
|
267
|
+
unless force
|
268
|
+
factor = if @last_cumulate == 0
|
269
|
+
Float::INFINITY
|
270
|
+
else
|
271
|
+
(@n.to_f / @last_cumulate)
|
272
|
+
end
|
273
|
+
return if @n == @last_cumulate || (!exact && @cx && @cx > (factor))
|
174
274
|
end
|
175
275
|
|
176
276
|
cumn = 0
|
177
277
|
@centroids.each do |_, c|
|
178
|
-
c.mean_cumn = cumn + c.n / 2
|
278
|
+
c.mean_cumn = cumn + c.n / 2.0
|
179
279
|
cumn = c.cumn = cumn + c.n
|
180
280
|
end
|
181
281
|
@n = @last_cumulate = cumn
|
data/lib/tdigest/version.rb
CHANGED
data/tdigest.gemspec
CHANGED
@@ -21,7 +21,9 @@ Gem::Specification.new do |spec|
|
|
21
21
|
|
22
22
|
spec.add_runtime_dependency 'rbtree', '~> 0.4.2'
|
23
23
|
|
24
|
-
spec.add_development_dependency
|
25
|
-
spec.add_development_dependency
|
26
|
-
spec.add_development_dependency
|
24
|
+
spec.add_development_dependency 'bundler', '~> 1.10'
|
25
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
26
|
+
spec.add_development_dependency 'minitest', '~> 5.8.3'
|
27
|
+
spec.add_development_dependency 'coveralls', '~> 0.8.10'
|
28
|
+
spec.add_development_dependency 'simplecov', '~> 0.11.1'
|
27
29
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tdigest
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sebastian Wallin
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-01-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbtree
|
@@ -56,16 +56,44 @@ dependencies:
|
|
56
56
|
name: minitest
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 5.8.3
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 5.8.3
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: coveralls
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
60
74
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
75
|
+
version: 0.8.10
|
62
76
|
type: :development
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
|
-
- - "
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 0.8.10
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: simplecov
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.11.1
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
67
95
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
96
|
+
version: 0.11.1
|
69
97
|
description: Ruby implementation of Dunning's T-Digest for streaming quantile approximation
|
70
98
|
email:
|
71
99
|
- sebastian.wallin@gmail.com
|
@@ -74,6 +102,7 @@ extensions: []
|
|
74
102
|
extra_rdoc_files: []
|
75
103
|
files:
|
76
104
|
- ".gitignore"
|
105
|
+
- ".ruby-version"
|
77
106
|
- ".travis.yml"
|
78
107
|
- Gemfile
|
79
108
|
- LICENSE.txt
|