tdigest 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.ruby-version +1 -0
- data/.travis.yml +2 -0
- data/README.md +32 -0
- data/lib/tdigest/centroid.rb +5 -1
- data/lib/tdigest/tdigest.rb +108 -8
- data/lib/tdigest/version.rb +1 -1
- data/tdigest.gemspec +5 -3
- metadata +35 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d6b00ddf3a1a0b0a5989002fb3cf8fd51b352c6a
|
4
|
+
data.tar.gz: 60d1e7f4b42e3e38300f2f9380f9a91077bbd02f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d7154db5857ee2b184ff16c1f2afb7ea110be15cb1d4253e5287b0bf6e656232fccdffecf1f0a3ba6f52626039f0498b030a56cc5e71b51f0a6c82c60f15c7f2
|
7
|
+
data.tar.gz: 7265929cc8f77b8973cd9d60240c38f3ae3ce29e9af6644e18041c7a56f72ec5980bccf1696cb232865d294853821ce31d8582e57bd26806f79785610b1aa3d9
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.2.3
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
# Tdigest
|
2
2
|
|
3
3
|
[![Gem Version](https://badge.fury.io/rb/tdigest.svg)](https://badge.fury.io/rb/tdigest)
|
4
|
+
[![Build Status](https://travis-ci.org/castle/tdigest.svg?branch=master)](https://travis-ci.org/castle/tdigest)
|
5
|
+
[![Coverage Status](https://coveralls.io/repos/castle/tdigest/badge.svg?branch=master&service=github)](https://coveralls.io/github/castle/tdigest?branch=master)
|
4
6
|
|
5
7
|
Ruby implementation of Ted Dunning's [t-digest](https://github.com/tdunning/t-digest) data structure.
|
6
8
|
|
@@ -33,6 +35,36 @@ puts td.percentile(0.5)
|
|
33
35
|
puts td.p_rank(0.95)
|
34
36
|
```
|
35
37
|
|
38
|
+
#### Serialization
|
39
|
+
|
40
|
+
This gem offers the same serialization options as the original [Java implementation](https://github.com/tdunning/t-digest). You can read more about T-digest persistance in [Chapter 3 in the paper](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf).
|
41
|
+
|
42
|
+
**Standard encoding**
|
43
|
+
|
44
|
+
This encoding uses 8-byte Double for the means and a 4-byte integers for counts.
|
45
|
+
Size per centroid is a fixed 12-bytes.
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
bytes = tdigest.as_bytes
|
49
|
+
```
|
50
|
+
|
51
|
+
**Compressed encoding**
|
52
|
+
|
53
|
+
This encoding uses delta encoding with 4-byte floats for the means and variable
|
54
|
+
length encoding for the counts. Size per centroid is between 5-12 bytes.
|
55
|
+
|
56
|
+
```ruby
|
57
|
+
bytes = tdigest.as_small_bytes
|
58
|
+
```
|
59
|
+
|
60
|
+
**Deserializing**
|
61
|
+
|
62
|
+
Deserialization will automatically detect compression format
|
63
|
+
|
64
|
+
```ruby
|
65
|
+
tdigest = TDigest::TDigest.from_bytes(bytes)
|
66
|
+
```
|
67
|
+
|
36
68
|
## Development
|
37
69
|
|
38
70
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/lib/tdigest/centroid.rb
CHANGED
data/lib/tdigest/tdigest.rb
CHANGED
@@ -3,6 +3,9 @@ require 'tdigest/centroid'
|
|
3
3
|
|
4
4
|
module TDigest
|
5
5
|
class TDigest
|
6
|
+
VERBOSE_ENCODING = 1
|
7
|
+
SMALL_ENCODING = 2
|
8
|
+
|
6
9
|
attr_accessor :centroids
|
7
10
|
def initialize(delta = 0.01, k = 25, cx = 1.1)
|
8
11
|
@delta = delta
|
@@ -13,6 +16,45 @@ module TDigest
|
|
13
16
|
reset!
|
14
17
|
end
|
15
18
|
|
19
|
+
def as_bytes
|
20
|
+
# compression as defined by Java implementation
|
21
|
+
output = [VERBOSE_ENCODING, compression, size]
|
22
|
+
output += @centroids.map { |_, c| c.mean }
|
23
|
+
output += @centroids.map { |_, c| c.n }
|
24
|
+
output.pack("LdLd#{size}L#{size}")
|
25
|
+
end
|
26
|
+
|
27
|
+
def as_small_bytes
|
28
|
+
output = [SMALL_ENCODING, compression, size]
|
29
|
+
x = 0
|
30
|
+
# delta encoding allows saving 4-bytes floats
|
31
|
+
mean_arr = @centroids.map do |_, c|
|
32
|
+
val = c.mean - x
|
33
|
+
x = c.mean
|
34
|
+
val
|
35
|
+
end
|
36
|
+
output += mean_arr
|
37
|
+
# Variable length encoding of numbers
|
38
|
+
c_arr = @centroids.each_with_object([]) do |(_, c), arr|
|
39
|
+
k = 0
|
40
|
+
n = c.n
|
41
|
+
while n < 0 || n > 0x7f
|
42
|
+
b = 0x80 | (0x7f & n)
|
43
|
+
arr << b
|
44
|
+
n = n >> 7
|
45
|
+
k += 1
|
46
|
+
fail 'Unreasonable large number' if k > 6
|
47
|
+
end
|
48
|
+
arr << n
|
49
|
+
end
|
50
|
+
output += c_arr
|
51
|
+
output.pack("LdLf#{mean_arr.size}C#{c_arr.size}")
|
52
|
+
end
|
53
|
+
|
54
|
+
def as_json(_ = nil)
|
55
|
+
@centroids.map { |_, c| c.as_json }
|
56
|
+
end
|
57
|
+
|
16
58
|
def bound_mean(x)
|
17
59
|
upper = @centroids.upper_bound(x)
|
18
60
|
lower = @centroids.lower_bound(x)
|
@@ -50,6 +92,10 @@ module TDigest
|
|
50
92
|
nil
|
51
93
|
end
|
52
94
|
|
95
|
+
def compression
|
96
|
+
1 / @delta
|
97
|
+
end
|
98
|
+
|
53
99
|
def find_nearest(x)
|
54
100
|
return nil if size == 0
|
55
101
|
|
@@ -149,33 +195,87 @@ module TDigest
|
|
149
195
|
@centroids.map { |_, c| c }
|
150
196
|
end
|
151
197
|
|
198
|
+
def self.from_bytes(bytes)
|
199
|
+
format, compression, size = bytes.unpack('LdL')
|
200
|
+
tdigest = new(1 / compression)
|
201
|
+
|
202
|
+
start_idx = 16 # after header
|
203
|
+
case format
|
204
|
+
when VERBOSE_ENCODING
|
205
|
+
array = bytes[start_idx..-1].unpack("d#{size}L#{size}")
|
206
|
+
means, counts = array.each_slice(size).to_a if array.size > 0
|
207
|
+
when SMALL_ENCODING
|
208
|
+
means = bytes[start_idx..(start_idx + 4 * size)].unpack("f#{size}")
|
209
|
+
# Decode delta encoding of means
|
210
|
+
x = 0
|
211
|
+
means.map! do |m|
|
212
|
+
m += x
|
213
|
+
x = m
|
214
|
+
m
|
215
|
+
end
|
216
|
+
counts_bytes = bytes[(start_idx + 4 * size)..-1].unpack('C*')
|
217
|
+
counts = []
|
218
|
+
# Decode variable length integer bytes
|
219
|
+
size.times do
|
220
|
+
v = counts_bytes.shift
|
221
|
+
z = 0x7f & v
|
222
|
+
shift = 7
|
223
|
+
while (v & 0x80) != 0
|
224
|
+
fail 'Shift too large in decode' if shift > 28
|
225
|
+
v = counts_bytes.shift || 0
|
226
|
+
z += (v & 0x7f) << shift
|
227
|
+
shift += 7
|
228
|
+
end
|
229
|
+
counts << z
|
230
|
+
end
|
231
|
+
# This shouldn't happen
|
232
|
+
fail 'Mismatch' unless counts.size == means.size
|
233
|
+
else
|
234
|
+
fail 'Unknown compression format'
|
235
|
+
end
|
236
|
+
if means && counts
|
237
|
+
means.zip(counts).each { |val| tdigest.push(val[0], val[1]) }
|
238
|
+
end
|
239
|
+
tdigest
|
240
|
+
end
|
152
241
|
|
153
|
-
|
242
|
+
def self.from_json(array)
|
243
|
+
tdigest = new
|
244
|
+
# Handle both string and symbol keys
|
245
|
+
array.each { |a| tdigest.push(a['m'] || a[:m], a['n'] || a[:n]) }
|
246
|
+
tdigest
|
247
|
+
end
|
154
248
|
|
249
|
+
private
|
155
250
|
|
156
251
|
def _add_weight(nearest, x, n)
|
157
252
|
unless x == nearest.mean
|
158
253
|
nearest.mean += n * (x - nearest.mean) / (nearest.n + n)
|
159
254
|
end
|
160
255
|
|
256
|
+
_cumulate(false, true) if nearest.mean_cumn.nil?
|
257
|
+
|
161
258
|
nearest.cumn += n
|
162
|
-
nearest.mean_cumn += n / 2
|
259
|
+
nearest.mean_cumn += n / 2.0
|
163
260
|
nearest.n += n
|
164
261
|
@n += n
|
165
262
|
|
166
263
|
nil
|
167
264
|
end
|
168
265
|
|
169
|
-
def _cumulate(exact = false)
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
266
|
+
def _cumulate(exact = false, force = false)
|
267
|
+
unless force
|
268
|
+
factor = if @last_cumulate == 0
|
269
|
+
Float::INFINITY
|
270
|
+
else
|
271
|
+
(@n.to_f / @last_cumulate)
|
272
|
+
end
|
273
|
+
return if @n == @last_cumulate || (!exact && @cx && @cx > (factor))
|
174
274
|
end
|
175
275
|
|
176
276
|
cumn = 0
|
177
277
|
@centroids.each do |_, c|
|
178
|
-
c.mean_cumn = cumn + c.n / 2
|
278
|
+
c.mean_cumn = cumn + c.n / 2.0
|
179
279
|
cumn = c.cumn = cumn + c.n
|
180
280
|
end
|
181
281
|
@n = @last_cumulate = cumn
|
data/lib/tdigest/version.rb
CHANGED
data/tdigest.gemspec
CHANGED
@@ -21,7 +21,9 @@ Gem::Specification.new do |spec|
|
|
21
21
|
|
22
22
|
spec.add_runtime_dependency 'rbtree', '~> 0.4.2'
|
23
23
|
|
24
|
-
spec.add_development_dependency
|
25
|
-
spec.add_development_dependency
|
26
|
-
spec.add_development_dependency
|
24
|
+
spec.add_development_dependency 'bundler', '~> 1.10'
|
25
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
26
|
+
spec.add_development_dependency 'minitest', '~> 5.8.3'
|
27
|
+
spec.add_development_dependency 'coveralls', '~> 0.8.10'
|
28
|
+
spec.add_development_dependency 'simplecov', '~> 0.11.1'
|
27
29
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tdigest
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sebastian Wallin
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-01-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbtree
|
@@ -56,16 +56,44 @@ dependencies:
|
|
56
56
|
name: minitest
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 5.8.3
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 5.8.3
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: coveralls
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
60
74
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
75
|
+
version: 0.8.10
|
62
76
|
type: :development
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
|
-
- - "
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 0.8.10
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: simplecov
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.11.1
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
67
95
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
96
|
+
version: 0.11.1
|
69
97
|
description: Ruby implementation of Dunning's T-Digest for streaming quantile approximation
|
70
98
|
email:
|
71
99
|
- sebastian.wallin@gmail.com
|
@@ -74,6 +102,7 @@ extensions: []
|
|
74
102
|
extra_rdoc_files: []
|
75
103
|
files:
|
76
104
|
- ".gitignore"
|
105
|
+
- ".ruby-version"
|
77
106
|
- ".travis.yml"
|
78
107
|
- Gemfile
|
79
108
|
- LICENSE.txt
|