tdigest 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fb2b554c557253ae6826b504943a315bf4f660a0
4
- data.tar.gz: 4f86fdef222ec8f3ddd7746ccfb7767cfbd1bd26
3
+ metadata.gz: d6b00ddf3a1a0b0a5989002fb3cf8fd51b352c6a
4
+ data.tar.gz: 60d1e7f4b42e3e38300f2f9380f9a91077bbd02f
5
5
  SHA512:
6
- metadata.gz: c5b5567069b6958c551721f8c6c6d707e9a08f6f20c23473b39e732c853a92e4ef1e20406dba46d67ddf2b496ce94224501f6605def6f36be0b65b7f586aaa01
7
- data.tar.gz: c00dfc8c3bb483793c5f5b2c8a4ea8ad9b324c4ce506a7930db362d31f42e2f1759215552d25140ab5667be969e62ee79fcc4e72cf7dcb0ca976d0fbae8b95d3
6
+ metadata.gz: d7154db5857ee2b184ff16c1f2afb7ea110be15cb1d4253e5287b0bf6e656232fccdffecf1f0a3ba6f52626039f0498b030a56cc5e71b51f0a6c82c60f15c7f2
7
+ data.tar.gz: 7265929cc8f77b8973cd9d60240c38f3ae3ce29e9af6644e18041c7a56f72ec5980bccf1696cb232865d294853821ce31d8582e57bd26806f79785610b1aa3d9
@@ -0,0 +1 @@
1
+ 2.2.3
@@ -1,4 +1,6 @@
1
1
  language: ruby
2
2
  rvm:
3
+ - 1.9.3
4
+ - 2.1.0
3
5
  - 2.2.3
4
6
  before_install: gem install bundler -v 1.10.6
data/README.md CHANGED
@@ -1,6 +1,8 @@
1
1
  # Tdigest
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/tdigest.svg)](https://badge.fury.io/rb/tdigest)
4
+ [![Build Status](https://travis-ci.org/castle/tdigest.svg?branch=master)](https://travis-ci.org/castle/tdigest)
5
+ [![Coverage Status](https://coveralls.io/repos/castle/tdigest/badge.svg?branch=master&service=github)](https://coveralls.io/github/castle/tdigest?branch=master)
4
6
 
5
7
  Ruby implementation of Ted Dunning's [t-digest](https://github.com/tdunning/t-digest) data structure.
6
8
 
@@ -33,6 +35,36 @@ puts td.percentile(0.5)
33
35
  puts td.p_rank(0.95)
34
36
  ```
35
37
 
38
+ #### Serialization
39
+
40
+ This gem offers the same serialization options as the original [Java implementation](https://github.com/tdunning/t-digest). You can read more about T-digest persistance in [Chapter 3 in the paper](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf).
41
+
42
+ **Standard encoding**
43
+
44
+ This encoding uses 8-byte Double for the means and a 4-byte integers for counts.
45
+ Size per centroid is a fixed 12-bytes.
46
+
47
+ ```ruby
48
+ bytes = tdigest.as_bytes
49
+ ```
50
+
51
+ **Compressed encoding**
52
+
53
+ This encoding uses delta encoding with 4-byte floats for the means and variable
54
+ length encoding for the counts. Size per centroid is between 5-12 bytes.
55
+
56
+ ```ruby
57
+ bytes = tdigest.as_small_bytes
58
+ ```
59
+
60
+ **Deserializing**
61
+
62
+ Deserialization will automatically detect compression format
63
+
64
+ ```ruby
65
+ tdigest = TDigest::TDigest.from_bytes(bytes)
66
+ ```
67
+
36
68
  ## Development
37
69
 
38
70
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -6,5 +6,9 @@ module TDigest
6
6
  send("#{p}=", value)
7
7
  end
8
8
  end
9
+
10
+ def as_json(_ = nil)
11
+ { m: mean, n: n }
12
+ end
9
13
  end
10
- end
14
+ end
@@ -3,6 +3,9 @@ require 'tdigest/centroid'
3
3
 
4
4
  module TDigest
5
5
  class TDigest
6
+ VERBOSE_ENCODING = 1
7
+ SMALL_ENCODING = 2
8
+
6
9
  attr_accessor :centroids
7
10
  def initialize(delta = 0.01, k = 25, cx = 1.1)
8
11
  @delta = delta
@@ -13,6 +16,45 @@ module TDigest
13
16
  reset!
14
17
  end
15
18
 
19
+ def as_bytes
20
+ # compression as defined by Java implementation
21
+ output = [VERBOSE_ENCODING, compression, size]
22
+ output += @centroids.map { |_, c| c.mean }
23
+ output += @centroids.map { |_, c| c.n }
24
+ output.pack("LdLd#{size}L#{size}")
25
+ end
26
+
27
+ def as_small_bytes
28
+ output = [SMALL_ENCODING, compression, size]
29
+ x = 0
30
+ # delta encoding allows saving 4-bytes floats
31
+ mean_arr = @centroids.map do |_, c|
32
+ val = c.mean - x
33
+ x = c.mean
34
+ val
35
+ end
36
+ output += mean_arr
37
+ # Variable length encoding of numbers
38
+ c_arr = @centroids.each_with_object([]) do |(_, c), arr|
39
+ k = 0
40
+ n = c.n
41
+ while n < 0 || n > 0x7f
42
+ b = 0x80 | (0x7f & n)
43
+ arr << b
44
+ n = n >> 7
45
+ k += 1
46
+ fail 'Unreasonable large number' if k > 6
47
+ end
48
+ arr << n
49
+ end
50
+ output += c_arr
51
+ output.pack("LdLf#{mean_arr.size}C#{c_arr.size}")
52
+ end
53
+
54
+ def as_json(_ = nil)
55
+ @centroids.map { |_, c| c.as_json }
56
+ end
57
+
16
58
  def bound_mean(x)
17
59
  upper = @centroids.upper_bound(x)
18
60
  lower = @centroids.lower_bound(x)
@@ -50,6 +92,10 @@ module TDigest
50
92
  nil
51
93
  end
52
94
 
95
+ def compression
96
+ 1 / @delta
97
+ end
98
+
53
99
  def find_nearest(x)
54
100
  return nil if size == 0
55
101
 
@@ -149,33 +195,87 @@ module TDigest
149
195
  @centroids.map { |_, c| c }
150
196
  end
151
197
 
198
+ def self.from_bytes(bytes)
199
+ format, compression, size = bytes.unpack('LdL')
200
+ tdigest = new(1 / compression)
201
+
202
+ start_idx = 16 # after header
203
+ case format
204
+ when VERBOSE_ENCODING
205
+ array = bytes[start_idx..-1].unpack("d#{size}L#{size}")
206
+ means, counts = array.each_slice(size).to_a if array.size > 0
207
+ when SMALL_ENCODING
208
+ means = bytes[start_idx..(start_idx + 4 * size)].unpack("f#{size}")
209
+ # Decode delta encoding of means
210
+ x = 0
211
+ means.map! do |m|
212
+ m += x
213
+ x = m
214
+ m
215
+ end
216
+ counts_bytes = bytes[(start_idx + 4 * size)..-1].unpack('C*')
217
+ counts = []
218
+ # Decode variable length integer bytes
219
+ size.times do
220
+ v = counts_bytes.shift
221
+ z = 0x7f & v
222
+ shift = 7
223
+ while (v & 0x80) != 0
224
+ fail 'Shift too large in decode' if shift > 28
225
+ v = counts_bytes.shift || 0
226
+ z += (v & 0x7f) << shift
227
+ shift += 7
228
+ end
229
+ counts << z
230
+ end
231
+ # This shouldn't happen
232
+ fail 'Mismatch' unless counts.size == means.size
233
+ else
234
+ fail 'Unknown compression format'
235
+ end
236
+ if means && counts
237
+ means.zip(counts).each { |val| tdigest.push(val[0], val[1]) }
238
+ end
239
+ tdigest
240
+ end
152
241
 
153
- private
242
+ def self.from_json(array)
243
+ tdigest = new
244
+ # Handle both string and symbol keys
245
+ array.each { |a| tdigest.push(a['m'] || a[:m], a['n'] || a[:n]) }
246
+ tdigest
247
+ end
154
248
 
249
+ private
155
250
 
156
251
  def _add_weight(nearest, x, n)
157
252
  unless x == nearest.mean
158
253
  nearest.mean += n * (x - nearest.mean) / (nearest.n + n)
159
254
  end
160
255
 
256
+ _cumulate(false, true) if nearest.mean_cumn.nil?
257
+
161
258
  nearest.cumn += n
162
- nearest.mean_cumn += n / 2
259
+ nearest.mean_cumn += n / 2.0
163
260
  nearest.n += n
164
261
  @n += n
165
262
 
166
263
  nil
167
264
  end
168
265
 
169
- def _cumulate(exact = false)
170
- factor = @last_cumulate == 0 ? Float::INFINITY : (@n / @last_cumulate)
171
- if @n == @last_cumulate ||
172
- !exact && @cx && @cx > (factor)
173
- return
266
+ def _cumulate(exact = false, force = false)
267
+ unless force
268
+ factor = if @last_cumulate == 0
269
+ Float::INFINITY
270
+ else
271
+ (@n.to_f / @last_cumulate)
272
+ end
273
+ return if @n == @last_cumulate || (!exact && @cx && @cx > (factor))
174
274
  end
175
275
 
176
276
  cumn = 0
177
277
  @centroids.each do |_, c|
178
- c.mean_cumn = cumn + c.n / 2
278
+ c.mean_cumn = cumn + c.n / 2.0
179
279
  cumn = c.cumn = cumn + c.n
180
280
  end
181
281
  @n = @last_cumulate = cumn
@@ -1,3 +1,3 @@
1
1
  module TDigest
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
@@ -21,7 +21,9 @@ Gem::Specification.new do |spec|
21
21
 
22
22
  spec.add_runtime_dependency 'rbtree', '~> 0.4.2'
23
23
 
24
- spec.add_development_dependency "bundler", "~> 1.10"
25
- spec.add_development_dependency "rake", "~> 10.0"
26
- spec.add_development_dependency "minitest"
24
+ spec.add_development_dependency 'bundler', '~> 1.10'
25
+ spec.add_development_dependency 'rake', '~> 10.0'
26
+ spec.add_development_dependency 'minitest', '~> 5.8.3'
27
+ spec.add_development_dependency 'coveralls', '~> 0.8.10'
28
+ spec.add_development_dependency 'simplecov', '~> 0.11.1'
27
29
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tdigest
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sebastian Wallin
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-12-07 00:00:00.000000000 Z
11
+ date: 2016-01-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbtree
@@ -56,16 +56,44 @@ dependencies:
56
56
  name: minitest
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - ">="
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 5.8.3
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 5.8.3
69
+ - !ruby/object:Gem::Dependency
70
+ name: coveralls
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
60
74
  - !ruby/object:Gem::Version
61
- version: '0'
75
+ version: 0.8.10
62
76
  type: :development
63
77
  prerelease: false
64
78
  version_requirements: !ruby/object:Gem::Requirement
65
79
  requirements:
66
- - - ">="
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.8.10
83
+ - !ruby/object:Gem::Dependency
84
+ name: simplecov
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.11.1
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
67
95
  - !ruby/object:Gem::Version
68
- version: '0'
96
+ version: 0.11.1
69
97
  description: Ruby implementation of Dunning's T-Digest for streaming quantile approximation
70
98
  email:
71
99
  - sebastian.wallin@gmail.com
@@ -74,6 +102,7 @@ extensions: []
74
102
  extra_rdoc_files: []
75
103
  files:
76
104
  - ".gitignore"
105
+ - ".ruby-version"
77
106
  - ".travis.yml"
78
107
  - Gemfile
79
108
  - LICENSE.txt