tdigest 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fb2b554c557253ae6826b504943a315bf4f660a0
4
- data.tar.gz: 4f86fdef222ec8f3ddd7746ccfb7767cfbd1bd26
3
+ metadata.gz: d6b00ddf3a1a0b0a5989002fb3cf8fd51b352c6a
4
+ data.tar.gz: 60d1e7f4b42e3e38300f2f9380f9a91077bbd02f
5
5
  SHA512:
6
- metadata.gz: c5b5567069b6958c551721f8c6c6d707e9a08f6f20c23473b39e732c853a92e4ef1e20406dba46d67ddf2b496ce94224501f6605def6f36be0b65b7f586aaa01
7
- data.tar.gz: c00dfc8c3bb483793c5f5b2c8a4ea8ad9b324c4ce506a7930db362d31f42e2f1759215552d25140ab5667be969e62ee79fcc4e72cf7dcb0ca976d0fbae8b95d3
6
+ metadata.gz: d7154db5857ee2b184ff16c1f2afb7ea110be15cb1d4253e5287b0bf6e656232fccdffecf1f0a3ba6f52626039f0498b030a56cc5e71b51f0a6c82c60f15c7f2
7
+ data.tar.gz: 7265929cc8f77b8973cd9d60240c38f3ae3ce29e9af6644e18041c7a56f72ec5980bccf1696cb232865d294853821ce31d8582e57bd26806f79785610b1aa3d9
@@ -0,0 +1 @@
1
+ 2.2.3
@@ -1,4 +1,6 @@
1
1
  language: ruby
2
2
  rvm:
3
+ - 1.9.3
4
+ - 2.1.0
3
5
  - 2.2.3
4
6
  before_install: gem install bundler -v 1.10.6
data/README.md CHANGED
@@ -1,6 +1,8 @@
1
1
  # Tdigest
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/tdigest.svg)](https://badge.fury.io/rb/tdigest)
4
+ [![Build Status](https://travis-ci.org/castle/tdigest.svg?branch=master)](https://travis-ci.org/castle/tdigest)
5
+ [![Coverage Status](https://coveralls.io/repos/castle/tdigest/badge.svg?branch=master&service=github)](https://coveralls.io/github/castle/tdigest?branch=master)
4
6
 
5
7
  Ruby implementation of Ted Dunning's [t-digest](https://github.com/tdunning/t-digest) data structure.
6
8
 
@@ -33,6 +35,36 @@ puts td.percentile(0.5)
33
35
  puts td.p_rank(0.95)
34
36
  ```
35
37
 
38
+ #### Serialization
39
+
40
+ This gem offers the same serialization options as the original [Java implementation](https://github.com/tdunning/t-digest). You can read more about T-digest persistance in [Chapter 3 in the paper](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf).
41
+
42
+ **Standard encoding**
43
+
44
+ This encoding uses 8-byte Double for the means and a 4-byte integers for counts.
45
+ Size per centroid is a fixed 12-bytes.
46
+
47
+ ```ruby
48
+ bytes = tdigest.as_bytes
49
+ ```
50
+
51
+ **Compressed encoding**
52
+
53
+ This encoding uses delta encoding with 4-byte floats for the means and variable
54
+ length encoding for the counts. Size per centroid is between 5-12 bytes.
55
+
56
+ ```ruby
57
+ bytes = tdigest.as_small_bytes
58
+ ```
59
+
60
+ **Deserializing**
61
+
62
+ Deserialization will automatically detect compression format
63
+
64
+ ```ruby
65
+ tdigest = TDigest::TDigest.from_bytes(bytes)
66
+ ```
67
+
36
68
  ## Development
37
69
 
38
70
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -6,5 +6,9 @@ module TDigest
6
6
  send("#{p}=", value)
7
7
  end
8
8
  end
9
+
10
+ def as_json(_ = nil)
11
+ { m: mean, n: n }
12
+ end
9
13
  end
10
- end
14
+ end
@@ -3,6 +3,9 @@ require 'tdigest/centroid'
3
3
 
4
4
  module TDigest
5
5
  class TDigest
6
+ VERBOSE_ENCODING = 1
7
+ SMALL_ENCODING = 2
8
+
6
9
  attr_accessor :centroids
7
10
  def initialize(delta = 0.01, k = 25, cx = 1.1)
8
11
  @delta = delta
@@ -13,6 +16,45 @@ module TDigest
13
16
  reset!
14
17
  end
15
18
 
19
+ def as_bytes
20
+ # compression as defined by Java implementation
21
+ output = [VERBOSE_ENCODING, compression, size]
22
+ output += @centroids.map { |_, c| c.mean }
23
+ output += @centroids.map { |_, c| c.n }
24
+ output.pack("LdLd#{size}L#{size}")
25
+ end
26
+
27
+ def as_small_bytes
28
+ output = [SMALL_ENCODING, compression, size]
29
+ x = 0
30
+ # delta encoding allows saving 4-bytes floats
31
+ mean_arr = @centroids.map do |_, c|
32
+ val = c.mean - x
33
+ x = c.mean
34
+ val
35
+ end
36
+ output += mean_arr
37
+ # Variable length encoding of numbers
38
+ c_arr = @centroids.each_with_object([]) do |(_, c), arr|
39
+ k = 0
40
+ n = c.n
41
+ while n < 0 || n > 0x7f
42
+ b = 0x80 | (0x7f & n)
43
+ arr << b
44
+ n = n >> 7
45
+ k += 1
46
+ fail 'Unreasonable large number' if k > 6
47
+ end
48
+ arr << n
49
+ end
50
+ output += c_arr
51
+ output.pack("LdLf#{mean_arr.size}C#{c_arr.size}")
52
+ end
53
+
54
+ def as_json(_ = nil)
55
+ @centroids.map { |_, c| c.as_json }
56
+ end
57
+
16
58
  def bound_mean(x)
17
59
  upper = @centroids.upper_bound(x)
18
60
  lower = @centroids.lower_bound(x)
@@ -50,6 +92,10 @@ module TDigest
50
92
  nil
51
93
  end
52
94
 
95
+ def compression
96
+ 1 / @delta
97
+ end
98
+
53
99
  def find_nearest(x)
54
100
  return nil if size == 0
55
101
 
@@ -149,33 +195,87 @@ module TDigest
149
195
  @centroids.map { |_, c| c }
150
196
  end
151
197
 
198
+ def self.from_bytes(bytes)
199
+ format, compression, size = bytes.unpack('LdL')
200
+ tdigest = new(1 / compression)
201
+
202
+ start_idx = 16 # after header
203
+ case format
204
+ when VERBOSE_ENCODING
205
+ array = bytes[start_idx..-1].unpack("d#{size}L#{size}")
206
+ means, counts = array.each_slice(size).to_a if array.size > 0
207
+ when SMALL_ENCODING
208
+ means = bytes[start_idx..(start_idx + 4 * size)].unpack("f#{size}")
209
+ # Decode delta encoding of means
210
+ x = 0
211
+ means.map! do |m|
212
+ m += x
213
+ x = m
214
+ m
215
+ end
216
+ counts_bytes = bytes[(start_idx + 4 * size)..-1].unpack('C*')
217
+ counts = []
218
+ # Decode variable length integer bytes
219
+ size.times do
220
+ v = counts_bytes.shift
221
+ z = 0x7f & v
222
+ shift = 7
223
+ while (v & 0x80) != 0
224
+ fail 'Shift too large in decode' if shift > 28
225
+ v = counts_bytes.shift || 0
226
+ z += (v & 0x7f) << shift
227
+ shift += 7
228
+ end
229
+ counts << z
230
+ end
231
+ # This shouldn't happen
232
+ fail 'Mismatch' unless counts.size == means.size
233
+ else
234
+ fail 'Unknown compression format'
235
+ end
236
+ if means && counts
237
+ means.zip(counts).each { |val| tdigest.push(val[0], val[1]) }
238
+ end
239
+ tdigest
240
+ end
152
241
 
153
- private
242
+ def self.from_json(array)
243
+ tdigest = new
244
+ # Handle both string and symbol keys
245
+ array.each { |a| tdigest.push(a['m'] || a[:m], a['n'] || a[:n]) }
246
+ tdigest
247
+ end
154
248
 
249
+ private
155
250
 
156
251
  def _add_weight(nearest, x, n)
157
252
  unless x == nearest.mean
158
253
  nearest.mean += n * (x - nearest.mean) / (nearest.n + n)
159
254
  end
160
255
 
256
+ _cumulate(false, true) if nearest.mean_cumn.nil?
257
+
161
258
  nearest.cumn += n
162
- nearest.mean_cumn += n / 2
259
+ nearest.mean_cumn += n / 2.0
163
260
  nearest.n += n
164
261
  @n += n
165
262
 
166
263
  nil
167
264
  end
168
265
 
169
- def _cumulate(exact = false)
170
- factor = @last_cumulate == 0 ? Float::INFINITY : (@n / @last_cumulate)
171
- if @n == @last_cumulate ||
172
- !exact && @cx && @cx > (factor)
173
- return
266
+ def _cumulate(exact = false, force = false)
267
+ unless force
268
+ factor = if @last_cumulate == 0
269
+ Float::INFINITY
270
+ else
271
+ (@n.to_f / @last_cumulate)
272
+ end
273
+ return if @n == @last_cumulate || (!exact && @cx && @cx > (factor))
174
274
  end
175
275
 
176
276
  cumn = 0
177
277
  @centroids.each do |_, c|
178
- c.mean_cumn = cumn + c.n / 2
278
+ c.mean_cumn = cumn + c.n / 2.0
179
279
  cumn = c.cumn = cumn + c.n
180
280
  end
181
281
  @n = @last_cumulate = cumn
@@ -1,3 +1,3 @@
1
1
  module TDigest
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
@@ -21,7 +21,9 @@ Gem::Specification.new do |spec|
21
21
 
22
22
  spec.add_runtime_dependency 'rbtree', '~> 0.4.2'
23
23
 
24
- spec.add_development_dependency "bundler", "~> 1.10"
25
- spec.add_development_dependency "rake", "~> 10.0"
26
- spec.add_development_dependency "minitest"
24
+ spec.add_development_dependency 'bundler', '~> 1.10'
25
+ spec.add_development_dependency 'rake', '~> 10.0'
26
+ spec.add_development_dependency 'minitest', '~> 5.8.3'
27
+ spec.add_development_dependency 'coveralls', '~> 0.8.10'
28
+ spec.add_development_dependency 'simplecov', '~> 0.11.1'
27
29
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tdigest
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sebastian Wallin
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-12-07 00:00:00.000000000 Z
11
+ date: 2016-01-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbtree
@@ -56,16 +56,44 @@ dependencies:
56
56
  name: minitest
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - ">="
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 5.8.3
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 5.8.3
69
+ - !ruby/object:Gem::Dependency
70
+ name: coveralls
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
60
74
  - !ruby/object:Gem::Version
61
- version: '0'
75
+ version: 0.8.10
62
76
  type: :development
63
77
  prerelease: false
64
78
  version_requirements: !ruby/object:Gem::Requirement
65
79
  requirements:
66
- - - ">="
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.8.10
83
+ - !ruby/object:Gem::Dependency
84
+ name: simplecov
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.11.1
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
67
95
  - !ruby/object:Gem::Version
68
- version: '0'
96
+ version: 0.11.1
69
97
  description: Ruby implementation of Dunning's T-Digest for streaming quantile approximation
70
98
  email:
71
99
  - sebastian.wallin@gmail.com
@@ -74,6 +102,7 @@ extensions: []
74
102
  extra_rdoc_files: []
75
103
  files:
76
104
  - ".gitignore"
105
+ - ".ruby-version"
77
106
  - ".travis.yml"
78
107
  - Gemfile
79
108
  - LICENSE.txt