tdigest 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/specs.yml +22 -0
- data/.ruby-version +1 -1
- data/Gemfile +2 -0
- data/README.md +5 -9
- data/Rakefile +7 -5
- data/bin/console +4 -3
- data/lib/tdigest/centroid.rb +7 -4
- data/lib/tdigest/tdigest.rb +28 -31
- data/lib/tdigest/version.rb +3 -1
- data/lib/tdigest.rb +3 -3
- data/tdigest.gemspec +24 -16
- metadata +25 -26
- data/.travis.yml +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 2b5f999c42d1051e26230facc685f22b8d7cc1c6c31af9e5680a202c7cb4f653
|
4
|
+
data.tar.gz: 5580752fe6f646fdc1774e4f5eb04c6170f2ce9dbdb325de3daae950fe4895eb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8d0eb2f8fd8a1645b035cb9245ca5659de2059e0fcc4899159fd8a600b8d23f13b919638d53787622956b11ced97fefd4898b6caa264f47429bfcc35a6a1d214
|
7
|
+
data.tar.gz: cbf7ce26a01bfcf8cff1496b31921227a97ebf21f74a7877a8c978475f69da5dc409763d308fb852488f05fa9cd3d4537ea309c05614d6c81c09db4488d81cc9
|
@@ -0,0 +1,22 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on: [push, pull_request]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
strategy:
|
9
|
+
matrix:
|
10
|
+
ruby-version: [2.7, 3.1]
|
11
|
+
steps:
|
12
|
+
- uses: actions/checkout@v3
|
13
|
+
- name: Set up Ruby ${{ matrix.ruby-version }}
|
14
|
+
uses: ruby/setup-ruby@v1
|
15
|
+
with:
|
16
|
+
ruby-version: ${{ matrix.ruby-version }}
|
17
|
+
- name: Install dependencies
|
18
|
+
run: |
|
19
|
+
gem update --system
|
20
|
+
bundle install
|
21
|
+
- name: Run tests
|
22
|
+
run: bundle exec rake test
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
3.1.3
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,8 +1,7 @@
|
|
1
|
-
#
|
1
|
+
# t-digest Ruby
|
2
2
|
|
3
|
+
[](https://github.com/castle/tdigest/actions/workflows/specs.yml)
|
3
4
|
[](https://badge.fury.io/rb/tdigest)
|
4
|
-
[](https://travis-ci.org/castle/tdigest)
|
5
|
-
[](https://coveralls.io/github/castle/tdigest?branch=master)
|
6
5
|
|
7
6
|
Ruby implementation of Ted Dunning's [t-digest](https://github.com/tdunning/t-digest) data structure.
|
8
7
|
|
@@ -37,12 +36,11 @@ puts td.p_rank(0.95)
|
|
37
36
|
|
38
37
|
#### Serialization
|
39
38
|
|
40
|
-
This gem offers the same serialization options as the original [Java implementation](https://github.com/tdunning/t-digest). You can read more about T-digest
|
39
|
+
This gem offers the same serialization options as the original [Java implementation](https://github.com/tdunning/t-digest). You can read more about T-digest persistence in [Chapter 3 in the paper](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf).
|
41
40
|
|
42
41
|
**Standard encoding**
|
43
42
|
|
44
|
-
This encoding uses 8-byte Double for the means and a 4-byte
|
45
|
-
Size per centroid is a fixed 12-bytes.
|
43
|
+
This encoding uses 8-byte Double for the means and a 4-byte integer for counts. Size per centroid is a fixed 12-bytes.
|
46
44
|
|
47
45
|
```ruby
|
48
46
|
bytes = tdigest.as_bytes
|
@@ -50,8 +48,7 @@ bytes = tdigest.as_bytes
|
|
50
48
|
|
51
49
|
**Compressed encoding**
|
52
50
|
|
53
|
-
This encoding uses delta encoding with 4-byte floats for the means and variable
|
54
|
-
length encoding for the counts. Size per centroid is between 5-12 bytes.
|
51
|
+
This encoding uses delta encoding with 4-byte floats for the means and variable length encoding for the counts. Size per centroid is between 5-12 bytes.
|
55
52
|
|
56
53
|
```ruby
|
57
54
|
bytes = tdigest.as_small_bytes
|
@@ -79,4 +76,3 @@ Bug reports and pull requests are welcome on GitHub at https://github.com/castle
|
|
79
76
|
## License
|
80
77
|
|
81
78
|
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
82
|
-
|
data/Rakefile
CHANGED
@@ -1,10 +1,12 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'bundler/gem_tasks'
|
4
|
+
require 'rake/testtask'
|
3
5
|
|
4
6
|
Rake::TestTask.new(:test) do |t|
|
5
|
-
t.libs <<
|
6
|
-
t.libs <<
|
7
|
+
t.libs << 'test'
|
8
|
+
t.libs << 'lib'
|
7
9
|
t.test_files = FileList['test/**/*_test.rb']
|
8
10
|
end
|
9
11
|
|
10
|
-
task :
|
12
|
+
task default: :test
|
data/bin/console
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
|
-
require
|
4
|
-
require
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'tdigest'
|
5
6
|
|
6
7
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
8
|
# with your gem easier. You can also use a different console, if you like.
|
@@ -10,5 +11,5 @@ require "tdigest"
|
|
10
11
|
# require "pry"
|
11
12
|
# Pry.start
|
12
13
|
|
13
|
-
require
|
14
|
+
require 'irb'
|
14
15
|
IRB.start
|
data/lib/tdigest/centroid.rb
CHANGED
@@ -1,10 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module TDigest
|
2
4
|
class Centroid
|
3
5
|
attr_accessor :mean, :n, :cumn, :mean_cumn
|
4
|
-
def initialize(
|
5
|
-
|
6
|
-
|
7
|
-
|
6
|
+
def initialize(mean, n, cumn, mean_cumn = nil)
|
7
|
+
@mean = mean
|
8
|
+
@n = n
|
9
|
+
@cumn = cumn
|
10
|
+
@mean_cumn = mean_cumn
|
8
11
|
end
|
9
12
|
|
10
13
|
def as_json(_ = nil)
|
data/lib/tdigest/tdigest.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'rbtree'
|
2
4
|
require 'tdigest/centroid'
|
3
5
|
|
@@ -13,16 +15,15 @@ module TDigest
|
|
13
15
|
@cx = cx
|
14
16
|
@centroids = RBTree.new
|
15
17
|
@nreset = 0
|
18
|
+
@n = 0
|
16
19
|
reset!
|
17
20
|
end
|
18
21
|
|
19
22
|
def +(other)
|
20
23
|
# Uses delta, k and cx from the caller
|
21
24
|
t = self.class.new(@delta, @k, @cx)
|
22
|
-
data =
|
23
|
-
|
24
|
-
t.push_centroid(data.delete_at(rand(data.length)))
|
25
|
-
end
|
25
|
+
data = centroids.values + other.centroids.values
|
26
|
+
t.push_centroid(data.delete_at(rand(data.length))) until data.empty?
|
26
27
|
t
|
27
28
|
end
|
28
29
|
|
@@ -55,7 +56,7 @@ module TDigest
|
|
55
56
|
arr << b
|
56
57
|
n = n >> 7
|
57
58
|
k += 1
|
58
|
-
|
59
|
+
raise 'Unreasonable large number' if k > 6
|
59
60
|
end
|
60
61
|
arr << n
|
61
62
|
end
|
@@ -76,7 +77,7 @@ module TDigest
|
|
76
77
|
def bound_mean_cumn(cumn)
|
77
78
|
last_c = nil
|
78
79
|
bounds = []
|
79
|
-
|
80
|
+
@centroids.each do |_k, v|
|
80
81
|
if v.mean_cumn == cumn
|
81
82
|
bounds << v
|
82
83
|
break
|
@@ -97,10 +98,8 @@ module TDigest
|
|
97
98
|
def compress!
|
98
99
|
points = to_a
|
99
100
|
reset!
|
100
|
-
|
101
|
-
|
102
|
-
end
|
103
|
-
_cumulate(true)
|
101
|
+
push_centroid(points.shuffle)
|
102
|
+
_cumulate(true, true)
|
104
103
|
nil
|
105
104
|
end
|
106
105
|
|
@@ -128,10 +127,8 @@ module TDigest
|
|
128
127
|
end
|
129
128
|
|
130
129
|
def merge!(other)
|
131
|
-
|
132
|
-
|
133
|
-
@centroids = t.centroids
|
134
|
-
compress!
|
130
|
+
push_centroid(other.centroids.values.shuffle)
|
131
|
+
self
|
135
132
|
end
|
136
133
|
|
137
134
|
def p_rank(x)
|
@@ -167,8 +164,9 @@ module TDigest
|
|
167
164
|
p = [p] unless is_array
|
168
165
|
p.map! do |item|
|
169
166
|
unless (0..1).include? item
|
170
|
-
|
167
|
+
raise ArgumentError, "p should be in [0,1], got #{item}"
|
171
168
|
end
|
169
|
+
|
172
170
|
if size == 0
|
173
171
|
nil
|
174
172
|
else
|
@@ -222,7 +220,7 @@ module TDigest
|
|
222
220
|
case format
|
223
221
|
when VERBOSE_ENCODING
|
224
222
|
array = bytes[start_idx..-1].unpack("d#{size}L#{size}")
|
225
|
-
means, counts = array.each_slice(size).to_a
|
223
|
+
means, counts = array.each_slice(size).to_a unless array.empty?
|
226
224
|
when SMALL_ENCODING
|
227
225
|
means = bytes[start_idx..(start_idx + 4 * size)].unpack("f#{size}")
|
228
226
|
# Decode delta encoding of means
|
@@ -240,7 +238,8 @@ module TDigest
|
|
240
238
|
z = 0x7f & v
|
241
239
|
shift = 7
|
242
240
|
while (v & 0x80) != 0
|
243
|
-
|
241
|
+
raise 'Shift too large in decode' if shift > 28
|
242
|
+
|
244
243
|
v = counts_bytes.shift || 0
|
245
244
|
z += (v & 0x7f) << shift
|
246
245
|
shift += 7
|
@@ -248,9 +247,9 @@ module TDigest
|
|
248
247
|
counts << z
|
249
248
|
end
|
250
249
|
# This shouldn't happen
|
251
|
-
|
250
|
+
raise 'Mismatch' unless counts.size == means.size
|
252
251
|
else
|
253
|
-
|
252
|
+
raise 'Unknown compression format'
|
254
253
|
end
|
255
254
|
if means && counts
|
256
255
|
means.zip(counts).each { |val| tdigest.push(val[0], val[1]) }
|
@@ -277,7 +276,6 @@ module TDigest
|
|
277
276
|
nearest.cumn += n
|
278
277
|
nearest.mean_cumn += n / 2.0
|
279
278
|
nearest.n += n
|
280
|
-
@n += n
|
281
279
|
|
282
280
|
nil
|
283
281
|
end
|
@@ -285,11 +283,11 @@ module TDigest
|
|
285
283
|
def _cumulate(exact = false, force = false)
|
286
284
|
unless force
|
287
285
|
factor = if @last_cumulate == 0
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
return if @n == @last_cumulate || (!exact && @cx && @cx >
|
286
|
+
Float::INFINITY
|
287
|
+
else
|
288
|
+
(@n.to_f / @last_cumulate)
|
289
|
+
end
|
290
|
+
return if @n == @last_cumulate || (!exact && @cx && @cx > factor)
|
293
291
|
end
|
294
292
|
|
295
293
|
cumn = 0
|
@@ -311,6 +309,8 @@ module TDigest
|
|
311
309
|
max = max.nil? ? nil : max[1]
|
312
310
|
nearest = find_nearest(x)
|
313
311
|
|
312
|
+
@n += n
|
313
|
+
|
314
314
|
if nearest && nearest.mean == x
|
315
315
|
_add_weight(nearest, x, n)
|
316
316
|
elsif nearest == min
|
@@ -320,7 +320,7 @@ module TDigest
|
|
320
320
|
else
|
321
321
|
p = nearest.mean_cumn.to_f / @n
|
322
322
|
max_n = (4 * @n * @delta * p * (1 - p)).floor
|
323
|
-
if
|
323
|
+
if max_n - nearest.n >= n
|
324
324
|
_add_weight(nearest, x, n)
|
325
325
|
else
|
326
326
|
_new_centroid(x, n, nearest.cumn)
|
@@ -333,17 +333,14 @@ module TDigest
|
|
333
333
|
# it may be due to values being inserted in sorted order.
|
334
334
|
# We combat that by replaying the centroids in random order,
|
335
335
|
# which is what compress! does
|
336
|
-
if @centroids.size > (@k / @delta)
|
337
|
-
compress!
|
338
|
-
end
|
336
|
+
compress! if @centroids.size > (@k / @delta)
|
339
337
|
|
340
338
|
nil
|
341
339
|
end
|
342
340
|
|
343
341
|
def _new_centroid(x, n, cumn)
|
344
|
-
c = Centroid.new(
|
342
|
+
c = Centroid.new(x, n, cumn)
|
345
343
|
@centroids[x] = c
|
346
|
-
@n += n
|
347
344
|
c
|
348
345
|
end
|
349
346
|
end
|
data/lib/tdigest/version.rb
CHANGED
data/lib/tdigest.rb
CHANGED
data/tdigest.gemspec
CHANGED
@@ -1,29 +1,37 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
3
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
5
|
require 'tdigest/version'
|
5
6
|
|
7
|
+
java = (ENV['RUBY_PLATFORM'] == 'java')
|
8
|
+
|
6
9
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
10
|
+
spec.name = 'tdigest'
|
8
11
|
spec.version = TDigest::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
12
|
+
spec.authors = ['Sebastian Wallin']
|
13
|
+
spec.email = ['sebastian.wallin@gmail.com']
|
11
14
|
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
14
|
-
spec.homepage =
|
15
|
-
spec.license =
|
15
|
+
spec.summary = 'TDigest for Ruby'
|
16
|
+
spec.description = "Ruby implementation of Dunning's T-Digest for streaming quantile approximation"
|
17
|
+
spec.homepage = 'https://github.com/castle/tdigest'
|
18
|
+
spec.license = 'MIT'
|
16
19
|
|
17
20
|
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
-
spec.bindir =
|
21
|
+
spec.bindir = 'exe'
|
19
22
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
|
-
spec.require_paths = [
|
23
|
+
spec.require_paths = ['lib']
|
24
|
+
spec.platform = java ? 'java' : 'ruby'
|
21
25
|
|
22
|
-
|
26
|
+
if java
|
27
|
+
spec.add_runtime_dependency 'rbtree-jruby', '~> 0.2.1'
|
28
|
+
else
|
29
|
+
spec.add_runtime_dependency 'rbtree3', '~> 0.6.0'
|
30
|
+
end
|
23
31
|
|
24
|
-
spec.add_development_dependency 'bundler', '~> 1
|
25
|
-
spec.add_development_dependency 'rake', '~> 10.0'
|
26
|
-
spec.add_development_dependency 'minitest', '~> 5.8.3'
|
32
|
+
spec.add_development_dependency 'bundler', '~> 2.1'
|
27
33
|
spec.add_development_dependency 'coveralls', '~> 0.8.10'
|
28
|
-
spec.add_development_dependency '
|
34
|
+
spec.add_development_dependency 'minitest', '~> 5.8'
|
35
|
+
spec.add_development_dependency 'rake', '>= 12.3.3'
|
36
|
+
spec.add_development_dependency 'simplecov', '~> 0.16.0'
|
29
37
|
end
|
metadata
CHANGED
@@ -1,99 +1,99 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tdigest
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sebastian Wallin
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-03-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: rbtree3
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.
|
19
|
+
version: 0.6.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.
|
26
|
+
version: 0.6.0
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '1
|
33
|
+
version: '2.1'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '1
|
40
|
+
version: '2.1'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: coveralls
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 0.8.10
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 0.8.10
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: minitest
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: 5.8
|
61
|
+
version: '5.8'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: 5.8
|
68
|
+
version: '5.8'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: rake
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - "
|
73
|
+
- - ">="
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version:
|
75
|
+
version: 12.3.3
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - "
|
80
|
+
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version:
|
82
|
+
version: 12.3.3
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: simplecov
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: 0.
|
89
|
+
version: 0.16.0
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: 0.
|
96
|
+
version: 0.16.0
|
97
97
|
description: Ruby implementation of Dunning's T-Digest for streaming quantile approximation
|
98
98
|
email:
|
99
99
|
- sebastian.wallin@gmail.com
|
@@ -101,9 +101,9 @@ executables: []
|
|
101
101
|
extensions: []
|
102
102
|
extra_rdoc_files: []
|
103
103
|
files:
|
104
|
+
- ".github/workflows/specs.yml"
|
104
105
|
- ".gitignore"
|
105
106
|
- ".ruby-version"
|
106
|
-
- ".travis.yml"
|
107
107
|
- Gemfile
|
108
108
|
- LICENSE.txt
|
109
109
|
- README.md
|
@@ -119,7 +119,7 @@ homepage: https://github.com/castle/tdigest
|
|
119
119
|
licenses:
|
120
120
|
- MIT
|
121
121
|
metadata: {}
|
122
|
-
post_install_message:
|
122
|
+
post_install_message:
|
123
123
|
rdoc_options: []
|
124
124
|
require_paths:
|
125
125
|
- lib
|
@@ -134,9 +134,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
134
134
|
- !ruby/object:Gem::Version
|
135
135
|
version: '0'
|
136
136
|
requirements: []
|
137
|
-
|
138
|
-
|
139
|
-
signing_key:
|
137
|
+
rubygems_version: 3.3.26
|
138
|
+
signing_key:
|
140
139
|
specification_version: 4
|
141
|
-
summary:
|
140
|
+
summary: TDigest for Ruby
|
142
141
|
test_files: []
|