tdigest 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/specs.yml +22 -0
- data/.ruby-version +1 -1
- data/Gemfile +2 -0
- data/README.md +5 -9
- data/Rakefile +7 -5
- data/bin/console +4 -3
- data/lib/tdigest/centroid.rb +2 -0
- data/lib/tdigest/tdigest.rb +20 -20
- data/lib/tdigest/version.rb +3 -1
- data/lib/tdigest.rb +3 -3
- data/tdigest.gemspec +24 -16
- metadata +25 -26
- data/.travis.yml +0 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 2b5f999c42d1051e26230facc685f22b8d7cc1c6c31af9e5680a202c7cb4f653
|
4
|
+
data.tar.gz: 5580752fe6f646fdc1774e4f5eb04c6170f2ce9dbdb325de3daae950fe4895eb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8d0eb2f8fd8a1645b035cb9245ca5659de2059e0fcc4899159fd8a600b8d23f13b919638d53787622956b11ced97fefd4898b6caa264f47429bfcc35a6a1d214
|
7
|
+
data.tar.gz: cbf7ce26a01bfcf8cff1496b31921227a97ebf21f74a7877a8c978475f69da5dc409763d308fb852488f05fa9cd3d4537ea309c05614d6c81c09db4488d81cc9
|
@@ -0,0 +1,22 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on: [push, pull_request]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
strategy:
|
9
|
+
matrix:
|
10
|
+
ruby-version: [2.7, 3.1]
|
11
|
+
steps:
|
12
|
+
- uses: actions/checkout@v3
|
13
|
+
- name: Set up Ruby ${{ matrix.ruby-version }}
|
14
|
+
uses: ruby/setup-ruby@v1
|
15
|
+
with:
|
16
|
+
ruby-version: ${{ matrix.ruby-version }}
|
17
|
+
- name: Install dependencies
|
18
|
+
run: |
|
19
|
+
gem update --system
|
20
|
+
bundle install
|
21
|
+
- name: Run tests
|
22
|
+
run: bundle exec rake test
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
3.1.3
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,8 +1,7 @@
|
|
1
|
-
#
|
1
|
+
# t-digest Ruby
|
2
2
|
|
3
|
+
[![Ruby CI](https://github.com/castle/tdigest/actions/workflows/specs.yml/badge.svg?branch=master)](https://github.com/castle/tdigest/actions/workflows/specs.yml)
|
3
4
|
[![Gem Version](https://badge.fury.io/rb/tdigest.svg)](https://badge.fury.io/rb/tdigest)
|
4
|
-
[![Build Status](https://travis-ci.org/castle/tdigest.svg?branch=master)](https://travis-ci.org/castle/tdigest)
|
5
|
-
[![Coverage Status](https://coveralls.io/repos/castle/tdigest/badge.svg?branch=master&service=github)](https://coveralls.io/github/castle/tdigest?branch=master)
|
6
5
|
|
7
6
|
Ruby implementation of Ted Dunning's [t-digest](https://github.com/tdunning/t-digest) data structure.
|
8
7
|
|
@@ -37,12 +36,11 @@ puts td.p_rank(0.95)
|
|
37
36
|
|
38
37
|
#### Serialization
|
39
38
|
|
40
|
-
This gem offers the same serialization options as the original [Java implementation](https://github.com/tdunning/t-digest). You can read more about T-digest
|
39
|
+
This gem offers the same serialization options as the original [Java implementation](https://github.com/tdunning/t-digest). You can read more about T-digest persistence in [Chapter 3 in the paper](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf).
|
41
40
|
|
42
41
|
**Standard encoding**
|
43
42
|
|
44
|
-
This encoding uses 8-byte Double for the means and a 4-byte
|
45
|
-
Size per centroid is a fixed 12-bytes.
|
43
|
+
This encoding uses 8-byte Double for the means and a 4-byte integer for counts. Size per centroid is a fixed 12-bytes.
|
46
44
|
|
47
45
|
```ruby
|
48
46
|
bytes = tdigest.as_bytes
|
@@ -50,8 +48,7 @@ bytes = tdigest.as_bytes
|
|
50
48
|
|
51
49
|
**Compressed encoding**
|
52
50
|
|
53
|
-
This encoding uses delta encoding with 4-byte floats for the means and variable
|
54
|
-
length encoding for the counts. Size per centroid is between 5-12 bytes.
|
51
|
+
This encoding uses delta encoding with 4-byte floats for the means and variable length encoding for the counts. Size per centroid is between 5-12 bytes.
|
55
52
|
|
56
53
|
```ruby
|
57
54
|
bytes = tdigest.as_small_bytes
|
@@ -79,4 +76,3 @@ Bug reports and pull requests are welcome on GitHub at https://github.com/castle
|
|
79
76
|
## License
|
80
77
|
|
81
78
|
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
82
|
-
|
data/Rakefile
CHANGED
@@ -1,10 +1,12 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'bundler/gem_tasks'
|
4
|
+
require 'rake/testtask'
|
3
5
|
|
4
6
|
Rake::TestTask.new(:test) do |t|
|
5
|
-
t.libs <<
|
6
|
-
t.libs <<
|
7
|
+
t.libs << 'test'
|
8
|
+
t.libs << 'lib'
|
7
9
|
t.test_files = FileList['test/**/*_test.rb']
|
8
10
|
end
|
9
11
|
|
10
|
-
task :
|
12
|
+
task default: :test
|
data/bin/console
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
|
-
require
|
4
|
-
require
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'tdigest'
|
5
6
|
|
6
7
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
8
|
# with your gem easier. You can also use a different console, if you like.
|
@@ -10,5 +11,5 @@ require "tdigest"
|
|
10
11
|
# require "pry"
|
11
12
|
# Pry.start
|
12
13
|
|
13
|
-
require
|
14
|
+
require 'irb'
|
14
15
|
IRB.start
|
data/lib/tdigest/centroid.rb
CHANGED
data/lib/tdigest/tdigest.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'rbtree'
|
2
4
|
require 'tdigest/centroid'
|
3
5
|
|
@@ -20,10 +22,8 @@ module TDigest
|
|
20
22
|
def +(other)
|
21
23
|
# Uses delta, k and cx from the caller
|
22
24
|
t = self.class.new(@delta, @k, @cx)
|
23
|
-
data =
|
24
|
-
|
25
|
-
t.push_centroid(data.delete_at(rand(data.length)))
|
26
|
-
end
|
25
|
+
data = centroids.values + other.centroids.values
|
26
|
+
t.push_centroid(data.delete_at(rand(data.length))) until data.empty?
|
27
27
|
t
|
28
28
|
end
|
29
29
|
|
@@ -56,7 +56,7 @@ module TDigest
|
|
56
56
|
arr << b
|
57
57
|
n = n >> 7
|
58
58
|
k += 1
|
59
|
-
|
59
|
+
raise 'Unreasonable large number' if k > 6
|
60
60
|
end
|
61
61
|
arr << n
|
62
62
|
end
|
@@ -77,7 +77,7 @@ module TDigest
|
|
77
77
|
def bound_mean_cumn(cumn)
|
78
78
|
last_c = nil
|
79
79
|
bounds = []
|
80
|
-
|
80
|
+
@centroids.each do |_k, v|
|
81
81
|
if v.mean_cumn == cumn
|
82
82
|
bounds << v
|
83
83
|
break
|
@@ -164,8 +164,9 @@ module TDigest
|
|
164
164
|
p = [p] unless is_array
|
165
165
|
p.map! do |item|
|
166
166
|
unless (0..1).include? item
|
167
|
-
|
167
|
+
raise ArgumentError, "p should be in [0,1], got #{item}"
|
168
168
|
end
|
169
|
+
|
169
170
|
if size == 0
|
170
171
|
nil
|
171
172
|
else
|
@@ -219,7 +220,7 @@ module TDigest
|
|
219
220
|
case format
|
220
221
|
when VERBOSE_ENCODING
|
221
222
|
array = bytes[start_idx..-1].unpack("d#{size}L#{size}")
|
222
|
-
means, counts = array.each_slice(size).to_a
|
223
|
+
means, counts = array.each_slice(size).to_a unless array.empty?
|
223
224
|
when SMALL_ENCODING
|
224
225
|
means = bytes[start_idx..(start_idx + 4 * size)].unpack("f#{size}")
|
225
226
|
# Decode delta encoding of means
|
@@ -237,7 +238,8 @@ module TDigest
|
|
237
238
|
z = 0x7f & v
|
238
239
|
shift = 7
|
239
240
|
while (v & 0x80) != 0
|
240
|
-
|
241
|
+
raise 'Shift too large in decode' if shift > 28
|
242
|
+
|
241
243
|
v = counts_bytes.shift || 0
|
242
244
|
z += (v & 0x7f) << shift
|
243
245
|
shift += 7
|
@@ -245,9 +247,9 @@ module TDigest
|
|
245
247
|
counts << z
|
246
248
|
end
|
247
249
|
# This shouldn't happen
|
248
|
-
|
250
|
+
raise 'Mismatch' unless counts.size == means.size
|
249
251
|
else
|
250
|
-
|
252
|
+
raise 'Unknown compression format'
|
251
253
|
end
|
252
254
|
if means && counts
|
253
255
|
means.zip(counts).each { |val| tdigest.push(val[0], val[1]) }
|
@@ -281,11 +283,11 @@ module TDigest
|
|
281
283
|
def _cumulate(exact = false, force = false)
|
282
284
|
unless force
|
283
285
|
factor = if @last_cumulate == 0
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
return if @n == @last_cumulate || (!exact && @cx && @cx >
|
286
|
+
Float::INFINITY
|
287
|
+
else
|
288
|
+
(@n.to_f / @last_cumulate)
|
289
|
+
end
|
290
|
+
return if @n == @last_cumulate || (!exact && @cx && @cx > factor)
|
289
291
|
end
|
290
292
|
|
291
293
|
cumn = 0
|
@@ -318,7 +320,7 @@ module TDigest
|
|
318
320
|
else
|
319
321
|
p = nearest.mean_cumn.to_f / @n
|
320
322
|
max_n = (4 * @n * @delta * p * (1 - p)).floor
|
321
|
-
if
|
323
|
+
if max_n - nearest.n >= n
|
322
324
|
_add_weight(nearest, x, n)
|
323
325
|
else
|
324
326
|
_new_centroid(x, n, nearest.cumn)
|
@@ -331,9 +333,7 @@ module TDigest
|
|
331
333
|
# it may be due to values being inserted in sorted order.
|
332
334
|
# We combat that by replaying the centroids in random order,
|
333
335
|
# which is what compress! does
|
334
|
-
if @centroids.size > (@k / @delta)
|
335
|
-
compress!
|
336
|
-
end
|
336
|
+
compress! if @centroids.size > (@k / @delta)
|
337
337
|
|
338
338
|
nil
|
339
339
|
end
|
data/lib/tdigest/version.rb
CHANGED
data/lib/tdigest.rb
CHANGED
data/tdigest.gemspec
CHANGED
@@ -1,29 +1,37 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
3
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
5
|
require 'tdigest/version'
|
5
6
|
|
7
|
+
java = (ENV['RUBY_PLATFORM'] == 'java')
|
8
|
+
|
6
9
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
10
|
+
spec.name = 'tdigest'
|
8
11
|
spec.version = TDigest::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
12
|
+
spec.authors = ['Sebastian Wallin']
|
13
|
+
spec.email = ['sebastian.wallin@gmail.com']
|
11
14
|
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
14
|
-
spec.homepage =
|
15
|
-
spec.license =
|
15
|
+
spec.summary = 'TDigest for Ruby'
|
16
|
+
spec.description = "Ruby implementation of Dunning's T-Digest for streaming quantile approximation"
|
17
|
+
spec.homepage = 'https://github.com/castle/tdigest'
|
18
|
+
spec.license = 'MIT'
|
16
19
|
|
17
20
|
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
-
spec.bindir =
|
21
|
+
spec.bindir = 'exe'
|
19
22
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
|
-
spec.require_paths = [
|
23
|
+
spec.require_paths = ['lib']
|
24
|
+
spec.platform = java ? 'java' : 'ruby'
|
21
25
|
|
22
|
-
|
26
|
+
if java
|
27
|
+
spec.add_runtime_dependency 'rbtree-jruby', '~> 0.2.1'
|
28
|
+
else
|
29
|
+
spec.add_runtime_dependency 'rbtree3', '~> 0.6.0'
|
30
|
+
end
|
23
31
|
|
24
|
-
spec.add_development_dependency 'bundler', '~> 1
|
25
|
-
spec.add_development_dependency 'rake', '~> 10.0'
|
26
|
-
spec.add_development_dependency 'minitest', '~> 5.8.3'
|
32
|
+
spec.add_development_dependency 'bundler', '~> 2.1'
|
27
33
|
spec.add_development_dependency 'coveralls', '~> 0.8.10'
|
28
|
-
spec.add_development_dependency '
|
34
|
+
spec.add_development_dependency 'minitest', '~> 5.8'
|
35
|
+
spec.add_development_dependency 'rake', '>= 12.3.3'
|
36
|
+
spec.add_development_dependency 'simplecov', '~> 0.16.0'
|
29
37
|
end
|
metadata
CHANGED
@@ -1,99 +1,99 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tdigest
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sebastian Wallin
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-03-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: rbtree3
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.
|
19
|
+
version: 0.6.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.
|
26
|
+
version: 0.6.0
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '1
|
33
|
+
version: '2.1'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '1
|
40
|
+
version: '2.1'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: coveralls
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 0.8.10
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 0.8.10
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: minitest
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: 5.8
|
61
|
+
version: '5.8'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: 5.8
|
68
|
+
version: '5.8'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: rake
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - "
|
73
|
+
- - ">="
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version:
|
75
|
+
version: 12.3.3
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - "
|
80
|
+
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version:
|
82
|
+
version: 12.3.3
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: simplecov
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: 0.
|
89
|
+
version: 0.16.0
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: 0.
|
96
|
+
version: 0.16.0
|
97
97
|
description: Ruby implementation of Dunning's T-Digest for streaming quantile approximation
|
98
98
|
email:
|
99
99
|
- sebastian.wallin@gmail.com
|
@@ -101,9 +101,9 @@ executables: []
|
|
101
101
|
extensions: []
|
102
102
|
extra_rdoc_files: []
|
103
103
|
files:
|
104
|
+
- ".github/workflows/specs.yml"
|
104
105
|
- ".gitignore"
|
105
106
|
- ".ruby-version"
|
106
|
-
- ".travis.yml"
|
107
107
|
- Gemfile
|
108
108
|
- LICENSE.txt
|
109
109
|
- README.md
|
@@ -119,7 +119,7 @@ homepage: https://github.com/castle/tdigest
|
|
119
119
|
licenses:
|
120
120
|
- MIT
|
121
121
|
metadata: {}
|
122
|
-
post_install_message:
|
122
|
+
post_install_message:
|
123
123
|
rdoc_options: []
|
124
124
|
require_paths:
|
125
125
|
- lib
|
@@ -134,9 +134,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
134
134
|
- !ruby/object:Gem::Version
|
135
135
|
version: '0'
|
136
136
|
requirements: []
|
137
|
-
|
138
|
-
|
139
|
-
signing_key:
|
137
|
+
rubygems_version: 3.3.26
|
138
|
+
signing_key:
|
140
139
|
specification_version: 4
|
141
|
-
summary:
|
140
|
+
summary: TDigest for Ruby
|
142
141
|
test_files: []
|