tdigest 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/specs.yml +22 -0
- data/.ruby-version +1 -1
- data/Gemfile +2 -0
- data/README.md +5 -9
- data/Rakefile +7 -5
- data/bin/console +4 -3
- data/lib/tdigest/centroid.rb +2 -0
- data/lib/tdigest/tdigest.rb +20 -20
- data/lib/tdigest/version.rb +3 -1
- data/lib/tdigest.rb +3 -3
- data/tdigest.gemspec +24 -16
- metadata +25 -26
- data/.travis.yml +0 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 2b5f999c42d1051e26230facc685f22b8d7cc1c6c31af9e5680a202c7cb4f653
|
4
|
+
data.tar.gz: 5580752fe6f646fdc1774e4f5eb04c6170f2ce9dbdb325de3daae950fe4895eb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8d0eb2f8fd8a1645b035cb9245ca5659de2059e0fcc4899159fd8a600b8d23f13b919638d53787622956b11ced97fefd4898b6caa264f47429bfcc35a6a1d214
|
7
|
+
data.tar.gz: cbf7ce26a01bfcf8cff1496b31921227a97ebf21f74a7877a8c978475f69da5dc409763d308fb852488f05fa9cd3d4537ea309c05614d6c81c09db4488d81cc9
|
@@ -0,0 +1,22 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on: [push, pull_request]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
strategy:
|
9
|
+
matrix:
|
10
|
+
ruby-version: [2.7, 3.1]
|
11
|
+
steps:
|
12
|
+
- uses: actions/checkout@v3
|
13
|
+
- name: Set up Ruby ${{ matrix.ruby-version }}
|
14
|
+
uses: ruby/setup-ruby@v1
|
15
|
+
with:
|
16
|
+
ruby-version: ${{ matrix.ruby-version }}
|
17
|
+
- name: Install dependencies
|
18
|
+
run: |
|
19
|
+
gem update --system
|
20
|
+
bundle install
|
21
|
+
- name: Run tests
|
22
|
+
run: bundle exec rake test
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
3.1.3
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,8 +1,7 @@
|
|
1
|
-
#
|
1
|
+
# t-digest Ruby
|
2
2
|
|
3
|
+
[](https://github.com/castle/tdigest/actions/workflows/specs.yml)
|
3
4
|
[](https://badge.fury.io/rb/tdigest)
|
4
|
-
[](https://travis-ci.org/castle/tdigest)
|
5
|
-
[](https://coveralls.io/github/castle/tdigest?branch=master)
|
6
5
|
|
7
6
|
Ruby implementation of Ted Dunning's [t-digest](https://github.com/tdunning/t-digest) data structure.
|
8
7
|
|
@@ -37,12 +36,11 @@ puts td.p_rank(0.95)
|
|
37
36
|
|
38
37
|
#### Serialization
|
39
38
|
|
40
|
-
This gem offers the same serialization options as the original [Java implementation](https://github.com/tdunning/t-digest). You can read more about T-digest
|
39
|
+
This gem offers the same serialization options as the original [Java implementation](https://github.com/tdunning/t-digest). You can read more about T-digest persistence in [Chapter 3 in the paper](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf).
|
41
40
|
|
42
41
|
**Standard encoding**
|
43
42
|
|
44
|
-
This encoding uses 8-byte Double for the means and a 4-byte
|
45
|
-
Size per centroid is a fixed 12-bytes.
|
43
|
+
This encoding uses 8-byte Double for the means and a 4-byte integer for counts. Size per centroid is a fixed 12-bytes.
|
46
44
|
|
47
45
|
```ruby
|
48
46
|
bytes = tdigest.as_bytes
|
@@ -50,8 +48,7 @@ bytes = tdigest.as_bytes
|
|
50
48
|
|
51
49
|
**Compressed encoding**
|
52
50
|
|
53
|
-
This encoding uses delta encoding with 4-byte floats for the means and variable
|
54
|
-
length encoding for the counts. Size per centroid is between 5-12 bytes.
|
51
|
+
This encoding uses delta encoding with 4-byte floats for the means and variable length encoding for the counts. Size per centroid is between 5-12 bytes.
|
55
52
|
|
56
53
|
```ruby
|
57
54
|
bytes = tdigest.as_small_bytes
|
@@ -79,4 +76,3 @@ Bug reports and pull requests are welcome on GitHub at https://github.com/castle
|
|
79
76
|
## License
|
80
77
|
|
81
78
|
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
82
|
-
|
data/Rakefile
CHANGED
@@ -1,10 +1,12 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'bundler/gem_tasks'
|
4
|
+
require 'rake/testtask'
|
3
5
|
|
4
6
|
Rake::TestTask.new(:test) do |t|
|
5
|
-
t.libs <<
|
6
|
-
t.libs <<
|
7
|
+
t.libs << 'test'
|
8
|
+
t.libs << 'lib'
|
7
9
|
t.test_files = FileList['test/**/*_test.rb']
|
8
10
|
end
|
9
11
|
|
10
|
-
task :
|
12
|
+
task default: :test
|
data/bin/console
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
|
-
require
|
4
|
-
require
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'tdigest'
|
5
6
|
|
6
7
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
8
|
# with your gem easier. You can also use a different console, if you like.
|
@@ -10,5 +11,5 @@ require "tdigest"
|
|
10
11
|
# require "pry"
|
11
12
|
# Pry.start
|
12
13
|
|
13
|
-
require
|
14
|
+
require 'irb'
|
14
15
|
IRB.start
|
data/lib/tdigest/centroid.rb
CHANGED
data/lib/tdigest/tdigest.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'rbtree'
|
2
4
|
require 'tdigest/centroid'
|
3
5
|
|
@@ -20,10 +22,8 @@ module TDigest
|
|
20
22
|
def +(other)
|
21
23
|
# Uses delta, k and cx from the caller
|
22
24
|
t = self.class.new(@delta, @k, @cx)
|
23
|
-
data =
|
24
|
-
|
25
|
-
t.push_centroid(data.delete_at(rand(data.length)))
|
26
|
-
end
|
25
|
+
data = centroids.values + other.centroids.values
|
26
|
+
t.push_centroid(data.delete_at(rand(data.length))) until data.empty?
|
27
27
|
t
|
28
28
|
end
|
29
29
|
|
@@ -56,7 +56,7 @@ module TDigest
|
|
56
56
|
arr << b
|
57
57
|
n = n >> 7
|
58
58
|
k += 1
|
59
|
-
|
59
|
+
raise 'Unreasonable large number' if k > 6
|
60
60
|
end
|
61
61
|
arr << n
|
62
62
|
end
|
@@ -77,7 +77,7 @@ module TDigest
|
|
77
77
|
def bound_mean_cumn(cumn)
|
78
78
|
last_c = nil
|
79
79
|
bounds = []
|
80
|
-
|
80
|
+
@centroids.each do |_k, v|
|
81
81
|
if v.mean_cumn == cumn
|
82
82
|
bounds << v
|
83
83
|
break
|
@@ -164,8 +164,9 @@ module TDigest
|
|
164
164
|
p = [p] unless is_array
|
165
165
|
p.map! do |item|
|
166
166
|
unless (0..1).include? item
|
167
|
-
|
167
|
+
raise ArgumentError, "p should be in [0,1], got #{item}"
|
168
168
|
end
|
169
|
+
|
169
170
|
if size == 0
|
170
171
|
nil
|
171
172
|
else
|
@@ -219,7 +220,7 @@ module TDigest
|
|
219
220
|
case format
|
220
221
|
when VERBOSE_ENCODING
|
221
222
|
array = bytes[start_idx..-1].unpack("d#{size}L#{size}")
|
222
|
-
means, counts = array.each_slice(size).to_a
|
223
|
+
means, counts = array.each_slice(size).to_a unless array.empty?
|
223
224
|
when SMALL_ENCODING
|
224
225
|
means = bytes[start_idx..(start_idx + 4 * size)].unpack("f#{size}")
|
225
226
|
# Decode delta encoding of means
|
@@ -237,7 +238,8 @@ module TDigest
|
|
237
238
|
z = 0x7f & v
|
238
239
|
shift = 7
|
239
240
|
while (v & 0x80) != 0
|
240
|
-
|
241
|
+
raise 'Shift too large in decode' if shift > 28
|
242
|
+
|
241
243
|
v = counts_bytes.shift || 0
|
242
244
|
z += (v & 0x7f) << shift
|
243
245
|
shift += 7
|
@@ -245,9 +247,9 @@ module TDigest
|
|
245
247
|
counts << z
|
246
248
|
end
|
247
249
|
# This shouldn't happen
|
248
|
-
|
250
|
+
raise 'Mismatch' unless counts.size == means.size
|
249
251
|
else
|
250
|
-
|
252
|
+
raise 'Unknown compression format'
|
251
253
|
end
|
252
254
|
if means && counts
|
253
255
|
means.zip(counts).each { |val| tdigest.push(val[0], val[1]) }
|
@@ -281,11 +283,11 @@ module TDigest
|
|
281
283
|
def _cumulate(exact = false, force = false)
|
282
284
|
unless force
|
283
285
|
factor = if @last_cumulate == 0
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
return if @n == @last_cumulate || (!exact && @cx && @cx >
|
286
|
+
Float::INFINITY
|
287
|
+
else
|
288
|
+
(@n.to_f / @last_cumulate)
|
289
|
+
end
|
290
|
+
return if @n == @last_cumulate || (!exact && @cx && @cx > factor)
|
289
291
|
end
|
290
292
|
|
291
293
|
cumn = 0
|
@@ -318,7 +320,7 @@ module TDigest
|
|
318
320
|
else
|
319
321
|
p = nearest.mean_cumn.to_f / @n
|
320
322
|
max_n = (4 * @n * @delta * p * (1 - p)).floor
|
321
|
-
if
|
323
|
+
if max_n - nearest.n >= n
|
322
324
|
_add_weight(nearest, x, n)
|
323
325
|
else
|
324
326
|
_new_centroid(x, n, nearest.cumn)
|
@@ -331,9 +333,7 @@ module TDigest
|
|
331
333
|
# it may be due to values being inserted in sorted order.
|
332
334
|
# We combat that by replaying the centroids in random order,
|
333
335
|
# which is what compress! does
|
334
|
-
if @centroids.size > (@k / @delta)
|
335
|
-
compress!
|
336
|
-
end
|
336
|
+
compress! if @centroids.size > (@k / @delta)
|
337
337
|
|
338
338
|
nil
|
339
339
|
end
|
data/lib/tdigest/version.rb
CHANGED
data/lib/tdigest.rb
CHANGED
data/tdigest.gemspec
CHANGED
@@ -1,29 +1,37 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
3
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
5
|
require 'tdigest/version'
|
5
6
|
|
7
|
+
java = (ENV['RUBY_PLATFORM'] == 'java')
|
8
|
+
|
6
9
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
10
|
+
spec.name = 'tdigest'
|
8
11
|
spec.version = TDigest::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
12
|
+
spec.authors = ['Sebastian Wallin']
|
13
|
+
spec.email = ['sebastian.wallin@gmail.com']
|
11
14
|
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
14
|
-
spec.homepage =
|
15
|
-
spec.license =
|
15
|
+
spec.summary = 'TDigest for Ruby'
|
16
|
+
spec.description = "Ruby implementation of Dunning's T-Digest for streaming quantile approximation"
|
17
|
+
spec.homepage = 'https://github.com/castle/tdigest'
|
18
|
+
spec.license = 'MIT'
|
16
19
|
|
17
20
|
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
-
spec.bindir =
|
21
|
+
spec.bindir = 'exe'
|
19
22
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
|
-
spec.require_paths = [
|
23
|
+
spec.require_paths = ['lib']
|
24
|
+
spec.platform = java ? 'java' : 'ruby'
|
21
25
|
|
22
|
-
|
26
|
+
if java
|
27
|
+
spec.add_runtime_dependency 'rbtree-jruby', '~> 0.2.1'
|
28
|
+
else
|
29
|
+
spec.add_runtime_dependency 'rbtree3', '~> 0.6.0'
|
30
|
+
end
|
23
31
|
|
24
|
-
spec.add_development_dependency 'bundler', '~> 1
|
25
|
-
spec.add_development_dependency 'rake', '~> 10.0'
|
26
|
-
spec.add_development_dependency 'minitest', '~> 5.8.3'
|
32
|
+
spec.add_development_dependency 'bundler', '~> 2.1'
|
27
33
|
spec.add_development_dependency 'coveralls', '~> 0.8.10'
|
28
|
-
spec.add_development_dependency '
|
34
|
+
spec.add_development_dependency 'minitest', '~> 5.8'
|
35
|
+
spec.add_development_dependency 'rake', '>= 12.3.3'
|
36
|
+
spec.add_development_dependency 'simplecov', '~> 0.16.0'
|
29
37
|
end
|
metadata
CHANGED
@@ -1,99 +1,99 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tdigest
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sebastian Wallin
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-03-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: rbtree3
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.
|
19
|
+
version: 0.6.0
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.
|
26
|
+
version: 0.6.0
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '1
|
33
|
+
version: '2.1'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '1
|
40
|
+
version: '2.1'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: coveralls
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 0.8.10
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 0.8.10
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: minitest
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: 5.8
|
61
|
+
version: '5.8'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: 5.8
|
68
|
+
version: '5.8'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: rake
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - "
|
73
|
+
- - ">="
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version:
|
75
|
+
version: 12.3.3
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - "
|
80
|
+
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version:
|
82
|
+
version: 12.3.3
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: simplecov
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: 0.
|
89
|
+
version: 0.16.0
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: 0.
|
96
|
+
version: 0.16.0
|
97
97
|
description: Ruby implementation of Dunning's T-Digest for streaming quantile approximation
|
98
98
|
email:
|
99
99
|
- sebastian.wallin@gmail.com
|
@@ -101,9 +101,9 @@ executables: []
|
|
101
101
|
extensions: []
|
102
102
|
extra_rdoc_files: []
|
103
103
|
files:
|
104
|
+
- ".github/workflows/specs.yml"
|
104
105
|
- ".gitignore"
|
105
106
|
- ".ruby-version"
|
106
|
-
- ".travis.yml"
|
107
107
|
- Gemfile
|
108
108
|
- LICENSE.txt
|
109
109
|
- README.md
|
@@ -119,7 +119,7 @@ homepage: https://github.com/castle/tdigest
|
|
119
119
|
licenses:
|
120
120
|
- MIT
|
121
121
|
metadata: {}
|
122
|
-
post_install_message:
|
122
|
+
post_install_message:
|
123
123
|
rdoc_options: []
|
124
124
|
require_paths:
|
125
125
|
- lib
|
@@ -134,9 +134,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
134
134
|
- !ruby/object:Gem::Version
|
135
135
|
version: '0'
|
136
136
|
requirements: []
|
137
|
-
|
138
|
-
|
139
|
-
signing_key:
|
137
|
+
rubygems_version: 3.3.26
|
138
|
+
signing_key:
|
140
139
|
specification_version: 4
|
141
|
-
summary:
|
140
|
+
summary: TDigest for Ruby
|
142
141
|
test_files: []
|