fast_statistics 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.rspec +3 -0
- data/.travis.yml +5 -0
- data/Gemfile +16 -0
- data/LICENSE.txt +21 -0
- data/README.md +164 -0
- data/Rakefile +36 -0
- data/benchmark/base.rb +125 -0
- data/benchmark/benchmark.rb +103 -0
- data/benchmark/helpers.rb +12 -0
- data/bin/console +14 -0
- data/bin/rake +29 -0
- data/bin/rspec +29 -0
- data/bin/setup +8 -0
- data/ext/fast_statistics/array_2d.cpp +188 -0
- data/ext/fast_statistics/array_2d.h +60 -0
- data/ext/fast_statistics/debug.h +36 -0
- data/ext/fast_statistics/extconf.rb +21 -0
- data/ext/fast_statistics/fast_statistics.cpp +190 -0
- data/ext/fast_statistics/fast_statistics.h +14 -0
- data/fast_statistics.gemspec +27 -0
- data/lib/fast_statistics.rb +5 -0
- data/lib/fast_statistics/version.rb +3 -0
- metadata +71 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 77ab0c69365318bdd1f68d562c0460ed573d12bf1301542e65fe7ff204f70118
|
4
|
+
data.tar.gz: ff64565f990ea2d6f77c52afbb63d7271f25737744be191146d5935d69727a30
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 898331717252505cff2e972ee2fb380709b070209197bf56374fe3c398aa906945c71a543f435d493aa5f5c5f7e7c65100979772f56bd5d28eb68967788fa6d3
|
7
|
+
data.tar.gz: e725e1d41c7749b8bc14eca78e46f574d477e12ddce3c8cd09817af82b4581ca3dcf24f2f2e2ea06128c2c65fe906ba365c38c7e911d20210b474b698065a01b
|
data/.gitignore
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
/.bundle/
|
2
|
+
/.yardoc
|
3
|
+
/_yardoc/
|
4
|
+
/coverage/
|
5
|
+
/doc/
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/tmp/
|
9
|
+
|
10
|
+
Gemfile.lock
|
11
|
+
|
12
|
+
*.so
|
13
|
+
*.o
|
14
|
+
compile_flags.txt
|
15
|
+
compile_commands.json
|
16
|
+
ext/fast_statistics/Makefile
|
17
|
+
ext/fast_statistics/mkmf.log
|
18
|
+
.clang-format
|
19
|
+
.cache
|
20
|
+
|
21
|
+
squiggles.txt
|
22
|
+
README.md.html
|
data/.rspec
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
source "https://rubygems.org"
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in fast_statistics.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
gem "rake", "~> 13.0"
|
7
|
+
gem "rake-compiler", "~> 1.1"
|
8
|
+
gem "rspec", "~> 3.0"
|
9
|
+
|
10
|
+
# Dependencies for running benchmarks
|
11
|
+
gem "benchmark-ips"
|
12
|
+
gem "terminal-table"
|
13
|
+
gem "descriptive_statistics"
|
14
|
+
gem "ruby_native_statistics"
|
15
|
+
gem "numo-narray"
|
16
|
+
gem "nmatrix"
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2020 Martin Nyaga
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,164 @@
|
|
1
|
+
# FastStatistics
|
2
|
+

|
3
|
+
|
4
|
+
A high performance native ruby extension (written in C++) for computation of
|
5
|
+
descriptive statistics.
|
6
|
+
|
7
|
+
## Overview
|
8
|
+
This gem provides fast computation of descriptive statistics (min, max, mean,
|
9
|
+
median, 1st and 3rd quartiles, population standard deviation) for a multivariate
|
10
|
+
dataset (represented as a 2D array) in ruby.
|
11
|
+
|
12
|
+
## Installation
|
13
|
+
|
14
|
+
Add this line to your application's Gemfile:
|
15
|
+
|
16
|
+
```ruby
|
17
|
+
gem 'fast_statistics'
|
18
|
+
```
|
19
|
+
|
20
|
+
And then execute:
|
21
|
+
|
22
|
+
$ bundle install
|
23
|
+
|
24
|
+
Or install it yourself as:
|
25
|
+
|
26
|
+
$ gem install fast_statistics
|
27
|
+
|
28
|
+
## Usage
|
29
|
+
|
30
|
+
Given you have some multivariate (2-dimensional) data:
|
31
|
+
```ruby
|
32
|
+
data = [
|
33
|
+
[0.6269, 0.3783, 0.1477, 0.2374],
|
34
|
+
[0.4209, 0.1055, 0.8000, 0.2023],
|
35
|
+
[0.1124, 0.1021, 0.1936, 0.8566],
|
36
|
+
[0.6454, 0.5362, 0.4567, 0.8309],
|
37
|
+
[0.4828, 0.1572, 0.5706, 0.4085],
|
38
|
+
[0.5594, 0.0979, 0.4078, 0.5885],
|
39
|
+
[0.8659, 0.5346, 0.5566, 0.6166],
|
40
|
+
[0.7256, 0.5841, 0.8546, 0.3918]
|
41
|
+
]
|
42
|
+
```
|
43
|
+
|
44
|
+
You can compute descriptive statistics for all the inner arrays as follows:
|
45
|
+
|
46
|
+
```ruby
|
47
|
+
require "fast_statistics"
|
48
|
+
|
49
|
+
FastStatistics::Array2D.new(data).descriptive_statistics
|
50
|
+
# Result:
|
51
|
+
#
|
52
|
+
# [{:min=>0.1477,
|
53
|
+
# :max=>0.6269,
|
54
|
+
# :mean=>0.347575,
|
55
|
+
# :median=>0.30785,
|
56
|
+
# :q1=>0.214975,
|
57
|
+
# :q3=>0.44045,
|
58
|
+
# :standard_deviation=>0.18100761551658537},
|
59
|
+
# {:min=>0.1055,
|
60
|
+
# :max=>0.8,
|
61
|
+
# :mean=>0.38217500000000004,
|
62
|
+
# :median=>0.3116,
|
63
|
+
# :q1=>0.1781,
|
64
|
+
# :q3=>0.515675,
|
65
|
+
# :standard_deviation=>0.26691825878909076},
|
66
|
+
# ...,
|
67
|
+
# {:min=>0.3918,
|
68
|
+
# :max=>0.8546,
|
69
|
+
# :mean=>0.639025,
|
70
|
+
# :median=>0.6548499999999999,
|
71
|
+
# :q1=>0.536025,
|
72
|
+
# :q3=>0.75785,
|
73
|
+
# :standard_deviation=>0.1718318709523935}]
|
74
|
+
```
|
75
|
+
|
76
|
+
## Benchmarks
|
77
|
+
|
78
|
+
Some alternatives compared are:
|
79
|
+
- [descriptive_statistics](https://github.com/thirtysixthspan/descriptive_statistics)
|
80
|
+
- [ruby-native-statistics](https://github.com/corybuecker/ruby-native-statistics)
|
81
|
+
- [Numo::NArray](https://github.com/ruby-numo/numo-narray)
|
82
|
+
- Hand-written ruby (using the same algorithm implemented in C++ in this gem)
|
83
|
+
|
84
|
+
Benchmarked on my machine (8th gen i7, sse2), this gem is **~11x**
|
85
|
+
faster than an optimal algorithm in hand-written ruby, and **~4.7x** faster than
|
86
|
+
the next fastest available native ruby extension (that I tested).
|
87
|
+
|
88
|
+
You can reivew the benchmark implementations at `benchmark/benchmark.rb` and run the
|
89
|
+
benchmark with `rake benchmark`.
|
90
|
+
|
91
|
+
Results:
|
92
|
+
```
|
93
|
+
Comparing calculated statistics with 10 values for 8 variables...
|
94
|
+
Test passed, results are equal to 6 decimal places!
|
95
|
+
|
96
|
+
Benchmarking with 100,000 values for 12 variables...
|
97
|
+
Warming up --------------------------------------
|
98
|
+
descriptive_statistics 1.000 i/100ms
|
99
|
+
Custom ruby 1.000 i/100ms
|
100
|
+
narray 1.000 i/100ms
|
101
|
+
ruby_native_statistics 1.000 i/100ms
|
102
|
+
FastStatistics 3.000 i/100ms
|
103
|
+
Calculating -------------------------------------
|
104
|
+
descriptive_statistics 0.473 (± 0.0%) i/s - 3.000 in 6.354555s
|
105
|
+
Custom ruby 2.518 (± 0.0%) i/s - 13.000 in 5.169084s
|
106
|
+
narray 4.231 (± 0.0%) i/s - 22.000 in 5.210299s
|
107
|
+
ruby_native_statistics 5.962 (± 0.0%) i/s - 30.000 in 5.041869s
|
108
|
+
FastStatistics 28.417 (±10.6%) i/s - 141.000 in 5.012229s
|
109
|
+
|
110
|
+
Comparison:
|
111
|
+
FastStatistics: 28.4 i/s
|
112
|
+
ruby_native_statistics: 6.0 i/s - 4.77x (± 0.00) slower
|
113
|
+
narray: 4.2 i/s - 6.72x (± 0.00) slower
|
114
|
+
Custom ruby: 2.5 i/s - 11.29x (± 0.00) slower
|
115
|
+
descriptive_statistics: 0.5 i/s - 60.09x (± 0.00) slower
|
116
|
+
```
|
117
|
+
|
118
|
+
## Background & Implementation
|
119
|
+
|
120
|
+
The inspiration for this gem was a use-case in an analytics ruby application,
|
121
|
+
where we frequently had to compute descriptive statistics for fairly large
|
122
|
+
multivariate datasets. Calculations in ruby were not fast enough, so I
|
123
|
+
first explored performing the computations natively in [this
|
124
|
+
repository](https://github.com/Martin-Nyaga/ruby-ffi-simd). The results were
|
125
|
+
promising, so I decided to package it as a ruby gem.
|
126
|
+
|
127
|
+
The following factors combined help this gem achieve high performance compared
|
128
|
+
to available native alternatives and hand-written computations in ruby:
|
129
|
+
|
130
|
+
- It is written in C++ and so can leverage the speed of native execution.
|
131
|
+
- It minimises the number of operations by calculating the statistics in as few
|
132
|
+
operations as possible (1 sort + 2 loops). Most native alternatives don't
|
133
|
+
provide a built in way to get all these statistics at once. Instead, they only
|
134
|
+
provide APIs where you make single calls for individual statistics. Through
|
135
|
+
such an API, building this set of summary statistics typically ends up looping
|
136
|
+
through the data more times than is necessary.
|
137
|
+
- This gem uses explicit 128-bit-wide SIMD intrinsics (on platforms where they
|
138
|
+
are available) to parallelize computations for 2 variables at the same time
|
139
|
+
where possible, giving an additional speed advantage while still being single
|
140
|
+
threaded.
|
141
|
+
|
142
|
+
That said, there are some limitations in the current implementation:
|
143
|
+
- The variables in the 2D array must all have the same number of data points
|
144
|
+
(inner arrays must have the same length) and contain only numbers (i.e. no
|
145
|
+
`nil` awareness is present).
|
146
|
+
- There is currently no API to calculate single statistics (although this may be
|
147
|
+
made available in the future).
|
148
|
+
|
149
|
+
This is an early release and should be considered unstable, at least until I'm
|
150
|
+
confident in the stability & performance in a real world application setting
|
151
|
+
(let me know in [the Welcome discussion
|
152
|
+
thread](https://github.com/Martin-Nyaga/fast_statistics/discussions/1) if you
|
153
|
+
use it!). I'm also not really an expert in C++, so reviews & suggestions are
|
154
|
+
welcome.
|
155
|
+
|
156
|
+
## Contributing
|
157
|
+
|
158
|
+
Bug reports and pull requests are welcome on GitHub at
|
159
|
+
https://github.com/Martin-Nyaga/fast_statistics.
|
160
|
+
|
161
|
+
## License
|
162
|
+
|
163
|
+
The gem is available as open source under the terms of the [MIT
|
164
|
+
License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rake/extensiontask"
|
3
|
+
|
4
|
+
task :default => :spec
|
5
|
+
|
6
|
+
# Compile
|
7
|
+
Rake::ExtensionTask.new "fast_statistics" do |ext|
|
8
|
+
ext.lib_dir = "lib/fast_statistics"
|
9
|
+
end
|
10
|
+
|
11
|
+
# Rspec
|
12
|
+
begin
|
13
|
+
require 'rspec/core/rake_task'
|
14
|
+
RSpec::Core::RakeTask.new(:spec, [:spec] => [:clean, :compile])
|
15
|
+
rescue LoadError
|
16
|
+
end
|
17
|
+
|
18
|
+
# Benchmark
|
19
|
+
task :benchmark => [:clean, :compile] do
|
20
|
+
require_relative "./benchmark/benchmark"
|
21
|
+
bench = DescriptiveStatsBenchmark
|
22
|
+
bench.compare_results!
|
23
|
+
bench.benchmark_ips!
|
24
|
+
end
|
25
|
+
|
26
|
+
task :profile => [:clean, :compile] do
|
27
|
+
require "fast_statistics"
|
28
|
+
$stdout.sync = true
|
29
|
+
|
30
|
+
variables = 12
|
31
|
+
length = 100_000
|
32
|
+
data = (0..(variables - 1)).map { (0..(length - 1)).map { rand } }
|
33
|
+
FastStatistics::Array2D.new(data, dtype: :float).mean.to_a
|
34
|
+
puts
|
35
|
+
puts
|
36
|
+
end
|
data/benchmark/base.rb
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "benchmark"
|
4
|
+
require "benchmark/ips"
|
5
|
+
require "terminal-table"
|
6
|
+
|
7
|
+
class BaseBenchmark
|
8
|
+
class << self
|
9
|
+
@@tests = []
|
10
|
+
|
11
|
+
def benchmark(name, &block)
|
12
|
+
@@tests.push(TestCase.new(name, block))
|
13
|
+
end
|
14
|
+
|
15
|
+
def tests
|
16
|
+
@@tests
|
17
|
+
end
|
18
|
+
|
19
|
+
def compare_results!(data_points: 10, precision: 6)
|
20
|
+
data = generate_data(data_points)
|
21
|
+
puts("Comparing calculated statistics with #{format_number(data_points)} values for #{data.length} variables...")
|
22
|
+
|
23
|
+
test_results = tests.map { |test| test.run(data) }
|
24
|
+
|
25
|
+
# Uncomment to print results
|
26
|
+
# test_results.zip(tests) do |results, test|
|
27
|
+
# print_results(test.name, results, precision)
|
28
|
+
# end
|
29
|
+
if assert_values_within_delta(test_results, 10 ** -precision)
|
30
|
+
puts("Test passed, results are equal to #{precision} decimal places!")
|
31
|
+
puts
|
32
|
+
end
|
33
|
+
|
34
|
+
rescue TestFailure => e
|
35
|
+
puts("Test results did not match!")
|
36
|
+
exit(1)
|
37
|
+
end
|
38
|
+
|
39
|
+
def benchmark!(data_points: 100_000, variables_count: 12)
|
40
|
+
data = generate_data(data_points, variables_count)
|
41
|
+
puts("Benchmarking with #{format_number(data_points)} values for #{data.length} variables...")
|
42
|
+
|
43
|
+
::Benchmark.bmbm do |x|
|
44
|
+
tests.each do |test|
|
45
|
+
x.report(test.name) do
|
46
|
+
test.run(data)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def benchmark_ips!(data_points: 100_000, variables_count: 12)
|
53
|
+
data = generate_data(data_points, variables_count)
|
54
|
+
puts("Benchmarking with #{format_number(data_points)} values for #{data.length} variables...")
|
55
|
+
|
56
|
+
::Benchmark.ips do |x|
|
57
|
+
tests.each do |test|
|
58
|
+
x.report(test.name) do
|
59
|
+
test.run(data)
|
60
|
+
end
|
61
|
+
|
62
|
+
x.compare!
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def generate_data(length, variables = 8)
|
70
|
+
data = (0..(variables - 1)).map { (0..(length - 1)).map { rand } }
|
71
|
+
end
|
72
|
+
|
73
|
+
def print_results(title, results, precision)
|
74
|
+
headers = results[0].keys
|
75
|
+
values = results.map { |r| r.values.map { |v| "%.#{precision}f" % v } }
|
76
|
+
table = Terminal::Table.new(headings: headers, rows: values)
|
77
|
+
puts(title + ":")
|
78
|
+
puts(table)
|
79
|
+
end
|
80
|
+
|
81
|
+
def assert_values_within_delta(values, delta)
|
82
|
+
values.combination(2).each do |expected, actual|
|
83
|
+
unless expected.length == actual.length
|
84
|
+
raise TestFailure, "Results don't match!"
|
85
|
+
end
|
86
|
+
|
87
|
+
expected.each_with_index do |expected_result, i|
|
88
|
+
actual_result = actual[i]
|
89
|
+
|
90
|
+
if actual_result.is_a?(Hash) && expected_result.is_a?(Hash)
|
91
|
+
expected_result.each do |k, _v|
|
92
|
+
assert_in_delta(actual_result[k], expected_result[k], delta)
|
93
|
+
end
|
94
|
+
else
|
95
|
+
|
96
|
+
assert_in_delta(actual_result, expected_result, delta)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
true
|
102
|
+
end
|
103
|
+
|
104
|
+
def assert_in_delta(expected, actual, delta)
|
105
|
+
unless (expected - actual).abs < delta
|
106
|
+
raise TestFailure, "Results don't match!"
|
107
|
+
end
|
108
|
+
|
109
|
+
true
|
110
|
+
end
|
111
|
+
|
112
|
+
def format_number(number)
|
113
|
+
number.to_s.reverse.gsub(/(\d{3})(?=\d)/, "\\1,").reverse
|
114
|
+
end
|
115
|
+
|
116
|
+
TestCase = Struct.new(:name, :block) do
|
117
|
+
def run(data)
|
118
|
+
block.call(data)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
class TestFailure < StandardError
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
|
5
|
+
require_relative "./base"
|
6
|
+
require_relative "./helpers"
|
7
|
+
|
8
|
+
require "fast_statistics"
|
9
|
+
require "descriptive_statistics/safe"
|
10
|
+
require "ruby_native_statistics"
|
11
|
+
require "numo/narray"
|
12
|
+
|
13
|
+
class DescriptiveStatsBenchmark < BaseBenchmark
|
14
|
+
class << self
|
15
|
+
include Helpers
|
16
|
+
end
|
17
|
+
|
18
|
+
benchmark "descriptive_statistics" do |data|
|
19
|
+
data.map do |arr|
|
20
|
+
stats = DescriptiveStatistics::Stats.new(arr)
|
21
|
+
{
|
22
|
+
mean: stats.mean,
|
23
|
+
min: stats.min,
|
24
|
+
max: stats.max,
|
25
|
+
median: stats.median,
|
26
|
+
q1: stats.percentile(25),
|
27
|
+
q3: stats.percentile(75),
|
28
|
+
standard_deviation: stats.standard_deviation
|
29
|
+
}
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
benchmark "Custom ruby" do |data|
|
34
|
+
data.map do |arr|
|
35
|
+
arr.sort!
|
36
|
+
|
37
|
+
min = arr.first
|
38
|
+
max = arr.last
|
39
|
+
length = arr.length
|
40
|
+
median = percentile(50, arr, length)
|
41
|
+
q1 = percentile(25, arr, length)
|
42
|
+
q3 = percentile(75, arr, length)
|
43
|
+
sum = arr.inject(0) { |sum, x| sum + x}
|
44
|
+
|
45
|
+
mean = sum / length
|
46
|
+
variance = arr.inject(0) { |var, x| var += ((x - mean) ** 2) / length }
|
47
|
+
standard_deviation = Math.sqrt(variance)
|
48
|
+
{
|
49
|
+
mean: mean,
|
50
|
+
min: min,
|
51
|
+
max: max,
|
52
|
+
median: median,
|
53
|
+
q1: q1,
|
54
|
+
q3: q3,
|
55
|
+
standard_deviation: standard_deviation
|
56
|
+
}
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
benchmark "narray" do |data|
|
61
|
+
data.map do |arr|
|
62
|
+
narr = Numo::DFloat[arr]
|
63
|
+
narr.sort
|
64
|
+
min = narr[0]
|
65
|
+
length = arr.length
|
66
|
+
max = narr[length - 1]
|
67
|
+
median = percentile(50, narr, length)
|
68
|
+
q1 = percentile(25, narr, length)
|
69
|
+
q3 = percentile(75, narr, length)
|
70
|
+
mean = narr.mean
|
71
|
+
variance = 0
|
72
|
+
narr.each { |x| variance += ((x - mean) ** 2) / length }
|
73
|
+
standard_deviation = Math.sqrt(variance)
|
74
|
+
{
|
75
|
+
mean: mean,
|
76
|
+
min: min,
|
77
|
+
max: max,
|
78
|
+
median: median,
|
79
|
+
q1: q1,
|
80
|
+
q3: q3,
|
81
|
+
standard_deviation: standard_deviation
|
82
|
+
}
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
benchmark "ruby_native_statistics" do |data|
|
87
|
+
data.map do |arr|
|
88
|
+
{
|
89
|
+
mean: arr.mean,
|
90
|
+
min: arr.min,
|
91
|
+
max: arr.max,
|
92
|
+
median: arr.median,
|
93
|
+
q1: arr.percentile(0.25),
|
94
|
+
q3: arr.percentile(0.75),
|
95
|
+
standard_deviation: arr.stdevp
|
96
|
+
}
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
benchmark "FastStatistics" do |data|
|
101
|
+
FastStatistics::Array2D.new(data).descriptive_statistics
|
102
|
+
end
|
103
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Helpers
|
4
|
+
def percentile(p, arr, len)
|
5
|
+
return arr[len - 1] if p == 100
|
6
|
+
rank = p / 100.0 * (len - 1)
|
7
|
+
lower = arr[rank.floor]
|
8
|
+
upper = arr[rank.floor + 1]
|
9
|
+
lower + (upper - lower) * (rank - rank.floor)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "fast_statistics"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/rake
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
#
|
5
|
+
# This file was generated by Bundler.
|
6
|
+
#
|
7
|
+
# The application 'rake' is installed as part of a gem, and
|
8
|
+
# this file is here to facilitate running it.
|
9
|
+
#
|
10
|
+
|
11
|
+
require "pathname"
|
12
|
+
ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../../Gemfile",
|
13
|
+
Pathname.new(__FILE__).realpath)
|
14
|
+
|
15
|
+
bundle_binstub = File.expand_path("../bundle", __FILE__)
|
16
|
+
|
17
|
+
if File.file?(bundle_binstub)
|
18
|
+
if File.read(bundle_binstub, 300) =~ /This file was generated by Bundler/
|
19
|
+
load(bundle_binstub)
|
20
|
+
else
|
21
|
+
abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
|
22
|
+
Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
require "rubygems"
|
27
|
+
require "bundler/setup"
|
28
|
+
|
29
|
+
load Gem.bin_path("rake", "rake")
|
data/bin/rspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
#
|
5
|
+
# This file was generated by Bundler.
|
6
|
+
#
|
7
|
+
# The application 'rspec' is installed as part of a gem, and
|
8
|
+
# this file is here to facilitate running it.
|
9
|
+
#
|
10
|
+
|
11
|
+
require "pathname"
|
12
|
+
ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../../Gemfile",
|
13
|
+
Pathname.new(__FILE__).realpath)
|
14
|
+
|
15
|
+
bundle_binstub = File.expand_path("../bundle", __FILE__)
|
16
|
+
|
17
|
+
if File.file?(bundle_binstub)
|
18
|
+
if File.read(bundle_binstub, 300) =~ /This file was generated by Bundler/
|
19
|
+
load(bundle_binstub)
|
20
|
+
else
|
21
|
+
abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
|
22
|
+
Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
require "rubygems"
|
27
|
+
require "bundler/setup"
|
28
|
+
|
29
|
+
load Gem.bin_path("rspec-core", "rspec")
|
data/bin/setup
ADDED
@@ -0,0 +1,188 @@
|
|
1
|
+
#include "array_2d.h"
|
2
|
+
#include "debug.h"
|
3
|
+
|
4
|
+
namespace array_2d
|
5
|
+
{
|
6
|
+
|
7
|
+
DFloat::~DFloat()
|
8
|
+
{
|
9
|
+
free(entries);
|
10
|
+
delete[] stats;
|
11
|
+
}
|
12
|
+
|
13
|
+
DFloat::DFloat(VALUE arrays)
|
14
|
+
{
|
15
|
+
cols = rb_array_len(arrays);
|
16
|
+
rows = rb_array_len(rb_ary_entry(arrays, 0));
|
17
|
+
entries = (double*)malloc(cols * rows * sizeof(double));
|
18
|
+
stats = NULL;
|
19
|
+
|
20
|
+
for (int j = 0; j < cols; j++) {
|
21
|
+
for (int i = 0; i < rows; i++) {
|
22
|
+
entries[j * rows + i] = (double)NUM2DBL(rb_ary_entry(rb_ary_entry(arrays, j), i));
|
23
|
+
}
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
inline void
|
28
|
+
DFloat::sort(double* col)
|
29
|
+
{
|
30
|
+
std::sort(col, col + rows);
|
31
|
+
}
|
32
|
+
|
33
|
+
inline double
|
34
|
+
DFloat::percentile(double* col, double pct)
|
35
|
+
{
|
36
|
+
if (pct == 1.0) return col[rows - 1];
|
37
|
+
double rank = pct * (double)(rows - 1);
|
38
|
+
int floored_rank = floor(rank);
|
39
|
+
double lower = col[floored_rank];
|
40
|
+
double upper = col[floored_rank + 1];
|
41
|
+
return lower + (upper - lower) * (rank - floored_rank);
|
42
|
+
}
|
43
|
+
|
44
|
+
inline double
|
45
|
+
DFloat::sum(double* col)
|
46
|
+
{
|
47
|
+
double sum = 0.0;
|
48
|
+
for (int row = 0; row < rows; row++) {
|
49
|
+
sum += col[row];
|
50
|
+
}
|
51
|
+
return sum;
|
52
|
+
}
|
53
|
+
|
54
|
+
inline double
|
55
|
+
DFloat::standard_deviation(double* col, double mean)
|
56
|
+
{
|
57
|
+
double variance = 0.0f;
|
58
|
+
for (int i = 0; i < rows; i++) {
|
59
|
+
double value = col[i];
|
60
|
+
double deviation = value - mean;
|
61
|
+
double sqr_deviation = deviation * deviation;
|
62
|
+
variance += (sqr_deviation / (double)rows);
|
63
|
+
}
|
64
|
+
double result = sqrt(variance);
|
65
|
+
return result;
|
66
|
+
}
|
67
|
+
|
68
|
+
Stats*
|
69
|
+
DFloat::descriptive_statistics()
|
70
|
+
{
|
71
|
+
stats = new Stats[cols];
|
72
|
+
|
73
|
+
for (int col = 0; col < cols; col++) {
|
74
|
+
Stats var_stats;
|
75
|
+
double* col_arr = base_ptr(col);
|
76
|
+
|
77
|
+
sort(col_arr);
|
78
|
+
|
79
|
+
var_stats.min = col_arr[0];
|
80
|
+
var_stats.max = col_arr[rows - 1];
|
81
|
+
var_stats.median = percentile(col_arr, 0.5);
|
82
|
+
var_stats.q1 = percentile(col_arr, 0.25);
|
83
|
+
var_stats.q3 = percentile(col_arr, 0.75);
|
84
|
+
double total = sum(col_arr);
|
85
|
+
var_stats.mean = total / (double)rows;
|
86
|
+
var_stats.standard_deviation = standard_deviation(col_arr, var_stats.mean);
|
87
|
+
|
88
|
+
stats[col] = var_stats;
|
89
|
+
}
|
90
|
+
|
91
|
+
return stats;
|
92
|
+
}
|
93
|
+
|
94
|
+
#ifdef HAVE_XMMINTRIN_H
|
95
|
+
inline double
|
96
|
+
DFloat::safe_entry(int col, int row)
|
97
|
+
{
|
98
|
+
if (col < cols) {
|
99
|
+
return *(base_ptr(col) + row);
|
100
|
+
} else {
|
101
|
+
return 0;
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
105
|
+
inline void
|
106
|
+
DFloat::sort_columns(int start_col, int pack_size)
|
107
|
+
{
|
108
|
+
for (int i = 0; i < pack_size; i++) {
|
109
|
+
if ((start_col + i) < cols) {
|
110
|
+
double* col_arr = base_ptr(start_col + i);
|
111
|
+
sort(col_arr);
|
112
|
+
}
|
113
|
+
}
|
114
|
+
}
|
115
|
+
|
116
|
+
inline __m128d
|
117
|
+
DFloat::pack(int start_col, int row)
|
118
|
+
{
|
119
|
+
__m128d packed = _mm_set_pd(safe_entry(start_col + 1, row), safe_entry(start_col + 0, row));
|
120
|
+
return packed;
|
121
|
+
}
|
122
|
+
|
123
|
+
inline __m128d
|
124
|
+
DFloat::percentile_packed(int start_col, float pct)
|
125
|
+
{
|
126
|
+
if (pct == 1.0) return pack(start_col, rows - 1);
|
127
|
+
double rank = pct * (double)(rows - 1);
|
128
|
+
int floored_rank = floor(rank);
|
129
|
+
__m128d lower = pack(start_col, floored_rank);
|
130
|
+
__m128d upper = pack(start_col, floored_rank + 1);
|
131
|
+
__m128d upper_minus_lower = _mm_sub_pd(upper, lower);
|
132
|
+
__m128d rank_minus_floored_rank = _mm_sub_pd(_mm_set_pd1(rank), _mm_set_pd1((float)floored_rank));
|
133
|
+
return _mm_add_pd(lower, _mm_mul_pd(upper_minus_lower, rank_minus_floored_rank));
|
134
|
+
}
|
135
|
+
|
136
|
+
Stats*
|
137
|
+
DFloat::descriptive_statistics_packed()
|
138
|
+
{
|
139
|
+
stats = new Stats[cols];
|
140
|
+
const int simd_pack_size = 2;
|
141
|
+
|
142
|
+
__m128d lengths = _mm_set_pd1((double)rows);
|
143
|
+
for (int col = 0; col < cols; col += simd_pack_size) {
|
144
|
+
sort_columns(col, simd_pack_size);
|
145
|
+
|
146
|
+
__m128d mins = pack(col, 0);
|
147
|
+
__m128d maxes = pack(col, rows - 1);
|
148
|
+
__m128d sums = _mm_setzero_pd();
|
149
|
+
for (int row_index = 0; row_index < rows; row_index++) {
|
150
|
+
__m128d packed = pack(col, row_index);
|
151
|
+
sums = _mm_add_pd(sums, packed);
|
152
|
+
}
|
153
|
+
__m128d means = _mm_div_pd(sums, lengths);
|
154
|
+
|
155
|
+
__m128d medians = percentile_packed(col, 0.5f);
|
156
|
+
__m128d q1s = percentile_packed(col, 0.25f);
|
157
|
+
__m128d q3s = percentile_packed(col, 0.75f);
|
158
|
+
|
159
|
+
__m128d variances = _mm_setzero_pd();
|
160
|
+
for (int row_index = 0; row_index < rows; row_index++) {
|
161
|
+
__m128d packed = pack(col, row_index);
|
162
|
+
__m128d deviation = _mm_sub_pd(packed, means);
|
163
|
+
__m128d sqr_deviation = _mm_mul_pd(deviation, deviation);
|
164
|
+
variances = _mm_add_pd(variances, _mm_div_pd(sqr_deviation, lengths));
|
165
|
+
}
|
166
|
+
__m128d stdevs = _mm_sqrt_pd(variances);
|
167
|
+
|
168
|
+
for (int simd_slot_index = 0; simd_slot_index < simd_pack_size; simd_slot_index++) {
|
169
|
+
if ((col + simd_slot_index) < cols) {
|
170
|
+
Stats var_stats;
|
171
|
+
var_stats.min = MM_GET_INDEX(mins, simd_slot_index);
|
172
|
+
var_stats.max = MM_GET_INDEX(maxes, simd_slot_index);
|
173
|
+
var_stats.mean = MM_GET_INDEX(means, simd_slot_index);
|
174
|
+
var_stats.median = MM_GET_INDEX(medians, simd_slot_index);
|
175
|
+
var_stats.q1 = MM_GET_INDEX(q1s, simd_slot_index);
|
176
|
+
var_stats.q3 = MM_GET_INDEX(q3s, simd_slot_index);
|
177
|
+
var_stats.standard_deviation = MM_GET_INDEX(stdevs, simd_slot_index);
|
178
|
+
|
179
|
+
stats[col + simd_slot_index] = var_stats;
|
180
|
+
}
|
181
|
+
}
|
182
|
+
}
|
183
|
+
|
184
|
+
return stats;
|
185
|
+
}
|
186
|
+
|
187
|
+
#endif
|
188
|
+
} // namespace array_2d
|
@@ -0,0 +1,60 @@
|
|
1
|
+
#ifndef ARRAY_2D_H
|
2
|
+
#define ARRAY_2D_H
|
3
|
+
#include <algorithm>
|
4
|
+
#include "ruby.h"
|
5
|
+
|
6
|
+
#ifdef HAVE_XMMINTRIN_H
|
7
|
+
#include <xmmintrin.h>
|
8
|
+
#define MM_GET_INDEX(packed, index) *(((double*)&packed) + index);
|
9
|
+
#endif
|
10
|
+
|
11
|
+
namespace array_2d
|
12
|
+
{
|
13
|
+
|
14
|
+
struct Stats {
|
15
|
+
double min;
|
16
|
+
double max;
|
17
|
+
double mean;
|
18
|
+
double median;
|
19
|
+
double q1;
|
20
|
+
double q3;
|
21
|
+
double standard_deviation;
|
22
|
+
|
23
|
+
Stats()
|
24
|
+
{
|
25
|
+
min = 0.0, max = 0.0, mean = 0.0, median = 0.0, q1 = 0.0, q3 = 0.0, standard_deviation = 0.0;
|
26
|
+
};
|
27
|
+
};
|
28
|
+
|
29
|
+
class DFloat
|
30
|
+
{
|
31
|
+
inline double* base_ptr(int col) { return entries + (col * rows); }
|
32
|
+
inline void sort(double* col);
|
33
|
+
inline double percentile(double* col, double pct);
|
34
|
+
inline double sum(double* col);
|
35
|
+
inline double standard_deviation(double* col, double mean);
|
36
|
+
|
37
|
+
#ifdef HAVE_XMMINTRIN_H
|
38
|
+
inline double safe_entry(int col, int row);
|
39
|
+
inline void sort_columns(int start_col, int pack_size);
|
40
|
+
inline __m128d percentile_packed(int start_col, float pct);
|
41
|
+
inline __m128d pack(int start_col, int row);
|
42
|
+
#endif
|
43
|
+
|
44
|
+
public:
|
45
|
+
int cols;
|
46
|
+
int rows;
|
47
|
+
double* entries;
|
48
|
+
Stats* stats;
|
49
|
+
|
50
|
+
DFloat(VALUE ruby_arr);
|
51
|
+
~DFloat();
|
52
|
+
|
53
|
+
Stats* descriptive_statistics();
|
54
|
+
|
55
|
+
#ifdef HAVE_XMMINTRIN_H
|
56
|
+
Stats* descriptive_statistics_packed();
|
57
|
+
#endif
|
58
|
+
};
|
59
|
+
} // namespace array_2d
|
60
|
+
#endif
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#ifdef DEBUG
|
2
|
+
#ifndef DEBUG_H
|
3
|
+
#define DEBUG_H
|
4
|
+
|
5
|
+
#include <x86intrin.h>
|
6
|
+
#include <stdio.h>
|
7
|
+
#include <string.h>
|
8
|
+
|
9
|
+
#define CONCAT_(a, b) a##b
|
10
|
+
#define CONCAT(a,b) CONCAT_(a, b)
|
11
|
+
#define PROFILE DebugTimer CONCAT(Timer, __COUNTER__)(__func__, __LINE__);
|
12
|
+
|
13
|
+
struct DebugTimer {
|
14
|
+
char* name;
|
15
|
+
unsigned long long counter;
|
16
|
+
|
17
|
+
DebugTimer(const char* function_name, int line_number) {
|
18
|
+
name = (char*)malloc(200 * sizeof(char));
|
19
|
+
|
20
|
+
strcpy(name, function_name);
|
21
|
+
strcat(name, "_");
|
22
|
+
snprintf(name + strlen(name), 4, "%d", line_number);
|
23
|
+
|
24
|
+
counter = __rdtsc();
|
25
|
+
}
|
26
|
+
|
27
|
+
~DebugTimer() {
|
28
|
+
printf("\n%30s:\t %-10llu", name, __rdtsc() - counter);
|
29
|
+
free(name);
|
30
|
+
fflush(stdout);
|
31
|
+
}
|
32
|
+
};
|
33
|
+
#endif
|
34
|
+
#else
|
35
|
+
#define PROFILE();
|
36
|
+
#endif
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require "mkmf"
|
2
|
+
|
3
|
+
# Enable debug compile using DEBUG env var
|
4
|
+
if ENV["DEBUG"]
|
5
|
+
puts "Compiling in debug mode..."
|
6
|
+
CONFIG["debugflags"] = "-g"
|
7
|
+
CONFIG["optflags"] = "-O0"
|
8
|
+
$defs << "-DDEBUG"
|
9
|
+
end
|
10
|
+
|
11
|
+
# Disable warnings
|
12
|
+
[
|
13
|
+
/ -Wdeclaration-after-statement/,
|
14
|
+
/ -Wimplicit-int/,
|
15
|
+
/ -Wimplicit-function-declaration/,
|
16
|
+
].each do |flag|
|
17
|
+
CONFIG["warnflags"].slice!(flag)
|
18
|
+
end
|
19
|
+
|
20
|
+
have_header("xmmintrin.h")
|
21
|
+
create_makefile("fast_statistics/fast_statistics")
|
@@ -0,0 +1,190 @@
|
|
1
|
+
#include "fast_statistics.h"
|
2
|
+
#include "array_2d.h"
|
3
|
+
|
4
|
+
using namespace array_2d;
|
5
|
+
|
6
|
+
static VALUE mFastStatistics;
|
7
|
+
static VALUE cArray2D;
|
8
|
+
|
9
|
+
// Helper
|
10
|
+
VALUE
|
11
|
+
build_results_hashes(Stats* stats, int num_variables)
|
12
|
+
{
|
13
|
+
VALUE a_results = rb_ary_new();
|
14
|
+
|
15
|
+
VALUE s_min = rb_sym("min");
|
16
|
+
VALUE s_max = rb_sym("max");
|
17
|
+
VALUE s_mean = rb_sym("mean");
|
18
|
+
VALUE s_median = rb_sym("median");
|
19
|
+
VALUE s_q1 = rb_sym("q1");
|
20
|
+
VALUE s_q3 = rb_sym("q3");
|
21
|
+
VALUE s_standard_deviation = rb_sym("standard_deviation");
|
22
|
+
|
23
|
+
for (int i = 0; i < num_variables; i++) {
|
24
|
+
VALUE h_result = rb_hash_new();
|
25
|
+
Stats var_stats = stats[i];
|
26
|
+
|
27
|
+
rb_hash_aset(h_result, s_min, DBL2NUM(var_stats.min));
|
28
|
+
rb_hash_aset(h_result, s_max, DBL2NUM(var_stats.max));
|
29
|
+
rb_hash_aset(h_result, s_mean, DBL2NUM(var_stats.mean));
|
30
|
+
rb_hash_aset(h_result, s_median, DBL2NUM(var_stats.median));
|
31
|
+
rb_hash_aset(h_result, s_q1, DBL2NUM(var_stats.q1));
|
32
|
+
rb_hash_aset(h_result, s_q3, DBL2NUM(var_stats.q3));
|
33
|
+
rb_hash_aset(h_result, s_standard_deviation, DBL2NUM(var_stats.standard_deviation));
|
34
|
+
|
35
|
+
rb_ary_push(a_results, h_result);
|
36
|
+
}
|
37
|
+
|
38
|
+
return a_results;
|
39
|
+
}
|
40
|
+
|
41
|
+
// Common
|
42
|
+
void
|
43
|
+
free_wrapped_array(void* array)
|
44
|
+
{
|
45
|
+
((DFloat*)array)->~DFloat();
|
46
|
+
}
|
47
|
+
|
48
|
+
size_t
|
49
|
+
wrapped_array_size(const void* data)
|
50
|
+
{
|
51
|
+
DFloat* array = (DFloat*)data;
|
52
|
+
size_t size = sizeof(array->entries) + sizeof(*array);
|
53
|
+
return size;
|
54
|
+
}
|
55
|
+
|
56
|
+
static rb_data_type_t dfloat_wrapper = [] {
|
57
|
+
rb_data_type_t wrapper{};
|
58
|
+
wrapper.wrap_struct_name = "dfloat";
|
59
|
+
wrapper.function = { dmark : NULL, dfree : free_wrapped_array, dsize : wrapped_array_size };
|
60
|
+
wrapper.data = NULL;
|
61
|
+
wrapper.flags = RUBY_TYPED_FREE_IMMEDIATELY;
|
62
|
+
return wrapper;
|
63
|
+
}();
|
64
|
+
|
65
|
+
VALUE
|
66
|
+
cArray2D_alloc(VALUE self)
|
67
|
+
{
|
68
|
+
void* dfloat = (void*)malloc(sizeof(void*));
|
69
|
+
|
70
|
+
return TypedData_Wrap_Struct(self, &dfloat_wrapper, dfloat);
|
71
|
+
}
|
72
|
+
|
73
|
+
inline bool
|
74
|
+
cArray2D_check_array_args(VALUE arrays)
|
75
|
+
{
|
76
|
+
if (TYPE(arrays) == T_ARRAY && TYPE(rb_ary_entry(arrays, 0)) == T_ARRAY) {
|
77
|
+
return true;
|
78
|
+
} else {
|
79
|
+
Check_Type(arrays, T_ARRAY);
|
80
|
+
Check_Type(rb_ary_entry(arrays, 0), T_ARRAY);
|
81
|
+
return false;
|
82
|
+
}
|
83
|
+
}
|
84
|
+
|
85
|
+
//{{{ Unpacked
|
86
|
+
VALUE
|
87
|
+
simd_disabled(VALUE self)
|
88
|
+
{
|
89
|
+
return Qfalse;
|
90
|
+
}
|
91
|
+
|
92
|
+
/*
|
93
|
+
* def initialize(arrays)
|
94
|
+
*/
|
95
|
+
VALUE
|
96
|
+
cArray2D_initialize_unpacked(VALUE self, VALUE arrays)
|
97
|
+
{
|
98
|
+
// Typecheck 2d array
|
99
|
+
if (cArray2D_check_array_args(arrays)) {
|
100
|
+
// Initialize dfloat structure to store Dfloat in type wrapper
|
101
|
+
void* dfloat;
|
102
|
+
UNWRAP_DFLOAT(self, dfloat);
|
103
|
+
new (dfloat) DFloat(arrays);
|
104
|
+
}
|
105
|
+
return self;
|
106
|
+
}
|
107
|
+
|
108
|
+
/*
|
109
|
+
* Unpacked descriptive statistics
|
110
|
+
*
|
111
|
+
* def descriptive_statistics
|
112
|
+
*/
|
113
|
+
VALUE
|
114
|
+
cArray2D_descriptive_statistics_unpacked(VALUE self)
|
115
|
+
{
|
116
|
+
void* dfloat_untyped;
|
117
|
+
UNWRAP_DFLOAT(self, dfloat_untyped);
|
118
|
+
|
119
|
+
DFloat* dfloat = ((DFloat*)dfloat_untyped);
|
120
|
+
Stats* stats = dfloat->descriptive_statistics();
|
121
|
+
return build_results_hashes(stats, dfloat->cols);
|
122
|
+
}
|
123
|
+
//}}}
|
124
|
+
|
125
|
+
// Packed
|
126
|
+
#ifdef HAVE_XMMINTRIN_H
|
127
|
+
extern "C" VALUE
|
128
|
+
simd_enabled(VALUE self)
|
129
|
+
{
|
130
|
+
return Qtrue;
|
131
|
+
}
|
132
|
+
|
133
|
+
/*
|
134
|
+
* def initialize(arrays)
|
135
|
+
*/
|
136
|
+
VALUE
|
137
|
+
cArray2D_initialize_packed(VALUE self, VALUE arrays)
|
138
|
+
{
|
139
|
+
// Typecheck 2d array
|
140
|
+
if (cArray2D_check_array_args(arrays)) {
|
141
|
+
// Initialize dfloat structure to store Dfloat in type wrapper
|
142
|
+
void* dfloat;
|
143
|
+
UNWRAP_DFLOAT(self, dfloat);
|
144
|
+
new (dfloat) DFloat(arrays);
|
145
|
+
}
|
146
|
+
return self;
|
147
|
+
}
|
148
|
+
|
149
|
+
/*
|
150
|
+
* Packed descriptive statistics
|
151
|
+
*
|
152
|
+
* def descriptive_statistics
|
153
|
+
*/
|
154
|
+
VALUE
|
155
|
+
cArray2D_descriptive_statistics_packed(VALUE self)
|
156
|
+
{
|
157
|
+
void* dfloat_untyped;
|
158
|
+
UNWRAP_DFLOAT(self, dfloat_untyped);
|
159
|
+
|
160
|
+
DFloat* dfloat = ((DFloat*)dfloat_untyped);
|
161
|
+
Stats* stats = dfloat->descriptive_statistics_packed();
|
162
|
+
return build_results_hashes(stats, dfloat->cols);
|
163
|
+
}
|
164
|
+
#endif
|
165
|
+
|
166
|
+
extern "C" void
|
167
|
+
Init_fast_statistics(void)
|
168
|
+
{
|
169
|
+
mFastStatistics = rb_define_module("FastStatistics");
|
170
|
+
cArray2D = rb_define_class_under(mFastStatistics, "Array2D", rb_cData);
|
171
|
+
rb_define_alloc_func(cArray2D, cArray2D_alloc);
|
172
|
+
|
173
|
+
#ifdef HAVE_XMMINTRIN_H
|
174
|
+
rb_define_singleton_method(mFastStatistics, "simd_enabled?", RUBY_METHOD_FUNC(simd_enabled), 0);
|
175
|
+
rb_define_method(cArray2D, "initialize", RUBY_METHOD_FUNC(cArray2D_initialize_packed), 1);
|
176
|
+
rb_define_method(
|
177
|
+
cArray2D,
|
178
|
+
"descriptive_statistics",
|
179
|
+
RUBY_METHOD_FUNC(cArray2D_descriptive_statistics_packed),
|
180
|
+
0);
|
181
|
+
#else
|
182
|
+
rb_define_singleton_method(mFastStatistics, "simd_enabled?", RUBY_METHOD_FUNC(simd_disabled), 0);
|
183
|
+
rb_define_method(cArray2D, "initialize", RUBY_METHOD_FUNC(cArray2D_initialize_unpacked), 1);
|
184
|
+
rb_define_method(
|
185
|
+
cArray2D,
|
186
|
+
"descriptive_statistics",
|
187
|
+
RUBY_METHOD_FUNC(cArray2D_descriptive_statistics_unpacked),
|
188
|
+
0);
|
189
|
+
#endif
|
190
|
+
}
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#ifndef FAST_STATISTICS_H
|
2
|
+
#define FAST_STATISTICS_H
|
3
|
+
|
4
|
+
#include <ruby.h>
|
5
|
+
#ifdef HAVE_XMMINTRIN_H
|
6
|
+
#include <xmmintrin.h>
|
7
|
+
#endif
|
8
|
+
|
9
|
+
#include "debug.h"
|
10
|
+
|
11
|
+
#define rb_sym(str) ID2SYM(rb_intern(str))
|
12
|
+
#define UNWRAP_DFLOAT(obj, var) TypedData_Get_Struct((obj), void*, &dfloat_wrapper, (var));
|
13
|
+
|
14
|
+
#endif
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require_relative 'lib/fast_statistics/version'
|
2
|
+
|
3
|
+
Gem::Specification.new do |spec|
|
4
|
+
spec.name = "fast_statistics"
|
5
|
+
spec.version = FastStatistics::VERSION
|
6
|
+
spec.authors = ["Martin Nyaga"]
|
7
|
+
spec.email = ["martin@martinnyaga.com"]
|
8
|
+
|
9
|
+
spec.summary = %q{Fast computation of descriptive statistics in ruby}
|
10
|
+
spec.description = %q{Fast computation of descriptive statistics in ruby using native code and SIMD}
|
11
|
+
spec.homepage = "https://github.com/martin-nyaga/ruby-ffi-simd"
|
12
|
+
spec.license = "MIT"
|
13
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
14
|
+
|
15
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
16
|
+
spec.metadata["source_code_uri"] = "https://github.com/martin-nyaga/fast_statistics"
|
17
|
+
spec.metadata["changelog_uri"] = "https://github.com/martin-nyaga/fast_statistics"
|
18
|
+
|
19
|
+
# Specify which files should be added to the gem when it is released.
|
20
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
21
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
22
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
23
|
+
end
|
24
|
+
spec.require_paths = ["lib"]
|
25
|
+
|
26
|
+
spec.extensions = %w[ext/fast_statistics/extconf.rb]
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: fast_statistics
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Martin Nyaga
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2021-03-23 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Fast computation of descriptive statistics in ruby using native code
|
14
|
+
and SIMD
|
15
|
+
email:
|
16
|
+
- martin@martinnyaga.com
|
17
|
+
executables: []
|
18
|
+
extensions:
|
19
|
+
- ext/fast_statistics/extconf.rb
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- ".gitignore"
|
23
|
+
- ".rspec"
|
24
|
+
- ".travis.yml"
|
25
|
+
- Gemfile
|
26
|
+
- LICENSE.txt
|
27
|
+
- README.md
|
28
|
+
- Rakefile
|
29
|
+
- benchmark/base.rb
|
30
|
+
- benchmark/benchmark.rb
|
31
|
+
- benchmark/helpers.rb
|
32
|
+
- bin/console
|
33
|
+
- bin/rake
|
34
|
+
- bin/rspec
|
35
|
+
- bin/setup
|
36
|
+
- ext/fast_statistics/array_2d.cpp
|
37
|
+
- ext/fast_statistics/array_2d.h
|
38
|
+
- ext/fast_statistics/debug.h
|
39
|
+
- ext/fast_statistics/extconf.rb
|
40
|
+
- ext/fast_statistics/fast_statistics.cpp
|
41
|
+
- ext/fast_statistics/fast_statistics.h
|
42
|
+
- fast_statistics.gemspec
|
43
|
+
- lib/fast_statistics.rb
|
44
|
+
- lib/fast_statistics/version.rb
|
45
|
+
homepage: https://github.com/martin-nyaga/ruby-ffi-simd
|
46
|
+
licenses:
|
47
|
+
- MIT
|
48
|
+
metadata:
|
49
|
+
homepage_uri: https://github.com/martin-nyaga/ruby-ffi-simd
|
50
|
+
source_code_uri: https://github.com/martin-nyaga/fast_statistics
|
51
|
+
changelog_uri: https://github.com/martin-nyaga/fast_statistics
|
52
|
+
post_install_message:
|
53
|
+
rdoc_options: []
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 2.3.0
|
61
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
requirements: []
|
67
|
+
rubygems_version: 3.1.4
|
68
|
+
signing_key:
|
69
|
+
specification_version: 4
|
70
|
+
summary: Fast computation of descriptive statistics in ruby
|
71
|
+
test_files: []
|