daru_lite 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/workflows/ci.yml +33 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +27 -0
- data/.rubocop_todo.yml +137 -0
- data/CONTRIBUTING.md +47 -0
- data/Gemfile +2 -0
- data/History.md +4 -0
- data/LICENSE +24 -0
- data/README.md +218 -0
- data/Rakefile +69 -0
- data/ReleasePolicy.md +20 -0
- data/benchmarks/TradeoffData.csv +65 -0
- data/benchmarks/csv_reading.rb +22 -0
- data/benchmarks/dataframe_creation.rb +39 -0
- data/benchmarks/db_loading.rb +34 -0
- data/benchmarks/duplicating.rb +45 -0
- data/benchmarks/group_by.rb +32 -0
- data/benchmarks/joining.rb +52 -0
- data/benchmarks/row_access.rb +41 -0
- data/benchmarks/row_assign.rb +36 -0
- data/benchmarks/sorting.rb +51 -0
- data/benchmarks/statistics.rb +28 -0
- data/benchmarks/vector_access.rb +31 -0
- data/benchmarks/vector_assign.rb +42 -0
- data/benchmarks/where_clause.rb +48 -0
- data/benchmarks/where_vs_filter.rb +28 -0
- data/daru_lite.gemspec +55 -0
- data/images/README.md +5 -0
- data/images/con0.png +0 -0
- data/images/con1.png +0 -0
- data/images/init0.png +0 -0
- data/images/init1.png +0 -0
- data/images/man0.png +0 -0
- data/images/man1.png +0 -0
- data/images/man2.png +0 -0
- data/images/man3.png +0 -0
- data/images/man4.png +0 -0
- data/images/man5.png +0 -0
- data/images/man6.png +0 -0
- data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
- data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
- data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
- data/lib/daru_lite/category.rb +929 -0
- data/lib/daru_lite/configuration.rb +34 -0
- data/lib/daru_lite/core/group_by.rb +403 -0
- data/lib/daru_lite/core/merge.rb +270 -0
- data/lib/daru_lite/core/query.rb +109 -0
- data/lib/daru_lite/dataframe.rb +3080 -0
- data/lib/daru_lite/date_time/index.rb +569 -0
- data/lib/daru_lite/date_time/offsets.rb +397 -0
- data/lib/daru_lite/exceptions.rb +2 -0
- data/lib/daru_lite/extensions/which_dsl.rb +53 -0
- data/lib/daru_lite/formatters/table.rb +52 -0
- data/lib/daru_lite/helpers/array.rb +53 -0
- data/lib/daru_lite/index/categorical_index.rb +201 -0
- data/lib/daru_lite/index/index.rb +374 -0
- data/lib/daru_lite/index/multi_index.rb +374 -0
- data/lib/daru_lite/io/csv/converters.rb +21 -0
- data/lib/daru_lite/io/io.rb +294 -0
- data/lib/daru_lite/io/sql_data_source.rb +97 -0
- data/lib/daru_lite/iruby/helpers.rb +38 -0
- data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
- data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
- data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
- data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
- data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
- data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
- data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
- data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
- data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
- data/lib/daru_lite/monkeys.rb +56 -0
- data/lib/daru_lite/vector.rb +1678 -0
- data/lib/daru_lite/version.rb +3 -0
- data/lib/daru_lite.rb +99 -0
- data/profile/_base.rb +23 -0
- data/profile/df_to_a.rb +10 -0
- data/profile/filter.rb +13 -0
- data/profile/joining.rb +13 -0
- data/profile/sorting.rb +12 -0
- data/profile/vector_each_with_index.rb +9 -0
- data/profile/vector_new.rb +9 -0
- data/spec/accessors/array_wrapper_spec.rb +3 -0
- data/spec/category_spec.rb +1741 -0
- data/spec/core/group_by_spec.rb +655 -0
- data/spec/core/merge_spec.rb +179 -0
- data/spec/core/query_spec.rb +347 -0
- data/spec/daru_lite_spec.rb +22 -0
- data/spec/dataframe_spec.rb +4330 -0
- data/spec/date_time/data_spec.rb +197 -0
- data/spec/date_time/date_time_index_helper_spec.rb +72 -0
- data/spec/date_time/index_spec.rb +588 -0
- data/spec/date_time/offsets_spec.rb +465 -0
- data/spec/extensions/which_dsl_spec.rb +38 -0
- data/spec/fixtures/bank2.dat +200 -0
- data/spec/fixtures/boolean_converter_test.csv +5 -0
- data/spec/fixtures/countries.json +7794 -0
- data/spec/fixtures/duplicates.csv +32 -0
- data/spec/fixtures/eciresults.html +394 -0
- data/spec/fixtures/empties.dat +2 -0
- data/spec/fixtures/empty_rows_test.csv +17 -0
- data/spec/fixtures/macau.html +3691 -0
- data/spec/fixtures/macd_data.csv +150 -0
- data/spec/fixtures/matrix_test.csv +100 -0
- data/spec/fixtures/moneycontrol.html +6812 -0
- data/spec/fixtures/music_data.tsv +2501 -0
- data/spec/fixtures/repeated_fields.csv +7 -0
- data/spec/fixtures/sales-funnel.csv +18 -0
- data/spec/fixtures/scientific_notation.csv +4 -0
- data/spec/fixtures/string_converter_test.csv +5 -0
- data/spec/fixtures/strings.dat +2 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/fixtures/test_xls_2.xls +0 -0
- data/spec/fixtures/url_test.txt~ +0 -0
- data/spec/fixtures/valid_markup.html +62 -0
- data/spec/fixtures/wiki_climate.html +1243 -0
- data/spec/fixtures/wiki_table_info.html +631 -0
- data/spec/formatters/table_formatter_spec.rb +137 -0
- data/spec/helpers_spec.rb +8 -0
- data/spec/index/categorical_index_spec.rb +170 -0
- data/spec/index/index_spec.rb +417 -0
- data/spec/index/multi_index_spec.rb +680 -0
- data/spec/io/io_spec.rb +373 -0
- data/spec/io/sql_data_source_spec.rb +56 -0
- data/spec/iruby/dataframe_spec.rb +170 -0
- data/spec/iruby/helpers_spec.rb +49 -0
- data/spec/iruby/multi_index_spec.rb +37 -0
- data/spec/iruby/vector_spec.rb +105 -0
- data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
- data/spec/maths/arithmetic/vector_spec.rb +165 -0
- data/spec/maths/statistics/dataframe_spec.rb +178 -0
- data/spec/maths/statistics/vector_spec.rb +756 -0
- data/spec/monkeys_spec.rb +42 -0
- data/spec/shared/vector_display_spec.rb +213 -0
- data/spec/spec_helper.rb +87 -0
- data/spec/support/database_helper.rb +30 -0
- data/spec/support/matchers.rb +5 -0
- data/spec/vector_spec.rb +2293 -0
- metadata +571 -0
@@ -0,0 +1,202 @@
|
|
1
|
+
module DaruLite
|
2
|
+
module Maths
|
3
|
+
module Statistics
|
4
|
+
module DataFrame
|
5
|
+
# @!method mean
|
6
|
+
# Calculate mean of numeric vectors
|
7
|
+
# @!method variance_sample
|
8
|
+
# Calculate sample variance of numeric vectors
|
9
|
+
# @!method range
|
10
|
+
# Calculate range of numeric vectors
|
11
|
+
# @!method median
|
12
|
+
# Calculate median of numeric vectors
|
13
|
+
# @!method mode
|
14
|
+
# Calculate mode of numeric vectors
|
15
|
+
# @!method std
|
16
|
+
# Calculate sample standard deviation of numeric vectors
|
17
|
+
# @!method sum
|
18
|
+
# Calculate sum of numeric vectors
|
19
|
+
# @!method count
|
20
|
+
# Count the number of non-nil values in each vector
|
21
|
+
# @!method min
|
22
|
+
# Calculate the minimum value of each numeric vector
|
23
|
+
# @!method product
|
24
|
+
# Compute the product of each numeric vector
|
25
|
+
%i[mean variance_sample range median mode std sum count min product].each do |meth|
|
26
|
+
define_method(meth) do
|
27
|
+
compute_stats meth
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Calculate the maximum value of each numeric vector.
|
32
|
+
def max(opts = {})
|
33
|
+
if opts[:vector]
|
34
|
+
row[*self[opts[:vector]].max_index.index.to_a]
|
35
|
+
else
|
36
|
+
compute_stats :max
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# @!method cumsum
|
41
|
+
# Calculate cumulative sum of each numeric Vector
|
42
|
+
# @!method standardize
|
43
|
+
# Standardize each Vector
|
44
|
+
# @!method acf(max_lags)
|
45
|
+
# Calculate Autocorrelation coefficient
|
46
|
+
# @param max_lags [Integer] (nil) Number of initial lags
|
47
|
+
# @!method ema(n,wilder)
|
48
|
+
# Calculate exponential moving average.
|
49
|
+
# @param n [Integer] (10) Loopback length.
|
50
|
+
# @param wilder [TrueClass, FalseClass, NilClass] (false) If true,
|
51
|
+
# 1/n value is used for smoothing; if false, uses 2/(n+1) value.
|
52
|
+
# @!method rolling_mean(n)
|
53
|
+
# Calculate moving averages
|
54
|
+
# @param n [Integer] (10) Loopback length. Default to 10.
|
55
|
+
# @!method rolling_median(n)
|
56
|
+
# Calculate moving median
|
57
|
+
# @param n [Integer] (10) Loopback length. Default to 10.
|
58
|
+
# @!method rolling_max(n)
|
59
|
+
# Calculate moving max
|
60
|
+
# @param n [Integer] (10) Loopback length. Default to 10.
|
61
|
+
# @!method rolling_min(n)
|
62
|
+
# Calculate moving min
|
63
|
+
# @param n [Integer] (10) Loopback length. Default to 10.
|
64
|
+
# @!method rolling_count(n)
|
65
|
+
# Calculate moving non-missing count
|
66
|
+
# @param n [Integer] (10) Loopback length. Default to 10.
|
67
|
+
# @!method rolling_std(n)
|
68
|
+
# Calculate moving standard deviation
|
69
|
+
# @param n [Integer] (10) Loopback length. Default to 10.
|
70
|
+
# @!method rolling_variance(n)
|
71
|
+
# Calculate moving variance
|
72
|
+
# @param n [Integer] (10) Loopback length. Default to 10.
|
73
|
+
%i[
|
74
|
+
cumsum standardize acf ema rolling_mean rolling_median rolling_max
|
75
|
+
rolling_min rolling_count rolling_std rolling_variance rolling_sum
|
76
|
+
].each do |meth|
|
77
|
+
define_method(meth) do |*args|
|
78
|
+
apply_method_to_numerics meth, *args
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# Create a summary of mean, standard deviation, count, max and min of
|
83
|
+
# each numeric vector in the dataframe in one shot.
|
84
|
+
#
|
85
|
+
# == Arguments
|
86
|
+
#
|
87
|
+
# +methods+ - An array with aggregation methods specified as symbols to
|
88
|
+
# be applied to numeric vectors. Default is [:count, :mean, :std, :max,
|
89
|
+
# :min]. Methods will be applied in the specified order.
|
90
|
+
def describe(methods = nil)
|
91
|
+
methods ||= %i[count mean std min max]
|
92
|
+
|
93
|
+
description_hash = {}
|
94
|
+
numeric_vectors.each do |vec|
|
95
|
+
description_hash[vec] = methods.map { |m| self[vec].send(m) }
|
96
|
+
end
|
97
|
+
DaruLite::DataFrame.new(description_hash, index: methods)
|
98
|
+
end
|
99
|
+
|
100
|
+
# The percent_change method computes the percent change over
|
101
|
+
# the given number of periods for numeric vectors.
|
102
|
+
#
|
103
|
+
# @param [Integer] periods (1) number of nils to insert at the beginning.
|
104
|
+
#
|
105
|
+
# @example
|
106
|
+
#
|
107
|
+
# df = DaruLite::DataFrame.new({
|
108
|
+
# 'col0' => [1,2,3,4,5,6],
|
109
|
+
# 'col2' => ['a','b','c','d','e','f'],
|
110
|
+
# 'col1' => [11,22,33,44,55,66]
|
111
|
+
# },
|
112
|
+
# index: ['one', 'two', 'three', 'four', 'five', 'six'],
|
113
|
+
# order: ['col0', 'col1', 'col2'])
|
114
|
+
# df.percent_change
|
115
|
+
# #=>
|
116
|
+
# # <DaruLite::DataFrame:23513280 @rows: 6 @cols: 2>
|
117
|
+
# # col0 col1
|
118
|
+
# # one
|
119
|
+
# # two 1.0 1.0
|
120
|
+
# # three 0.5 0.5
|
121
|
+
# # four 0.3333333333333333 0.3333333333333333
|
122
|
+
# # five 0.25 0.25
|
123
|
+
# # six 0.2 0.2
|
124
|
+
def percent_change(periods = 1)
|
125
|
+
df_numeric = only_numerics.vectors.to_a
|
126
|
+
df = DaruLite::DataFrame.new({}, order: @order, index: @index, name: @name)
|
127
|
+
df_numeric.each do |vec|
|
128
|
+
df[vec] = self[vec].percent_change periods
|
129
|
+
end
|
130
|
+
df
|
131
|
+
end
|
132
|
+
|
133
|
+
# Calculate sample variance-covariance between the numeric vectors.
|
134
|
+
def covariance
|
135
|
+
cache = Hash.new do |h, (col, row)|
|
136
|
+
value = vector_cov(self[row], self[col])
|
137
|
+
h[[col, row]] = value
|
138
|
+
h[[row, col]] = value
|
139
|
+
end
|
140
|
+
vectors = numeric_vectors
|
141
|
+
|
142
|
+
mat_rows = vectors.collect do |row|
|
143
|
+
vectors.collect do |col|
|
144
|
+
row == col ? self[row].variance : cache[[col, row]]
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
DaruLite::DataFrame.rows(mat_rows, index: numeric_vectors, order: numeric_vectors)
|
149
|
+
end
|
150
|
+
|
151
|
+
alias cov covariance
|
152
|
+
|
153
|
+
# Calculate the correlation between the numeric vectors.
|
154
|
+
def correlation
|
155
|
+
standard_deviation = std.to_matrix
|
156
|
+
corr_arry = cov
|
157
|
+
.to_matrix
|
158
|
+
.elementwise_division(standard_deviation.transpose *
|
159
|
+
standard_deviation).to_a
|
160
|
+
|
161
|
+
DaruLite::DataFrame.rows(corr_arry, index: numeric_vectors, order: numeric_vectors)
|
162
|
+
end
|
163
|
+
|
164
|
+
alias corr correlation
|
165
|
+
|
166
|
+
private
|
167
|
+
|
168
|
+
def apply_method_to_numerics(method, *args)
|
169
|
+
numerics = @vectors.to_a.map { |n| [n, @data[@vectors[n]]] }
|
170
|
+
.select { |_n, v| v.numeric? }
|
171
|
+
computed = numerics.map { |_n, v| v.send(method, *args) }
|
172
|
+
|
173
|
+
DaruLite::DataFrame.new(computed, index: @index, order: numerics.map(&:first), clone: false)
|
174
|
+
end
|
175
|
+
|
176
|
+
def vector_cov(v1a, v2a)
|
177
|
+
sum_of_squares(v1a, v2a) / (v1a.size - 1)
|
178
|
+
end
|
179
|
+
|
180
|
+
def sum_of_squares(v1, v2)
|
181
|
+
v1a = v1.reject_values(*DaruLite::MISSING_VALUES)
|
182
|
+
v2a = v2.reject_values(*DaruLite::MISSING_VALUES)
|
183
|
+
v1a.reset_index!
|
184
|
+
v2a.reset_index!
|
185
|
+
m1 = v1a.mean
|
186
|
+
m2 = v2a.mean
|
187
|
+
v1a.size.times.inject(0) { |ac, i| ac + ((v1a[i] - m1) * (v2a[i] - m2)) }
|
188
|
+
end
|
189
|
+
|
190
|
+
def compute_stats(method)
|
191
|
+
DaruLite::Vector.new(
|
192
|
+
numeric_vectors.each_with_object({}) do |vec, hash|
|
193
|
+
hash[vec] = self[vec].send(method)
|
194
|
+
end, name: method
|
195
|
+
)
|
196
|
+
end
|
197
|
+
alias sds std
|
198
|
+
alias variance variance_sample
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|