daru_lite 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/workflows/ci.yml +33 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +27 -0
- data/.rubocop_todo.yml +137 -0
- data/CONTRIBUTING.md +47 -0
- data/Gemfile +2 -0
- data/History.md +4 -0
- data/LICENSE +24 -0
- data/README.md +218 -0
- data/Rakefile +69 -0
- data/ReleasePolicy.md +20 -0
- data/benchmarks/TradeoffData.csv +65 -0
- data/benchmarks/csv_reading.rb +22 -0
- data/benchmarks/dataframe_creation.rb +39 -0
- data/benchmarks/db_loading.rb +34 -0
- data/benchmarks/duplicating.rb +45 -0
- data/benchmarks/group_by.rb +32 -0
- data/benchmarks/joining.rb +52 -0
- data/benchmarks/row_access.rb +41 -0
- data/benchmarks/row_assign.rb +36 -0
- data/benchmarks/sorting.rb +51 -0
- data/benchmarks/statistics.rb +28 -0
- data/benchmarks/vector_access.rb +31 -0
- data/benchmarks/vector_assign.rb +42 -0
- data/benchmarks/where_clause.rb +48 -0
- data/benchmarks/where_vs_filter.rb +28 -0
- data/daru_lite.gemspec +55 -0
- data/images/README.md +5 -0
- data/images/con0.png +0 -0
- data/images/con1.png +0 -0
- data/images/init0.png +0 -0
- data/images/init1.png +0 -0
- data/images/man0.png +0 -0
- data/images/man1.png +0 -0
- data/images/man2.png +0 -0
- data/images/man3.png +0 -0
- data/images/man4.png +0 -0
- data/images/man5.png +0 -0
- data/images/man6.png +0 -0
- data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
- data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
- data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
- data/lib/daru_lite/category.rb +929 -0
- data/lib/daru_lite/configuration.rb +34 -0
- data/lib/daru_lite/core/group_by.rb +403 -0
- data/lib/daru_lite/core/merge.rb +270 -0
- data/lib/daru_lite/core/query.rb +109 -0
- data/lib/daru_lite/dataframe.rb +3080 -0
- data/lib/daru_lite/date_time/index.rb +569 -0
- data/lib/daru_lite/date_time/offsets.rb +397 -0
- data/lib/daru_lite/exceptions.rb +2 -0
- data/lib/daru_lite/extensions/which_dsl.rb +53 -0
- data/lib/daru_lite/formatters/table.rb +52 -0
- data/lib/daru_lite/helpers/array.rb +53 -0
- data/lib/daru_lite/index/categorical_index.rb +201 -0
- data/lib/daru_lite/index/index.rb +374 -0
- data/lib/daru_lite/index/multi_index.rb +374 -0
- data/lib/daru_lite/io/csv/converters.rb +21 -0
- data/lib/daru_lite/io/io.rb +294 -0
- data/lib/daru_lite/io/sql_data_source.rb +97 -0
- data/lib/daru_lite/iruby/helpers.rb +38 -0
- data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
- data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
- data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
- data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
- data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
- data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
- data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
- data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
- data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
- data/lib/daru_lite/monkeys.rb +56 -0
- data/lib/daru_lite/vector.rb +1678 -0
- data/lib/daru_lite/version.rb +3 -0
- data/lib/daru_lite.rb +99 -0
- data/profile/_base.rb +23 -0
- data/profile/df_to_a.rb +10 -0
- data/profile/filter.rb +13 -0
- data/profile/joining.rb +13 -0
- data/profile/sorting.rb +12 -0
- data/profile/vector_each_with_index.rb +9 -0
- data/profile/vector_new.rb +9 -0
- data/spec/accessors/array_wrapper_spec.rb +3 -0
- data/spec/category_spec.rb +1741 -0
- data/spec/core/group_by_spec.rb +655 -0
- data/spec/core/merge_spec.rb +179 -0
- data/spec/core/query_spec.rb +347 -0
- data/spec/daru_lite_spec.rb +22 -0
- data/spec/dataframe_spec.rb +4330 -0
- data/spec/date_time/data_spec.rb +197 -0
- data/spec/date_time/date_time_index_helper_spec.rb +72 -0
- data/spec/date_time/index_spec.rb +588 -0
- data/spec/date_time/offsets_spec.rb +465 -0
- data/spec/extensions/which_dsl_spec.rb +38 -0
- data/spec/fixtures/bank2.dat +200 -0
- data/spec/fixtures/boolean_converter_test.csv +5 -0
- data/spec/fixtures/countries.json +7794 -0
- data/spec/fixtures/duplicates.csv +32 -0
- data/spec/fixtures/eciresults.html +394 -0
- data/spec/fixtures/empties.dat +2 -0
- data/spec/fixtures/empty_rows_test.csv +17 -0
- data/spec/fixtures/macau.html +3691 -0
- data/spec/fixtures/macd_data.csv +150 -0
- data/spec/fixtures/matrix_test.csv +100 -0
- data/spec/fixtures/moneycontrol.html +6812 -0
- data/spec/fixtures/music_data.tsv +2501 -0
- data/spec/fixtures/repeated_fields.csv +7 -0
- data/spec/fixtures/sales-funnel.csv +18 -0
- data/spec/fixtures/scientific_notation.csv +4 -0
- data/spec/fixtures/string_converter_test.csv +5 -0
- data/spec/fixtures/strings.dat +2 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/fixtures/test_xls_2.xls +0 -0
- data/spec/fixtures/url_test.txt~ +0 -0
- data/spec/fixtures/valid_markup.html +62 -0
- data/spec/fixtures/wiki_climate.html +1243 -0
- data/spec/fixtures/wiki_table_info.html +631 -0
- data/spec/formatters/table_formatter_spec.rb +137 -0
- data/spec/helpers_spec.rb +8 -0
- data/spec/index/categorical_index_spec.rb +170 -0
- data/spec/index/index_spec.rb +417 -0
- data/spec/index/multi_index_spec.rb +680 -0
- data/spec/io/io_spec.rb +373 -0
- data/spec/io/sql_data_source_spec.rb +56 -0
- data/spec/iruby/dataframe_spec.rb +170 -0
- data/spec/iruby/helpers_spec.rb +49 -0
- data/spec/iruby/multi_index_spec.rb +37 -0
- data/spec/iruby/vector_spec.rb +105 -0
- data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
- data/spec/maths/arithmetic/vector_spec.rb +165 -0
- data/spec/maths/statistics/dataframe_spec.rb +178 -0
- data/spec/maths/statistics/vector_spec.rb +756 -0
- data/spec/monkeys_spec.rb +42 -0
- data/spec/shared/vector_display_spec.rb +213 -0
- data/spec/spec_helper.rb +87 -0
- data/spec/support/database_helper.rb +30 -0
- data/spec/support/matchers.rb +5 -0
- data/spec/vector_spec.rb +2293 -0
- metadata +571 -0
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
module DaruLite
|
|
2
|
+
module Maths
|
|
3
|
+
module Statistics
|
|
4
|
+
module DataFrame
|
|
5
|
+
# @!method mean
|
|
6
|
+
# Calculate mean of numeric vectors
|
|
7
|
+
# @!method variance_sample
|
|
8
|
+
# Calculate sample variance of numeric vectors
|
|
9
|
+
# @!method range
|
|
10
|
+
# Calculate range of numeric vectors
|
|
11
|
+
# @!method median
|
|
12
|
+
# Calculate median of numeric vectors
|
|
13
|
+
# @!method mode
|
|
14
|
+
# Calculate mode of numeric vectors
|
|
15
|
+
# @!method std
|
|
16
|
+
# Calculate sample standard deviation of numeric vectors
|
|
17
|
+
# @!method sum
|
|
18
|
+
# Calculate sum of numeric vectors
|
|
19
|
+
# @!method count
|
|
20
|
+
# Count the number of non-nil values in each vector
|
|
21
|
+
# @!method min
|
|
22
|
+
# Calculate the minimum value of each numeric vector
|
|
23
|
+
# @!method product
|
|
24
|
+
# Compute the product of each numeric vector
|
|
25
|
+
%i[mean variance_sample range median mode std sum count min product].each do |meth|
|
|
26
|
+
define_method(meth) do
|
|
27
|
+
compute_stats meth
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Calculate the maximum value of each numeric vector.
|
|
32
|
+
def max(opts = {})
|
|
33
|
+
if opts[:vector]
|
|
34
|
+
row[*self[opts[:vector]].max_index.index.to_a]
|
|
35
|
+
else
|
|
36
|
+
compute_stats :max
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# @!method cumsum
|
|
41
|
+
# Calculate cumulative sum of each numeric Vector
|
|
42
|
+
# @!method standardize
|
|
43
|
+
# Standardize each Vector
|
|
44
|
+
# @!method acf(max_lags)
|
|
45
|
+
# Calculate Autocorrelation coefficient
|
|
46
|
+
# @param max_lags [Integer] (nil) Number of initial lags
|
|
47
|
+
# @!method ema(n,wilder)
|
|
48
|
+
# Calculate exponential moving average.
|
|
49
|
+
# @param n [Integer] (10) Loopback length.
|
|
50
|
+
# @param wilder [TrueClass, FalseClass, NilClass] (false) If true,
|
|
51
|
+
# 1/n value is used for smoothing; if false, uses 2/(n+1) value.
|
|
52
|
+
# @!method rolling_mean(n)
|
|
53
|
+
# Calculate moving averages
|
|
54
|
+
# @param n [Integer] (10) Loopback length. Default to 10.
|
|
55
|
+
# @!method rolling_median(n)
|
|
56
|
+
# Calculate moving median
|
|
57
|
+
# @param n [Integer] (10) Loopback length. Default to 10.
|
|
58
|
+
# @!method rolling_max(n)
|
|
59
|
+
# Calculate moving max
|
|
60
|
+
# @param n [Integer] (10) Loopback length. Default to 10.
|
|
61
|
+
# @!method rolling_min(n)
|
|
62
|
+
# Calculate moving min
|
|
63
|
+
# @param n [Integer] (10) Loopback length. Default to 10.
|
|
64
|
+
# @!method rolling_count(n)
|
|
65
|
+
# Calculate moving non-missing count
|
|
66
|
+
# @param n [Integer] (10) Loopback length. Default to 10.
|
|
67
|
+
# @!method rolling_std(n)
|
|
68
|
+
# Calculate moving standard deviation
|
|
69
|
+
# @param n [Integer] (10) Loopback length. Default to 10.
|
|
70
|
+
# @!method rolling_variance(n)
|
|
71
|
+
# Calculate moving variance
|
|
72
|
+
# @param n [Integer] (10) Loopback length. Default to 10.
|
|
73
|
+
%i[
|
|
74
|
+
cumsum standardize acf ema rolling_mean rolling_median rolling_max
|
|
75
|
+
rolling_min rolling_count rolling_std rolling_variance rolling_sum
|
|
76
|
+
].each do |meth|
|
|
77
|
+
define_method(meth) do |*args|
|
|
78
|
+
apply_method_to_numerics meth, *args
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Create a summary of mean, standard deviation, count, max and min of
|
|
83
|
+
# each numeric vector in the dataframe in one shot.
|
|
84
|
+
#
|
|
85
|
+
# == Arguments
|
|
86
|
+
#
|
|
87
|
+
# +methods+ - An array with aggregation methods specified as symbols to
|
|
88
|
+
# be applied to numeric vectors. Default is [:count, :mean, :std, :max,
|
|
89
|
+
# :min]. Methods will be applied in the specified order.
|
|
90
|
+
def describe(methods = nil)
|
|
91
|
+
methods ||= %i[count mean std min max]
|
|
92
|
+
|
|
93
|
+
description_hash = {}
|
|
94
|
+
numeric_vectors.each do |vec|
|
|
95
|
+
description_hash[vec] = methods.map { |m| self[vec].send(m) }
|
|
96
|
+
end
|
|
97
|
+
DaruLite::DataFrame.new(description_hash, index: methods)
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# The percent_change method computes the percent change over
|
|
101
|
+
# the given number of periods for numeric vectors.
|
|
102
|
+
#
|
|
103
|
+
# @param [Integer] periods (1) number of nils to insert at the beginning.
|
|
104
|
+
#
|
|
105
|
+
# @example
|
|
106
|
+
#
|
|
107
|
+
# df = DaruLite::DataFrame.new({
|
|
108
|
+
# 'col0' => [1,2,3,4,5,6],
|
|
109
|
+
# 'col2' => ['a','b','c','d','e','f'],
|
|
110
|
+
# 'col1' => [11,22,33,44,55,66]
|
|
111
|
+
# },
|
|
112
|
+
# index: ['one', 'two', 'three', 'four', 'five', 'six'],
|
|
113
|
+
# order: ['col0', 'col1', 'col2'])
|
|
114
|
+
# df.percent_change
|
|
115
|
+
# #=>
|
|
116
|
+
# # <DaruLite::DataFrame:23513280 @rows: 6 @cols: 2>
|
|
117
|
+
# # col0 col1
|
|
118
|
+
# # one
|
|
119
|
+
# # two 1.0 1.0
|
|
120
|
+
# # three 0.5 0.5
|
|
121
|
+
# # four 0.3333333333333333 0.3333333333333333
|
|
122
|
+
# # five 0.25 0.25
|
|
123
|
+
# # six 0.2 0.2
|
|
124
|
+
def percent_change(periods = 1)
|
|
125
|
+
df_numeric = only_numerics.vectors.to_a
|
|
126
|
+
df = DaruLite::DataFrame.new({}, order: @order, index: @index, name: @name)
|
|
127
|
+
df_numeric.each do |vec|
|
|
128
|
+
df[vec] = self[vec].percent_change periods
|
|
129
|
+
end
|
|
130
|
+
df
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Calculate sample variance-covariance between the numeric vectors.
|
|
134
|
+
def covariance
|
|
135
|
+
cache = Hash.new do |h, (col, row)|
|
|
136
|
+
value = vector_cov(self[row], self[col])
|
|
137
|
+
h[[col, row]] = value
|
|
138
|
+
h[[row, col]] = value
|
|
139
|
+
end
|
|
140
|
+
vectors = numeric_vectors
|
|
141
|
+
|
|
142
|
+
mat_rows = vectors.collect do |row|
|
|
143
|
+
vectors.collect do |col|
|
|
144
|
+
row == col ? self[row].variance : cache[[col, row]]
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
DaruLite::DataFrame.rows(mat_rows, index: numeric_vectors, order: numeric_vectors)
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
alias cov covariance
|
|
152
|
+
|
|
153
|
+
# Calculate the correlation between the numeric vectors.
|
|
154
|
+
def correlation
|
|
155
|
+
standard_deviation = std.to_matrix
|
|
156
|
+
corr_arry = cov
|
|
157
|
+
.to_matrix
|
|
158
|
+
.elementwise_division(standard_deviation.transpose *
|
|
159
|
+
standard_deviation).to_a
|
|
160
|
+
|
|
161
|
+
DaruLite::DataFrame.rows(corr_arry, index: numeric_vectors, order: numeric_vectors)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
alias corr correlation
|
|
165
|
+
|
|
166
|
+
private
|
|
167
|
+
|
|
168
|
+
def apply_method_to_numerics(method, *args)
|
|
169
|
+
numerics = @vectors.to_a.map { |n| [n, @data[@vectors[n]]] }
|
|
170
|
+
.select { |_n, v| v.numeric? }
|
|
171
|
+
computed = numerics.map { |_n, v| v.send(method, *args) }
|
|
172
|
+
|
|
173
|
+
DaruLite::DataFrame.new(computed, index: @index, order: numerics.map(&:first), clone: false)
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def vector_cov(v1a, v2a)
|
|
177
|
+
sum_of_squares(v1a, v2a) / (v1a.size - 1)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def sum_of_squares(v1, v2)
|
|
181
|
+
v1a = v1.reject_values(*DaruLite::MISSING_VALUES)
|
|
182
|
+
v2a = v2.reject_values(*DaruLite::MISSING_VALUES)
|
|
183
|
+
v1a.reset_index!
|
|
184
|
+
v2a.reset_index!
|
|
185
|
+
m1 = v1a.mean
|
|
186
|
+
m2 = v2a.mean
|
|
187
|
+
v1a.size.times.inject(0) { |ac, i| ac + ((v1a[i] - m1) * (v2a[i] - m2)) }
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def compute_stats(method)
|
|
191
|
+
DaruLite::Vector.new(
|
|
192
|
+
numeric_vectors.each_with_object({}) do |vec, hash|
|
|
193
|
+
hash[vec] = self[vec].send(method)
|
|
194
|
+
end, name: method
|
|
195
|
+
)
|
|
196
|
+
end
|
|
197
|
+
alias sds std
|
|
198
|
+
alias variance variance_sample
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
end
|