daru_lite 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (149) hide show
  1. checksums.yaml +7 -0
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/workflows/ci.yml +33 -0
  4. data/.gitignore +10 -0
  5. data/.rspec +2 -0
  6. data/.rubocop.yml +27 -0
  7. data/.rubocop_todo.yml +137 -0
  8. data/CONTRIBUTING.md +47 -0
  9. data/Gemfile +2 -0
  10. data/History.md +4 -0
  11. data/LICENSE +24 -0
  12. data/README.md +218 -0
  13. data/Rakefile +69 -0
  14. data/ReleasePolicy.md +20 -0
  15. data/benchmarks/TradeoffData.csv +65 -0
  16. data/benchmarks/csv_reading.rb +22 -0
  17. data/benchmarks/dataframe_creation.rb +39 -0
  18. data/benchmarks/db_loading.rb +34 -0
  19. data/benchmarks/duplicating.rb +45 -0
  20. data/benchmarks/group_by.rb +32 -0
  21. data/benchmarks/joining.rb +52 -0
  22. data/benchmarks/row_access.rb +41 -0
  23. data/benchmarks/row_assign.rb +36 -0
  24. data/benchmarks/sorting.rb +51 -0
  25. data/benchmarks/statistics.rb +28 -0
  26. data/benchmarks/vector_access.rb +31 -0
  27. data/benchmarks/vector_assign.rb +42 -0
  28. data/benchmarks/where_clause.rb +48 -0
  29. data/benchmarks/where_vs_filter.rb +28 -0
  30. data/daru_lite.gemspec +55 -0
  31. data/images/README.md +5 -0
  32. data/images/con0.png +0 -0
  33. data/images/con1.png +0 -0
  34. data/images/init0.png +0 -0
  35. data/images/init1.png +0 -0
  36. data/images/man0.png +0 -0
  37. data/images/man1.png +0 -0
  38. data/images/man2.png +0 -0
  39. data/images/man3.png +0 -0
  40. data/images/man4.png +0 -0
  41. data/images/man5.png +0 -0
  42. data/images/man6.png +0 -0
  43. data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
  44. data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
  45. data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
  46. data/lib/daru_lite/category.rb +929 -0
  47. data/lib/daru_lite/configuration.rb +34 -0
  48. data/lib/daru_lite/core/group_by.rb +403 -0
  49. data/lib/daru_lite/core/merge.rb +270 -0
  50. data/lib/daru_lite/core/query.rb +109 -0
  51. data/lib/daru_lite/dataframe.rb +3080 -0
  52. data/lib/daru_lite/date_time/index.rb +569 -0
  53. data/lib/daru_lite/date_time/offsets.rb +397 -0
  54. data/lib/daru_lite/exceptions.rb +2 -0
  55. data/lib/daru_lite/extensions/which_dsl.rb +53 -0
  56. data/lib/daru_lite/formatters/table.rb +52 -0
  57. data/lib/daru_lite/helpers/array.rb +53 -0
  58. data/lib/daru_lite/index/categorical_index.rb +201 -0
  59. data/lib/daru_lite/index/index.rb +374 -0
  60. data/lib/daru_lite/index/multi_index.rb +374 -0
  61. data/lib/daru_lite/io/csv/converters.rb +21 -0
  62. data/lib/daru_lite/io/io.rb +294 -0
  63. data/lib/daru_lite/io/sql_data_source.rb +97 -0
  64. data/lib/daru_lite/iruby/helpers.rb +38 -0
  65. data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
  66. data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
  67. data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
  68. data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
  69. data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
  70. data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
  71. data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
  72. data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
  73. data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
  74. data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
  75. data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
  76. data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
  77. data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
  78. data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
  79. data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
  80. data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
  81. data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
  82. data/lib/daru_lite/monkeys.rb +56 -0
  83. data/lib/daru_lite/vector.rb +1678 -0
  84. data/lib/daru_lite/version.rb +3 -0
  85. data/lib/daru_lite.rb +99 -0
  86. data/profile/_base.rb +23 -0
  87. data/profile/df_to_a.rb +10 -0
  88. data/profile/filter.rb +13 -0
  89. data/profile/joining.rb +13 -0
  90. data/profile/sorting.rb +12 -0
  91. data/profile/vector_each_with_index.rb +9 -0
  92. data/profile/vector_new.rb +9 -0
  93. data/spec/accessors/array_wrapper_spec.rb +3 -0
  94. data/spec/category_spec.rb +1741 -0
  95. data/spec/core/group_by_spec.rb +655 -0
  96. data/spec/core/merge_spec.rb +179 -0
  97. data/spec/core/query_spec.rb +347 -0
  98. data/spec/daru_lite_spec.rb +22 -0
  99. data/spec/dataframe_spec.rb +4330 -0
  100. data/spec/date_time/data_spec.rb +197 -0
  101. data/spec/date_time/date_time_index_helper_spec.rb +72 -0
  102. data/spec/date_time/index_spec.rb +588 -0
  103. data/spec/date_time/offsets_spec.rb +465 -0
  104. data/spec/extensions/which_dsl_spec.rb +38 -0
  105. data/spec/fixtures/bank2.dat +200 -0
  106. data/spec/fixtures/boolean_converter_test.csv +5 -0
  107. data/spec/fixtures/countries.json +7794 -0
  108. data/spec/fixtures/duplicates.csv +32 -0
  109. data/spec/fixtures/eciresults.html +394 -0
  110. data/spec/fixtures/empties.dat +2 -0
  111. data/spec/fixtures/empty_rows_test.csv +17 -0
  112. data/spec/fixtures/macau.html +3691 -0
  113. data/spec/fixtures/macd_data.csv +150 -0
  114. data/spec/fixtures/matrix_test.csv +100 -0
  115. data/spec/fixtures/moneycontrol.html +6812 -0
  116. data/spec/fixtures/music_data.tsv +2501 -0
  117. data/spec/fixtures/repeated_fields.csv +7 -0
  118. data/spec/fixtures/sales-funnel.csv +18 -0
  119. data/spec/fixtures/scientific_notation.csv +4 -0
  120. data/spec/fixtures/string_converter_test.csv +5 -0
  121. data/spec/fixtures/strings.dat +2 -0
  122. data/spec/fixtures/test_xls.xls +0 -0
  123. data/spec/fixtures/test_xls_2.xls +0 -0
  124. data/spec/fixtures/url_test.txt~ +0 -0
  125. data/spec/fixtures/valid_markup.html +62 -0
  126. data/spec/fixtures/wiki_climate.html +1243 -0
  127. data/spec/fixtures/wiki_table_info.html +631 -0
  128. data/spec/formatters/table_formatter_spec.rb +137 -0
  129. data/spec/helpers_spec.rb +8 -0
  130. data/spec/index/categorical_index_spec.rb +170 -0
  131. data/spec/index/index_spec.rb +417 -0
  132. data/spec/index/multi_index_spec.rb +680 -0
  133. data/spec/io/io_spec.rb +373 -0
  134. data/spec/io/sql_data_source_spec.rb +56 -0
  135. data/spec/iruby/dataframe_spec.rb +170 -0
  136. data/spec/iruby/helpers_spec.rb +49 -0
  137. data/spec/iruby/multi_index_spec.rb +37 -0
  138. data/spec/iruby/vector_spec.rb +105 -0
  139. data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
  140. data/spec/maths/arithmetic/vector_spec.rb +165 -0
  141. data/spec/maths/statistics/dataframe_spec.rb +178 -0
  142. data/spec/maths/statistics/vector_spec.rb +756 -0
  143. data/spec/monkeys_spec.rb +42 -0
  144. data/spec/shared/vector_display_spec.rb +213 -0
  145. data/spec/spec_helper.rb +87 -0
  146. data/spec/support/database_helper.rb +30 -0
  147. data/spec/support/matchers.rb +5 -0
  148. data/spec/vector_spec.rb +2293 -0
  149. metadata +571 -0
@@ -0,0 +1,202 @@
1
+ module DaruLite
2
+ module Maths
3
+ module Statistics
4
+ module DataFrame
5
+ # @!method mean
6
+ # Calculate mean of numeric vectors
7
+ # @!method variance_sample
8
+ # Calculate sample variance of numeric vectors
9
+ # @!method range
10
+ # Calculate range of numeric vectors
11
+ # @!method median
12
+ # Calculate median of numeric vectors
13
+ # @!method mode
14
+ # Calculate mode of numeric vectors
15
+ # @!method std
16
+ # Calculate sample standard deviation of numeric vectors
17
+ # @!method sum
18
+ # Calculate sum of numeric vectors
19
+ # @!method count
20
+ # Count the number of non-nil values in each vector
21
+ # @!method min
22
+ # Calculate the minimum value of each numeric vector
23
+ # @!method product
24
+ # Compute the product of each numeric vector
25
+ %i[mean variance_sample range median mode std sum count min product].each do |meth|
26
+ define_method(meth) do
27
+ compute_stats meth
28
+ end
29
+ end
30
+
31
+ # Calculate the maximum value of each numeric vector.
32
+ def max(opts = {})
33
+ if opts[:vector]
34
+ row[*self[opts[:vector]].max_index.index.to_a]
35
+ else
36
+ compute_stats :max
37
+ end
38
+ end
39
+
40
+ # @!method cumsum
41
+ # Calculate cumulative sum of each numeric Vector
42
+ # @!method standardize
43
+ # Standardize each Vector
44
+ # @!method acf(max_lags)
45
+ # Calculate Autocorrelation coefficient
46
+ # @param max_lags [Integer] (nil) Number of initial lags
47
+ # @!method ema(n,wilder)
48
+ # Calculate exponential moving average.
49
+ # @param n [Integer] (10) Loopback length.
50
+ # @param wilder [TrueClass, FalseClass, NilClass] (false) If true,
51
+ # 1/n value is used for smoothing; if false, uses 2/(n+1) value.
52
+ # @!method rolling_mean(n)
53
+ # Calculate moving averages
54
+ # @param n [Integer] (10) Loopback length. Default to 10.
55
+ # @!method rolling_median(n)
56
+ # Calculate moving median
57
+ # @param n [Integer] (10) Loopback length. Default to 10.
58
+ # @!method rolling_max(n)
59
+ # Calculate moving max
60
+ # @param n [Integer] (10) Loopback length. Default to 10.
61
+ # @!method rolling_min(n)
62
+ # Calculate moving min
63
+ # @param n [Integer] (10) Loopback length. Default to 10.
64
+ # @!method rolling_count(n)
65
+ # Calculate moving non-missing count
66
+ # @param n [Integer] (10) Loopback length. Default to 10.
67
+ # @!method rolling_std(n)
68
+ # Calculate moving standard deviation
69
+ # @param n [Integer] (10) Loopback length. Default to 10.
70
+ # @!method rolling_variance(n)
71
+ # Calculate moving variance
72
+ # @param n [Integer] (10) Loopback length. Default to 10.
73
+ %i[
74
+ cumsum standardize acf ema rolling_mean rolling_median rolling_max
75
+ rolling_min rolling_count rolling_std rolling_variance rolling_sum
76
+ ].each do |meth|
77
+ define_method(meth) do |*args|
78
+ apply_method_to_numerics meth, *args
79
+ end
80
+ end
81
+
82
+ # Create a summary of mean, standard deviation, count, max and min of
83
+ # each numeric vector in the dataframe in one shot.
84
+ #
85
+ # == Arguments
86
+ #
87
+ # +methods+ - An array with aggregation methods specified as symbols to
88
+ # be applied to numeric vectors. Default is [:count, :mean, :std, :max,
89
+ # :min]. Methods will be applied in the specified order.
90
+ def describe(methods = nil)
91
+ methods ||= %i[count mean std min max]
92
+
93
+ description_hash = {}
94
+ numeric_vectors.each do |vec|
95
+ description_hash[vec] = methods.map { |m| self[vec].send(m) }
96
+ end
97
+ DaruLite::DataFrame.new(description_hash, index: methods)
98
+ end
99
+
100
+ # The percent_change method computes the percent change over
101
+ # the given number of periods for numeric vectors.
102
+ #
103
+ # @param [Integer] periods (1) number of nils to insert at the beginning.
104
+ #
105
+ # @example
106
+ #
107
+ # df = DaruLite::DataFrame.new({
108
+ # 'col0' => [1,2,3,4,5,6],
109
+ # 'col2' => ['a','b','c','d','e','f'],
110
+ # 'col1' => [11,22,33,44,55,66]
111
+ # },
112
+ # index: ['one', 'two', 'three', 'four', 'five', 'six'],
113
+ # order: ['col0', 'col1', 'col2'])
114
+ # df.percent_change
115
+ # #=>
116
+ # # <DaruLite::DataFrame:23513280 @rows: 6 @cols: 2>
117
+ # # col0 col1
118
+ # # one
119
+ # # two 1.0 1.0
120
+ # # three 0.5 0.5
121
+ # # four 0.3333333333333333 0.3333333333333333
122
+ # # five 0.25 0.25
123
+ # # six 0.2 0.2
124
+ def percent_change(periods = 1)
125
+ df_numeric = only_numerics.vectors.to_a
126
+ df = DaruLite::DataFrame.new({}, order: @order, index: @index, name: @name)
127
+ df_numeric.each do |vec|
128
+ df[vec] = self[vec].percent_change periods
129
+ end
130
+ df
131
+ end
132
+
133
+ # Calculate sample variance-covariance between the numeric vectors.
134
+ def covariance
135
+ cache = Hash.new do |h, (col, row)|
136
+ value = vector_cov(self[row], self[col])
137
+ h[[col, row]] = value
138
+ h[[row, col]] = value
139
+ end
140
+ vectors = numeric_vectors
141
+
142
+ mat_rows = vectors.collect do |row|
143
+ vectors.collect do |col|
144
+ row == col ? self[row].variance : cache[[col, row]]
145
+ end
146
+ end
147
+
148
+ DaruLite::DataFrame.rows(mat_rows, index: numeric_vectors, order: numeric_vectors)
149
+ end
150
+
151
+ alias cov covariance
152
+
153
+ # Calculate the correlation between the numeric vectors.
154
+ def correlation
155
+ standard_deviation = std.to_matrix
156
+ corr_arry = cov
157
+ .to_matrix
158
+ .elementwise_division(standard_deviation.transpose *
159
+ standard_deviation).to_a
160
+
161
+ DaruLite::DataFrame.rows(corr_arry, index: numeric_vectors, order: numeric_vectors)
162
+ end
163
+
164
+ alias corr correlation
165
+
166
+ private
167
+
168
+ def apply_method_to_numerics(method, *args)
169
+ numerics = @vectors.to_a.map { |n| [n, @data[@vectors[n]]] }
170
+ .select { |_n, v| v.numeric? }
171
+ computed = numerics.map { |_n, v| v.send(method, *args) }
172
+
173
+ DaruLite::DataFrame.new(computed, index: @index, order: numerics.map(&:first), clone: false)
174
+ end
175
+
176
+ def vector_cov(v1a, v2a)
177
+ sum_of_squares(v1a, v2a) / (v1a.size - 1)
178
+ end
179
+
180
+ def sum_of_squares(v1, v2)
181
+ v1a = v1.reject_values(*DaruLite::MISSING_VALUES)
182
+ v2a = v2.reject_values(*DaruLite::MISSING_VALUES)
183
+ v1a.reset_index!
184
+ v2a.reset_index!
185
+ m1 = v1a.mean
186
+ m2 = v2a.mean
187
+ v1a.size.times.inject(0) { |ac, i| ac + ((v1a[i] - m1) * (v2a[i] - m2)) }
188
+ end
189
+
190
+ def compute_stats(method)
191
+ DaruLite::Vector.new(
192
+ numeric_vectors.each_with_object({}) do |vec, hash|
193
+ hash[vec] = self[vec].send(method)
194
+ end, name: method
195
+ )
196
+ end
197
+ alias sds std
198
+ alias variance variance_sample
199
+ end
200
+ end
201
+ end
202
+ end