daru_lite 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. checksums.yaml +7 -0
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/workflows/ci.yml +33 -0
  4. data/.gitignore +10 -0
  5. data/.rspec +2 -0
  6. data/.rubocop.yml +27 -0
  7. data/.rubocop_todo.yml +137 -0
  8. data/CONTRIBUTING.md +47 -0
  9. data/Gemfile +2 -0
  10. data/History.md +4 -0
  11. data/LICENSE +24 -0
  12. data/README.md +218 -0
  13. data/Rakefile +69 -0
  14. data/ReleasePolicy.md +20 -0
  15. data/benchmarks/TradeoffData.csv +65 -0
  16. data/benchmarks/csv_reading.rb +22 -0
  17. data/benchmarks/dataframe_creation.rb +39 -0
  18. data/benchmarks/db_loading.rb +34 -0
  19. data/benchmarks/duplicating.rb +45 -0
  20. data/benchmarks/group_by.rb +32 -0
  21. data/benchmarks/joining.rb +52 -0
  22. data/benchmarks/row_access.rb +41 -0
  23. data/benchmarks/row_assign.rb +36 -0
  24. data/benchmarks/sorting.rb +51 -0
  25. data/benchmarks/statistics.rb +28 -0
  26. data/benchmarks/vector_access.rb +31 -0
  27. data/benchmarks/vector_assign.rb +42 -0
  28. data/benchmarks/where_clause.rb +48 -0
  29. data/benchmarks/where_vs_filter.rb +28 -0
  30. data/daru_lite.gemspec +55 -0
  31. data/images/README.md +5 -0
  32. data/images/con0.png +0 -0
  33. data/images/con1.png +0 -0
  34. data/images/init0.png +0 -0
  35. data/images/init1.png +0 -0
  36. data/images/man0.png +0 -0
  37. data/images/man1.png +0 -0
  38. data/images/man2.png +0 -0
  39. data/images/man3.png +0 -0
  40. data/images/man4.png +0 -0
  41. data/images/man5.png +0 -0
  42. data/images/man6.png +0 -0
  43. data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
  44. data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
  45. data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
  46. data/lib/daru_lite/category.rb +929 -0
  47. data/lib/daru_lite/configuration.rb +34 -0
  48. data/lib/daru_lite/core/group_by.rb +403 -0
  49. data/lib/daru_lite/core/merge.rb +270 -0
  50. data/lib/daru_lite/core/query.rb +109 -0
  51. data/lib/daru_lite/dataframe.rb +3080 -0
  52. data/lib/daru_lite/date_time/index.rb +569 -0
  53. data/lib/daru_lite/date_time/offsets.rb +397 -0
  54. data/lib/daru_lite/exceptions.rb +2 -0
  55. data/lib/daru_lite/extensions/which_dsl.rb +53 -0
  56. data/lib/daru_lite/formatters/table.rb +52 -0
  57. data/lib/daru_lite/helpers/array.rb +53 -0
  58. data/lib/daru_lite/index/categorical_index.rb +201 -0
  59. data/lib/daru_lite/index/index.rb +374 -0
  60. data/lib/daru_lite/index/multi_index.rb +374 -0
  61. data/lib/daru_lite/io/csv/converters.rb +21 -0
  62. data/lib/daru_lite/io/io.rb +294 -0
  63. data/lib/daru_lite/io/sql_data_source.rb +97 -0
  64. data/lib/daru_lite/iruby/helpers.rb +38 -0
  65. data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
  66. data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
  67. data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
  68. data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
  69. data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
  70. data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
  71. data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
  72. data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
  73. data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
  74. data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
  75. data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
  76. data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
  77. data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
  78. data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
  79. data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
  80. data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
  81. data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
  82. data/lib/daru_lite/monkeys.rb +56 -0
  83. data/lib/daru_lite/vector.rb +1678 -0
  84. data/lib/daru_lite/version.rb +3 -0
  85. data/lib/daru_lite.rb +99 -0
  86. data/profile/_base.rb +23 -0
  87. data/profile/df_to_a.rb +10 -0
  88. data/profile/filter.rb +13 -0
  89. data/profile/joining.rb +13 -0
  90. data/profile/sorting.rb +12 -0
  91. data/profile/vector_each_with_index.rb +9 -0
  92. data/profile/vector_new.rb +9 -0
  93. data/spec/accessors/array_wrapper_spec.rb +3 -0
  94. data/spec/category_spec.rb +1741 -0
  95. data/spec/core/group_by_spec.rb +655 -0
  96. data/spec/core/merge_spec.rb +179 -0
  97. data/spec/core/query_spec.rb +347 -0
  98. data/spec/daru_lite_spec.rb +22 -0
  99. data/spec/dataframe_spec.rb +4330 -0
  100. data/spec/date_time/data_spec.rb +197 -0
  101. data/spec/date_time/date_time_index_helper_spec.rb +72 -0
  102. data/spec/date_time/index_spec.rb +588 -0
  103. data/spec/date_time/offsets_spec.rb +465 -0
  104. data/spec/extensions/which_dsl_spec.rb +38 -0
  105. data/spec/fixtures/bank2.dat +200 -0
  106. data/spec/fixtures/boolean_converter_test.csv +5 -0
  107. data/spec/fixtures/countries.json +7794 -0
  108. data/spec/fixtures/duplicates.csv +32 -0
  109. data/spec/fixtures/eciresults.html +394 -0
  110. data/spec/fixtures/empties.dat +2 -0
  111. data/spec/fixtures/empty_rows_test.csv +17 -0
  112. data/spec/fixtures/macau.html +3691 -0
  113. data/spec/fixtures/macd_data.csv +150 -0
  114. data/spec/fixtures/matrix_test.csv +100 -0
  115. data/spec/fixtures/moneycontrol.html +6812 -0
  116. data/spec/fixtures/music_data.tsv +2501 -0
  117. data/spec/fixtures/repeated_fields.csv +7 -0
  118. data/spec/fixtures/sales-funnel.csv +18 -0
  119. data/spec/fixtures/scientific_notation.csv +4 -0
  120. data/spec/fixtures/string_converter_test.csv +5 -0
  121. data/spec/fixtures/strings.dat +2 -0
  122. data/spec/fixtures/test_xls.xls +0 -0
  123. data/spec/fixtures/test_xls_2.xls +0 -0
  124. data/spec/fixtures/url_test.txt~ +0 -0
  125. data/spec/fixtures/valid_markup.html +62 -0
  126. data/spec/fixtures/wiki_climate.html +1243 -0
  127. data/spec/fixtures/wiki_table_info.html +631 -0
  128. data/spec/formatters/table_formatter_spec.rb +137 -0
  129. data/spec/helpers_spec.rb +8 -0
  130. data/spec/index/categorical_index_spec.rb +170 -0
  131. data/spec/index/index_spec.rb +417 -0
  132. data/spec/index/multi_index_spec.rb +680 -0
  133. data/spec/io/io_spec.rb +373 -0
  134. data/spec/io/sql_data_source_spec.rb +56 -0
  135. data/spec/iruby/dataframe_spec.rb +170 -0
  136. data/spec/iruby/helpers_spec.rb +49 -0
  137. data/spec/iruby/multi_index_spec.rb +37 -0
  138. data/spec/iruby/vector_spec.rb +105 -0
  139. data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
  140. data/spec/maths/arithmetic/vector_spec.rb +165 -0
  141. data/spec/maths/statistics/dataframe_spec.rb +178 -0
  142. data/spec/maths/statistics/vector_spec.rb +756 -0
  143. data/spec/monkeys_spec.rb +42 -0
  144. data/spec/shared/vector_display_spec.rb +213 -0
  145. data/spec/spec_helper.rb +87 -0
  146. data/spec/support/database_helper.rb +30 -0
  147. data/spec/support/matchers.rb +5 -0
  148. data/spec/vector_spec.rb +2293 -0
  149. metadata +571 -0
@@ -0,0 +1,202 @@
1
+ module DaruLite
2
+ module Maths
3
+ module Statistics
4
+ module DataFrame
5
+ # @!method mean
6
+ # Calculate mean of numeric vectors
7
+ # @!method variance_sample
8
+ # Calculate sample variance of numeric vectors
9
+ # @!method range
10
+ # Calculate range of numeric vectors
11
+ # @!method median
12
+ # Calculate median of numeric vectors
13
+ # @!method mode
14
+ # Calculate mode of numeric vectors
15
+ # @!method std
16
+ # Calculate sample standard deviation of numeric vectors
17
+ # @!method sum
18
+ # Calculate sum of numeric vectors
19
+ # @!method count
20
+ # Count the number of non-nil values in each vector
21
+ # @!method min
22
+ # Calculate the minimum value of each numeric vector
23
+ # @!method product
24
+ # Compute the product of each numeric vector
25
+ %i[mean variance_sample range median mode std sum count min product].each do |meth|
26
+ define_method(meth) do
27
+ compute_stats meth
28
+ end
29
+ end
30
+
31
+ # Calculate the maximum value of each numeric vector.
32
+ def max(opts = {})
33
+ if opts[:vector]
34
+ row[*self[opts[:vector]].max_index.index.to_a]
35
+ else
36
+ compute_stats :max
37
+ end
38
+ end
39
+
40
+ # @!method cumsum
41
+ # Calculate cumulative sum of each numeric Vector
42
+ # @!method standardize
43
+ # Standardize each Vector
44
+ # @!method acf(max_lags)
45
+ # Calculate Autocorrelation coefficient
46
+ # @param max_lags [Integer] (nil) Number of initial lags
47
+ # @!method ema(n,wilder)
48
+ # Calculate exponential moving average.
49
+ # @param n [Integer] (10) Loopback length.
50
+ # @param wilder [TrueClass, FalseClass, NilClass] (false) If true,
51
+ # 1/n value is used for smoothing; if false, uses 2/(n+1) value.
52
+ # @!method rolling_mean(n)
53
+ # Calculate moving averages
54
+ # @param n [Integer] (10) Loopback length. Default to 10.
55
+ # @!method rolling_median(n)
56
+ # Calculate moving median
57
+ # @param n [Integer] (10) Loopback length. Default to 10.
58
+ # @!method rolling_max(n)
59
+ # Calculate moving max
60
+ # @param n [Integer] (10) Loopback length. Default to 10.
61
+ # @!method rolling_min(n)
62
+ # Calculate moving min
63
+ # @param n [Integer] (10) Loopback length. Default to 10.
64
+ # @!method rolling_count(n)
65
+ # Calculate moving non-missing count
66
+ # @param n [Integer] (10) Loopback length. Default to 10.
67
+ # @!method rolling_std(n)
68
+ # Calculate moving standard deviation
69
+ # @param n [Integer] (10) Loopback length. Default to 10.
70
+ # @!method rolling_variance(n)
71
+ # Calculate moving variance
72
+ # @param n [Integer] (10) Loopback length. Default to 10.
73
+ %i[
74
+ cumsum standardize acf ema rolling_mean rolling_median rolling_max
75
+ rolling_min rolling_count rolling_std rolling_variance rolling_sum
76
+ ].each do |meth|
77
+ define_method(meth) do |*args|
78
+ apply_method_to_numerics meth, *args
79
+ end
80
+ end
81
+
82
+ # Create a summary of mean, standard deviation, count, max and min of
83
+ # each numeric vector in the dataframe in one shot.
84
+ #
85
+ # == Arguments
86
+ #
87
+ # +methods+ - An array with aggregation methods specified as symbols to
88
+ # be applied to numeric vectors. Default is [:count, :mean, :std, :max,
89
+ # :min]. Methods will be applied in the specified order.
90
+ def describe(methods = nil)
91
+ methods ||= %i[count mean std min max]
92
+
93
+ description_hash = {}
94
+ numeric_vectors.each do |vec|
95
+ description_hash[vec] = methods.map { |m| self[vec].send(m) }
96
+ end
97
+ DaruLite::DataFrame.new(description_hash, index: methods)
98
+ end
99
+
100
+ # The percent_change method computes the percent change over
101
+ # the given number of periods for numeric vectors.
102
+ #
103
+ # @param [Integer] periods (1) number of nils to insert at the beginning.
104
+ #
105
+ # @example
106
+ #
107
+ # df = DaruLite::DataFrame.new({
108
+ # 'col0' => [1,2,3,4,5,6],
109
+ # 'col2' => ['a','b','c','d','e','f'],
110
+ # 'col1' => [11,22,33,44,55,66]
111
+ # },
112
+ # index: ['one', 'two', 'three', 'four', 'five', 'six'],
113
+ # order: ['col0', 'col1', 'col2'])
114
+ # df.percent_change
115
+ # #=>
116
+ # # <DaruLite::DataFrame:23513280 @rows: 6 @cols: 2>
117
+ # # col0 col1
118
+ # # one
119
+ # # two 1.0 1.0
120
+ # # three 0.5 0.5
121
+ # # four 0.3333333333333333 0.3333333333333333
122
+ # # five 0.25 0.25
123
+ # # six 0.2 0.2
124
+ def percent_change(periods = 1)
125
+ df_numeric = only_numerics.vectors.to_a
126
+ df = DaruLite::DataFrame.new({}, order: @order, index: @index, name: @name)
127
+ df_numeric.each do |vec|
128
+ df[vec] = self[vec].percent_change periods
129
+ end
130
+ df
131
+ end
132
+
133
+ # Calculate sample variance-covariance between the numeric vectors.
134
+ def covariance
135
+ cache = Hash.new do |h, (col, row)|
136
+ value = vector_cov(self[row], self[col])
137
+ h[[col, row]] = value
138
+ h[[row, col]] = value
139
+ end
140
+ vectors = numeric_vectors
141
+
142
+ mat_rows = vectors.collect do |row|
143
+ vectors.collect do |col|
144
+ row == col ? self[row].variance : cache[[col, row]]
145
+ end
146
+ end
147
+
148
+ DaruLite::DataFrame.rows(mat_rows, index: numeric_vectors, order: numeric_vectors)
149
+ end
150
+
151
+ alias cov covariance
152
+
153
+ # Calculate the correlation between the numeric vectors.
154
+ def correlation
155
+ standard_deviation = std.to_matrix
156
+ corr_arry = cov
157
+ .to_matrix
158
+ .elementwise_division(standard_deviation.transpose *
159
+ standard_deviation).to_a
160
+
161
+ DaruLite::DataFrame.rows(corr_arry, index: numeric_vectors, order: numeric_vectors)
162
+ end
163
+
164
+ alias corr correlation
165
+
166
+ private
167
+
168
+ def apply_method_to_numerics(method, *args)
169
+ numerics = @vectors.to_a.map { |n| [n, @data[@vectors[n]]] }
170
+ .select { |_n, v| v.numeric? }
171
+ computed = numerics.map { |_n, v| v.send(method, *args) }
172
+
173
+ DaruLite::DataFrame.new(computed, index: @index, order: numerics.map(&:first), clone: false)
174
+ end
175
+
176
+ def vector_cov(v1a, v2a)
177
+ sum_of_squares(v1a, v2a) / (v1a.size - 1)
178
+ end
179
+
180
+ def sum_of_squares(v1, v2)
181
+ v1a = v1.reject_values(*DaruLite::MISSING_VALUES)
182
+ v2a = v2.reject_values(*DaruLite::MISSING_VALUES)
183
+ v1a.reset_index!
184
+ v2a.reset_index!
185
+ m1 = v1a.mean
186
+ m2 = v2a.mean
187
+ v1a.size.times.inject(0) { |ac, i| ac + ((v1a[i] - m1) * (v2a[i] - m2)) }
188
+ end
189
+
190
+ def compute_stats(method)
191
+ DaruLite::Vector.new(
192
+ numeric_vectors.each_with_object({}) do |vec, hash|
193
+ hash[vec] = self[vec].send(method)
194
+ end, name: method
195
+ )
196
+ end
197
+ alias sds std
198
+ alias variance variance_sample
199
+ end
200
+ end
201
+ end
202
+ end