daru_lite 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (149) hide show
  1. checksums.yaml +7 -0
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/workflows/ci.yml +33 -0
  4. data/.gitignore +10 -0
  5. data/.rspec +2 -0
  6. data/.rubocop.yml +27 -0
  7. data/.rubocop_todo.yml +137 -0
  8. data/CONTRIBUTING.md +47 -0
  9. data/Gemfile +2 -0
  10. data/History.md +4 -0
  11. data/LICENSE +24 -0
  12. data/README.md +218 -0
  13. data/Rakefile +69 -0
  14. data/ReleasePolicy.md +20 -0
  15. data/benchmarks/TradeoffData.csv +65 -0
  16. data/benchmarks/csv_reading.rb +22 -0
  17. data/benchmarks/dataframe_creation.rb +39 -0
  18. data/benchmarks/db_loading.rb +34 -0
  19. data/benchmarks/duplicating.rb +45 -0
  20. data/benchmarks/group_by.rb +32 -0
  21. data/benchmarks/joining.rb +52 -0
  22. data/benchmarks/row_access.rb +41 -0
  23. data/benchmarks/row_assign.rb +36 -0
  24. data/benchmarks/sorting.rb +51 -0
  25. data/benchmarks/statistics.rb +28 -0
  26. data/benchmarks/vector_access.rb +31 -0
  27. data/benchmarks/vector_assign.rb +42 -0
  28. data/benchmarks/where_clause.rb +48 -0
  29. data/benchmarks/where_vs_filter.rb +28 -0
  30. data/daru_lite.gemspec +55 -0
  31. data/images/README.md +5 -0
  32. data/images/con0.png +0 -0
  33. data/images/con1.png +0 -0
  34. data/images/init0.png +0 -0
  35. data/images/init1.png +0 -0
  36. data/images/man0.png +0 -0
  37. data/images/man1.png +0 -0
  38. data/images/man2.png +0 -0
  39. data/images/man3.png +0 -0
  40. data/images/man4.png +0 -0
  41. data/images/man5.png +0 -0
  42. data/images/man6.png +0 -0
  43. data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
  44. data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
  45. data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
  46. data/lib/daru_lite/category.rb +929 -0
  47. data/lib/daru_lite/configuration.rb +34 -0
  48. data/lib/daru_lite/core/group_by.rb +403 -0
  49. data/lib/daru_lite/core/merge.rb +270 -0
  50. data/lib/daru_lite/core/query.rb +109 -0
  51. data/lib/daru_lite/dataframe.rb +3080 -0
  52. data/lib/daru_lite/date_time/index.rb +569 -0
  53. data/lib/daru_lite/date_time/offsets.rb +397 -0
  54. data/lib/daru_lite/exceptions.rb +2 -0
  55. data/lib/daru_lite/extensions/which_dsl.rb +53 -0
  56. data/lib/daru_lite/formatters/table.rb +52 -0
  57. data/lib/daru_lite/helpers/array.rb +53 -0
  58. data/lib/daru_lite/index/categorical_index.rb +201 -0
  59. data/lib/daru_lite/index/index.rb +374 -0
  60. data/lib/daru_lite/index/multi_index.rb +374 -0
  61. data/lib/daru_lite/io/csv/converters.rb +21 -0
  62. data/lib/daru_lite/io/io.rb +294 -0
  63. data/lib/daru_lite/io/sql_data_source.rb +97 -0
  64. data/lib/daru_lite/iruby/helpers.rb +38 -0
  65. data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
  66. data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
  67. data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
  68. data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
  69. data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
  70. data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
  71. data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
  72. data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
  73. data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
  74. data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
  75. data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
  76. data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
  77. data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
  78. data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
  79. data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
  80. data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
  81. data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
  82. data/lib/daru_lite/monkeys.rb +56 -0
  83. data/lib/daru_lite/vector.rb +1678 -0
  84. data/lib/daru_lite/version.rb +3 -0
  85. data/lib/daru_lite.rb +99 -0
  86. data/profile/_base.rb +23 -0
  87. data/profile/df_to_a.rb +10 -0
  88. data/profile/filter.rb +13 -0
  89. data/profile/joining.rb +13 -0
  90. data/profile/sorting.rb +12 -0
  91. data/profile/vector_each_with_index.rb +9 -0
  92. data/profile/vector_new.rb +9 -0
  93. data/spec/accessors/array_wrapper_spec.rb +3 -0
  94. data/spec/category_spec.rb +1741 -0
  95. data/spec/core/group_by_spec.rb +655 -0
  96. data/spec/core/merge_spec.rb +179 -0
  97. data/spec/core/query_spec.rb +347 -0
  98. data/spec/daru_lite_spec.rb +22 -0
  99. data/spec/dataframe_spec.rb +4330 -0
  100. data/spec/date_time/data_spec.rb +197 -0
  101. data/spec/date_time/date_time_index_helper_spec.rb +72 -0
  102. data/spec/date_time/index_spec.rb +588 -0
  103. data/spec/date_time/offsets_spec.rb +465 -0
  104. data/spec/extensions/which_dsl_spec.rb +38 -0
  105. data/spec/fixtures/bank2.dat +200 -0
  106. data/spec/fixtures/boolean_converter_test.csv +5 -0
  107. data/spec/fixtures/countries.json +7794 -0
  108. data/spec/fixtures/duplicates.csv +32 -0
  109. data/spec/fixtures/eciresults.html +394 -0
  110. data/spec/fixtures/empties.dat +2 -0
  111. data/spec/fixtures/empty_rows_test.csv +17 -0
  112. data/spec/fixtures/macau.html +3691 -0
  113. data/spec/fixtures/macd_data.csv +150 -0
  114. data/spec/fixtures/matrix_test.csv +100 -0
  115. data/spec/fixtures/moneycontrol.html +6812 -0
  116. data/spec/fixtures/music_data.tsv +2501 -0
  117. data/spec/fixtures/repeated_fields.csv +7 -0
  118. data/spec/fixtures/sales-funnel.csv +18 -0
  119. data/spec/fixtures/scientific_notation.csv +4 -0
  120. data/spec/fixtures/string_converter_test.csv +5 -0
  121. data/spec/fixtures/strings.dat +2 -0
  122. data/spec/fixtures/test_xls.xls +0 -0
  123. data/spec/fixtures/test_xls_2.xls +0 -0
  124. data/spec/fixtures/url_test.txt~ +0 -0
  125. data/spec/fixtures/valid_markup.html +62 -0
  126. data/spec/fixtures/wiki_climate.html +1243 -0
  127. data/spec/fixtures/wiki_table_info.html +631 -0
  128. data/spec/formatters/table_formatter_spec.rb +137 -0
  129. data/spec/helpers_spec.rb +8 -0
  130. data/spec/index/categorical_index_spec.rb +170 -0
  131. data/spec/index/index_spec.rb +417 -0
  132. data/spec/index/multi_index_spec.rb +680 -0
  133. data/spec/io/io_spec.rb +373 -0
  134. data/spec/io/sql_data_source_spec.rb +56 -0
  135. data/spec/iruby/dataframe_spec.rb +170 -0
  136. data/spec/iruby/helpers_spec.rb +49 -0
  137. data/spec/iruby/multi_index_spec.rb +37 -0
  138. data/spec/iruby/vector_spec.rb +105 -0
  139. data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
  140. data/spec/maths/arithmetic/vector_spec.rb +165 -0
  141. data/spec/maths/statistics/dataframe_spec.rb +178 -0
  142. data/spec/maths/statistics/vector_spec.rb +756 -0
  143. data/spec/monkeys_spec.rb +42 -0
  144. data/spec/shared/vector_display_spec.rb +213 -0
  145. data/spec/spec_helper.rb +87 -0
  146. data/spec/support/database_helper.rb +30 -0
  147. data/spec/support/matchers.rb +5 -0
  148. data/spec/vector_spec.rb +2293 -0
  149. metadata +571 -0
@@ -0,0 +1,655 @@
1
+ describe DaruLite::Core::GroupBy do
2
+ before do
3
+ @df = DaruLite::DataFrame.new({
4
+ a: %w{foo bar foo bar foo bar foo foo},
5
+ b: %w{one one two three two two one three},
6
+ c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
7
+ d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
8
+ }, order: [:a, :b, :c, :d])
9
+
10
+ @sl_group = @df.group_by(:a)
11
+ @dl_group = @df.group_by([:a, :b])
12
+ @tl_group = @df.group_by([:a,:b,:c])
13
+
14
+ @sl_index = DaruLite::Index.new(['bar', 'foo'])
15
+ @dl_multi_index = DaruLite::MultiIndex.from_tuples([
16
+ ['bar', 'one'],
17
+ ['bar', 'three'],
18
+ ['bar', 'two'],
19
+ ['foo', 'one'],
20
+ ['foo', 'three'],
21
+ ['foo', 'two']
22
+ ])
23
+ @tl_multi_index = DaruLite::MultiIndex.from_tuples([
24
+ ['bar', 'one' , 2],
25
+ ['bar', 'three', 1],
26
+ ['bar', 'two' , 6],
27
+ ['foo', 'one' , 1],
28
+ ['foo', 'one' , 3],
29
+ ['foo', 'three', 8],
30
+ ['foo', 'two' , 3]
31
+ ])
32
+
33
+ end
34
+
35
+ context 'with nil values' do
36
+ before do
37
+ @df[:w_nils] = DaruLite::Vector.new([11 ,nil ,33 ,nil ,nil ,66 ,77 ,88])
38
+ end
39
+
40
+ it 'groups by nil values' do
41
+ expect(@df.group_by(:w_nils).groups[[nil]]).to eq([1,3,4])
42
+ end
43
+
44
+ it "uses a multi-index when nils are part of the grouping keys" do
45
+ expect(@df.group_by(:a, :w_nils).send(:multi_indexed_grouping?)).to be true
46
+ end
47
+ end
48
+
49
+ context "#initialize" do
50
+ let(:df_emp) { DaruLite::DataFrame.new(
51
+ employee: %w[John Jane Mark John Jane Mark],
52
+ month: %w[June June June July July July],
53
+ salary: [1000, 500, 700, 1200, 600, 600]
54
+ ) }
55
+ let(:employee_grp) { df_emp.group_by(:employee).df }
56
+ let(:mi_single) { DaruLite::MultiIndex.from_tuples([
57
+ ['Jane', 1], ['Jane', 4], ['John', 0],
58
+ ['John', 3], ['Mark', 2], ['Mark', 5]
59
+ ]
60
+ )}
61
+
62
+ let(:emp_month_grp) { df_emp.group_by([:employee, :month]).df }
63
+ let(:mi_double) { DaruLite::MultiIndex.from_tuples([
64
+ ['Jane', 'July', 4], ['Jane', 'June', 1], ['John', 'July', 3],
65
+ ['John', 'June', 0], ['Mark', 'July', 5], ['Mark', 'June', 2]
66
+ ]
67
+ )}
68
+
69
+ let(:emp_month_salary_grp) {
70
+ df_emp.group_by([:employee, :month, :salary]).df }
71
+ let(:mi_triple) { DaruLite::MultiIndex.from_tuples([
72
+ ['Jane', 'July', 600, 4], ['Jane', 'June', 500, 1],
73
+ ['John', 'July', 1200, 3], ['John', 'June', 1000, 0],
74
+ ['Mark', 'July', 600, 5], ['Mark', 'June', 700, 2]
75
+ ]
76
+ )}
77
+
78
+ it "groups by a single tuple" do
79
+ expect(@sl_group.groups).to eq({
80
+ ['bar'] => [1,3,5],
81
+ ['foo'] => [0,2,4,6,7]
82
+ })
83
+ end
84
+
85
+ it "returns dataframe with MultiIndex, groups by single layer hierarchy" do
86
+ expect(employee_grp).to eq(DaruLite::DataFrame.new({
87
+ month: ["June", "July", "June", "July", "June", "July"],
88
+ salary: [500, 600, 1000, 1200, 700, 600]
89
+ }, index: mi_single))
90
+ end
91
+
92
+ it "returns dataframe with MultiIndex, groups by double layer hierarchy" do
93
+ expect(emp_month_grp).to eq(DaruLite::DataFrame.new({
94
+ salary: [600, 500, 1200, 1000, 600, 700]
95
+ }, index: mi_double))
96
+ end
97
+
98
+ it "returns dataframe with MultiIndex, groups by triple layer hierarchy" do
99
+ expect(emp_month_salary_grp).to eq(DaruLite::DataFrame.new({
100
+ }, index: mi_triple))
101
+ end
102
+
103
+ it "groups by a double layer hierarchy" do
104
+ expect(@dl_group.groups).to eq({
105
+ ['foo', 'one'] => [0,6],
106
+ ['bar', 'one'] => [1],
107
+ ['foo', 'two'] => [2,4],
108
+ ['bar', 'three'] => [3],
109
+ ['bar', 'two'] => [5],
110
+ ['foo', 'three'] => [7]
111
+ })
112
+ end
113
+
114
+ it "groups by a triple layer hierarchy" do
115
+ expect(@tl_group.groups).to eq({
116
+ ['bar', 'one' , 2] => [1],
117
+ ['bar', 'three', 1] => [3],
118
+ ['bar', 'two' , 6] => [5],
119
+ ['foo', 'one' , 1] => [0],
120
+ ['foo', 'one' , 3] => [6],
121
+ ['foo', 'three', 8] => [7],
122
+ ['foo', 'two' , 3] => [2,4]
123
+ })
124
+ end
125
+
126
+ it "raises error if a non-existent vector is passed as args" do
127
+ expect {
128
+ @df.group_by([:a, :ted])
129
+ }.to raise_error
130
+ end
131
+ end
132
+
133
+ context "#size" do
134
+ it "returns a vector containing the size of each group" do
135
+ expect(@dl_group.size).to eq(DaruLite::Vector.new([1,1,1,2,1,2], index: @dl_multi_index))
136
+ end
137
+
138
+ it "returns an empty vector if given an empty dataframe" do
139
+ df = DaruLite::DataFrame.new({ a: [], b: [] })
140
+ expect(df.group_by(:a).size).to eq(DaruLite::Vector.new([]))
141
+ end
142
+ end
143
+
144
+ context "#get_group" do
145
+ it "returns the whole sub-group for single layer grouping" do
146
+ expect(@sl_group.get_group(['bar'])).to eq(DaruLite::DataFrame.new({
147
+ a: ['bar', 'bar', 'bar'],
148
+ b: ['one', 'three', 'two'],
149
+ c: [2,1,6],
150
+ d: [22,44,66]
151
+ }, index: [1,3,5]
152
+ ))
153
+ end
154
+
155
+ it "returns the whole sub-group for double layer grouping" do
156
+ expect(@dl_group.get_group(['bar', 'one'])).to eq(DaruLite::DataFrame.new({
157
+ a: ['bar'],
158
+ b: ['one'],
159
+ c: [2],
160
+ d: [22]
161
+ }, index: [1]
162
+ ))
163
+ end
164
+
165
+ it "returns the whole sub-group for triple layer grouping" do
166
+ expect(@tl_group.get_group(['foo','two',3])).to eq(DaruLite::DataFrame.new({
167
+ a: ['foo', 'foo'],
168
+ b: ['two', 'two'],
169
+ c: [3,3],
170
+ d: [33,55]
171
+ }, index: [2,4]
172
+ ))
173
+ end
174
+
175
+ it "raises error for incomplete specification" do
176
+ expect {
177
+ @tl_group.get_group(['foo'])
178
+ }.to raise_error
179
+ end
180
+
181
+ it "raises error for over specification" do
182
+ expect {
183
+ @sl_group.get_group(['bar', 'one'])
184
+ }.to raise_error
185
+ end
186
+ end
187
+
188
+ context '#each_group' do
189
+ it 'enumerates groups' do
190
+ ret = []
191
+ @dl_group.each_group { |g| ret << g }
192
+ expect(ret.count).to eq 6
193
+ expect(ret).to all be_a(DaruLite::DataFrame)
194
+ expect(ret.first).to eq(DaruLite::DataFrame.new({
195
+ a: ['bar'],
196
+ b: ['one'],
197
+ c: [2],
198
+ d: [22]
199
+ }, index: [1]
200
+ ))
201
+ end
202
+ end
203
+
204
+ context '#each_group without block' do
205
+ it 'enumerates groups' do
206
+ enum = @dl_group.each_group
207
+
208
+ expect(enum.count).to eq 6
209
+ expect(enum).to all be_a(DaruLite::DataFrame)
210
+ expect(enum.to_a.last).to eq(DaruLite::DataFrame.new({
211
+ a: ['foo', 'foo'],
212
+ b: ['two', 'two'],
213
+ c: [3, 3],
214
+ d: [33, 55]
215
+ }, index: [2, 4]
216
+ ))
217
+ end
218
+ end
219
+
220
+ context '#first' do
221
+ it 'gets the first row from each group' do
222
+ expect(@dl_group.first).to eq(DaruLite::DataFrame.new({
223
+ a: %w{bar bar bar foo foo foo },
224
+ b: %w{one three two one three two },
225
+ c: [2 ,1 ,6 ,1 ,8 ,3 ],
226
+ d: [22 ,44 ,66 ,11 ,88 ,33 ]
227
+ }, index: [1,3,5,0,7,2]))
228
+ end
229
+ end
230
+
231
+ context '#last' do
232
+ it 'gets the last row from each group' do
233
+ expect(@dl_group.last).to eq(DaruLite::DataFrame.new({
234
+ a: %w{bar bar bar foo foo foo },
235
+ b: %w{one three two one three two },
236
+ c: [2 ,1 ,6 ,3 ,8 ,3 ],
237
+ d: [22 ,44 ,66 ,77 ,88 ,55 ]
238
+ }, index: [1,3,5,6,7,4]))
239
+ end
240
+ end
241
+
242
+ context "#mean" do
243
+ it "computes mean of the numeric columns of a single layer group" do
244
+ expect(@sl_group.mean).to eq(DaruLite::DataFrame.new({
245
+ :c => [3.0, 3.6],
246
+ :d => [44.0, 52.8]
247
+ }, index: @sl_index
248
+ ))
249
+ end
250
+
251
+ it "computes mean of the numeric columns of a double layer group" do
252
+ expect(@dl_group.mean).to eq(DaruLite::DataFrame.new({
253
+ c: [2,1,6,2,8,3],
254
+ d: [22,44,66,44,88,44]
255
+ }, index: @dl_multi_index))
256
+ end
257
+
258
+ it "computes mean of the numeric columns of a triple layer group" do
259
+ expect(@tl_group.mean).to eq(DaruLite::DataFrame.new({
260
+ d: [22,44,66,11,77,88,44]
261
+ }, index: @tl_multi_index
262
+ ))
263
+ end
264
+ end
265
+
266
+ context "#sum" do
267
+ it "calculates the sum of the numeric columns of a single layer group" do
268
+ expect(@sl_group.sum).to eq(DaruLite::DataFrame.new({
269
+ c: [9, 18],
270
+ d: [132, 264]
271
+ }, index: @sl_index
272
+ ))
273
+ end
274
+
275
+ it "calculates the sum of the numeric columns of a double layer group" do
276
+ expect(@dl_group.sum).to eq(DaruLite::DataFrame.new({
277
+ c: [2,1,6,4,8,6],
278
+ d: [22,44,66,88,88,88]
279
+ }, index: @dl_multi_index))
280
+ end
281
+
282
+ it "calculates the sum of the numeric columns of a triple layer group" do
283
+ expect(@tl_group.sum).to eq(DaruLite::DataFrame.new({
284
+ d: [22,44,66,11,77,88,88]
285
+ }, index: @tl_multi_index))
286
+ end
287
+ end
288
+
289
+ [:median, :std, :max, :min].each do |numeric_method|
290
+ it "works somehow" do
291
+ expect(@sl_group.send(numeric_method).index).to eq @sl_index
292
+ expect(@dl_group.send(numeric_method).index).to eq @dl_multi_index
293
+ expect(@tl_group.send(numeric_method).index).to eq @tl_multi_index
294
+ end
295
+ end
296
+
297
+ context "#product" do
298
+ it "calculates product for single layer groups" do
299
+ # TODO
300
+ end
301
+
302
+ it "calculates product for double layer groups" do
303
+ # TODO
304
+ end
305
+
306
+ it "calculates product for triple layer groups" do
307
+ # TODO
308
+ end
309
+ end
310
+
311
+ context "#count" do
312
+ it "counts the number of elements in a single layer group" do
313
+ expect(@sl_group.count).to eq(DaruLite::DataFrame.new({
314
+ b: [3,5],
315
+ c: [3,5],
316
+ d: [3,5]
317
+ }, index: @sl_index))
318
+ end
319
+
320
+ it "counts the number of elements in a double layer group" do
321
+ expect(@dl_group.count).to eq(DaruLite::DataFrame.new({
322
+ c: [1,1,1,2,1,2],
323
+ d: [1,1,1,2,1,2]
324
+ }, index: @dl_multi_index))
325
+ end
326
+
327
+ it "counts the number of elements in a triple layer group" do
328
+ expect(@tl_group.count).to eq(DaruLite::DataFrame.new({
329
+ d: [1,1,1,1,1,1,2]
330
+ }, index: @tl_multi_index))
331
+ end
332
+ end
333
+
334
+ context "#std" do
335
+ it "calculates sample standard deviation for single layer groups" do
336
+ # TODO
337
+ end
338
+
339
+ it "calculates sample standard deviation for double layer groups" do
340
+ # TODO
341
+ end
342
+
343
+ it "calculates sample standard deviation for triple layer groups" do
344
+ # TODO
345
+ end
346
+ end
347
+
348
+ context "#max" do
349
+ it "calculates max value for single layer groups" do
350
+ # TODO
351
+ end
352
+
353
+ it "calculates max value for double layer groups" do
354
+ # TODO
355
+ end
356
+
357
+ it "calculates max value for triple layer groups" do
358
+ # TODO
359
+ end
360
+ end
361
+
362
+ context "#min" do
363
+ it "calculates min value for single layer groups" do
364
+ # TODO
365
+ end
366
+
367
+ it "calculates min value for double layer groups" do
368
+ # TODO
369
+ end
370
+
371
+ it "calculates min value for triple layer groups" do
372
+ # TODO
373
+ end
374
+ end
375
+
376
+ context "#median" do
377
+ it "calculates median for single layer groups" do
378
+ # TODO
379
+ end
380
+
381
+ it "calculates median for double layer groups" do
382
+ # TODO
383
+ end
384
+
385
+ it "calculates median for triple layer groups" do
386
+ # TODO
387
+ end
388
+ end
389
+
390
+ context "#head" do
391
+ it "returns first n rows of each single layer group" do
392
+ expect(@sl_group.head(2)).to eq(DaruLite::DataFrame.new({
393
+ a: ['bar', 'bar','foo','foo'],
394
+ b: ['one', 'three','one', 'two'],
395
+ c: [2, 1, 1, 3],
396
+ d: [22, 44, 11, 33]
397
+ }, index: [1,3,0,2]))
398
+ end
399
+
400
+ it "returns first n rows of each double layer group" do
401
+ expect(@dl_group.head(2)).to eq(DaruLite::DataFrame.new({
402
+ a: ['bar','bar','bar','foo','foo','foo','foo','foo'],
403
+ b: ['one','three','two','one','one','three','two','two'],
404
+ c: [2,1,6,1,3,8,3,3],
405
+ d: [22,44,66,11,77,88,33,55]
406
+ }, index: [1,3,5,0,6,7,2,4]))
407
+ end
408
+
409
+ it "returns first n rows of each triple layer group" do
410
+ expect(@tl_group.head(1)).to eq(DaruLite::DataFrame.new({
411
+ a: ['bar','bar','bar','foo','foo','foo','foo'],
412
+ b: ['one','three','two','one','one','three','two'],
413
+ c: [2,1,6,1,3,8,3],
414
+ d: [22,44,66,11,77,88,33]
415
+ }, index: [1,3,5,0,6,7,2]))
416
+ end
417
+ end
418
+
419
+ context "#tail" do
420
+ it "returns last n rows of each single layer group" do
421
+ expect(@sl_group.tail(1)).to eq(DaruLite::DataFrame.new({
422
+ a: ['bar','foo'],
423
+ b: ['two', 'three'],
424
+ c: [6,8],
425
+ d: [66,88]
426
+ }, index: [5,7]))
427
+ end
428
+
429
+ it "returns last n rows of each double layer group" do
430
+ expect(@dl_group.tail(2)).to eq(DaruLite::DataFrame.new({
431
+ a: ['bar','bar','bar','foo','foo','foo','foo','foo'],
432
+ b: ['one','three','two','one','one','three','two','two'],
433
+ c: [2,1,6,1,3,8,3,3],
434
+ d: [22,44,66,11,77,88,33,55]
435
+ }, index: [1,3,5,0,6,7,2,4]))
436
+ end
437
+
438
+ it "returns last n rows of each triple layer group" do
439
+ expect(@tl_group.tail(1)).to eq(DaruLite::DataFrame.new({
440
+ a: ['bar','bar','bar','foo','foo','foo','foo'],
441
+ b: ['one','three','two','one','one','three','two'],
442
+ c: [2,1,6,1,3,8,3],
443
+ d: [22,44,66,11,77,88,55]
444
+ }, index: [1,3,5,0,6,7,4]))
445
+ end
446
+ end
447
+
448
+ context "#[]" do
449
+ pending
450
+ end
451
+
452
+ context "#reduce" do
453
+ it "returns a vector that concatenates strings in a group" do
454
+ string_concat = lambda { |result, row| result += row[:b] }
455
+ expect(@sl_group.reduce('', &string_concat)).to eq(DaruLite::Vector.new(['onethreetwo', 'onetwotwoonethree'], index: @sl_index))
456
+ end
457
+
458
+ it "works with multi-indexes" do
459
+ string_concat = lambda { |result, row| result += row[:b] }
460
+ expect(@dl_group.reduce('', &string_concat)).to eq \
461
+ DaruLite::Vector.new(['one', 'three', 'two', 'oneone', 'three', 'twotwo'], index: @dl_multi_index)
462
+ end
463
+ end
464
+
465
+ context 'groups by first vector if no vector mentioned' do
466
+ subject { @df.group_by }
467
+
468
+ it { is_expected.to be_a DaruLite::Core::GroupBy }
469
+ its(:groups) { is_expected.to eq @sl_group.groups }
470
+ its(:size) { is_expected.to eq @sl_group.size }
471
+ end
472
+
473
+ context 'group and sum with numeric indices' do
474
+ let(:df) { DaruLite::DataFrame.new({ g: ['a','a','a'], num: [1,2,3]}, index: [2,12,23]) }
475
+
476
+ subject { df.group_by([:g]).sum }
477
+
478
+ it { is_expected.to eq DaruLite::DataFrame.new({num: [6]}, index: ['a']) }
479
+ end
480
+
481
+ context 'when dataframe tuples contain nils in mismatching positions' do
482
+
483
+ let(:df){
484
+ DaruLite::DataFrame.new(
485
+ {
486
+ 'string1' => ["Color", "Color", "Color", "Color", nil, "Color", "Color", " Black and White"],
487
+ 'string2' => ["Test", "test2", nil, "test3", nil, "test", "test3", "test5"],
488
+ 'num' => [1, nil, 3, 4, 5, 6, 7, nil]
489
+ }
490
+ )
491
+ }
492
+
493
+ it 'groups by without errors' do
494
+ expect { df.group_by(df.vectors.map(&:to_s)) }.to_not raise_error(ArgumentError)
495
+ end
496
+ end
497
+
498
+ context '#aggregate' do
499
+ let(:dataframe) { DaruLite::DataFrame.new({
500
+ employee: %w[John Jane Mark John Jane Mark],
501
+ month: %w[June June June July July July],
502
+ salary: [1000, 500, 700, 1200, 600, 600]})
503
+ }
504
+ context 'group and aggregate sum for particular single vector' do
505
+ subject { dataframe.group_by([:employee]).aggregate(salary: :sum) }
506
+
507
+ it { is_expected.to eq DaruLite::DataFrame.new({
508
+ salary: [1100, 2200, 1300]},
509
+ index: ['Jane', 'John', 'Mark'])
510
+ }
511
+ end
512
+
513
+ context 'group and aggregate sum and lambda function for vectors' do
514
+ subject { dataframe.group_by([:employee]).aggregate(
515
+ salary: :sum,
516
+ month: ->(vec) { vec.to_a.join('/') }) }
517
+
518
+ it { is_expected.to eq DaruLite::DataFrame.new({
519
+ salary: [1100, 2200, 1300],
520
+ month: ['June/July', 'June/July', 'June/July']},
521
+ index: ['Jane', 'John', 'Mark'],
522
+ order: [:salary, :month])
523
+ }
524
+ end
525
+
526
+ context 'group and aggregate sum and lambda functions on dataframe' do
527
+ subject { dataframe.group_by([:employee]).aggregate(
528
+ salary: :sum,
529
+ month: ->(vec) { vec.to_a.join('/') },
530
+ mean_salary: ->(df) { df.salary.mean },
531
+ periods: ->(df) { df.size }
532
+ )}
533
+
534
+ it { is_expected.to eq DaruLite::DataFrame.new({
535
+ salary: [1100, 2200, 1300],
536
+ month: ['June/July', 'June/July', 'June/July'],
537
+ mean_salary: [550.0, 1100.0, 650.0],
538
+ periods: [2, 2, 2]},
539
+ index: ['Jane', 'John', 'Mark'], order: [:salary, :month,
540
+ :mean_salary, :periods]) }
541
+ end
542
+
543
+ context 'group_by and aggregate on mixed MultiIndex' do
544
+ let(:df) { DaruLite::DataFrame.new(
545
+ name: ['Ram','Krishna','Ram','Krishna','Krishna'],
546
+ visited: [
547
+ 'Hyderabad', 'Delhi', 'Mumbai', 'Raipur', 'Banglore']
548
+ )
549
+ }
550
+ let(:df_mixed) { DaruLite::DataFrame.new(
551
+ name: ['Krishna','Ram','Krishna','Krishna'],
552
+ visited: [
553
+ 'Delhi', 'Mumbai', 'Raipur', 'Banglore']
554
+ )
555
+ }
556
+ it 'group_by' do
557
+ expect(df.group_by(:name).df).to eq(
558
+ DaruLite::DataFrame.new({
559
+ visited: ['Delhi', 'Raipur', 'Banglore', 'Hyderabad', 'Mumbai']},
560
+ index: DaruLite::MultiIndex.from_tuples(
561
+ [['Krishna', 1], ['Krishna', 3], ['Krishna', 4],
562
+ ['Ram', 0], ['Ram', 2]]
563
+ )
564
+ )
565
+ )
566
+ end
567
+
568
+ it 'group_by and aggregate' do
569
+ expect(
570
+ df.group_by(:name).aggregate(
571
+ visited: -> (vec){vec.to_a.join(',')})).to eq(
572
+ DaruLite::DataFrame.new({
573
+ visited: ['Delhi,Raipur,Banglore', 'Hyderabad,Mumbai']},
574
+ index: ['Krishna', 'Ram']
575
+ )
576
+ )
577
+ end
578
+
579
+ it 'group_by and aggregate when anyone index is not multiple times' do
580
+ expect(
581
+ df_mixed.group_by(:name).aggregate(
582
+ visited: -> (vec){vec.to_a.join(',')})).to eq(
583
+ DaruLite::DataFrame.new({
584
+ visited: ['Delhi,Raipur,Banglore', 'Mumbai']},
585
+ index: ['Krishna', 'Ram']
586
+ )
587
+ )
588
+ end
589
+ end
590
+
591
+ let(:spending_df) {
592
+ DaruLite::DataFrame.rows([
593
+ [2010, 'dev', 50, 1],
594
+ [2010, 'dev', 150, 1],
595
+ [2010, 'dev', 200, 1],
596
+ [2011, 'dev', 50, 1],
597
+ [2012, 'dev', 150, 1],
598
+
599
+ [2011, 'office', 300, 1],
600
+
601
+ [2010, 'market', 50, 1],
602
+ [2011, 'market', 500, 1],
603
+ [2012, 'market', 500, 1],
604
+ [2012, 'market', 300, 1],
605
+
606
+ [2012, 'R&D', 10, 1],],
607
+ order: [:year, :category, :spending, :nb_spending])
608
+ }
609
+ let(:multi_index_year_category) {
610
+ DaruLite::MultiIndex.from_tuples([
611
+ [2010, "dev"], [2010, "market"],
612
+ [2011, "dev"], [2011, "market"], [2011, "office"],
613
+ [2012, "R&D"], [2012, "dev"], [2012, "market"]])
614
+ }
615
+
616
+ context 'group_by and aggregate on multiple elements' do
617
+ it 'does aggregate' do
618
+ expect(spending_df.group_by([:year, :category]).aggregate(spending: :sum)).to eq(
619
+ DaruLite::DataFrame.new({spending: [400, 50, 50, 500, 300, 10, 150, 800]}, index: multi_index_year_category))
620
+ end
621
+
622
+ it 'works as older methods' do
623
+ older_way = spending_df.group_by([:year, :category]).sum
624
+
625
+ newer_way = spending_df.group_by([:year, :category]).aggregate(spending: :sum, nb_spending: :sum)
626
+ expect(newer_way).to eq(older_way)
627
+
628
+ contrived_way = spending_df.group_by([:year, :category]).aggregate(spending: :sum, nb_spending_lambda: ->(df) { df[:nb_spending].sum })
629
+ contrived_way.rename_vectors(nb_spending_lambda: :nb_spending)
630
+ expect(contrived_way).to eq(older_way)
631
+ end
632
+
633
+ context 'can aggregate on MultiIndex' do
634
+ let(:multi_indexed_aggregated_df) { spending_df.group_by([:year, :category]).aggregate(spending: :sum) }
635
+ let(:index_year) { DaruLite::Index.new([2010, 2011, 2012]) }
636
+ let(:index_category) { DaruLite::Index.new(["dev", "market", "office", "R&D"]) }
637
+
638
+ it 'aggregates by default on the last layer of MultiIndex' do
639
+ expect(multi_indexed_aggregated_df.aggregate(spending: :sum)).to eq(
640
+ DaruLite::DataFrame.new({spending: [450, 850, 960]}, index: index_year))
641
+ end
642
+
643
+ it 'can aggregate on the first layer of MultiIndex' do
644
+ expect(multi_indexed_aggregated_df.aggregate({spending: :sum},0)).to eq(
645
+ DaruLite::DataFrame.new({spending: [600, 1350, 300, 10]}, index: index_category))
646
+ end
647
+
648
+ it 'does coercion: when one layer is remaining, MultiIndex is coerced in Index that does not aggregate anymore' do
649
+ df_with_simple_index = multi_indexed_aggregated_df.aggregate(spending: :sum)
650
+ expect(df_with_simple_index.aggregate(spending: :sum)).to eq(df_with_simple_index)
651
+ end
652
+ end
653
+ end
654
+ end
655
+ end