daru_lite 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. checksums.yaml +7 -0
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/workflows/ci.yml +33 -0
  4. data/.gitignore +10 -0
  5. data/.rspec +2 -0
  6. data/.rubocop.yml +27 -0
  7. data/.rubocop_todo.yml +137 -0
  8. data/CONTRIBUTING.md +47 -0
  9. data/Gemfile +2 -0
  10. data/History.md +4 -0
  11. data/LICENSE +24 -0
  12. data/README.md +218 -0
  13. data/Rakefile +69 -0
  14. data/ReleasePolicy.md +20 -0
  15. data/benchmarks/TradeoffData.csv +65 -0
  16. data/benchmarks/csv_reading.rb +22 -0
  17. data/benchmarks/dataframe_creation.rb +39 -0
  18. data/benchmarks/db_loading.rb +34 -0
  19. data/benchmarks/duplicating.rb +45 -0
  20. data/benchmarks/group_by.rb +32 -0
  21. data/benchmarks/joining.rb +52 -0
  22. data/benchmarks/row_access.rb +41 -0
  23. data/benchmarks/row_assign.rb +36 -0
  24. data/benchmarks/sorting.rb +51 -0
  25. data/benchmarks/statistics.rb +28 -0
  26. data/benchmarks/vector_access.rb +31 -0
  27. data/benchmarks/vector_assign.rb +42 -0
  28. data/benchmarks/where_clause.rb +48 -0
  29. data/benchmarks/where_vs_filter.rb +28 -0
  30. data/daru_lite.gemspec +55 -0
  31. data/images/README.md +5 -0
  32. data/images/con0.png +0 -0
  33. data/images/con1.png +0 -0
  34. data/images/init0.png +0 -0
  35. data/images/init1.png +0 -0
  36. data/images/man0.png +0 -0
  37. data/images/man1.png +0 -0
  38. data/images/man2.png +0 -0
  39. data/images/man3.png +0 -0
  40. data/images/man4.png +0 -0
  41. data/images/man5.png +0 -0
  42. data/images/man6.png +0 -0
  43. data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
  44. data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
  45. data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
  46. data/lib/daru_lite/category.rb +929 -0
  47. data/lib/daru_lite/configuration.rb +34 -0
  48. data/lib/daru_lite/core/group_by.rb +403 -0
  49. data/lib/daru_lite/core/merge.rb +270 -0
  50. data/lib/daru_lite/core/query.rb +109 -0
  51. data/lib/daru_lite/dataframe.rb +3080 -0
  52. data/lib/daru_lite/date_time/index.rb +569 -0
  53. data/lib/daru_lite/date_time/offsets.rb +397 -0
  54. data/lib/daru_lite/exceptions.rb +2 -0
  55. data/lib/daru_lite/extensions/which_dsl.rb +53 -0
  56. data/lib/daru_lite/formatters/table.rb +52 -0
  57. data/lib/daru_lite/helpers/array.rb +53 -0
  58. data/lib/daru_lite/index/categorical_index.rb +201 -0
  59. data/lib/daru_lite/index/index.rb +374 -0
  60. data/lib/daru_lite/index/multi_index.rb +374 -0
  61. data/lib/daru_lite/io/csv/converters.rb +21 -0
  62. data/lib/daru_lite/io/io.rb +294 -0
  63. data/lib/daru_lite/io/sql_data_source.rb +97 -0
  64. data/lib/daru_lite/iruby/helpers.rb +38 -0
  65. data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
  66. data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
  67. data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
  68. data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
  69. data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
  70. data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
  71. data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
  72. data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
  73. data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
  74. data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
  75. data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
  76. data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
  77. data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
  78. data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
  79. data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
  80. data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
  81. data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
  82. data/lib/daru_lite/monkeys.rb +56 -0
  83. data/lib/daru_lite/vector.rb +1678 -0
  84. data/lib/daru_lite/version.rb +3 -0
  85. data/lib/daru_lite.rb +99 -0
  86. data/profile/_base.rb +23 -0
  87. data/profile/df_to_a.rb +10 -0
  88. data/profile/filter.rb +13 -0
  89. data/profile/joining.rb +13 -0
  90. data/profile/sorting.rb +12 -0
  91. data/profile/vector_each_with_index.rb +9 -0
  92. data/profile/vector_new.rb +9 -0
  93. data/spec/accessors/array_wrapper_spec.rb +3 -0
  94. data/spec/category_spec.rb +1741 -0
  95. data/spec/core/group_by_spec.rb +655 -0
  96. data/spec/core/merge_spec.rb +179 -0
  97. data/spec/core/query_spec.rb +347 -0
  98. data/spec/daru_lite_spec.rb +22 -0
  99. data/spec/dataframe_spec.rb +4330 -0
  100. data/spec/date_time/data_spec.rb +197 -0
  101. data/spec/date_time/date_time_index_helper_spec.rb +72 -0
  102. data/spec/date_time/index_spec.rb +588 -0
  103. data/spec/date_time/offsets_spec.rb +465 -0
  104. data/spec/extensions/which_dsl_spec.rb +38 -0
  105. data/spec/fixtures/bank2.dat +200 -0
  106. data/spec/fixtures/boolean_converter_test.csv +5 -0
  107. data/spec/fixtures/countries.json +7794 -0
  108. data/spec/fixtures/duplicates.csv +32 -0
  109. data/spec/fixtures/eciresults.html +394 -0
  110. data/spec/fixtures/empties.dat +2 -0
  111. data/spec/fixtures/empty_rows_test.csv +17 -0
  112. data/spec/fixtures/macau.html +3691 -0
  113. data/spec/fixtures/macd_data.csv +150 -0
  114. data/spec/fixtures/matrix_test.csv +100 -0
  115. data/spec/fixtures/moneycontrol.html +6812 -0
  116. data/spec/fixtures/music_data.tsv +2501 -0
  117. data/spec/fixtures/repeated_fields.csv +7 -0
  118. data/spec/fixtures/sales-funnel.csv +18 -0
  119. data/spec/fixtures/scientific_notation.csv +4 -0
  120. data/spec/fixtures/string_converter_test.csv +5 -0
  121. data/spec/fixtures/strings.dat +2 -0
  122. data/spec/fixtures/test_xls.xls +0 -0
  123. data/spec/fixtures/test_xls_2.xls +0 -0
  124. data/spec/fixtures/url_test.txt~ +0 -0
  125. data/spec/fixtures/valid_markup.html +62 -0
  126. data/spec/fixtures/wiki_climate.html +1243 -0
  127. data/spec/fixtures/wiki_table_info.html +631 -0
  128. data/spec/formatters/table_formatter_spec.rb +137 -0
  129. data/spec/helpers_spec.rb +8 -0
  130. data/spec/index/categorical_index_spec.rb +170 -0
  131. data/spec/index/index_spec.rb +417 -0
  132. data/spec/index/multi_index_spec.rb +680 -0
  133. data/spec/io/io_spec.rb +373 -0
  134. data/spec/io/sql_data_source_spec.rb +56 -0
  135. data/spec/iruby/dataframe_spec.rb +170 -0
  136. data/spec/iruby/helpers_spec.rb +49 -0
  137. data/spec/iruby/multi_index_spec.rb +37 -0
  138. data/spec/iruby/vector_spec.rb +105 -0
  139. data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
  140. data/spec/maths/arithmetic/vector_spec.rb +165 -0
  141. data/spec/maths/statistics/dataframe_spec.rb +178 -0
  142. data/spec/maths/statistics/vector_spec.rb +756 -0
  143. data/spec/monkeys_spec.rb +42 -0
  144. data/spec/shared/vector_display_spec.rb +213 -0
  145. data/spec/spec_helper.rb +87 -0
  146. data/spec/support/database_helper.rb +30 -0
  147. data/spec/support/matchers.rb +5 -0
  148. data/spec/vector_spec.rb +2293 -0
  149. metadata +571 -0
@@ -0,0 +1,655 @@
1
+ describe DaruLite::Core::GroupBy do
2
+ before do
3
+ @df = DaruLite::DataFrame.new({
4
+ a: %w{foo bar foo bar foo bar foo foo},
5
+ b: %w{one one two three two two one three},
6
+ c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
7
+ d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
8
+ }, order: [:a, :b, :c, :d])
9
+
10
+ @sl_group = @df.group_by(:a)
11
+ @dl_group = @df.group_by([:a, :b])
12
+ @tl_group = @df.group_by([:a,:b,:c])
13
+
14
+ @sl_index = DaruLite::Index.new(['bar', 'foo'])
15
+ @dl_multi_index = DaruLite::MultiIndex.from_tuples([
16
+ ['bar', 'one'],
17
+ ['bar', 'three'],
18
+ ['bar', 'two'],
19
+ ['foo', 'one'],
20
+ ['foo', 'three'],
21
+ ['foo', 'two']
22
+ ])
23
+ @tl_multi_index = DaruLite::MultiIndex.from_tuples([
24
+ ['bar', 'one' , 2],
25
+ ['bar', 'three', 1],
26
+ ['bar', 'two' , 6],
27
+ ['foo', 'one' , 1],
28
+ ['foo', 'one' , 3],
29
+ ['foo', 'three', 8],
30
+ ['foo', 'two' , 3]
31
+ ])
32
+
33
+ end
34
+
35
+ context 'with nil values' do
36
+ before do
37
+ @df[:w_nils] = DaruLite::Vector.new([11 ,nil ,33 ,nil ,nil ,66 ,77 ,88])
38
+ end
39
+
40
+ it 'groups by nil values' do
41
+ expect(@df.group_by(:w_nils).groups[[nil]]).to eq([1,3,4])
42
+ end
43
+
44
+ it "uses a multi-index when nils are part of the grouping keys" do
45
+ expect(@df.group_by(:a, :w_nils).send(:multi_indexed_grouping?)).to be true
46
+ end
47
+ end
48
+
49
+ context "#initialize" do
50
+ let(:df_emp) { DaruLite::DataFrame.new(
51
+ employee: %w[John Jane Mark John Jane Mark],
52
+ month: %w[June June June July July July],
53
+ salary: [1000, 500, 700, 1200, 600, 600]
54
+ ) }
55
+ let(:employee_grp) { df_emp.group_by(:employee).df }
56
+ let(:mi_single) { DaruLite::MultiIndex.from_tuples([
57
+ ['Jane', 1], ['Jane', 4], ['John', 0],
58
+ ['John', 3], ['Mark', 2], ['Mark', 5]
59
+ ]
60
+ )}
61
+
62
+ let(:emp_month_grp) { df_emp.group_by([:employee, :month]).df }
63
+ let(:mi_double) { DaruLite::MultiIndex.from_tuples([
64
+ ['Jane', 'July', 4], ['Jane', 'June', 1], ['John', 'July', 3],
65
+ ['John', 'June', 0], ['Mark', 'July', 5], ['Mark', 'June', 2]
66
+ ]
67
+ )}
68
+
69
+ let(:emp_month_salary_grp) {
70
+ df_emp.group_by([:employee, :month, :salary]).df }
71
+ let(:mi_triple) { DaruLite::MultiIndex.from_tuples([
72
+ ['Jane', 'July', 600, 4], ['Jane', 'June', 500, 1],
73
+ ['John', 'July', 1200, 3], ['John', 'June', 1000, 0],
74
+ ['Mark', 'July', 600, 5], ['Mark', 'June', 700, 2]
75
+ ]
76
+ )}
77
+
78
+ it "groups by a single tuple" do
79
+ expect(@sl_group.groups).to eq({
80
+ ['bar'] => [1,3,5],
81
+ ['foo'] => [0,2,4,6,7]
82
+ })
83
+ end
84
+
85
+ it "returns dataframe with MultiIndex, groups by single layer hierarchy" do
86
+ expect(employee_grp).to eq(DaruLite::DataFrame.new({
87
+ month: ["June", "July", "June", "July", "June", "July"],
88
+ salary: [500, 600, 1000, 1200, 700, 600]
89
+ }, index: mi_single))
90
+ end
91
+
92
+ it "returns dataframe with MultiIndex, groups by double layer hierarchy" do
93
+ expect(emp_month_grp).to eq(DaruLite::DataFrame.new({
94
+ salary: [600, 500, 1200, 1000, 600, 700]
95
+ }, index: mi_double))
96
+ end
97
+
98
+ it "returns dataframe with MultiIndex, groups by triple layer hierarchy" do
99
+ expect(emp_month_salary_grp).to eq(DaruLite::DataFrame.new({
100
+ }, index: mi_triple))
101
+ end
102
+
103
+ it "groups by a double layer hierarchy" do
104
+ expect(@dl_group.groups).to eq({
105
+ ['foo', 'one'] => [0,6],
106
+ ['bar', 'one'] => [1],
107
+ ['foo', 'two'] => [2,4],
108
+ ['bar', 'three'] => [3],
109
+ ['bar', 'two'] => [5],
110
+ ['foo', 'three'] => [7]
111
+ })
112
+ end
113
+
114
+ it "groups by a triple layer hierarchy" do
115
+ expect(@tl_group.groups).to eq({
116
+ ['bar', 'one' , 2] => [1],
117
+ ['bar', 'three', 1] => [3],
118
+ ['bar', 'two' , 6] => [5],
119
+ ['foo', 'one' , 1] => [0],
120
+ ['foo', 'one' , 3] => [6],
121
+ ['foo', 'three', 8] => [7],
122
+ ['foo', 'two' , 3] => [2,4]
123
+ })
124
+ end
125
+
126
+ it "raises error if a non-existent vector is passed as args" do
127
+ expect {
128
+ @df.group_by([:a, :ted])
129
+ }.to raise_error
130
+ end
131
+ end
132
+
133
+ context "#size" do
134
+ it "returns a vector containing the size of each group" do
135
+ expect(@dl_group.size).to eq(DaruLite::Vector.new([1,1,1,2,1,2], index: @dl_multi_index))
136
+ end
137
+
138
+ it "returns an empty vector if given an empty dataframe" do
139
+ df = DaruLite::DataFrame.new({ a: [], b: [] })
140
+ expect(df.group_by(:a).size).to eq(DaruLite::Vector.new([]))
141
+ end
142
+ end
143
+
144
+ context "#get_group" do
145
+ it "returns the whole sub-group for single layer grouping" do
146
+ expect(@sl_group.get_group(['bar'])).to eq(DaruLite::DataFrame.new({
147
+ a: ['bar', 'bar', 'bar'],
148
+ b: ['one', 'three', 'two'],
149
+ c: [2,1,6],
150
+ d: [22,44,66]
151
+ }, index: [1,3,5]
152
+ ))
153
+ end
154
+
155
+ it "returns the whole sub-group for double layer grouping" do
156
+ expect(@dl_group.get_group(['bar', 'one'])).to eq(DaruLite::DataFrame.new({
157
+ a: ['bar'],
158
+ b: ['one'],
159
+ c: [2],
160
+ d: [22]
161
+ }, index: [1]
162
+ ))
163
+ end
164
+
165
+ it "returns the whole sub-group for triple layer grouping" do
166
+ expect(@tl_group.get_group(['foo','two',3])).to eq(DaruLite::DataFrame.new({
167
+ a: ['foo', 'foo'],
168
+ b: ['two', 'two'],
169
+ c: [3,3],
170
+ d: [33,55]
171
+ }, index: [2,4]
172
+ ))
173
+ end
174
+
175
+ it "raises error for incomplete specification" do
176
+ expect {
177
+ @tl_group.get_group(['foo'])
178
+ }.to raise_error
179
+ end
180
+
181
+ it "raises error for over specification" do
182
+ expect {
183
+ @sl_group.get_group(['bar', 'one'])
184
+ }.to raise_error
185
+ end
186
+ end
187
+
188
+ context '#each_group' do
189
+ it 'enumerates groups' do
190
+ ret = []
191
+ @dl_group.each_group { |g| ret << g }
192
+ expect(ret.count).to eq 6
193
+ expect(ret).to all be_a(DaruLite::DataFrame)
194
+ expect(ret.first).to eq(DaruLite::DataFrame.new({
195
+ a: ['bar'],
196
+ b: ['one'],
197
+ c: [2],
198
+ d: [22]
199
+ }, index: [1]
200
+ ))
201
+ end
202
+ end
203
+
204
+ context '#each_group without block' do
205
+ it 'enumerates groups' do
206
+ enum = @dl_group.each_group
207
+
208
+ expect(enum.count).to eq 6
209
+ expect(enum).to all be_a(DaruLite::DataFrame)
210
+ expect(enum.to_a.last).to eq(DaruLite::DataFrame.new({
211
+ a: ['foo', 'foo'],
212
+ b: ['two', 'two'],
213
+ c: [3, 3],
214
+ d: [33, 55]
215
+ }, index: [2, 4]
216
+ ))
217
+ end
218
+ end
219
+
220
+ context '#first' do
221
+ it 'gets the first row from each group' do
222
+ expect(@dl_group.first).to eq(DaruLite::DataFrame.new({
223
+ a: %w{bar bar bar foo foo foo },
224
+ b: %w{one three two one three two },
225
+ c: [2 ,1 ,6 ,1 ,8 ,3 ],
226
+ d: [22 ,44 ,66 ,11 ,88 ,33 ]
227
+ }, index: [1,3,5,0,7,2]))
228
+ end
229
+ end
230
+
231
+ context '#last' do
232
+ it 'gets the last row from each group' do
233
+ expect(@dl_group.last).to eq(DaruLite::DataFrame.new({
234
+ a: %w{bar bar bar foo foo foo },
235
+ b: %w{one three two one three two },
236
+ c: [2 ,1 ,6 ,3 ,8 ,3 ],
237
+ d: [22 ,44 ,66 ,77 ,88 ,55 ]
238
+ }, index: [1,3,5,6,7,4]))
239
+ end
240
+ end
241
+
242
+ context "#mean" do
243
+ it "computes mean of the numeric columns of a single layer group" do
244
+ expect(@sl_group.mean).to eq(DaruLite::DataFrame.new({
245
+ :c => [3.0, 3.6],
246
+ :d => [44.0, 52.8]
247
+ }, index: @sl_index
248
+ ))
249
+ end
250
+
251
+ it "computes mean of the numeric columns of a double layer group" do
252
+ expect(@dl_group.mean).to eq(DaruLite::DataFrame.new({
253
+ c: [2,1,6,2,8,3],
254
+ d: [22,44,66,44,88,44]
255
+ }, index: @dl_multi_index))
256
+ end
257
+
258
+ it "computes mean of the numeric columns of a triple layer group" do
259
+ expect(@tl_group.mean).to eq(DaruLite::DataFrame.new({
260
+ d: [22,44,66,11,77,88,44]
261
+ }, index: @tl_multi_index
262
+ ))
263
+ end
264
+ end
265
+
266
+ context "#sum" do
267
+ it "calculates the sum of the numeric columns of a single layer group" do
268
+ expect(@sl_group.sum).to eq(DaruLite::DataFrame.new({
269
+ c: [9, 18],
270
+ d: [132, 264]
271
+ }, index: @sl_index
272
+ ))
273
+ end
274
+
275
+ it "calculates the sum of the numeric columns of a double layer group" do
276
+ expect(@dl_group.sum).to eq(DaruLite::DataFrame.new({
277
+ c: [2,1,6,4,8,6],
278
+ d: [22,44,66,88,88,88]
279
+ }, index: @dl_multi_index))
280
+ end
281
+
282
+ it "calculates the sum of the numeric columns of a triple layer group" do
283
+ expect(@tl_group.sum).to eq(DaruLite::DataFrame.new({
284
+ d: [22,44,66,11,77,88,88]
285
+ }, index: @tl_multi_index))
286
+ end
287
+ end
288
+
289
+ [:median, :std, :max, :min].each do |numeric_method|
290
+ it "works somehow" do
291
+ expect(@sl_group.send(numeric_method).index).to eq @sl_index
292
+ expect(@dl_group.send(numeric_method).index).to eq @dl_multi_index
293
+ expect(@tl_group.send(numeric_method).index).to eq @tl_multi_index
294
+ end
295
+ end
296
+
297
+ context "#product" do
298
+ it "calculates product for single layer groups" do
299
+ # TODO
300
+ end
301
+
302
+ it "calculates product for double layer groups" do
303
+ # TODO
304
+ end
305
+
306
+ it "calculates product for triple layer groups" do
307
+ # TODO
308
+ end
309
+ end
310
+
311
+ context "#count" do
312
+ it "counts the number of elements in a single layer group" do
313
+ expect(@sl_group.count).to eq(DaruLite::DataFrame.new({
314
+ b: [3,5],
315
+ c: [3,5],
316
+ d: [3,5]
317
+ }, index: @sl_index))
318
+ end
319
+
320
+ it "counts the number of elements in a double layer group" do
321
+ expect(@dl_group.count).to eq(DaruLite::DataFrame.new({
322
+ c: [1,1,1,2,1,2],
323
+ d: [1,1,1,2,1,2]
324
+ }, index: @dl_multi_index))
325
+ end
326
+
327
+ it "counts the number of elements in a triple layer group" do
328
+ expect(@tl_group.count).to eq(DaruLite::DataFrame.new({
329
+ d: [1,1,1,1,1,1,2]
330
+ }, index: @tl_multi_index))
331
+ end
332
+ end
333
+
334
+ context "#std" do
335
+ it "calculates sample standard deviation for single layer groups" do
336
+ # TODO
337
+ end
338
+
339
+ it "calculates sample standard deviation for double layer groups" do
340
+ # TODO
341
+ end
342
+
343
+ it "calculates sample standard deviation for triple layer groups" do
344
+ # TODO
345
+ end
346
+ end
347
+
348
+ context "#max" do
349
+ it "calculates max value for single layer groups" do
350
+ # TODO
351
+ end
352
+
353
+ it "calculates max value for double layer groups" do
354
+ # TODO
355
+ end
356
+
357
+ it "calculates max value for triple layer groups" do
358
+ # TODO
359
+ end
360
+ end
361
+
362
+ context "#min" do
363
+ it "calculates min value for single layer groups" do
364
+ # TODO
365
+ end
366
+
367
+ it "calculates min value for double layer groups" do
368
+ # TODO
369
+ end
370
+
371
+ it "calculates min value for triple layer groups" do
372
+ # TODO
373
+ end
374
+ end
375
+
376
+ context "#median" do
377
+ it "calculates median for single layer groups" do
378
+ # TODO
379
+ end
380
+
381
+ it "calculates median for double layer groups" do
382
+ # TODO
383
+ end
384
+
385
+ it "calculates median for triple layer groups" do
386
+ # TODO
387
+ end
388
+ end
389
+
390
+ context "#head" do
391
+ it "returns first n rows of each single layer group" do
392
+ expect(@sl_group.head(2)).to eq(DaruLite::DataFrame.new({
393
+ a: ['bar', 'bar','foo','foo'],
394
+ b: ['one', 'three','one', 'two'],
395
+ c: [2, 1, 1, 3],
396
+ d: [22, 44, 11, 33]
397
+ }, index: [1,3,0,2]))
398
+ end
399
+
400
+ it "returns first n rows of each double layer group" do
401
+ expect(@dl_group.head(2)).to eq(DaruLite::DataFrame.new({
402
+ a: ['bar','bar','bar','foo','foo','foo','foo','foo'],
403
+ b: ['one','three','two','one','one','three','two','two'],
404
+ c: [2,1,6,1,3,8,3,3],
405
+ d: [22,44,66,11,77,88,33,55]
406
+ }, index: [1,3,5,0,6,7,2,4]))
407
+ end
408
+
409
+ it "returns first n rows of each triple layer group" do
410
+ expect(@tl_group.head(1)).to eq(DaruLite::DataFrame.new({
411
+ a: ['bar','bar','bar','foo','foo','foo','foo'],
412
+ b: ['one','three','two','one','one','three','two'],
413
+ c: [2,1,6,1,3,8,3],
414
+ d: [22,44,66,11,77,88,33]
415
+ }, index: [1,3,5,0,6,7,2]))
416
+ end
417
+ end
418
+
419
+ context "#tail" do
420
+ it "returns last n rows of each single layer group" do
421
+ expect(@sl_group.tail(1)).to eq(DaruLite::DataFrame.new({
422
+ a: ['bar','foo'],
423
+ b: ['two', 'three'],
424
+ c: [6,8],
425
+ d: [66,88]
426
+ }, index: [5,7]))
427
+ end
428
+
429
+ it "returns last n rows of each double layer group" do
430
+ expect(@dl_group.tail(2)).to eq(DaruLite::DataFrame.new({
431
+ a: ['bar','bar','bar','foo','foo','foo','foo','foo'],
432
+ b: ['one','three','two','one','one','three','two','two'],
433
+ c: [2,1,6,1,3,8,3,3],
434
+ d: [22,44,66,11,77,88,33,55]
435
+ }, index: [1,3,5,0,6,7,2,4]))
436
+ end
437
+
438
+ it "returns last n rows of each triple layer group" do
439
+ expect(@tl_group.tail(1)).to eq(DaruLite::DataFrame.new({
440
+ a: ['bar','bar','bar','foo','foo','foo','foo'],
441
+ b: ['one','three','two','one','one','three','two'],
442
+ c: [2,1,6,1,3,8,3],
443
+ d: [22,44,66,11,77,88,55]
444
+ }, index: [1,3,5,0,6,7,4]))
445
+ end
446
+ end
447
+
448
+ context "#[]" do
449
+ pending
450
+ end
451
+
452
+ context "#reduce" do
453
+ it "returns a vector that concatenates strings in a group" do
454
+ string_concat = lambda { |result, row| result += row[:b] }
455
+ expect(@sl_group.reduce('', &string_concat)).to eq(DaruLite::Vector.new(['onethreetwo', 'onetwotwoonethree'], index: @sl_index))
456
+ end
457
+
458
+ it "works with multi-indexes" do
459
+ string_concat = lambda { |result, row| result += row[:b] }
460
+ expect(@dl_group.reduce('', &string_concat)).to eq \
461
+ DaruLite::Vector.new(['one', 'three', 'two', 'oneone', 'three', 'twotwo'], index: @dl_multi_index)
462
+ end
463
+ end
464
+
465
+ context 'groups by first vector if no vector mentioned' do
466
+ subject { @df.group_by }
467
+
468
+ it { is_expected.to be_a DaruLite::Core::GroupBy }
469
+ its(:groups) { is_expected.to eq @sl_group.groups }
470
+ its(:size) { is_expected.to eq @sl_group.size }
471
+ end
472
+
473
+ context 'group and sum with numeric indices' do
474
+ let(:df) { DaruLite::DataFrame.new({ g: ['a','a','a'], num: [1,2,3]}, index: [2,12,23]) }
475
+
476
+ subject { df.group_by([:g]).sum }
477
+
478
+ it { is_expected.to eq DaruLite::DataFrame.new({num: [6]}, index: ['a']) }
479
+ end
480
+
481
+ context 'when dataframe tuples contain nils in mismatching positions' do
482
+
483
+ let(:df){
484
+ DaruLite::DataFrame.new(
485
+ {
486
+ 'string1' => ["Color", "Color", "Color", "Color", nil, "Color", "Color", " Black and White"],
487
+ 'string2' => ["Test", "test2", nil, "test3", nil, "test", "test3", "test5"],
488
+ 'num' => [1, nil, 3, 4, 5, 6, 7, nil]
489
+ }
490
+ )
491
+ }
492
+
493
+ it 'groups by without errors' do
494
+ expect { df.group_by(df.vectors.map(&:to_s)) }.to_not raise_error(ArgumentError)
495
+ end
496
+ end
497
+
498
+ context '#aggregate' do
499
+ let(:dataframe) { DaruLite::DataFrame.new({
500
+ employee: %w[John Jane Mark John Jane Mark],
501
+ month: %w[June June June July July July],
502
+ salary: [1000, 500, 700, 1200, 600, 600]})
503
+ }
504
+ context 'group and aggregate sum for particular single vector' do
505
+ subject { dataframe.group_by([:employee]).aggregate(salary: :sum) }
506
+
507
+ it { is_expected.to eq DaruLite::DataFrame.new({
508
+ salary: [1100, 2200, 1300]},
509
+ index: ['Jane', 'John', 'Mark'])
510
+ }
511
+ end
512
+
513
+ context 'group and aggregate sum and lambda function for vectors' do
514
+ subject { dataframe.group_by([:employee]).aggregate(
515
+ salary: :sum,
516
+ month: ->(vec) { vec.to_a.join('/') }) }
517
+
518
+ it { is_expected.to eq DaruLite::DataFrame.new({
519
+ salary: [1100, 2200, 1300],
520
+ month: ['June/July', 'June/July', 'June/July']},
521
+ index: ['Jane', 'John', 'Mark'],
522
+ order: [:salary, :month])
523
+ }
524
+ end
525
+
526
+ context 'group and aggregate sum and lambda functions on dataframe' do
527
+ subject { dataframe.group_by([:employee]).aggregate(
528
+ salary: :sum,
529
+ month: ->(vec) { vec.to_a.join('/') },
530
+ mean_salary: ->(df) { df.salary.mean },
531
+ periods: ->(df) { df.size }
532
+ )}
533
+
534
+ it { is_expected.to eq DaruLite::DataFrame.new({
535
+ salary: [1100, 2200, 1300],
536
+ month: ['June/July', 'June/July', 'June/July'],
537
+ mean_salary: [550.0, 1100.0, 650.0],
538
+ periods: [2, 2, 2]},
539
+ index: ['Jane', 'John', 'Mark'], order: [:salary, :month,
540
+ :mean_salary, :periods]) }
541
+ end
542
+
543
+ context 'group_by and aggregate on mixed MultiIndex' do
544
+ let(:df) { DaruLite::DataFrame.new(
545
+ name: ['Ram','Krishna','Ram','Krishna','Krishna'],
546
+ visited: [
547
+ 'Hyderabad', 'Delhi', 'Mumbai', 'Raipur', 'Banglore']
548
+ )
549
+ }
550
+ let(:df_mixed) { DaruLite::DataFrame.new(
551
+ name: ['Krishna','Ram','Krishna','Krishna'],
552
+ visited: [
553
+ 'Delhi', 'Mumbai', 'Raipur', 'Banglore']
554
+ )
555
+ }
556
+ it 'group_by' do
557
+ expect(df.group_by(:name).df).to eq(
558
+ DaruLite::DataFrame.new({
559
+ visited: ['Delhi', 'Raipur', 'Banglore', 'Hyderabad', 'Mumbai']},
560
+ index: DaruLite::MultiIndex.from_tuples(
561
+ [['Krishna', 1], ['Krishna', 3], ['Krishna', 4],
562
+ ['Ram', 0], ['Ram', 2]]
563
+ )
564
+ )
565
+ )
566
+ end
567
+
568
+ it 'group_by and aggregate' do
569
+ expect(
570
+ df.group_by(:name).aggregate(
571
+ visited: -> (vec){vec.to_a.join(',')})).to eq(
572
+ DaruLite::DataFrame.new({
573
+ visited: ['Delhi,Raipur,Banglore', 'Hyderabad,Mumbai']},
574
+ index: ['Krishna', 'Ram']
575
+ )
576
+ )
577
+ end
578
+
579
+ it 'group_by and aggregate when anyone index is not multiple times' do
580
+ expect(
581
+ df_mixed.group_by(:name).aggregate(
582
+ visited: -> (vec){vec.to_a.join(',')})).to eq(
583
+ DaruLite::DataFrame.new({
584
+ visited: ['Delhi,Raipur,Banglore', 'Mumbai']},
585
+ index: ['Krishna', 'Ram']
586
+ )
587
+ )
588
+ end
589
+ end
590
+
591
+ let(:spending_df) {
592
+ DaruLite::DataFrame.rows([
593
+ [2010, 'dev', 50, 1],
594
+ [2010, 'dev', 150, 1],
595
+ [2010, 'dev', 200, 1],
596
+ [2011, 'dev', 50, 1],
597
+ [2012, 'dev', 150, 1],
598
+
599
+ [2011, 'office', 300, 1],
600
+
601
+ [2010, 'market', 50, 1],
602
+ [2011, 'market', 500, 1],
603
+ [2012, 'market', 500, 1],
604
+ [2012, 'market', 300, 1],
605
+
606
+ [2012, 'R&D', 10, 1],],
607
+ order: [:year, :category, :spending, :nb_spending])
608
+ }
609
+ let(:multi_index_year_category) {
610
+ DaruLite::MultiIndex.from_tuples([
611
+ [2010, "dev"], [2010, "market"],
612
+ [2011, "dev"], [2011, "market"], [2011, "office"],
613
+ [2012, "R&D"], [2012, "dev"], [2012, "market"]])
614
+ }
615
+
616
+ context 'group_by and aggregate on multiple elements' do
617
+ it 'does aggregate' do
618
+ expect(spending_df.group_by([:year, :category]).aggregate(spending: :sum)).to eq(
619
+ DaruLite::DataFrame.new({spending: [400, 50, 50, 500, 300, 10, 150, 800]}, index: multi_index_year_category))
620
+ end
621
+
622
+ it 'works as older methods' do
623
+ older_way = spending_df.group_by([:year, :category]).sum
624
+
625
+ newer_way = spending_df.group_by([:year, :category]).aggregate(spending: :sum, nb_spending: :sum)
626
+ expect(newer_way).to eq(older_way)
627
+
628
+ contrived_way = spending_df.group_by([:year, :category]).aggregate(spending: :sum, nb_spending_lambda: ->(df) { df[:nb_spending].sum })
629
+ contrived_way.rename_vectors(nb_spending_lambda: :nb_spending)
630
+ expect(contrived_way).to eq(older_way)
631
+ end
632
+
633
+ context 'can aggregate on MultiIndex' do
634
+ let(:multi_indexed_aggregated_df) { spending_df.group_by([:year, :category]).aggregate(spending: :sum) }
635
+ let(:index_year) { DaruLite::Index.new([2010, 2011, 2012]) }
636
+ let(:index_category) { DaruLite::Index.new(["dev", "market", "office", "R&D"]) }
637
+
638
+ it 'aggregates by default on the last layer of MultiIndex' do
639
+ expect(multi_indexed_aggregated_df.aggregate(spending: :sum)).to eq(
640
+ DaruLite::DataFrame.new({spending: [450, 850, 960]}, index: index_year))
641
+ end
642
+
643
+ it 'can aggregate on the first layer of MultiIndex' do
644
+ expect(multi_indexed_aggregated_df.aggregate({spending: :sum},0)).to eq(
645
+ DaruLite::DataFrame.new({spending: [600, 1350, 300, 10]}, index: index_category))
646
+ end
647
+
648
+ it 'does coercion: when one layer is remaining, MultiIndex is coerced in Index that does not aggregate anymore' do
649
+ df_with_simple_index = multi_indexed_aggregated_df.aggregate(spending: :sum)
650
+ expect(df_with_simple_index.aggregate(spending: :sum)).to eq(df_with_simple_index)
651
+ end
652
+ end
653
+ end
654
+ end
655
+ end