daru_lite 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/workflows/ci.yml +33 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +27 -0
- data/.rubocop_todo.yml +137 -0
- data/CONTRIBUTING.md +47 -0
- data/Gemfile +2 -0
- data/History.md +4 -0
- data/LICENSE +24 -0
- data/README.md +218 -0
- data/Rakefile +69 -0
- data/ReleasePolicy.md +20 -0
- data/benchmarks/TradeoffData.csv +65 -0
- data/benchmarks/csv_reading.rb +22 -0
- data/benchmarks/dataframe_creation.rb +39 -0
- data/benchmarks/db_loading.rb +34 -0
- data/benchmarks/duplicating.rb +45 -0
- data/benchmarks/group_by.rb +32 -0
- data/benchmarks/joining.rb +52 -0
- data/benchmarks/row_access.rb +41 -0
- data/benchmarks/row_assign.rb +36 -0
- data/benchmarks/sorting.rb +51 -0
- data/benchmarks/statistics.rb +28 -0
- data/benchmarks/vector_access.rb +31 -0
- data/benchmarks/vector_assign.rb +42 -0
- data/benchmarks/where_clause.rb +48 -0
- data/benchmarks/where_vs_filter.rb +28 -0
- data/daru_lite.gemspec +55 -0
- data/images/README.md +5 -0
- data/images/con0.png +0 -0
- data/images/con1.png +0 -0
- data/images/init0.png +0 -0
- data/images/init1.png +0 -0
- data/images/man0.png +0 -0
- data/images/man1.png +0 -0
- data/images/man2.png +0 -0
- data/images/man3.png +0 -0
- data/images/man4.png +0 -0
- data/images/man5.png +0 -0
- data/images/man6.png +0 -0
- data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
- data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
- data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
- data/lib/daru_lite/category.rb +929 -0
- data/lib/daru_lite/configuration.rb +34 -0
- data/lib/daru_lite/core/group_by.rb +403 -0
- data/lib/daru_lite/core/merge.rb +270 -0
- data/lib/daru_lite/core/query.rb +109 -0
- data/lib/daru_lite/dataframe.rb +3080 -0
- data/lib/daru_lite/date_time/index.rb +569 -0
- data/lib/daru_lite/date_time/offsets.rb +397 -0
- data/lib/daru_lite/exceptions.rb +2 -0
- data/lib/daru_lite/extensions/which_dsl.rb +53 -0
- data/lib/daru_lite/formatters/table.rb +52 -0
- data/lib/daru_lite/helpers/array.rb +53 -0
- data/lib/daru_lite/index/categorical_index.rb +201 -0
- data/lib/daru_lite/index/index.rb +374 -0
- data/lib/daru_lite/index/multi_index.rb +374 -0
- data/lib/daru_lite/io/csv/converters.rb +21 -0
- data/lib/daru_lite/io/io.rb +294 -0
- data/lib/daru_lite/io/sql_data_source.rb +97 -0
- data/lib/daru_lite/iruby/helpers.rb +38 -0
- data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
- data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
- data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
- data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
- data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
- data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
- data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
- data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
- data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
- data/lib/daru_lite/monkeys.rb +56 -0
- data/lib/daru_lite/vector.rb +1678 -0
- data/lib/daru_lite/version.rb +3 -0
- data/lib/daru_lite.rb +99 -0
- data/profile/_base.rb +23 -0
- data/profile/df_to_a.rb +10 -0
- data/profile/filter.rb +13 -0
- data/profile/joining.rb +13 -0
- data/profile/sorting.rb +12 -0
- data/profile/vector_each_with_index.rb +9 -0
- data/profile/vector_new.rb +9 -0
- data/spec/accessors/array_wrapper_spec.rb +3 -0
- data/spec/category_spec.rb +1741 -0
- data/spec/core/group_by_spec.rb +655 -0
- data/spec/core/merge_spec.rb +179 -0
- data/spec/core/query_spec.rb +347 -0
- data/spec/daru_lite_spec.rb +22 -0
- data/spec/dataframe_spec.rb +4330 -0
- data/spec/date_time/data_spec.rb +197 -0
- data/spec/date_time/date_time_index_helper_spec.rb +72 -0
- data/spec/date_time/index_spec.rb +588 -0
- data/spec/date_time/offsets_spec.rb +465 -0
- data/spec/extensions/which_dsl_spec.rb +38 -0
- data/spec/fixtures/bank2.dat +200 -0
- data/spec/fixtures/boolean_converter_test.csv +5 -0
- data/spec/fixtures/countries.json +7794 -0
- data/spec/fixtures/duplicates.csv +32 -0
- data/spec/fixtures/eciresults.html +394 -0
- data/spec/fixtures/empties.dat +2 -0
- data/spec/fixtures/empty_rows_test.csv +17 -0
- data/spec/fixtures/macau.html +3691 -0
- data/spec/fixtures/macd_data.csv +150 -0
- data/spec/fixtures/matrix_test.csv +100 -0
- data/spec/fixtures/moneycontrol.html +6812 -0
- data/spec/fixtures/music_data.tsv +2501 -0
- data/spec/fixtures/repeated_fields.csv +7 -0
- data/spec/fixtures/sales-funnel.csv +18 -0
- data/spec/fixtures/scientific_notation.csv +4 -0
- data/spec/fixtures/string_converter_test.csv +5 -0
- data/spec/fixtures/strings.dat +2 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/fixtures/test_xls_2.xls +0 -0
- data/spec/fixtures/url_test.txt~ +0 -0
- data/spec/fixtures/valid_markup.html +62 -0
- data/spec/fixtures/wiki_climate.html +1243 -0
- data/spec/fixtures/wiki_table_info.html +631 -0
- data/spec/formatters/table_formatter_spec.rb +137 -0
- data/spec/helpers_spec.rb +8 -0
- data/spec/index/categorical_index_spec.rb +170 -0
- data/spec/index/index_spec.rb +417 -0
- data/spec/index/multi_index_spec.rb +680 -0
- data/spec/io/io_spec.rb +373 -0
- data/spec/io/sql_data_source_spec.rb +56 -0
- data/spec/iruby/dataframe_spec.rb +170 -0
- data/spec/iruby/helpers_spec.rb +49 -0
- data/spec/iruby/multi_index_spec.rb +37 -0
- data/spec/iruby/vector_spec.rb +105 -0
- data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
- data/spec/maths/arithmetic/vector_spec.rb +165 -0
- data/spec/maths/statistics/dataframe_spec.rb +178 -0
- data/spec/maths/statistics/vector_spec.rb +756 -0
- data/spec/monkeys_spec.rb +42 -0
- data/spec/shared/vector_display_spec.rb +213 -0
- data/spec/spec_helper.rb +87 -0
- data/spec/support/database_helper.rb +30 -0
- data/spec/support/matchers.rb +5 -0
- data/spec/vector_spec.rb +2293 -0
- metadata +571 -0
@@ -0,0 +1,655 @@
|
|
1
|
+
describe DaruLite::Core::GroupBy do
|
2
|
+
before do
|
3
|
+
@df = DaruLite::DataFrame.new({
|
4
|
+
a: %w{foo bar foo bar foo bar foo foo},
|
5
|
+
b: %w{one one two three two two one three},
|
6
|
+
c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
7
|
+
d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
8
|
+
}, order: [:a, :b, :c, :d])
|
9
|
+
|
10
|
+
@sl_group = @df.group_by(:a)
|
11
|
+
@dl_group = @df.group_by([:a, :b])
|
12
|
+
@tl_group = @df.group_by([:a,:b,:c])
|
13
|
+
|
14
|
+
@sl_index = DaruLite::Index.new(['bar', 'foo'])
|
15
|
+
@dl_multi_index = DaruLite::MultiIndex.from_tuples([
|
16
|
+
['bar', 'one'],
|
17
|
+
['bar', 'three'],
|
18
|
+
['bar', 'two'],
|
19
|
+
['foo', 'one'],
|
20
|
+
['foo', 'three'],
|
21
|
+
['foo', 'two']
|
22
|
+
])
|
23
|
+
@tl_multi_index = DaruLite::MultiIndex.from_tuples([
|
24
|
+
['bar', 'one' , 2],
|
25
|
+
['bar', 'three', 1],
|
26
|
+
['bar', 'two' , 6],
|
27
|
+
['foo', 'one' , 1],
|
28
|
+
['foo', 'one' , 3],
|
29
|
+
['foo', 'three', 8],
|
30
|
+
['foo', 'two' , 3]
|
31
|
+
])
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
context 'with nil values' do
|
36
|
+
before do
|
37
|
+
@df[:w_nils] = DaruLite::Vector.new([11 ,nil ,33 ,nil ,nil ,66 ,77 ,88])
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'groups by nil values' do
|
41
|
+
expect(@df.group_by(:w_nils).groups[[nil]]).to eq([1,3,4])
|
42
|
+
end
|
43
|
+
|
44
|
+
it "uses a multi-index when nils are part of the grouping keys" do
|
45
|
+
expect(@df.group_by(:a, :w_nils).send(:multi_indexed_grouping?)).to be true
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
context "#initialize" do
|
50
|
+
let(:df_emp) { DaruLite::DataFrame.new(
|
51
|
+
employee: %w[John Jane Mark John Jane Mark],
|
52
|
+
month: %w[June June June July July July],
|
53
|
+
salary: [1000, 500, 700, 1200, 600, 600]
|
54
|
+
) }
|
55
|
+
let(:employee_grp) { df_emp.group_by(:employee).df }
|
56
|
+
let(:mi_single) { DaruLite::MultiIndex.from_tuples([
|
57
|
+
['Jane', 1], ['Jane', 4], ['John', 0],
|
58
|
+
['John', 3], ['Mark', 2], ['Mark', 5]
|
59
|
+
]
|
60
|
+
)}
|
61
|
+
|
62
|
+
let(:emp_month_grp) { df_emp.group_by([:employee, :month]).df }
|
63
|
+
let(:mi_double) { DaruLite::MultiIndex.from_tuples([
|
64
|
+
['Jane', 'July', 4], ['Jane', 'June', 1], ['John', 'July', 3],
|
65
|
+
['John', 'June', 0], ['Mark', 'July', 5], ['Mark', 'June', 2]
|
66
|
+
]
|
67
|
+
)}
|
68
|
+
|
69
|
+
let(:emp_month_salary_grp) {
|
70
|
+
df_emp.group_by([:employee, :month, :salary]).df }
|
71
|
+
let(:mi_triple) { DaruLite::MultiIndex.from_tuples([
|
72
|
+
['Jane', 'July', 600, 4], ['Jane', 'June', 500, 1],
|
73
|
+
['John', 'July', 1200, 3], ['John', 'June', 1000, 0],
|
74
|
+
['Mark', 'July', 600, 5], ['Mark', 'June', 700, 2]
|
75
|
+
]
|
76
|
+
)}
|
77
|
+
|
78
|
+
it "groups by a single tuple" do
|
79
|
+
expect(@sl_group.groups).to eq({
|
80
|
+
['bar'] => [1,3,5],
|
81
|
+
['foo'] => [0,2,4,6,7]
|
82
|
+
})
|
83
|
+
end
|
84
|
+
|
85
|
+
it "returns dataframe with MultiIndex, groups by single layer hierarchy" do
|
86
|
+
expect(employee_grp).to eq(DaruLite::DataFrame.new({
|
87
|
+
month: ["June", "July", "June", "July", "June", "July"],
|
88
|
+
salary: [500, 600, 1000, 1200, 700, 600]
|
89
|
+
}, index: mi_single))
|
90
|
+
end
|
91
|
+
|
92
|
+
it "returns dataframe with MultiIndex, groups by double layer hierarchy" do
|
93
|
+
expect(emp_month_grp).to eq(DaruLite::DataFrame.new({
|
94
|
+
salary: [600, 500, 1200, 1000, 600, 700]
|
95
|
+
}, index: mi_double))
|
96
|
+
end
|
97
|
+
|
98
|
+
it "returns dataframe with MultiIndex, groups by triple layer hierarchy" do
|
99
|
+
expect(emp_month_salary_grp).to eq(DaruLite::DataFrame.new({
|
100
|
+
}, index: mi_triple))
|
101
|
+
end
|
102
|
+
|
103
|
+
it "groups by a double layer hierarchy" do
|
104
|
+
expect(@dl_group.groups).to eq({
|
105
|
+
['foo', 'one'] => [0,6],
|
106
|
+
['bar', 'one'] => [1],
|
107
|
+
['foo', 'two'] => [2,4],
|
108
|
+
['bar', 'three'] => [3],
|
109
|
+
['bar', 'two'] => [5],
|
110
|
+
['foo', 'three'] => [7]
|
111
|
+
})
|
112
|
+
end
|
113
|
+
|
114
|
+
it "groups by a triple layer hierarchy" do
|
115
|
+
expect(@tl_group.groups).to eq({
|
116
|
+
['bar', 'one' , 2] => [1],
|
117
|
+
['bar', 'three', 1] => [3],
|
118
|
+
['bar', 'two' , 6] => [5],
|
119
|
+
['foo', 'one' , 1] => [0],
|
120
|
+
['foo', 'one' , 3] => [6],
|
121
|
+
['foo', 'three', 8] => [7],
|
122
|
+
['foo', 'two' , 3] => [2,4]
|
123
|
+
})
|
124
|
+
end
|
125
|
+
|
126
|
+
it "raises error if a non-existent vector is passed as args" do
|
127
|
+
expect {
|
128
|
+
@df.group_by([:a, :ted])
|
129
|
+
}.to raise_error
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
context "#size" do
|
134
|
+
it "returns a vector containing the size of each group" do
|
135
|
+
expect(@dl_group.size).to eq(DaruLite::Vector.new([1,1,1,2,1,2], index: @dl_multi_index))
|
136
|
+
end
|
137
|
+
|
138
|
+
it "returns an empty vector if given an empty dataframe" do
|
139
|
+
df = DaruLite::DataFrame.new({ a: [], b: [] })
|
140
|
+
expect(df.group_by(:a).size).to eq(DaruLite::Vector.new([]))
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
context "#get_group" do
|
145
|
+
it "returns the whole sub-group for single layer grouping" do
|
146
|
+
expect(@sl_group.get_group(['bar'])).to eq(DaruLite::DataFrame.new({
|
147
|
+
a: ['bar', 'bar', 'bar'],
|
148
|
+
b: ['one', 'three', 'two'],
|
149
|
+
c: [2,1,6],
|
150
|
+
d: [22,44,66]
|
151
|
+
}, index: [1,3,5]
|
152
|
+
))
|
153
|
+
end
|
154
|
+
|
155
|
+
it "returns the whole sub-group for double layer grouping" do
|
156
|
+
expect(@dl_group.get_group(['bar', 'one'])).to eq(DaruLite::DataFrame.new({
|
157
|
+
a: ['bar'],
|
158
|
+
b: ['one'],
|
159
|
+
c: [2],
|
160
|
+
d: [22]
|
161
|
+
}, index: [1]
|
162
|
+
))
|
163
|
+
end
|
164
|
+
|
165
|
+
it "returns the whole sub-group for triple layer grouping" do
|
166
|
+
expect(@tl_group.get_group(['foo','two',3])).to eq(DaruLite::DataFrame.new({
|
167
|
+
a: ['foo', 'foo'],
|
168
|
+
b: ['two', 'two'],
|
169
|
+
c: [3,3],
|
170
|
+
d: [33,55]
|
171
|
+
}, index: [2,4]
|
172
|
+
))
|
173
|
+
end
|
174
|
+
|
175
|
+
it "raises error for incomplete specification" do
|
176
|
+
expect {
|
177
|
+
@tl_group.get_group(['foo'])
|
178
|
+
}.to raise_error
|
179
|
+
end
|
180
|
+
|
181
|
+
it "raises error for over specification" do
|
182
|
+
expect {
|
183
|
+
@sl_group.get_group(['bar', 'one'])
|
184
|
+
}.to raise_error
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
context '#each_group' do
|
189
|
+
it 'enumerates groups' do
|
190
|
+
ret = []
|
191
|
+
@dl_group.each_group { |g| ret << g }
|
192
|
+
expect(ret.count).to eq 6
|
193
|
+
expect(ret).to all be_a(DaruLite::DataFrame)
|
194
|
+
expect(ret.first).to eq(DaruLite::DataFrame.new({
|
195
|
+
a: ['bar'],
|
196
|
+
b: ['one'],
|
197
|
+
c: [2],
|
198
|
+
d: [22]
|
199
|
+
}, index: [1]
|
200
|
+
))
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
context '#each_group without block' do
|
205
|
+
it 'enumerates groups' do
|
206
|
+
enum = @dl_group.each_group
|
207
|
+
|
208
|
+
expect(enum.count).to eq 6
|
209
|
+
expect(enum).to all be_a(DaruLite::DataFrame)
|
210
|
+
expect(enum.to_a.last).to eq(DaruLite::DataFrame.new({
|
211
|
+
a: ['foo', 'foo'],
|
212
|
+
b: ['two', 'two'],
|
213
|
+
c: [3, 3],
|
214
|
+
d: [33, 55]
|
215
|
+
}, index: [2, 4]
|
216
|
+
))
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
context '#first' do
|
221
|
+
it 'gets the first row from each group' do
|
222
|
+
expect(@dl_group.first).to eq(DaruLite::DataFrame.new({
|
223
|
+
a: %w{bar bar bar foo foo foo },
|
224
|
+
b: %w{one three two one three two },
|
225
|
+
c: [2 ,1 ,6 ,1 ,8 ,3 ],
|
226
|
+
d: [22 ,44 ,66 ,11 ,88 ,33 ]
|
227
|
+
}, index: [1,3,5,0,7,2]))
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
context '#last' do
|
232
|
+
it 'gets the last row from each group' do
|
233
|
+
expect(@dl_group.last).to eq(DaruLite::DataFrame.new({
|
234
|
+
a: %w{bar bar bar foo foo foo },
|
235
|
+
b: %w{one three two one three two },
|
236
|
+
c: [2 ,1 ,6 ,3 ,8 ,3 ],
|
237
|
+
d: [22 ,44 ,66 ,77 ,88 ,55 ]
|
238
|
+
}, index: [1,3,5,6,7,4]))
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
context "#mean" do
|
243
|
+
it "computes mean of the numeric columns of a single layer group" do
|
244
|
+
expect(@sl_group.mean).to eq(DaruLite::DataFrame.new({
|
245
|
+
:c => [3.0, 3.6],
|
246
|
+
:d => [44.0, 52.8]
|
247
|
+
}, index: @sl_index
|
248
|
+
))
|
249
|
+
end
|
250
|
+
|
251
|
+
it "computes mean of the numeric columns of a double layer group" do
|
252
|
+
expect(@dl_group.mean).to eq(DaruLite::DataFrame.new({
|
253
|
+
c: [2,1,6,2,8,3],
|
254
|
+
d: [22,44,66,44,88,44]
|
255
|
+
}, index: @dl_multi_index))
|
256
|
+
end
|
257
|
+
|
258
|
+
it "computes mean of the numeric columns of a triple layer group" do
|
259
|
+
expect(@tl_group.mean).to eq(DaruLite::DataFrame.new({
|
260
|
+
d: [22,44,66,11,77,88,44]
|
261
|
+
}, index: @tl_multi_index
|
262
|
+
))
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
context "#sum" do
|
267
|
+
it "calculates the sum of the numeric columns of a single layer group" do
|
268
|
+
expect(@sl_group.sum).to eq(DaruLite::DataFrame.new({
|
269
|
+
c: [9, 18],
|
270
|
+
d: [132, 264]
|
271
|
+
}, index: @sl_index
|
272
|
+
))
|
273
|
+
end
|
274
|
+
|
275
|
+
it "calculates the sum of the numeric columns of a double layer group" do
|
276
|
+
expect(@dl_group.sum).to eq(DaruLite::DataFrame.new({
|
277
|
+
c: [2,1,6,4,8,6],
|
278
|
+
d: [22,44,66,88,88,88]
|
279
|
+
}, index: @dl_multi_index))
|
280
|
+
end
|
281
|
+
|
282
|
+
it "calculates the sum of the numeric columns of a triple layer group" do
|
283
|
+
expect(@tl_group.sum).to eq(DaruLite::DataFrame.new({
|
284
|
+
d: [22,44,66,11,77,88,88]
|
285
|
+
}, index: @tl_multi_index))
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
[:median, :std, :max, :min].each do |numeric_method|
|
290
|
+
it "works somehow" do
|
291
|
+
expect(@sl_group.send(numeric_method).index).to eq @sl_index
|
292
|
+
expect(@dl_group.send(numeric_method).index).to eq @dl_multi_index
|
293
|
+
expect(@tl_group.send(numeric_method).index).to eq @tl_multi_index
|
294
|
+
end
|
295
|
+
end
|
296
|
+
|
297
|
+
context "#product" do
|
298
|
+
it "calculates product for single layer groups" do
|
299
|
+
# TODO
|
300
|
+
end
|
301
|
+
|
302
|
+
it "calculates product for double layer groups" do
|
303
|
+
# TODO
|
304
|
+
end
|
305
|
+
|
306
|
+
it "calculates product for triple layer groups" do
|
307
|
+
# TODO
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
context "#count" do
|
312
|
+
it "counts the number of elements in a single layer group" do
|
313
|
+
expect(@sl_group.count).to eq(DaruLite::DataFrame.new({
|
314
|
+
b: [3,5],
|
315
|
+
c: [3,5],
|
316
|
+
d: [3,5]
|
317
|
+
}, index: @sl_index))
|
318
|
+
end
|
319
|
+
|
320
|
+
it "counts the number of elements in a double layer group" do
|
321
|
+
expect(@dl_group.count).to eq(DaruLite::DataFrame.new({
|
322
|
+
c: [1,1,1,2,1,2],
|
323
|
+
d: [1,1,1,2,1,2]
|
324
|
+
}, index: @dl_multi_index))
|
325
|
+
end
|
326
|
+
|
327
|
+
it "counts the number of elements in a triple layer group" do
|
328
|
+
expect(@tl_group.count).to eq(DaruLite::DataFrame.new({
|
329
|
+
d: [1,1,1,1,1,1,2]
|
330
|
+
}, index: @tl_multi_index))
|
331
|
+
end
|
332
|
+
end
|
333
|
+
|
334
|
+
context "#std" do
|
335
|
+
it "calculates sample standard deviation for single layer groups" do
|
336
|
+
# TODO
|
337
|
+
end
|
338
|
+
|
339
|
+
it "calculates sample standard deviation for double layer groups" do
|
340
|
+
# TODO
|
341
|
+
end
|
342
|
+
|
343
|
+
it "calculates sample standard deviation for triple layer groups" do
|
344
|
+
# TODO
|
345
|
+
end
|
346
|
+
end
|
347
|
+
|
348
|
+
context "#max" do
|
349
|
+
it "calculates max value for single layer groups" do
|
350
|
+
# TODO
|
351
|
+
end
|
352
|
+
|
353
|
+
it "calculates max value for double layer groups" do
|
354
|
+
# TODO
|
355
|
+
end
|
356
|
+
|
357
|
+
it "calculates max value for triple layer groups" do
|
358
|
+
# TODO
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
context "#min" do
|
363
|
+
it "calculates min value for single layer groups" do
|
364
|
+
# TODO
|
365
|
+
end
|
366
|
+
|
367
|
+
it "calculates min value for double layer groups" do
|
368
|
+
# TODO
|
369
|
+
end
|
370
|
+
|
371
|
+
it "calculates min value for triple layer groups" do
|
372
|
+
# TODO
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
context "#median" do
|
377
|
+
it "calculates median for single layer groups" do
|
378
|
+
# TODO
|
379
|
+
end
|
380
|
+
|
381
|
+
it "calculates median for double layer groups" do
|
382
|
+
# TODO
|
383
|
+
end
|
384
|
+
|
385
|
+
it "calculates median for triple layer groups" do
|
386
|
+
# TODO
|
387
|
+
end
|
388
|
+
end
|
389
|
+
|
390
|
+
context "#head" do
|
391
|
+
it "returns first n rows of each single layer group" do
|
392
|
+
expect(@sl_group.head(2)).to eq(DaruLite::DataFrame.new({
|
393
|
+
a: ['bar', 'bar','foo','foo'],
|
394
|
+
b: ['one', 'three','one', 'two'],
|
395
|
+
c: [2, 1, 1, 3],
|
396
|
+
d: [22, 44, 11, 33]
|
397
|
+
}, index: [1,3,0,2]))
|
398
|
+
end
|
399
|
+
|
400
|
+
it "returns first n rows of each double layer group" do
|
401
|
+
expect(@dl_group.head(2)).to eq(DaruLite::DataFrame.new({
|
402
|
+
a: ['bar','bar','bar','foo','foo','foo','foo','foo'],
|
403
|
+
b: ['one','three','two','one','one','three','two','two'],
|
404
|
+
c: [2,1,6,1,3,8,3,3],
|
405
|
+
d: [22,44,66,11,77,88,33,55]
|
406
|
+
}, index: [1,3,5,0,6,7,2,4]))
|
407
|
+
end
|
408
|
+
|
409
|
+
it "returns first n rows of each triple layer group" do
|
410
|
+
expect(@tl_group.head(1)).to eq(DaruLite::DataFrame.new({
|
411
|
+
a: ['bar','bar','bar','foo','foo','foo','foo'],
|
412
|
+
b: ['one','three','two','one','one','three','two'],
|
413
|
+
c: [2,1,6,1,3,8,3],
|
414
|
+
d: [22,44,66,11,77,88,33]
|
415
|
+
}, index: [1,3,5,0,6,7,2]))
|
416
|
+
end
|
417
|
+
end
|
418
|
+
|
419
|
+
context "#tail" do
|
420
|
+
it "returns last n rows of each single layer group" do
|
421
|
+
expect(@sl_group.tail(1)).to eq(DaruLite::DataFrame.new({
|
422
|
+
a: ['bar','foo'],
|
423
|
+
b: ['two', 'three'],
|
424
|
+
c: [6,8],
|
425
|
+
d: [66,88]
|
426
|
+
}, index: [5,7]))
|
427
|
+
end
|
428
|
+
|
429
|
+
it "returns last n rows of each double layer group" do
|
430
|
+
expect(@dl_group.tail(2)).to eq(DaruLite::DataFrame.new({
|
431
|
+
a: ['bar','bar','bar','foo','foo','foo','foo','foo'],
|
432
|
+
b: ['one','three','two','one','one','three','two','two'],
|
433
|
+
c: [2,1,6,1,3,8,3,3],
|
434
|
+
d: [22,44,66,11,77,88,33,55]
|
435
|
+
}, index: [1,3,5,0,6,7,2,4]))
|
436
|
+
end
|
437
|
+
|
438
|
+
it "returns last n rows of each triple layer group" do
|
439
|
+
expect(@tl_group.tail(1)).to eq(DaruLite::DataFrame.new({
|
440
|
+
a: ['bar','bar','bar','foo','foo','foo','foo'],
|
441
|
+
b: ['one','three','two','one','one','three','two'],
|
442
|
+
c: [2,1,6,1,3,8,3],
|
443
|
+
d: [22,44,66,11,77,88,55]
|
444
|
+
}, index: [1,3,5,0,6,7,4]))
|
445
|
+
end
|
446
|
+
end
|
447
|
+
|
448
|
+
context "#[]" do
|
449
|
+
pending
|
450
|
+
end
|
451
|
+
|
452
|
+
context "#reduce" do
|
453
|
+
it "returns a vector that concatenates strings in a group" do
|
454
|
+
string_concat = lambda { |result, row| result += row[:b] }
|
455
|
+
expect(@sl_group.reduce('', &string_concat)).to eq(DaruLite::Vector.new(['onethreetwo', 'onetwotwoonethree'], index: @sl_index))
|
456
|
+
end
|
457
|
+
|
458
|
+
it "works with multi-indexes" do
|
459
|
+
string_concat = lambda { |result, row| result += row[:b] }
|
460
|
+
expect(@dl_group.reduce('', &string_concat)).to eq \
|
461
|
+
DaruLite::Vector.new(['one', 'three', 'two', 'oneone', 'three', 'twotwo'], index: @dl_multi_index)
|
462
|
+
end
|
463
|
+
end
|
464
|
+
|
465
|
+
context 'groups by first vector if no vector mentioned' do
|
466
|
+
subject { @df.group_by }
|
467
|
+
|
468
|
+
it { is_expected.to be_a DaruLite::Core::GroupBy }
|
469
|
+
its(:groups) { is_expected.to eq @sl_group.groups }
|
470
|
+
its(:size) { is_expected.to eq @sl_group.size }
|
471
|
+
end
|
472
|
+
|
473
|
+
context 'group and sum with numeric indices' do
|
474
|
+
let(:df) { DaruLite::DataFrame.new({ g: ['a','a','a'], num: [1,2,3]}, index: [2,12,23]) }
|
475
|
+
|
476
|
+
subject { df.group_by([:g]).sum }
|
477
|
+
|
478
|
+
it { is_expected.to eq DaruLite::DataFrame.new({num: [6]}, index: ['a']) }
|
479
|
+
end
|
480
|
+
|
481
|
+
context 'when dataframe tuples contain nils in mismatching positions' do
|
482
|
+
|
483
|
+
let(:df){
|
484
|
+
DaruLite::DataFrame.new(
|
485
|
+
{
|
486
|
+
'string1' => ["Color", "Color", "Color", "Color", nil, "Color", "Color", " Black and White"],
|
487
|
+
'string2' => ["Test", "test2", nil, "test3", nil, "test", "test3", "test5"],
|
488
|
+
'num' => [1, nil, 3, 4, 5, 6, 7, nil]
|
489
|
+
}
|
490
|
+
)
|
491
|
+
}
|
492
|
+
|
493
|
+
it 'groups by without errors' do
|
494
|
+
expect { df.group_by(df.vectors.map(&:to_s)) }.to_not raise_error(ArgumentError)
|
495
|
+
end
|
496
|
+
end
|
497
|
+
|
498
|
+
context '#aggregate' do
|
499
|
+
let(:dataframe) { DaruLite::DataFrame.new({
|
500
|
+
employee: %w[John Jane Mark John Jane Mark],
|
501
|
+
month: %w[June June June July July July],
|
502
|
+
salary: [1000, 500, 700, 1200, 600, 600]})
|
503
|
+
}
|
504
|
+
context 'group and aggregate sum for particular single vector' do
|
505
|
+
subject { dataframe.group_by([:employee]).aggregate(salary: :sum) }
|
506
|
+
|
507
|
+
it { is_expected.to eq DaruLite::DataFrame.new({
|
508
|
+
salary: [1100, 2200, 1300]},
|
509
|
+
index: ['Jane', 'John', 'Mark'])
|
510
|
+
}
|
511
|
+
end
|
512
|
+
|
513
|
+
context 'group and aggregate sum and lambda function for vectors' do
|
514
|
+
subject { dataframe.group_by([:employee]).aggregate(
|
515
|
+
salary: :sum,
|
516
|
+
month: ->(vec) { vec.to_a.join('/') }) }
|
517
|
+
|
518
|
+
it { is_expected.to eq DaruLite::DataFrame.new({
|
519
|
+
salary: [1100, 2200, 1300],
|
520
|
+
month: ['June/July', 'June/July', 'June/July']},
|
521
|
+
index: ['Jane', 'John', 'Mark'],
|
522
|
+
order: [:salary, :month])
|
523
|
+
}
|
524
|
+
end
|
525
|
+
|
526
|
+
context 'group and aggregate sum and lambda functions on dataframe' do
|
527
|
+
subject { dataframe.group_by([:employee]).aggregate(
|
528
|
+
salary: :sum,
|
529
|
+
month: ->(vec) { vec.to_a.join('/') },
|
530
|
+
mean_salary: ->(df) { df.salary.mean },
|
531
|
+
periods: ->(df) { df.size }
|
532
|
+
)}
|
533
|
+
|
534
|
+
it { is_expected.to eq DaruLite::DataFrame.new({
|
535
|
+
salary: [1100, 2200, 1300],
|
536
|
+
month: ['June/July', 'June/July', 'June/July'],
|
537
|
+
mean_salary: [550.0, 1100.0, 650.0],
|
538
|
+
periods: [2, 2, 2]},
|
539
|
+
index: ['Jane', 'John', 'Mark'], order: [:salary, :month,
|
540
|
+
:mean_salary, :periods]) }
|
541
|
+
end
|
542
|
+
|
543
|
+
context 'group_by and aggregate on mixed MultiIndex' do
|
544
|
+
let(:df) { DaruLite::DataFrame.new(
|
545
|
+
name: ['Ram','Krishna','Ram','Krishna','Krishna'],
|
546
|
+
visited: [
|
547
|
+
'Hyderabad', 'Delhi', 'Mumbai', 'Raipur', 'Banglore']
|
548
|
+
)
|
549
|
+
}
|
550
|
+
let(:df_mixed) { DaruLite::DataFrame.new(
|
551
|
+
name: ['Krishna','Ram','Krishna','Krishna'],
|
552
|
+
visited: [
|
553
|
+
'Delhi', 'Mumbai', 'Raipur', 'Banglore']
|
554
|
+
)
|
555
|
+
}
|
556
|
+
it 'group_by' do
|
557
|
+
expect(df.group_by(:name).df).to eq(
|
558
|
+
DaruLite::DataFrame.new({
|
559
|
+
visited: ['Delhi', 'Raipur', 'Banglore', 'Hyderabad', 'Mumbai']},
|
560
|
+
index: DaruLite::MultiIndex.from_tuples(
|
561
|
+
[['Krishna', 1], ['Krishna', 3], ['Krishna', 4],
|
562
|
+
['Ram', 0], ['Ram', 2]]
|
563
|
+
)
|
564
|
+
)
|
565
|
+
)
|
566
|
+
end
|
567
|
+
|
568
|
+
it 'group_by and aggregate' do
|
569
|
+
expect(
|
570
|
+
df.group_by(:name).aggregate(
|
571
|
+
visited: -> (vec){vec.to_a.join(',')})).to eq(
|
572
|
+
DaruLite::DataFrame.new({
|
573
|
+
visited: ['Delhi,Raipur,Banglore', 'Hyderabad,Mumbai']},
|
574
|
+
index: ['Krishna', 'Ram']
|
575
|
+
)
|
576
|
+
)
|
577
|
+
end
|
578
|
+
|
579
|
+
it 'group_by and aggregate when anyone index is not multiple times' do
|
580
|
+
expect(
|
581
|
+
df_mixed.group_by(:name).aggregate(
|
582
|
+
visited: -> (vec){vec.to_a.join(',')})).to eq(
|
583
|
+
DaruLite::DataFrame.new({
|
584
|
+
visited: ['Delhi,Raipur,Banglore', 'Mumbai']},
|
585
|
+
index: ['Krishna', 'Ram']
|
586
|
+
)
|
587
|
+
)
|
588
|
+
end
|
589
|
+
end
|
590
|
+
|
591
|
+
let(:spending_df) {
|
592
|
+
DaruLite::DataFrame.rows([
|
593
|
+
[2010, 'dev', 50, 1],
|
594
|
+
[2010, 'dev', 150, 1],
|
595
|
+
[2010, 'dev', 200, 1],
|
596
|
+
[2011, 'dev', 50, 1],
|
597
|
+
[2012, 'dev', 150, 1],
|
598
|
+
|
599
|
+
[2011, 'office', 300, 1],
|
600
|
+
|
601
|
+
[2010, 'market', 50, 1],
|
602
|
+
[2011, 'market', 500, 1],
|
603
|
+
[2012, 'market', 500, 1],
|
604
|
+
[2012, 'market', 300, 1],
|
605
|
+
|
606
|
+
[2012, 'R&D', 10, 1],],
|
607
|
+
order: [:year, :category, :spending, :nb_spending])
|
608
|
+
}
|
609
|
+
let(:multi_index_year_category) {
|
610
|
+
DaruLite::MultiIndex.from_tuples([
|
611
|
+
[2010, "dev"], [2010, "market"],
|
612
|
+
[2011, "dev"], [2011, "market"], [2011, "office"],
|
613
|
+
[2012, "R&D"], [2012, "dev"], [2012, "market"]])
|
614
|
+
}
|
615
|
+
|
616
|
+
context 'group_by and aggregate on multiple elements' do
|
617
|
+
it 'does aggregate' do
|
618
|
+
expect(spending_df.group_by([:year, :category]).aggregate(spending: :sum)).to eq(
|
619
|
+
DaruLite::DataFrame.new({spending: [400, 50, 50, 500, 300, 10, 150, 800]}, index: multi_index_year_category))
|
620
|
+
end
|
621
|
+
|
622
|
+
it 'works as older methods' do
|
623
|
+
older_way = spending_df.group_by([:year, :category]).sum
|
624
|
+
|
625
|
+
newer_way = spending_df.group_by([:year, :category]).aggregate(spending: :sum, nb_spending: :sum)
|
626
|
+
expect(newer_way).to eq(older_way)
|
627
|
+
|
628
|
+
contrived_way = spending_df.group_by([:year, :category]).aggregate(spending: :sum, nb_spending_lambda: ->(df) { df[:nb_spending].sum })
|
629
|
+
contrived_way.rename_vectors(nb_spending_lambda: :nb_spending)
|
630
|
+
expect(contrived_way).to eq(older_way)
|
631
|
+
end
|
632
|
+
|
633
|
+
context 'can aggregate on MultiIndex' do
|
634
|
+
let(:multi_indexed_aggregated_df) { spending_df.group_by([:year, :category]).aggregate(spending: :sum) }
|
635
|
+
let(:index_year) { DaruLite::Index.new([2010, 2011, 2012]) }
|
636
|
+
let(:index_category) { DaruLite::Index.new(["dev", "market", "office", "R&D"]) }
|
637
|
+
|
638
|
+
it 'aggregates by default on the last layer of MultiIndex' do
|
639
|
+
expect(multi_indexed_aggregated_df.aggregate(spending: :sum)).to eq(
|
640
|
+
DaruLite::DataFrame.new({spending: [450, 850, 960]}, index: index_year))
|
641
|
+
end
|
642
|
+
|
643
|
+
it 'can aggregate on the first layer of MultiIndex' do
|
644
|
+
expect(multi_indexed_aggregated_df.aggregate({spending: :sum},0)).to eq(
|
645
|
+
DaruLite::DataFrame.new({spending: [600, 1350, 300, 10]}, index: index_category))
|
646
|
+
end
|
647
|
+
|
648
|
+
it 'does coercion: when one layer is remaining, MultiIndex is coerced in Index that does not aggregate anymore' do
|
649
|
+
df_with_simple_index = multi_indexed_aggregated_df.aggregate(spending: :sum)
|
650
|
+
expect(df_with_simple_index.aggregate(spending: :sum)).to eq(df_with_simple_index)
|
651
|
+
end
|
652
|
+
end
|
653
|
+
end
|
654
|
+
end
|
655
|
+
end
|