daru_lite 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/workflows/ci.yml +33 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +27 -0
- data/.rubocop_todo.yml +137 -0
- data/CONTRIBUTING.md +47 -0
- data/Gemfile +2 -0
- data/History.md +4 -0
- data/LICENSE +24 -0
- data/README.md +218 -0
- data/Rakefile +69 -0
- data/ReleasePolicy.md +20 -0
- data/benchmarks/TradeoffData.csv +65 -0
- data/benchmarks/csv_reading.rb +22 -0
- data/benchmarks/dataframe_creation.rb +39 -0
- data/benchmarks/db_loading.rb +34 -0
- data/benchmarks/duplicating.rb +45 -0
- data/benchmarks/group_by.rb +32 -0
- data/benchmarks/joining.rb +52 -0
- data/benchmarks/row_access.rb +41 -0
- data/benchmarks/row_assign.rb +36 -0
- data/benchmarks/sorting.rb +51 -0
- data/benchmarks/statistics.rb +28 -0
- data/benchmarks/vector_access.rb +31 -0
- data/benchmarks/vector_assign.rb +42 -0
- data/benchmarks/where_clause.rb +48 -0
- data/benchmarks/where_vs_filter.rb +28 -0
- data/daru_lite.gemspec +55 -0
- data/images/README.md +5 -0
- data/images/con0.png +0 -0
- data/images/con1.png +0 -0
- data/images/init0.png +0 -0
- data/images/init1.png +0 -0
- data/images/man0.png +0 -0
- data/images/man1.png +0 -0
- data/images/man2.png +0 -0
- data/images/man3.png +0 -0
- data/images/man4.png +0 -0
- data/images/man5.png +0 -0
- data/images/man6.png +0 -0
- data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
- data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
- data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
- data/lib/daru_lite/category.rb +929 -0
- data/lib/daru_lite/configuration.rb +34 -0
- data/lib/daru_lite/core/group_by.rb +403 -0
- data/lib/daru_lite/core/merge.rb +270 -0
- data/lib/daru_lite/core/query.rb +109 -0
- data/lib/daru_lite/dataframe.rb +3080 -0
- data/lib/daru_lite/date_time/index.rb +569 -0
- data/lib/daru_lite/date_time/offsets.rb +397 -0
- data/lib/daru_lite/exceptions.rb +2 -0
- data/lib/daru_lite/extensions/which_dsl.rb +53 -0
- data/lib/daru_lite/formatters/table.rb +52 -0
- data/lib/daru_lite/helpers/array.rb +53 -0
- data/lib/daru_lite/index/categorical_index.rb +201 -0
- data/lib/daru_lite/index/index.rb +374 -0
- data/lib/daru_lite/index/multi_index.rb +374 -0
- data/lib/daru_lite/io/csv/converters.rb +21 -0
- data/lib/daru_lite/io/io.rb +294 -0
- data/lib/daru_lite/io/sql_data_source.rb +97 -0
- data/lib/daru_lite/iruby/helpers.rb +38 -0
- data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
- data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
- data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
- data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
- data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
- data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
- data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
- data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
- data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
- data/lib/daru_lite/monkeys.rb +56 -0
- data/lib/daru_lite/vector.rb +1678 -0
- data/lib/daru_lite/version.rb +3 -0
- data/lib/daru_lite.rb +99 -0
- data/profile/_base.rb +23 -0
- data/profile/df_to_a.rb +10 -0
- data/profile/filter.rb +13 -0
- data/profile/joining.rb +13 -0
- data/profile/sorting.rb +12 -0
- data/profile/vector_each_with_index.rb +9 -0
- data/profile/vector_new.rb +9 -0
- data/spec/accessors/array_wrapper_spec.rb +3 -0
- data/spec/category_spec.rb +1741 -0
- data/spec/core/group_by_spec.rb +655 -0
- data/spec/core/merge_spec.rb +179 -0
- data/spec/core/query_spec.rb +347 -0
- data/spec/daru_lite_spec.rb +22 -0
- data/spec/dataframe_spec.rb +4330 -0
- data/spec/date_time/data_spec.rb +197 -0
- data/spec/date_time/date_time_index_helper_spec.rb +72 -0
- data/spec/date_time/index_spec.rb +588 -0
- data/spec/date_time/offsets_spec.rb +465 -0
- data/spec/extensions/which_dsl_spec.rb +38 -0
- data/spec/fixtures/bank2.dat +200 -0
- data/spec/fixtures/boolean_converter_test.csv +5 -0
- data/spec/fixtures/countries.json +7794 -0
- data/spec/fixtures/duplicates.csv +32 -0
- data/spec/fixtures/eciresults.html +394 -0
- data/spec/fixtures/empties.dat +2 -0
- data/spec/fixtures/empty_rows_test.csv +17 -0
- data/spec/fixtures/macau.html +3691 -0
- data/spec/fixtures/macd_data.csv +150 -0
- data/spec/fixtures/matrix_test.csv +100 -0
- data/spec/fixtures/moneycontrol.html +6812 -0
- data/spec/fixtures/music_data.tsv +2501 -0
- data/spec/fixtures/repeated_fields.csv +7 -0
- data/spec/fixtures/sales-funnel.csv +18 -0
- data/spec/fixtures/scientific_notation.csv +4 -0
- data/spec/fixtures/string_converter_test.csv +5 -0
- data/spec/fixtures/strings.dat +2 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/fixtures/test_xls_2.xls +0 -0
- data/spec/fixtures/url_test.txt~ +0 -0
- data/spec/fixtures/valid_markup.html +62 -0
- data/spec/fixtures/wiki_climate.html +1243 -0
- data/spec/fixtures/wiki_table_info.html +631 -0
- data/spec/formatters/table_formatter_spec.rb +137 -0
- data/spec/helpers_spec.rb +8 -0
- data/spec/index/categorical_index_spec.rb +170 -0
- data/spec/index/index_spec.rb +417 -0
- data/spec/index/multi_index_spec.rb +680 -0
- data/spec/io/io_spec.rb +373 -0
- data/spec/io/sql_data_source_spec.rb +56 -0
- data/spec/iruby/dataframe_spec.rb +170 -0
- data/spec/iruby/helpers_spec.rb +49 -0
- data/spec/iruby/multi_index_spec.rb +37 -0
- data/spec/iruby/vector_spec.rb +105 -0
- data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
- data/spec/maths/arithmetic/vector_spec.rb +165 -0
- data/spec/maths/statistics/dataframe_spec.rb +178 -0
- data/spec/maths/statistics/vector_spec.rb +756 -0
- data/spec/monkeys_spec.rb +42 -0
- data/spec/shared/vector_display_spec.rb +213 -0
- data/spec/spec_helper.rb +87 -0
- data/spec/support/database_helper.rb +30 -0
- data/spec/support/matchers.rb +5 -0
- data/spec/vector_spec.rb +2293 -0
- metadata +571 -0
|
@@ -0,0 +1,655 @@
|
|
|
1
|
+
describe DaruLite::Core::GroupBy do
|
|
2
|
+
before do
|
|
3
|
+
@df = DaruLite::DataFrame.new({
|
|
4
|
+
a: %w{foo bar foo bar foo bar foo foo},
|
|
5
|
+
b: %w{one one two three two two one three},
|
|
6
|
+
c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
|
7
|
+
d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
|
8
|
+
}, order: [:a, :b, :c, :d])
|
|
9
|
+
|
|
10
|
+
@sl_group = @df.group_by(:a)
|
|
11
|
+
@dl_group = @df.group_by([:a, :b])
|
|
12
|
+
@tl_group = @df.group_by([:a,:b,:c])
|
|
13
|
+
|
|
14
|
+
@sl_index = DaruLite::Index.new(['bar', 'foo'])
|
|
15
|
+
@dl_multi_index = DaruLite::MultiIndex.from_tuples([
|
|
16
|
+
['bar', 'one'],
|
|
17
|
+
['bar', 'three'],
|
|
18
|
+
['bar', 'two'],
|
|
19
|
+
['foo', 'one'],
|
|
20
|
+
['foo', 'three'],
|
|
21
|
+
['foo', 'two']
|
|
22
|
+
])
|
|
23
|
+
@tl_multi_index = DaruLite::MultiIndex.from_tuples([
|
|
24
|
+
['bar', 'one' , 2],
|
|
25
|
+
['bar', 'three', 1],
|
|
26
|
+
['bar', 'two' , 6],
|
|
27
|
+
['foo', 'one' , 1],
|
|
28
|
+
['foo', 'one' , 3],
|
|
29
|
+
['foo', 'three', 8],
|
|
30
|
+
['foo', 'two' , 3]
|
|
31
|
+
])
|
|
32
|
+
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
context 'with nil values' do
|
|
36
|
+
before do
|
|
37
|
+
@df[:w_nils] = DaruLite::Vector.new([11 ,nil ,33 ,nil ,nil ,66 ,77 ,88])
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
it 'groups by nil values' do
|
|
41
|
+
expect(@df.group_by(:w_nils).groups[[nil]]).to eq([1,3,4])
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
it "uses a multi-index when nils are part of the grouping keys" do
|
|
45
|
+
expect(@df.group_by(:a, :w_nils).send(:multi_indexed_grouping?)).to be true
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
context "#initialize" do
|
|
50
|
+
let(:df_emp) { DaruLite::DataFrame.new(
|
|
51
|
+
employee: %w[John Jane Mark John Jane Mark],
|
|
52
|
+
month: %w[June June June July July July],
|
|
53
|
+
salary: [1000, 500, 700, 1200, 600, 600]
|
|
54
|
+
) }
|
|
55
|
+
let(:employee_grp) { df_emp.group_by(:employee).df }
|
|
56
|
+
let(:mi_single) { DaruLite::MultiIndex.from_tuples([
|
|
57
|
+
['Jane', 1], ['Jane', 4], ['John', 0],
|
|
58
|
+
['John', 3], ['Mark', 2], ['Mark', 5]
|
|
59
|
+
]
|
|
60
|
+
)}
|
|
61
|
+
|
|
62
|
+
let(:emp_month_grp) { df_emp.group_by([:employee, :month]).df }
|
|
63
|
+
let(:mi_double) { DaruLite::MultiIndex.from_tuples([
|
|
64
|
+
['Jane', 'July', 4], ['Jane', 'June', 1], ['John', 'July', 3],
|
|
65
|
+
['John', 'June', 0], ['Mark', 'July', 5], ['Mark', 'June', 2]
|
|
66
|
+
]
|
|
67
|
+
)}
|
|
68
|
+
|
|
69
|
+
let(:emp_month_salary_grp) {
|
|
70
|
+
df_emp.group_by([:employee, :month, :salary]).df }
|
|
71
|
+
let(:mi_triple) { DaruLite::MultiIndex.from_tuples([
|
|
72
|
+
['Jane', 'July', 600, 4], ['Jane', 'June', 500, 1],
|
|
73
|
+
['John', 'July', 1200, 3], ['John', 'June', 1000, 0],
|
|
74
|
+
['Mark', 'July', 600, 5], ['Mark', 'June', 700, 2]
|
|
75
|
+
]
|
|
76
|
+
)}
|
|
77
|
+
|
|
78
|
+
it "groups by a single tuple" do
|
|
79
|
+
expect(@sl_group.groups).to eq({
|
|
80
|
+
['bar'] => [1,3,5],
|
|
81
|
+
['foo'] => [0,2,4,6,7]
|
|
82
|
+
})
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
it "returns dataframe with MultiIndex, groups by single layer hierarchy" do
|
|
86
|
+
expect(employee_grp).to eq(DaruLite::DataFrame.new({
|
|
87
|
+
month: ["June", "July", "June", "July", "June", "July"],
|
|
88
|
+
salary: [500, 600, 1000, 1200, 700, 600]
|
|
89
|
+
}, index: mi_single))
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
it "returns dataframe with MultiIndex, groups by double layer hierarchy" do
|
|
93
|
+
expect(emp_month_grp).to eq(DaruLite::DataFrame.new({
|
|
94
|
+
salary: [600, 500, 1200, 1000, 600, 700]
|
|
95
|
+
}, index: mi_double))
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
it "returns dataframe with MultiIndex, groups by triple layer hierarchy" do
|
|
99
|
+
expect(emp_month_salary_grp).to eq(DaruLite::DataFrame.new({
|
|
100
|
+
}, index: mi_triple))
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
it "groups by a double layer hierarchy" do
|
|
104
|
+
expect(@dl_group.groups).to eq({
|
|
105
|
+
['foo', 'one'] => [0,6],
|
|
106
|
+
['bar', 'one'] => [1],
|
|
107
|
+
['foo', 'two'] => [2,4],
|
|
108
|
+
['bar', 'three'] => [3],
|
|
109
|
+
['bar', 'two'] => [5],
|
|
110
|
+
['foo', 'three'] => [7]
|
|
111
|
+
})
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
it "groups by a triple layer hierarchy" do
|
|
115
|
+
expect(@tl_group.groups).to eq({
|
|
116
|
+
['bar', 'one' , 2] => [1],
|
|
117
|
+
['bar', 'three', 1] => [3],
|
|
118
|
+
['bar', 'two' , 6] => [5],
|
|
119
|
+
['foo', 'one' , 1] => [0],
|
|
120
|
+
['foo', 'one' , 3] => [6],
|
|
121
|
+
['foo', 'three', 8] => [7],
|
|
122
|
+
['foo', 'two' , 3] => [2,4]
|
|
123
|
+
})
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
it "raises error if a non-existent vector is passed as args" do
|
|
127
|
+
expect {
|
|
128
|
+
@df.group_by([:a, :ted])
|
|
129
|
+
}.to raise_error
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
context "#size" do
|
|
134
|
+
it "returns a vector containing the size of each group" do
|
|
135
|
+
expect(@dl_group.size).to eq(DaruLite::Vector.new([1,1,1,2,1,2], index: @dl_multi_index))
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
it "returns an empty vector if given an empty dataframe" do
|
|
139
|
+
df = DaruLite::DataFrame.new({ a: [], b: [] })
|
|
140
|
+
expect(df.group_by(:a).size).to eq(DaruLite::Vector.new([]))
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
context "#get_group" do
|
|
145
|
+
it "returns the whole sub-group for single layer grouping" do
|
|
146
|
+
expect(@sl_group.get_group(['bar'])).to eq(DaruLite::DataFrame.new({
|
|
147
|
+
a: ['bar', 'bar', 'bar'],
|
|
148
|
+
b: ['one', 'three', 'two'],
|
|
149
|
+
c: [2,1,6],
|
|
150
|
+
d: [22,44,66]
|
|
151
|
+
}, index: [1,3,5]
|
|
152
|
+
))
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
it "returns the whole sub-group for double layer grouping" do
|
|
156
|
+
expect(@dl_group.get_group(['bar', 'one'])).to eq(DaruLite::DataFrame.new({
|
|
157
|
+
a: ['bar'],
|
|
158
|
+
b: ['one'],
|
|
159
|
+
c: [2],
|
|
160
|
+
d: [22]
|
|
161
|
+
}, index: [1]
|
|
162
|
+
))
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
it "returns the whole sub-group for triple layer grouping" do
|
|
166
|
+
expect(@tl_group.get_group(['foo','two',3])).to eq(DaruLite::DataFrame.new({
|
|
167
|
+
a: ['foo', 'foo'],
|
|
168
|
+
b: ['two', 'two'],
|
|
169
|
+
c: [3,3],
|
|
170
|
+
d: [33,55]
|
|
171
|
+
}, index: [2,4]
|
|
172
|
+
))
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
it "raises error for incomplete specification" do
|
|
176
|
+
expect {
|
|
177
|
+
@tl_group.get_group(['foo'])
|
|
178
|
+
}.to raise_error
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
it "raises error for over specification" do
|
|
182
|
+
expect {
|
|
183
|
+
@sl_group.get_group(['bar', 'one'])
|
|
184
|
+
}.to raise_error
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
context '#each_group' do
|
|
189
|
+
it 'enumerates groups' do
|
|
190
|
+
ret = []
|
|
191
|
+
@dl_group.each_group { |g| ret << g }
|
|
192
|
+
expect(ret.count).to eq 6
|
|
193
|
+
expect(ret).to all be_a(DaruLite::DataFrame)
|
|
194
|
+
expect(ret.first).to eq(DaruLite::DataFrame.new({
|
|
195
|
+
a: ['bar'],
|
|
196
|
+
b: ['one'],
|
|
197
|
+
c: [2],
|
|
198
|
+
d: [22]
|
|
199
|
+
}, index: [1]
|
|
200
|
+
))
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
context '#each_group without block' do
|
|
205
|
+
it 'enumerates groups' do
|
|
206
|
+
enum = @dl_group.each_group
|
|
207
|
+
|
|
208
|
+
expect(enum.count).to eq 6
|
|
209
|
+
expect(enum).to all be_a(DaruLite::DataFrame)
|
|
210
|
+
expect(enum.to_a.last).to eq(DaruLite::DataFrame.new({
|
|
211
|
+
a: ['foo', 'foo'],
|
|
212
|
+
b: ['two', 'two'],
|
|
213
|
+
c: [3, 3],
|
|
214
|
+
d: [33, 55]
|
|
215
|
+
}, index: [2, 4]
|
|
216
|
+
))
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
context '#first' do
|
|
221
|
+
it 'gets the first row from each group' do
|
|
222
|
+
expect(@dl_group.first).to eq(DaruLite::DataFrame.new({
|
|
223
|
+
a: %w{bar bar bar foo foo foo },
|
|
224
|
+
b: %w{one three two one three two },
|
|
225
|
+
c: [2 ,1 ,6 ,1 ,8 ,3 ],
|
|
226
|
+
d: [22 ,44 ,66 ,11 ,88 ,33 ]
|
|
227
|
+
}, index: [1,3,5,0,7,2]))
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
context '#last' do
|
|
232
|
+
it 'gets the last row from each group' do
|
|
233
|
+
expect(@dl_group.last).to eq(DaruLite::DataFrame.new({
|
|
234
|
+
a: %w{bar bar bar foo foo foo },
|
|
235
|
+
b: %w{one three two one three two },
|
|
236
|
+
c: [2 ,1 ,6 ,3 ,8 ,3 ],
|
|
237
|
+
d: [22 ,44 ,66 ,77 ,88 ,55 ]
|
|
238
|
+
}, index: [1,3,5,6,7,4]))
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
context "#mean" do
|
|
243
|
+
it "computes mean of the numeric columns of a single layer group" do
|
|
244
|
+
expect(@sl_group.mean).to eq(DaruLite::DataFrame.new({
|
|
245
|
+
:c => [3.0, 3.6],
|
|
246
|
+
:d => [44.0, 52.8]
|
|
247
|
+
}, index: @sl_index
|
|
248
|
+
))
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
it "computes mean of the numeric columns of a double layer group" do
|
|
252
|
+
expect(@dl_group.mean).to eq(DaruLite::DataFrame.new({
|
|
253
|
+
c: [2,1,6,2,8,3],
|
|
254
|
+
d: [22,44,66,44,88,44]
|
|
255
|
+
}, index: @dl_multi_index))
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
it "computes mean of the numeric columns of a triple layer group" do
|
|
259
|
+
expect(@tl_group.mean).to eq(DaruLite::DataFrame.new({
|
|
260
|
+
d: [22,44,66,11,77,88,44]
|
|
261
|
+
}, index: @tl_multi_index
|
|
262
|
+
))
|
|
263
|
+
end
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
context "#sum" do
|
|
267
|
+
it "calculates the sum of the numeric columns of a single layer group" do
|
|
268
|
+
expect(@sl_group.sum).to eq(DaruLite::DataFrame.new({
|
|
269
|
+
c: [9, 18],
|
|
270
|
+
d: [132, 264]
|
|
271
|
+
}, index: @sl_index
|
|
272
|
+
))
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
it "calculates the sum of the numeric columns of a double layer group" do
|
|
276
|
+
expect(@dl_group.sum).to eq(DaruLite::DataFrame.new({
|
|
277
|
+
c: [2,1,6,4,8,6],
|
|
278
|
+
d: [22,44,66,88,88,88]
|
|
279
|
+
}, index: @dl_multi_index))
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
it "calculates the sum of the numeric columns of a triple layer group" do
|
|
283
|
+
expect(@tl_group.sum).to eq(DaruLite::DataFrame.new({
|
|
284
|
+
d: [22,44,66,11,77,88,88]
|
|
285
|
+
}, index: @tl_multi_index))
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
[:median, :std, :max, :min].each do |numeric_method|
|
|
290
|
+
it "works somehow" do
|
|
291
|
+
expect(@sl_group.send(numeric_method).index).to eq @sl_index
|
|
292
|
+
expect(@dl_group.send(numeric_method).index).to eq @dl_multi_index
|
|
293
|
+
expect(@tl_group.send(numeric_method).index).to eq @tl_multi_index
|
|
294
|
+
end
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
context "#product" do
|
|
298
|
+
it "calculates product for single layer groups" do
|
|
299
|
+
# TODO
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
it "calculates product for double layer groups" do
|
|
303
|
+
# TODO
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
it "calculates product for triple layer groups" do
|
|
307
|
+
# TODO
|
|
308
|
+
end
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
context "#count" do
|
|
312
|
+
it "counts the number of elements in a single layer group" do
|
|
313
|
+
expect(@sl_group.count).to eq(DaruLite::DataFrame.new({
|
|
314
|
+
b: [3,5],
|
|
315
|
+
c: [3,5],
|
|
316
|
+
d: [3,5]
|
|
317
|
+
}, index: @sl_index))
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
it "counts the number of elements in a double layer group" do
|
|
321
|
+
expect(@dl_group.count).to eq(DaruLite::DataFrame.new({
|
|
322
|
+
c: [1,1,1,2,1,2],
|
|
323
|
+
d: [1,1,1,2,1,2]
|
|
324
|
+
}, index: @dl_multi_index))
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
it "counts the number of elements in a triple layer group" do
|
|
328
|
+
expect(@tl_group.count).to eq(DaruLite::DataFrame.new({
|
|
329
|
+
d: [1,1,1,1,1,1,2]
|
|
330
|
+
}, index: @tl_multi_index))
|
|
331
|
+
end
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
context "#std" do
|
|
335
|
+
it "calculates sample standard deviation for single layer groups" do
|
|
336
|
+
# TODO
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
it "calculates sample standard deviation for double layer groups" do
|
|
340
|
+
# TODO
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
it "calculates sample standard deviation for triple layer groups" do
|
|
344
|
+
# TODO
|
|
345
|
+
end
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
context "#max" do
|
|
349
|
+
it "calculates max value for single layer groups" do
|
|
350
|
+
# TODO
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
it "calculates max value for double layer groups" do
|
|
354
|
+
# TODO
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
it "calculates max value for triple layer groups" do
|
|
358
|
+
# TODO
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
context "#min" do
|
|
363
|
+
it "calculates min value for single layer groups" do
|
|
364
|
+
# TODO
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
it "calculates min value for double layer groups" do
|
|
368
|
+
# TODO
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
it "calculates min value for triple layer groups" do
|
|
372
|
+
# TODO
|
|
373
|
+
end
|
|
374
|
+
end
|
|
375
|
+
|
|
376
|
+
context "#median" do
|
|
377
|
+
it "calculates median for single layer groups" do
|
|
378
|
+
# TODO
|
|
379
|
+
end
|
|
380
|
+
|
|
381
|
+
it "calculates median for double layer groups" do
|
|
382
|
+
# TODO
|
|
383
|
+
end
|
|
384
|
+
|
|
385
|
+
it "calculates median for triple layer groups" do
|
|
386
|
+
# TODO
|
|
387
|
+
end
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
context "#head" do
|
|
391
|
+
it "returns first n rows of each single layer group" do
|
|
392
|
+
expect(@sl_group.head(2)).to eq(DaruLite::DataFrame.new({
|
|
393
|
+
a: ['bar', 'bar','foo','foo'],
|
|
394
|
+
b: ['one', 'three','one', 'two'],
|
|
395
|
+
c: [2, 1, 1, 3],
|
|
396
|
+
d: [22, 44, 11, 33]
|
|
397
|
+
}, index: [1,3,0,2]))
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
it "returns first n rows of each double layer group" do
|
|
401
|
+
expect(@dl_group.head(2)).to eq(DaruLite::DataFrame.new({
|
|
402
|
+
a: ['bar','bar','bar','foo','foo','foo','foo','foo'],
|
|
403
|
+
b: ['one','three','two','one','one','three','two','two'],
|
|
404
|
+
c: [2,1,6,1,3,8,3,3],
|
|
405
|
+
d: [22,44,66,11,77,88,33,55]
|
|
406
|
+
}, index: [1,3,5,0,6,7,2,4]))
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
it "returns first n rows of each triple layer group" do
|
|
410
|
+
expect(@tl_group.head(1)).to eq(DaruLite::DataFrame.new({
|
|
411
|
+
a: ['bar','bar','bar','foo','foo','foo','foo'],
|
|
412
|
+
b: ['one','three','two','one','one','three','two'],
|
|
413
|
+
c: [2,1,6,1,3,8,3],
|
|
414
|
+
d: [22,44,66,11,77,88,33]
|
|
415
|
+
}, index: [1,3,5,0,6,7,2]))
|
|
416
|
+
end
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
context "#tail" do
|
|
420
|
+
it "returns last n rows of each single layer group" do
|
|
421
|
+
expect(@sl_group.tail(1)).to eq(DaruLite::DataFrame.new({
|
|
422
|
+
a: ['bar','foo'],
|
|
423
|
+
b: ['two', 'three'],
|
|
424
|
+
c: [6,8],
|
|
425
|
+
d: [66,88]
|
|
426
|
+
}, index: [5,7]))
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
it "returns last n rows of each double layer group" do
|
|
430
|
+
expect(@dl_group.tail(2)).to eq(DaruLite::DataFrame.new({
|
|
431
|
+
a: ['bar','bar','bar','foo','foo','foo','foo','foo'],
|
|
432
|
+
b: ['one','three','two','one','one','three','two','two'],
|
|
433
|
+
c: [2,1,6,1,3,8,3,3],
|
|
434
|
+
d: [22,44,66,11,77,88,33,55]
|
|
435
|
+
}, index: [1,3,5,0,6,7,2,4]))
|
|
436
|
+
end
|
|
437
|
+
|
|
438
|
+
it "returns last n rows of each triple layer group" do
|
|
439
|
+
expect(@tl_group.tail(1)).to eq(DaruLite::DataFrame.new({
|
|
440
|
+
a: ['bar','bar','bar','foo','foo','foo','foo'],
|
|
441
|
+
b: ['one','three','two','one','one','three','two'],
|
|
442
|
+
c: [2,1,6,1,3,8,3],
|
|
443
|
+
d: [22,44,66,11,77,88,55]
|
|
444
|
+
}, index: [1,3,5,0,6,7,4]))
|
|
445
|
+
end
|
|
446
|
+
end
|
|
447
|
+
|
|
448
|
+
context "#[]" do
|
|
449
|
+
pending
|
|
450
|
+
end
|
|
451
|
+
|
|
452
|
+
context "#reduce" do
|
|
453
|
+
it "returns a vector that concatenates strings in a group" do
|
|
454
|
+
string_concat = lambda { |result, row| result += row[:b] }
|
|
455
|
+
expect(@sl_group.reduce('', &string_concat)).to eq(DaruLite::Vector.new(['onethreetwo', 'onetwotwoonethree'], index: @sl_index))
|
|
456
|
+
end
|
|
457
|
+
|
|
458
|
+
it "works with multi-indexes" do
|
|
459
|
+
string_concat = lambda { |result, row| result += row[:b] }
|
|
460
|
+
expect(@dl_group.reduce('', &string_concat)).to eq \
|
|
461
|
+
DaruLite::Vector.new(['one', 'three', 'two', 'oneone', 'three', 'twotwo'], index: @dl_multi_index)
|
|
462
|
+
end
|
|
463
|
+
end
|
|
464
|
+
|
|
465
|
+
context 'groups by first vector if no vector mentioned' do
|
|
466
|
+
subject { @df.group_by }
|
|
467
|
+
|
|
468
|
+
it { is_expected.to be_a DaruLite::Core::GroupBy }
|
|
469
|
+
its(:groups) { is_expected.to eq @sl_group.groups }
|
|
470
|
+
its(:size) { is_expected.to eq @sl_group.size }
|
|
471
|
+
end
|
|
472
|
+
|
|
473
|
+
context 'group and sum with numeric indices' do
|
|
474
|
+
let(:df) { DaruLite::DataFrame.new({ g: ['a','a','a'], num: [1,2,3]}, index: [2,12,23]) }
|
|
475
|
+
|
|
476
|
+
subject { df.group_by([:g]).sum }
|
|
477
|
+
|
|
478
|
+
it { is_expected.to eq DaruLite::DataFrame.new({num: [6]}, index: ['a']) }
|
|
479
|
+
end
|
|
480
|
+
|
|
481
|
+
context 'when dataframe tuples contain nils in mismatching positions' do
|
|
482
|
+
|
|
483
|
+
let(:df){
|
|
484
|
+
DaruLite::DataFrame.new(
|
|
485
|
+
{
|
|
486
|
+
'string1' => ["Color", "Color", "Color", "Color", nil, "Color", "Color", " Black and White"],
|
|
487
|
+
'string2' => ["Test", "test2", nil, "test3", nil, "test", "test3", "test5"],
|
|
488
|
+
'num' => [1, nil, 3, 4, 5, 6, 7, nil]
|
|
489
|
+
}
|
|
490
|
+
)
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
it 'groups by without errors' do
|
|
494
|
+
expect { df.group_by(df.vectors.map(&:to_s)) }.to_not raise_error(ArgumentError)
|
|
495
|
+
end
|
|
496
|
+
end
|
|
497
|
+
|
|
498
|
+
context '#aggregate' do
|
|
499
|
+
let(:dataframe) { DaruLite::DataFrame.new({
|
|
500
|
+
employee: %w[John Jane Mark John Jane Mark],
|
|
501
|
+
month: %w[June June June July July July],
|
|
502
|
+
salary: [1000, 500, 700, 1200, 600, 600]})
|
|
503
|
+
}
|
|
504
|
+
context 'group and aggregate sum for particular single vector' do
|
|
505
|
+
subject { dataframe.group_by([:employee]).aggregate(salary: :sum) }
|
|
506
|
+
|
|
507
|
+
it { is_expected.to eq DaruLite::DataFrame.new({
|
|
508
|
+
salary: [1100, 2200, 1300]},
|
|
509
|
+
index: ['Jane', 'John', 'Mark'])
|
|
510
|
+
}
|
|
511
|
+
end
|
|
512
|
+
|
|
513
|
+
context 'group and aggregate sum and lambda function for vectors' do
|
|
514
|
+
subject { dataframe.group_by([:employee]).aggregate(
|
|
515
|
+
salary: :sum,
|
|
516
|
+
month: ->(vec) { vec.to_a.join('/') }) }
|
|
517
|
+
|
|
518
|
+
it { is_expected.to eq DaruLite::DataFrame.new({
|
|
519
|
+
salary: [1100, 2200, 1300],
|
|
520
|
+
month: ['June/July', 'June/July', 'June/July']},
|
|
521
|
+
index: ['Jane', 'John', 'Mark'],
|
|
522
|
+
order: [:salary, :month])
|
|
523
|
+
}
|
|
524
|
+
end
|
|
525
|
+
|
|
526
|
+
context 'group and aggregate sum and lambda functions on dataframe' do
|
|
527
|
+
subject { dataframe.group_by([:employee]).aggregate(
|
|
528
|
+
salary: :sum,
|
|
529
|
+
month: ->(vec) { vec.to_a.join('/') },
|
|
530
|
+
mean_salary: ->(df) { df.salary.mean },
|
|
531
|
+
periods: ->(df) { df.size }
|
|
532
|
+
)}
|
|
533
|
+
|
|
534
|
+
it { is_expected.to eq DaruLite::DataFrame.new({
|
|
535
|
+
salary: [1100, 2200, 1300],
|
|
536
|
+
month: ['June/July', 'June/July', 'June/July'],
|
|
537
|
+
mean_salary: [550.0, 1100.0, 650.0],
|
|
538
|
+
periods: [2, 2, 2]},
|
|
539
|
+
index: ['Jane', 'John', 'Mark'], order: [:salary, :month,
|
|
540
|
+
:mean_salary, :periods]) }
|
|
541
|
+
end
|
|
542
|
+
|
|
543
|
+
context 'group_by and aggregate on mixed MultiIndex' do
|
|
544
|
+
let(:df) { DaruLite::DataFrame.new(
|
|
545
|
+
name: ['Ram','Krishna','Ram','Krishna','Krishna'],
|
|
546
|
+
visited: [
|
|
547
|
+
'Hyderabad', 'Delhi', 'Mumbai', 'Raipur', 'Banglore']
|
|
548
|
+
)
|
|
549
|
+
}
|
|
550
|
+
let(:df_mixed) { DaruLite::DataFrame.new(
|
|
551
|
+
name: ['Krishna','Ram','Krishna','Krishna'],
|
|
552
|
+
visited: [
|
|
553
|
+
'Delhi', 'Mumbai', 'Raipur', 'Banglore']
|
|
554
|
+
)
|
|
555
|
+
}
|
|
556
|
+
it 'group_by' do
|
|
557
|
+
expect(df.group_by(:name).df).to eq(
|
|
558
|
+
DaruLite::DataFrame.new({
|
|
559
|
+
visited: ['Delhi', 'Raipur', 'Banglore', 'Hyderabad', 'Mumbai']},
|
|
560
|
+
index: DaruLite::MultiIndex.from_tuples(
|
|
561
|
+
[['Krishna', 1], ['Krishna', 3], ['Krishna', 4],
|
|
562
|
+
['Ram', 0], ['Ram', 2]]
|
|
563
|
+
)
|
|
564
|
+
)
|
|
565
|
+
)
|
|
566
|
+
end
|
|
567
|
+
|
|
568
|
+
it 'group_by and aggregate' do
|
|
569
|
+
expect(
|
|
570
|
+
df.group_by(:name).aggregate(
|
|
571
|
+
visited: -> (vec){vec.to_a.join(',')})).to eq(
|
|
572
|
+
DaruLite::DataFrame.new({
|
|
573
|
+
visited: ['Delhi,Raipur,Banglore', 'Hyderabad,Mumbai']},
|
|
574
|
+
index: ['Krishna', 'Ram']
|
|
575
|
+
)
|
|
576
|
+
)
|
|
577
|
+
end
|
|
578
|
+
|
|
579
|
+
it 'group_by and aggregate when anyone index is not multiple times' do
|
|
580
|
+
expect(
|
|
581
|
+
df_mixed.group_by(:name).aggregate(
|
|
582
|
+
visited: -> (vec){vec.to_a.join(',')})).to eq(
|
|
583
|
+
DaruLite::DataFrame.new({
|
|
584
|
+
visited: ['Delhi,Raipur,Banglore', 'Mumbai']},
|
|
585
|
+
index: ['Krishna', 'Ram']
|
|
586
|
+
)
|
|
587
|
+
)
|
|
588
|
+
end
|
|
589
|
+
end
|
|
590
|
+
|
|
591
|
+
let(:spending_df) {
|
|
592
|
+
DaruLite::DataFrame.rows([
|
|
593
|
+
[2010, 'dev', 50, 1],
|
|
594
|
+
[2010, 'dev', 150, 1],
|
|
595
|
+
[2010, 'dev', 200, 1],
|
|
596
|
+
[2011, 'dev', 50, 1],
|
|
597
|
+
[2012, 'dev', 150, 1],
|
|
598
|
+
|
|
599
|
+
[2011, 'office', 300, 1],
|
|
600
|
+
|
|
601
|
+
[2010, 'market', 50, 1],
|
|
602
|
+
[2011, 'market', 500, 1],
|
|
603
|
+
[2012, 'market', 500, 1],
|
|
604
|
+
[2012, 'market', 300, 1],
|
|
605
|
+
|
|
606
|
+
[2012, 'R&D', 10, 1],],
|
|
607
|
+
order: [:year, :category, :spending, :nb_spending])
|
|
608
|
+
}
|
|
609
|
+
let(:multi_index_year_category) {
|
|
610
|
+
DaruLite::MultiIndex.from_tuples([
|
|
611
|
+
[2010, "dev"], [2010, "market"],
|
|
612
|
+
[2011, "dev"], [2011, "market"], [2011, "office"],
|
|
613
|
+
[2012, "R&D"], [2012, "dev"], [2012, "market"]])
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
context 'group_by and aggregate on multiple elements' do
|
|
617
|
+
it 'does aggregate' do
|
|
618
|
+
expect(spending_df.group_by([:year, :category]).aggregate(spending: :sum)).to eq(
|
|
619
|
+
DaruLite::DataFrame.new({spending: [400, 50, 50, 500, 300, 10, 150, 800]}, index: multi_index_year_category))
|
|
620
|
+
end
|
|
621
|
+
|
|
622
|
+
it 'works as older methods' do
|
|
623
|
+
older_way = spending_df.group_by([:year, :category]).sum
|
|
624
|
+
|
|
625
|
+
newer_way = spending_df.group_by([:year, :category]).aggregate(spending: :sum, nb_spending: :sum)
|
|
626
|
+
expect(newer_way).to eq(older_way)
|
|
627
|
+
|
|
628
|
+
contrived_way = spending_df.group_by([:year, :category]).aggregate(spending: :sum, nb_spending_lambda: ->(df) { df[:nb_spending].sum })
|
|
629
|
+
contrived_way.rename_vectors(nb_spending_lambda: :nb_spending)
|
|
630
|
+
expect(contrived_way).to eq(older_way)
|
|
631
|
+
end
|
|
632
|
+
|
|
633
|
+
context 'can aggregate on MultiIndex' do
|
|
634
|
+
let(:multi_indexed_aggregated_df) { spending_df.group_by([:year, :category]).aggregate(spending: :sum) }
|
|
635
|
+
let(:index_year) { DaruLite::Index.new([2010, 2011, 2012]) }
|
|
636
|
+
let(:index_category) { DaruLite::Index.new(["dev", "market", "office", "R&D"]) }
|
|
637
|
+
|
|
638
|
+
it 'aggregates by default on the last layer of MultiIndex' do
|
|
639
|
+
expect(multi_indexed_aggregated_df.aggregate(spending: :sum)).to eq(
|
|
640
|
+
DaruLite::DataFrame.new({spending: [450, 850, 960]}, index: index_year))
|
|
641
|
+
end
|
|
642
|
+
|
|
643
|
+
it 'can aggregate on the first layer of MultiIndex' do
|
|
644
|
+
expect(multi_indexed_aggregated_df.aggregate({spending: :sum},0)).to eq(
|
|
645
|
+
DaruLite::DataFrame.new({spending: [600, 1350, 300, 10]}, index: index_category))
|
|
646
|
+
end
|
|
647
|
+
|
|
648
|
+
it 'does coercion: when one layer is remaining, MultiIndex is coerced in Index that does not aggregate anymore' do
|
|
649
|
+
df_with_simple_index = multi_indexed_aggregated_df.aggregate(spending: :sum)
|
|
650
|
+
expect(df_with_simple_index.aggregate(spending: :sum)).to eq(df_with_simple_index)
|
|
651
|
+
end
|
|
652
|
+
end
|
|
653
|
+
end
|
|
654
|
+
end
|
|
655
|
+
end
|