daru_lite 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/workflows/ci.yml +33 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +27 -0
- data/.rubocop_todo.yml +137 -0
- data/CONTRIBUTING.md +47 -0
- data/Gemfile +2 -0
- data/History.md +4 -0
- data/LICENSE +24 -0
- data/README.md +218 -0
- data/Rakefile +69 -0
- data/ReleasePolicy.md +20 -0
- data/benchmarks/TradeoffData.csv +65 -0
- data/benchmarks/csv_reading.rb +22 -0
- data/benchmarks/dataframe_creation.rb +39 -0
- data/benchmarks/db_loading.rb +34 -0
- data/benchmarks/duplicating.rb +45 -0
- data/benchmarks/group_by.rb +32 -0
- data/benchmarks/joining.rb +52 -0
- data/benchmarks/row_access.rb +41 -0
- data/benchmarks/row_assign.rb +36 -0
- data/benchmarks/sorting.rb +51 -0
- data/benchmarks/statistics.rb +28 -0
- data/benchmarks/vector_access.rb +31 -0
- data/benchmarks/vector_assign.rb +42 -0
- data/benchmarks/where_clause.rb +48 -0
- data/benchmarks/where_vs_filter.rb +28 -0
- data/daru_lite.gemspec +55 -0
- data/images/README.md +5 -0
- data/images/con0.png +0 -0
- data/images/con1.png +0 -0
- data/images/init0.png +0 -0
- data/images/init1.png +0 -0
- data/images/man0.png +0 -0
- data/images/man1.png +0 -0
- data/images/man2.png +0 -0
- data/images/man3.png +0 -0
- data/images/man4.png +0 -0
- data/images/man5.png +0 -0
- data/images/man6.png +0 -0
- data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
- data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
- data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
- data/lib/daru_lite/category.rb +929 -0
- data/lib/daru_lite/configuration.rb +34 -0
- data/lib/daru_lite/core/group_by.rb +403 -0
- data/lib/daru_lite/core/merge.rb +270 -0
- data/lib/daru_lite/core/query.rb +109 -0
- data/lib/daru_lite/dataframe.rb +3080 -0
- data/lib/daru_lite/date_time/index.rb +569 -0
- data/lib/daru_lite/date_time/offsets.rb +397 -0
- data/lib/daru_lite/exceptions.rb +2 -0
- data/lib/daru_lite/extensions/which_dsl.rb +53 -0
- data/lib/daru_lite/formatters/table.rb +52 -0
- data/lib/daru_lite/helpers/array.rb +53 -0
- data/lib/daru_lite/index/categorical_index.rb +201 -0
- data/lib/daru_lite/index/index.rb +374 -0
- data/lib/daru_lite/index/multi_index.rb +374 -0
- data/lib/daru_lite/io/csv/converters.rb +21 -0
- data/lib/daru_lite/io/io.rb +294 -0
- data/lib/daru_lite/io/sql_data_source.rb +97 -0
- data/lib/daru_lite/iruby/helpers.rb +38 -0
- data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
- data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
- data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
- data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
- data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
- data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
- data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
- data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
- data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
- data/lib/daru_lite/monkeys.rb +56 -0
- data/lib/daru_lite/vector.rb +1678 -0
- data/lib/daru_lite/version.rb +3 -0
- data/lib/daru_lite.rb +99 -0
- data/profile/_base.rb +23 -0
- data/profile/df_to_a.rb +10 -0
- data/profile/filter.rb +13 -0
- data/profile/joining.rb +13 -0
- data/profile/sorting.rb +12 -0
- data/profile/vector_each_with_index.rb +9 -0
- data/profile/vector_new.rb +9 -0
- data/spec/accessors/array_wrapper_spec.rb +3 -0
- data/spec/category_spec.rb +1741 -0
- data/spec/core/group_by_spec.rb +655 -0
- data/spec/core/merge_spec.rb +179 -0
- data/spec/core/query_spec.rb +347 -0
- data/spec/daru_lite_spec.rb +22 -0
- data/spec/dataframe_spec.rb +4330 -0
- data/spec/date_time/data_spec.rb +197 -0
- data/spec/date_time/date_time_index_helper_spec.rb +72 -0
- data/spec/date_time/index_spec.rb +588 -0
- data/spec/date_time/offsets_spec.rb +465 -0
- data/spec/extensions/which_dsl_spec.rb +38 -0
- data/spec/fixtures/bank2.dat +200 -0
- data/spec/fixtures/boolean_converter_test.csv +5 -0
- data/spec/fixtures/countries.json +7794 -0
- data/spec/fixtures/duplicates.csv +32 -0
- data/spec/fixtures/eciresults.html +394 -0
- data/spec/fixtures/empties.dat +2 -0
- data/spec/fixtures/empty_rows_test.csv +17 -0
- data/spec/fixtures/macau.html +3691 -0
- data/spec/fixtures/macd_data.csv +150 -0
- data/spec/fixtures/matrix_test.csv +100 -0
- data/spec/fixtures/moneycontrol.html +6812 -0
- data/spec/fixtures/music_data.tsv +2501 -0
- data/spec/fixtures/repeated_fields.csv +7 -0
- data/spec/fixtures/sales-funnel.csv +18 -0
- data/spec/fixtures/scientific_notation.csv +4 -0
- data/spec/fixtures/string_converter_test.csv +5 -0
- data/spec/fixtures/strings.dat +2 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/fixtures/test_xls_2.xls +0 -0
- data/spec/fixtures/url_test.txt~ +0 -0
- data/spec/fixtures/valid_markup.html +62 -0
- data/spec/fixtures/wiki_climate.html +1243 -0
- data/spec/fixtures/wiki_table_info.html +631 -0
- data/spec/formatters/table_formatter_spec.rb +137 -0
- data/spec/helpers_spec.rb +8 -0
- data/spec/index/categorical_index_spec.rb +170 -0
- data/spec/index/index_spec.rb +417 -0
- data/spec/index/multi_index_spec.rb +680 -0
- data/spec/io/io_spec.rb +373 -0
- data/spec/io/sql_data_source_spec.rb +56 -0
- data/spec/iruby/dataframe_spec.rb +170 -0
- data/spec/iruby/helpers_spec.rb +49 -0
- data/spec/iruby/multi_index_spec.rb +37 -0
- data/spec/iruby/vector_spec.rb +105 -0
- data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
- data/spec/maths/arithmetic/vector_spec.rb +165 -0
- data/spec/maths/statistics/dataframe_spec.rb +178 -0
- data/spec/maths/statistics/vector_spec.rb +756 -0
- data/spec/monkeys_spec.rb +42 -0
- data/spec/shared/vector_display_spec.rb +213 -0
- data/spec/spec_helper.rb +87 -0
- data/spec/support/database_helper.rb +30 -0
- data/spec/support/matchers.rb +5 -0
- data/spec/vector_spec.rb +2293 -0
- metadata +571 -0
@@ -0,0 +1,34 @@
|
|
1
|
+
module DaruLite
|
2
|
+
# Defines constants and methods related to configuration
|
3
|
+
module Configuration
|
4
|
+
INSPECT_OPTIONS_KEYS = [
|
5
|
+
:max_rows,
|
6
|
+
# Terminal
|
7
|
+
:spacing
|
8
|
+
].freeze
|
9
|
+
|
10
|
+
# Jupyter
|
11
|
+
DEFAULT_MAX_ROWS = 30
|
12
|
+
|
13
|
+
# Terminal
|
14
|
+
DEFAULT_SPACING = 10
|
15
|
+
|
16
|
+
attr_accessor(*INSPECT_OPTIONS_KEYS)
|
17
|
+
|
18
|
+
def configure
|
19
|
+
yield self
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.extended(base)
|
23
|
+
base.reset_options
|
24
|
+
end
|
25
|
+
|
26
|
+
def reset_options
|
27
|
+
self.max_rows = DEFAULT_MAX_ROWS
|
28
|
+
|
29
|
+
self.spacing = DEFAULT_SPACING
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
extend Configuration
|
34
|
+
end
|
@@ -0,0 +1,403 @@
|
|
1
|
+
module DaruLite
|
2
|
+
module Core
|
3
|
+
class GroupBy
|
4
|
+
class << self
|
5
|
+
extend Gem::Deprecate
|
6
|
+
|
7
|
+
# @private
|
8
|
+
def group_by_index_to_positions(indexes_with_positions, sort: false)
|
9
|
+
index_to_positions = {}
|
10
|
+
|
11
|
+
indexes_with_positions.each do |idx, position|
|
12
|
+
(index_to_positions[idx] ||= []) << position
|
13
|
+
end
|
14
|
+
|
15
|
+
if sort # TODO: maybe add a more "stable" sorting option?
|
16
|
+
sorted_keys = index_to_positions.keys.sort(&DaruLite::Core::GroupBy::TUPLE_SORTER)
|
17
|
+
index_to_positions = sorted_keys.to_h { |k| [k, index_to_positions[k]] }
|
18
|
+
end
|
19
|
+
|
20
|
+
index_to_positions
|
21
|
+
end
|
22
|
+
alias get_positions_group_map_on group_by_index_to_positions
|
23
|
+
deprecate :get_positions_group_map_on, :group_by_index_to_positions, 2019, 10
|
24
|
+
|
25
|
+
# @private
|
26
|
+
def get_positions_group_for_aggregation(multi_index, level = -1)
|
27
|
+
raise unless multi_index.is_a?(DaruLite::MultiIndex)
|
28
|
+
|
29
|
+
new_index = multi_index.dup
|
30
|
+
new_index.remove_layer(level) # TODO: recheck code of DaruLite::MultiIndex#remove_layer
|
31
|
+
|
32
|
+
group_by_index_to_positions(new_index.each_with_index)
|
33
|
+
end
|
34
|
+
|
35
|
+
# @private
|
36
|
+
def get_positions_group_map_for_df(df, group_by_keys, sort: true)
|
37
|
+
indexes_with_positions = df[*group_by_keys].to_df.each_row.map(&:to_a).each_with_index
|
38
|
+
|
39
|
+
group_by_index_to_positions(indexes_with_positions, sort: sort)
|
40
|
+
end
|
41
|
+
|
42
|
+
# @private
|
43
|
+
def group_map_from_positions_to_indexes(positions_group_map, index)
|
44
|
+
positions_group_map.transform_values { |positions| positions.map { |pos| index.at(pos) } }
|
45
|
+
end
|
46
|
+
|
47
|
+
# @private
|
48
|
+
def df_from_group_map(df, group_map, remaining_vectors, from_position: true)
|
49
|
+
return nil if group_map == {}
|
50
|
+
|
51
|
+
new_index = group_map.flat_map { |group, values| values.map { |val| group + [val] } }
|
52
|
+
new_index = DaruLite::MultiIndex.from_tuples(new_index)
|
53
|
+
|
54
|
+
return DaruLite::DataFrame.new({}, index: new_index) if remaining_vectors == []
|
55
|
+
|
56
|
+
new_rows_order = group_map.values.flatten
|
57
|
+
new_df = df[*remaining_vectors].to_df.get_sub_dataframe(new_rows_order, by_position: from_position)
|
58
|
+
new_df.index = new_index
|
59
|
+
|
60
|
+
new_df
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# The group_by was done over the vectors in group_vectors; the remaining vectors are the non_group_vectors
|
65
|
+
attr_reader :group_vectors, :non_group_vectors
|
66
|
+
|
67
|
+
# lazy accessor/attr_reader for the attribute groups
|
68
|
+
def groups
|
69
|
+
@groups ||= GroupBy.group_map_from_positions_to_indexes(@groups_by_pos, @context.index)
|
70
|
+
end
|
71
|
+
alias groups_by_idx groups
|
72
|
+
|
73
|
+
# lazy accessor/attr_reader for the attribute df
|
74
|
+
def df
|
75
|
+
@df ||= GroupBy.df_from_group_map(@context, @groups_by_pos, @non_group_vectors)
|
76
|
+
end
|
77
|
+
alias grouped_df df
|
78
|
+
|
79
|
+
# Iterate over each group created by group_by. A DataFrame is yielded in
|
80
|
+
# block.
|
81
|
+
def each_group
|
82
|
+
return to_enum(:each_group) unless block_given?
|
83
|
+
|
84
|
+
groups.each_key do |k|
|
85
|
+
yield get_group(k)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
TUPLE_SORTER = lambda do |left, right|
|
90
|
+
return -1 unless right
|
91
|
+
return 1 unless left
|
92
|
+
|
93
|
+
left = left.compact
|
94
|
+
right = right.compact
|
95
|
+
return left <=> right || 0 if left.length == right.length
|
96
|
+
|
97
|
+
left.length <=> right.length
|
98
|
+
end
|
99
|
+
|
100
|
+
def initialize(context, names)
|
101
|
+
@group_vectors = names
|
102
|
+
@non_group_vectors = context.vectors.to_a - names
|
103
|
+
|
104
|
+
@context = context # TODO: maybe rename in @original_df
|
105
|
+
|
106
|
+
# FIXME: It feels like we don't want to sort here. Ruby's #group_by
|
107
|
+
# never sorts:
|
108
|
+
#
|
109
|
+
# ['test', 'me', 'please'].group_by(&:size)
|
110
|
+
# # => {4=>["test"], 2=>["me"], 6=>["please"]}
|
111
|
+
#
|
112
|
+
# - zverok, 2016-09-12
|
113
|
+
@groups_by_pos = GroupBy.get_positions_group_map_for_df(@context, @group_vectors, sort: true)
|
114
|
+
end
|
115
|
+
|
116
|
+
# Get a DaruLite::Vector of the size of each group.
|
117
|
+
def size
|
118
|
+
index = get_grouped_index
|
119
|
+
|
120
|
+
values = @groups_by_pos.values.map(&:size)
|
121
|
+
DaruLite::Vector.new(values, index: index, name: :size)
|
122
|
+
end
|
123
|
+
|
124
|
+
# Get the first group
|
125
|
+
def first
|
126
|
+
head(1)
|
127
|
+
end
|
128
|
+
|
129
|
+
# Get the last group
|
130
|
+
def last
|
131
|
+
tail(1)
|
132
|
+
end
|
133
|
+
|
134
|
+
# Get the top 'n' groups
|
135
|
+
# @param quantity [Fixnum] (5) The number of groups.
|
136
|
+
# @example Usage of head
|
137
|
+
# df = DaruLite::DataFrame.new({
|
138
|
+
# a: %w{foo bar foo bar foo bar foo foo},
|
139
|
+
# b: %w{one one two three two two one three},
|
140
|
+
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
141
|
+
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
142
|
+
# })
|
143
|
+
# df.group_by([:a, :b]).head(1)
|
144
|
+
# # =>
|
145
|
+
# # #<DaruLite::DataFrame:82745170 @name = d7003f75-5eb9-4967-9303-c08dd9160224 @size = 6>
|
146
|
+
# # a b c d
|
147
|
+
# # 1 bar one 2 22
|
148
|
+
# # 3 bar three 1 44
|
149
|
+
# # 5 bar two 6 66
|
150
|
+
# # 0 foo one 1 11
|
151
|
+
# # 7 foo three 8 88
|
152
|
+
# # 2 foo two 3 33
|
153
|
+
def head(quantity = 5)
|
154
|
+
select_groups_from :first, quantity
|
155
|
+
end
|
156
|
+
|
157
|
+
# Get the bottom 'n' groups
|
158
|
+
# @param quantity [Fixnum] (5) The number of groups.
|
159
|
+
# @example Usage of tail
|
160
|
+
# df = DaruLite::DataFrame.new({
|
161
|
+
# a: %w{foo bar foo bar foo bar foo foo},
|
162
|
+
# b: %w{one one two three two two one three},
|
163
|
+
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
164
|
+
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
165
|
+
# })
|
166
|
+
# # df.group_by([:a, :b]).tail(1)
|
167
|
+
# # =>
|
168
|
+
# # #<DaruLite::DataFrame:82378270 @name = 0623db46-5425-41bd-a843-99baac3d1d9a @size = 6>
|
169
|
+
# # a b c d
|
170
|
+
# # 1 bar one 2 22
|
171
|
+
# # 3 bar three 1 44
|
172
|
+
# # 5 bar two 6 66
|
173
|
+
# # 6 foo one 3 77
|
174
|
+
# # 7 foo three 8 88
|
175
|
+
# # 4 foo two 3 55
|
176
|
+
def tail(quantity = 5)
|
177
|
+
select_groups_from :last, quantity
|
178
|
+
end
|
179
|
+
|
180
|
+
# Calculate mean of numeric groups, excluding missing values.
|
181
|
+
# @example Usage of mean
|
182
|
+
# df = DaruLite::DataFrame.new({
|
183
|
+
# a: %w{foo bar foo bar foo bar foo foo},
|
184
|
+
# b: %w{one one two three two two one three},
|
185
|
+
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
186
|
+
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
187
|
+
# df.group_by([:a, :b]).mean
|
188
|
+
# # =>
|
189
|
+
# # #<DaruLite::DataFrame:81097450 @name = 0c32983f-3e06-451f-a9c9-051cadfe7371 @size = 6>
|
190
|
+
# # c d
|
191
|
+
# # ["bar", "one"] 2 22
|
192
|
+
# # ["bar", "three"] 1 44
|
193
|
+
# # ["bar", "two"] 6 66
|
194
|
+
# # ["foo", "one"] 2.0 44.0
|
195
|
+
# # ["foo", "three"] 8 88
|
196
|
+
# # ["foo", "two"] 3.0 44.0
|
197
|
+
def mean
|
198
|
+
apply_method :numeric, :mean
|
199
|
+
end
|
200
|
+
|
201
|
+
# Calculate the median of numeric groups, excluding missing values.
|
202
|
+
def median
|
203
|
+
apply_method :numeric, :median
|
204
|
+
end
|
205
|
+
|
206
|
+
# Calculate sum of numeric groups, excluding missing values.
|
207
|
+
def sum
|
208
|
+
apply_method :numeric, :sum
|
209
|
+
end
|
210
|
+
|
211
|
+
# Count groups, excludes missing values.
|
212
|
+
# @example Using count
|
213
|
+
# df = DaruLite::DataFrame.new({
|
214
|
+
# a: %w{foo bar foo bar foo bar foo foo},
|
215
|
+
# b: %w{one one two three two two one three},
|
216
|
+
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
217
|
+
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
218
|
+
# })
|
219
|
+
# df.group_by([:a, :b]).count
|
220
|
+
# # =>
|
221
|
+
# # #<DaruLite::DataFrame:76900210 @name = 7b9cf55d-17f8-48c7-b03a-2586c6e5ec5a @size = 6>
|
222
|
+
# # c d
|
223
|
+
# # ["bar", "one"] 1 1
|
224
|
+
# # ["bar", "two"] 1 1
|
225
|
+
# # ["bar", "three"] 1 1
|
226
|
+
# # ["foo", "one"] 2 2
|
227
|
+
# # ["foo", "three"] 1 1
|
228
|
+
# # ["foo", "two"] 2 2
|
229
|
+
def count
|
230
|
+
width = @non_group_vectors.size
|
231
|
+
DaruLite::DataFrame.new([size] * width, order: @non_group_vectors)
|
232
|
+
end
|
233
|
+
|
234
|
+
# Calculate sample standard deviation of numeric vector groups, excluding
|
235
|
+
# missing values.
|
236
|
+
def std
|
237
|
+
apply_method :numeric, :std
|
238
|
+
end
|
239
|
+
|
240
|
+
# Find the max element of each numeric vector group.
|
241
|
+
def max
|
242
|
+
apply_method :numeric, :max
|
243
|
+
end
|
244
|
+
|
245
|
+
# Find the min element of each numeric vector group.
|
246
|
+
def min
|
247
|
+
apply_method :numeric, :min
|
248
|
+
end
|
249
|
+
|
250
|
+
# Returns one of the selected groups as a DataFrame.
|
251
|
+
# @param group [Array] The group that is to be selected from those grouped.
|
252
|
+
#
|
253
|
+
# @example Getting a group
|
254
|
+
#
|
255
|
+
# df = DaruLite::DataFrame.new({
|
256
|
+
# a: %w{foo bar foo bar foo bar foo foo},
|
257
|
+
# b: %w{one one two three two two one three},
|
258
|
+
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
259
|
+
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
260
|
+
# })
|
261
|
+
# df.group_by([:a, :b]).get_group ['bar','two']
|
262
|
+
# #=>
|
263
|
+
# ##<DaruLite::DataFrame:83258980 @name = 687ee3f6-8874-4899-97fa-9b31d84fa1d5 @size = 1>
|
264
|
+
# # a b c d
|
265
|
+
# # 5 bar two 6 66
|
266
|
+
def get_group(group)
|
267
|
+
indexes = groups_by_idx[group]
|
268
|
+
elements = @context.each_vector.map(&:to_a)
|
269
|
+
transpose = elements.transpose
|
270
|
+
rows = indexes.each.map { |idx| transpose[idx] }
|
271
|
+
|
272
|
+
DaruLite::DataFrame.rows(
|
273
|
+
rows, index: indexes, order: @context.vectors
|
274
|
+
)
|
275
|
+
end
|
276
|
+
|
277
|
+
# Iteratively applies a function to the values in a group and accumulates the result.
|
278
|
+
# @param init (nil) The initial value of the accumulator.
|
279
|
+
# @yieldparam block [Proc] A proc or lambda that accepts two arguments. The first argument
|
280
|
+
# is the accumulated result. The second argument is a DataFrame row.
|
281
|
+
# @example Usage of reduce
|
282
|
+
# df = DaruLite::DataFrame.new({
|
283
|
+
# a: ['a','b'] * 3,
|
284
|
+
# b: [1,2,3] * 2,
|
285
|
+
# c: 'A'..'F'
|
286
|
+
# })
|
287
|
+
# df.group_by([:a]).reduce('') { |result, row| result += row[:c]; result }
|
288
|
+
# # =>
|
289
|
+
# # #<DaruLite::Vector:70343147159900 @name = nil @size = 2 >
|
290
|
+
# # nil
|
291
|
+
# # a ACE
|
292
|
+
# # b BDF
|
293
|
+
def reduce(init = nil)
|
294
|
+
result_hash = groups_by_idx.each_with_object({}) do |(group, indices), h|
|
295
|
+
group_indices = indices.map { |v| @context.index.to_a[v] }
|
296
|
+
|
297
|
+
grouped_result = init
|
298
|
+
group_indices.each do |idx|
|
299
|
+
grouped_result = yield(grouped_result, @context.row[idx])
|
300
|
+
end
|
301
|
+
|
302
|
+
h[group] = grouped_result
|
303
|
+
end
|
304
|
+
|
305
|
+
index = get_grouped_index(result_hash.keys)
|
306
|
+
|
307
|
+
DaruLite::Vector.new(result_hash.values, index: index)
|
308
|
+
end
|
309
|
+
|
310
|
+
def inspect
|
311
|
+
grouped_df.inspect
|
312
|
+
end
|
313
|
+
|
314
|
+
# Function to use for aggregating the data.
|
315
|
+
# `group_by` is using DaruLite::DataFrame#aggregate
|
316
|
+
#
|
317
|
+
# @param options [Hash] options for column, you want in resultant dataframe
|
318
|
+
#
|
319
|
+
# @return [DaruLite::DataFrame]
|
320
|
+
#
|
321
|
+
# @example
|
322
|
+
#
|
323
|
+
# df = DaruLite::DataFrame.new(
|
324
|
+
# name: ['Ram','Krishna','Ram','Krishna','Krishna'],
|
325
|
+
# visited: ['Hyderabad', 'Delhi', 'Mumbai', 'Raipur', 'Banglore'])
|
326
|
+
#
|
327
|
+
# => #<DaruLite::DataFrame(5x2)>
|
328
|
+
# name visited
|
329
|
+
# 0 Ram Hyderabad
|
330
|
+
# 1 Krishna Delhi
|
331
|
+
# 2 Ram Mumbai
|
332
|
+
# 3 Krishna Raipur
|
333
|
+
# 4 Krishna Banglore
|
334
|
+
#
|
335
|
+
# df.group_by(:name)
|
336
|
+
# => #<DaruLite::DataFrame(5x1)>
|
337
|
+
# visited
|
338
|
+
# Krishna 1 Delhi
|
339
|
+
# 3 Raipur
|
340
|
+
# 4 Banglore
|
341
|
+
# Ram 0 Hyderabad
|
342
|
+
# 2 Mumbai
|
343
|
+
#
|
344
|
+
# df.group_by(:name).aggregate(visited: -> (vec){vec.to_a.join(',')})
|
345
|
+
# => #<DaruLite::DataFrame(2x1)>
|
346
|
+
# visited
|
347
|
+
# Krishna Delhi,Raipur,Banglore
|
348
|
+
# Ram Hyderabad,Mumbai
|
349
|
+
#
|
350
|
+
def aggregate(options = {})
|
351
|
+
new_index = get_grouped_index
|
352
|
+
|
353
|
+
@context.aggregate(options) { [@groups_by_pos.values, new_index] }
|
354
|
+
end
|
355
|
+
|
356
|
+
private
|
357
|
+
|
358
|
+
def select_groups_from(method, quantity)
|
359
|
+
selection = @context
|
360
|
+
rows = []
|
361
|
+
indexes = []
|
362
|
+
|
363
|
+
groups_by_idx.each_value do |index|
|
364
|
+
index.send(method, quantity).each do |idx|
|
365
|
+
rows << selection.row[idx].to_a
|
366
|
+
indexes << idx
|
367
|
+
end
|
368
|
+
end
|
369
|
+
indexes.flatten!
|
370
|
+
|
371
|
+
DaruLite::DataFrame.rows(rows, order: @context.vectors, index: indexes)
|
372
|
+
end
|
373
|
+
|
374
|
+
def select_numeric_non_group_vectors
|
375
|
+
@non_group_vectors.select { |ngvec| @context[ngvec].type == :numeric }
|
376
|
+
end
|
377
|
+
|
378
|
+
def apply_method(method_type, method)
|
379
|
+
raise 'To implement' if method_type != :numeric
|
380
|
+
|
381
|
+
aggregation_options = select_numeric_non_group_vectors.map { |k| [k, method] }.to_h
|
382
|
+
|
383
|
+
aggregate(aggregation_options)
|
384
|
+
end
|
385
|
+
|
386
|
+
def get_grouped_index(index_tuples = nil)
|
387
|
+
index_tuples = @groups_by_pos.keys if index_tuples.nil?
|
388
|
+
|
389
|
+
if multi_indexed_grouping?
|
390
|
+
DaruLite::MultiIndex.from_tuples(index_tuples)
|
391
|
+
else
|
392
|
+
DaruLite::Index.new(index_tuples.flatten)
|
393
|
+
end
|
394
|
+
end
|
395
|
+
|
396
|
+
def multi_indexed_grouping?
|
397
|
+
return false unless @groups_by_pos.keys[0]
|
398
|
+
|
399
|
+
@groups_by_pos.keys[0].size > 1
|
400
|
+
end
|
401
|
+
end
|
402
|
+
end
|
403
|
+
end
|
@@ -0,0 +1,270 @@
|
|
1
|
+
module DaruLite
|
2
|
+
module Core
|
3
|
+
class MergeFrame
|
4
|
+
class NilSorter
|
5
|
+
include Comparable
|
6
|
+
|
7
|
+
def nil?
|
8
|
+
true
|
9
|
+
end
|
10
|
+
|
11
|
+
def ==(_other)
|
12
|
+
false
|
13
|
+
end
|
14
|
+
|
15
|
+
def <=>(other)
|
16
|
+
other.nil? ? 0 : -1
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# quick-fix for issue #171
|
21
|
+
def initialize(left_df, right_df, opts = {})
|
22
|
+
init_opts(opts)
|
23
|
+
validate_on!(left_df, right_df)
|
24
|
+
key_sanitizer = ->(h) { sanitize_merge_keys(h.values_at(*on)) }
|
25
|
+
|
26
|
+
@left = df_to_a(left_df)
|
27
|
+
@left.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) }
|
28
|
+
@left_key_values = @left.map(&key_sanitizer)
|
29
|
+
|
30
|
+
@right = df_to_a(right_df)
|
31
|
+
@right.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) }
|
32
|
+
@right_key_values = @right.map(&key_sanitizer)
|
33
|
+
|
34
|
+
@left_keys, @right_keys = merge_keys(left_df, right_df, on)
|
35
|
+
end
|
36
|
+
|
37
|
+
def join
|
38
|
+
res = []
|
39
|
+
|
40
|
+
until left.empty? && right.empty?
|
41
|
+
lkey = first_left_key
|
42
|
+
rkey = first_right_key
|
43
|
+
|
44
|
+
row(lkey, rkey).tap { |r| res << r if r }
|
45
|
+
end
|
46
|
+
|
47
|
+
DaruLite::DataFrame.new(res, order: dataframe_vector_names)
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
attr_reader :on, :indicator,
|
53
|
+
:left, :left_key_values, :keep_left, :left_keys,
|
54
|
+
:right, :right_key_values, :keep_right, :right_keys
|
55
|
+
|
56
|
+
attr_accessor :merge_key
|
57
|
+
|
58
|
+
LEFT_RIGHT_COMBINATIONS = {
|
59
|
+
# left right
|
60
|
+
inner: [false, false],
|
61
|
+
left: [true, false],
|
62
|
+
right: [false, true],
|
63
|
+
outer: [true, true]
|
64
|
+
}.freeze
|
65
|
+
|
66
|
+
def init_opts(opts)
|
67
|
+
@on = opts[:on]
|
68
|
+
@keep_left, @keep_right = extract_left_right(opts[:how])
|
69
|
+
@indicator = opts[:indicator]
|
70
|
+
end
|
71
|
+
|
72
|
+
def dataframe_vector_names
|
73
|
+
left_keys.values + on + right_keys.values + Array(indicator)
|
74
|
+
end
|
75
|
+
|
76
|
+
def extract_left_right(how)
|
77
|
+
LEFT_RIGHT_COMBINATIONS[how] or
|
78
|
+
raise ArgumentError, "Unrecognized join option: #{how}"
|
79
|
+
end
|
80
|
+
|
81
|
+
def sanitize_merge_keys(merge_keys)
|
82
|
+
merge_keys.map { |v| v.nil? ? NilSorter.new : v }
|
83
|
+
end
|
84
|
+
|
85
|
+
def df_to_a(df)
|
86
|
+
# FIXME: much faster than "native" DataFrame#to_a. Should not be
|
87
|
+
h = df.to_h
|
88
|
+
keys = h.keys
|
89
|
+
h.values.map(&:to_a).transpose.map { |r| keys.zip(r).to_h }
|
90
|
+
end
|
91
|
+
|
92
|
+
def merge_keys(df1, df2, on)
|
93
|
+
duplicates =
|
94
|
+
(df1.vectors.to_a + df2.vectors.to_a - on)
|
95
|
+
.group_by(&:itself)
|
96
|
+
.select { |_, g| g.count == 2 }.map(&:first)
|
97
|
+
|
98
|
+
[
|
99
|
+
guard_keys(df1.vectors.to_a - on, duplicates, 1),
|
100
|
+
guard_keys(df2.vectors.to_a - on, duplicates, 2)
|
101
|
+
]
|
102
|
+
end
|
103
|
+
|
104
|
+
def guard_keys(keys, duplicates, num)
|
105
|
+
keys.to_h { |v| [v, guard_duplicate(v, duplicates, num)] }
|
106
|
+
end
|
107
|
+
|
108
|
+
def guard_duplicate(val, duplicates, num)
|
109
|
+
duplicates.include?(val) ? :"#{val}_#{num}" : val
|
110
|
+
end
|
111
|
+
|
112
|
+
def row(lkey, rkey)
|
113
|
+
# :nocov:
|
114
|
+
# It's just an impossibility handler, can't be covered :)
|
115
|
+
raise 'Unexpected condition met during merge' if !lkey && !rkey
|
116
|
+
|
117
|
+
# :nocov:
|
118
|
+
if lkey == rkey
|
119
|
+
self.merge_key = lkey
|
120
|
+
add_indicator(merge_matching_rows, :both)
|
121
|
+
elsif !rkey || lt(lkey, rkey)
|
122
|
+
add_indicator(left_row_missing_right, :left_only)
|
123
|
+
else # !lkey || lt(rkey, lkey)
|
124
|
+
add_indicator(right_row_missing_left, :right_only)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def add_indicator(row, indicator_value)
|
129
|
+
return row unless indicator
|
130
|
+
|
131
|
+
row[indicator] = indicator_value
|
132
|
+
row
|
133
|
+
end
|
134
|
+
|
135
|
+
def merge_matching_rows
|
136
|
+
if one_to_one_merge?
|
137
|
+
merge_rows(one_to_one_left_row, one_to_one_right_row)
|
138
|
+
elsif one_to_many_merge?
|
139
|
+
result = merge_rows(left.first, right.first)
|
140
|
+
one_to_many_shift
|
141
|
+
result
|
142
|
+
else
|
143
|
+
result = cartesian_product.shift
|
144
|
+
end_cartesian_product if cartesian_product.empty?
|
145
|
+
result
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def one_to_many_shift
|
150
|
+
shift_left = first_right_key != next_right_key
|
151
|
+
shift_right = first_left_key != next_left_key
|
152
|
+
one_to_one_left_row if shift_left
|
153
|
+
one_to_one_right_row if shift_right
|
154
|
+
end
|
155
|
+
|
156
|
+
def one_to_one_merge?
|
157
|
+
merge_key != next_left_key && merge_key != next_right_key
|
158
|
+
end
|
159
|
+
|
160
|
+
def one_to_many_merge?
|
161
|
+
!(merge_key == next_left_key && merge_key == next_right_key)
|
162
|
+
end
|
163
|
+
|
164
|
+
def one_to_one_left_row
|
165
|
+
left_key_values.shift
|
166
|
+
left.shift
|
167
|
+
end
|
168
|
+
|
169
|
+
def one_to_one_right_row
|
170
|
+
right_key_values.shift
|
171
|
+
right.shift
|
172
|
+
end
|
173
|
+
|
174
|
+
def left_row_missing_right
|
175
|
+
val = one_to_one_left_row
|
176
|
+
expand_row(val, left_keys) if keep_left
|
177
|
+
end
|
178
|
+
|
179
|
+
def right_row_missing_left
|
180
|
+
val = one_to_one_right_row
|
181
|
+
expand_row(val, right_keys) if keep_right
|
182
|
+
end
|
183
|
+
|
184
|
+
def lt(k1, k2)
|
185
|
+
(k1 <=> k2) == -1
|
186
|
+
end
|
187
|
+
|
188
|
+
def merge_rows(lrow, rrow)
|
189
|
+
left_keys
|
190
|
+
.to_h { |from, to| [to, lrow[from]] }
|
191
|
+
.merge(on.to_h { |col| [col, lrow[col]] })
|
192
|
+
.merge(indicator ? { indicator => nil } : {})
|
193
|
+
.merge(right_keys.to_h { |from, to| [to, rrow[from]] })
|
194
|
+
end
|
195
|
+
|
196
|
+
def expand_row(row, renamings)
|
197
|
+
renamings
|
198
|
+
.to_h { |from, to| [to, row[from]] }
|
199
|
+
.merge(on.to_h { |col| [col, row[col]] })
|
200
|
+
.merge(indicator ? { indicator => nil } : {})
|
201
|
+
end
|
202
|
+
|
203
|
+
def first_right_key
|
204
|
+
right_key_values.empty? ? nil : right_key_values.first
|
205
|
+
end
|
206
|
+
|
207
|
+
def next_right_key
|
208
|
+
right_key_values[1]
|
209
|
+
end
|
210
|
+
|
211
|
+
def first_left_key
|
212
|
+
left_key_values.empty? ? nil : left_key_values.first
|
213
|
+
end
|
214
|
+
|
215
|
+
def next_left_key
|
216
|
+
left_key_values[1]
|
217
|
+
end
|
218
|
+
|
219
|
+
def left_rows_at_merge_key
|
220
|
+
left.take_while { |arr| sanitize_merge_keys(arr.values_at(*on)) == merge_key }
|
221
|
+
end
|
222
|
+
|
223
|
+
def right_rows_at_merge_key
|
224
|
+
right.take_while { |arr| sanitize_merge_keys(arr.values_at(*on)) == merge_key }
|
225
|
+
end
|
226
|
+
|
227
|
+
def cartesian_product
|
228
|
+
@cartesian_product ||= left_rows_at_merge_key.product(right_rows_at_merge_key).map do |left_row, right_row|
|
229
|
+
merge_rows(left_row, right_row)
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
def end_cartesian_product
|
234
|
+
left_size = left_rows_at_merge_key.size
|
235
|
+
left_key_values.shift(left_size)
|
236
|
+
left.shift(left_size)
|
237
|
+
|
238
|
+
right_size = right_rows_at_merge_key.size
|
239
|
+
right_key_values.shift(right_size)
|
240
|
+
right.shift(right_size)
|
241
|
+
@cartesian_product = nil
|
242
|
+
end
|
243
|
+
|
244
|
+
def validate_on!(left_df, right_df)
|
245
|
+
@on.each do |on|
|
246
|
+
(left_df.has_vector?(on) && right_df.has_vector?(on)) or
|
247
|
+
raise ArgumentError, "Both dataframes expected to have #{on.inspect} field"
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
def safe_compare(left_array, right_array)
|
252
|
+
left_array.zip(right_array).map do |l, r|
|
253
|
+
next 0 if l.nil? && r.nil?
|
254
|
+
next 1 if r.nil?
|
255
|
+
next -1 if l.nil?
|
256
|
+
|
257
|
+
l <=> r
|
258
|
+
end.reject(&:zero?).first || 0
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
module Merge
|
263
|
+
class << self
|
264
|
+
def join(df1, df2, opts = {})
|
265
|
+
MergeFrame.new(df1, df2, opts).join
|
266
|
+
end
|
267
|
+
end
|
268
|
+
end
|
269
|
+
end
|
270
|
+
end
|