daru_lite 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/workflows/ci.yml +33 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/.rubocop.yml +27 -0
- data/.rubocop_todo.yml +137 -0
- data/CONTRIBUTING.md +47 -0
- data/Gemfile +2 -0
- data/History.md +4 -0
- data/LICENSE +24 -0
- data/README.md +218 -0
- data/Rakefile +69 -0
- data/ReleasePolicy.md +20 -0
- data/benchmarks/TradeoffData.csv +65 -0
- data/benchmarks/csv_reading.rb +22 -0
- data/benchmarks/dataframe_creation.rb +39 -0
- data/benchmarks/db_loading.rb +34 -0
- data/benchmarks/duplicating.rb +45 -0
- data/benchmarks/group_by.rb +32 -0
- data/benchmarks/joining.rb +52 -0
- data/benchmarks/row_access.rb +41 -0
- data/benchmarks/row_assign.rb +36 -0
- data/benchmarks/sorting.rb +51 -0
- data/benchmarks/statistics.rb +28 -0
- data/benchmarks/vector_access.rb +31 -0
- data/benchmarks/vector_assign.rb +42 -0
- data/benchmarks/where_clause.rb +48 -0
- data/benchmarks/where_vs_filter.rb +28 -0
- data/daru_lite.gemspec +55 -0
- data/images/README.md +5 -0
- data/images/con0.png +0 -0
- data/images/con1.png +0 -0
- data/images/init0.png +0 -0
- data/images/init1.png +0 -0
- data/images/man0.png +0 -0
- data/images/man1.png +0 -0
- data/images/man2.png +0 -0
- data/images/man3.png +0 -0
- data/images/man4.png +0 -0
- data/images/man5.png +0 -0
- data/images/man6.png +0 -0
- data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
- data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
- data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
- data/lib/daru_lite/category.rb +929 -0
- data/lib/daru_lite/configuration.rb +34 -0
- data/lib/daru_lite/core/group_by.rb +403 -0
- data/lib/daru_lite/core/merge.rb +270 -0
- data/lib/daru_lite/core/query.rb +109 -0
- data/lib/daru_lite/dataframe.rb +3080 -0
- data/lib/daru_lite/date_time/index.rb +569 -0
- data/lib/daru_lite/date_time/offsets.rb +397 -0
- data/lib/daru_lite/exceptions.rb +2 -0
- data/lib/daru_lite/extensions/which_dsl.rb +53 -0
- data/lib/daru_lite/formatters/table.rb +52 -0
- data/lib/daru_lite/helpers/array.rb +53 -0
- data/lib/daru_lite/index/categorical_index.rb +201 -0
- data/lib/daru_lite/index/index.rb +374 -0
- data/lib/daru_lite/index/multi_index.rb +374 -0
- data/lib/daru_lite/io/csv/converters.rb +21 -0
- data/lib/daru_lite/io/io.rb +294 -0
- data/lib/daru_lite/io/sql_data_source.rb +97 -0
- data/lib/daru_lite/iruby/helpers.rb +38 -0
- data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
- data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
- data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
- data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
- data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
- data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
- data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
- data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
- data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
- data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
- data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
- data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
- data/lib/daru_lite/monkeys.rb +56 -0
- data/lib/daru_lite/vector.rb +1678 -0
- data/lib/daru_lite/version.rb +3 -0
- data/lib/daru_lite.rb +99 -0
- data/profile/_base.rb +23 -0
- data/profile/df_to_a.rb +10 -0
- data/profile/filter.rb +13 -0
- data/profile/joining.rb +13 -0
- data/profile/sorting.rb +12 -0
- data/profile/vector_each_with_index.rb +9 -0
- data/profile/vector_new.rb +9 -0
- data/spec/accessors/array_wrapper_spec.rb +3 -0
- data/spec/category_spec.rb +1741 -0
- data/spec/core/group_by_spec.rb +655 -0
- data/spec/core/merge_spec.rb +179 -0
- data/spec/core/query_spec.rb +347 -0
- data/spec/daru_lite_spec.rb +22 -0
- data/spec/dataframe_spec.rb +4330 -0
- data/spec/date_time/data_spec.rb +197 -0
- data/spec/date_time/date_time_index_helper_spec.rb +72 -0
- data/spec/date_time/index_spec.rb +588 -0
- data/spec/date_time/offsets_spec.rb +465 -0
- data/spec/extensions/which_dsl_spec.rb +38 -0
- data/spec/fixtures/bank2.dat +200 -0
- data/spec/fixtures/boolean_converter_test.csv +5 -0
- data/spec/fixtures/countries.json +7794 -0
- data/spec/fixtures/duplicates.csv +32 -0
- data/spec/fixtures/eciresults.html +394 -0
- data/spec/fixtures/empties.dat +2 -0
- data/spec/fixtures/empty_rows_test.csv +17 -0
- data/spec/fixtures/macau.html +3691 -0
- data/spec/fixtures/macd_data.csv +150 -0
- data/spec/fixtures/matrix_test.csv +100 -0
- data/spec/fixtures/moneycontrol.html +6812 -0
- data/spec/fixtures/music_data.tsv +2501 -0
- data/spec/fixtures/repeated_fields.csv +7 -0
- data/spec/fixtures/sales-funnel.csv +18 -0
- data/spec/fixtures/scientific_notation.csv +4 -0
- data/spec/fixtures/string_converter_test.csv +5 -0
- data/spec/fixtures/strings.dat +2 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/fixtures/test_xls_2.xls +0 -0
- data/spec/fixtures/url_test.txt~ +0 -0
- data/spec/fixtures/valid_markup.html +62 -0
- data/spec/fixtures/wiki_climate.html +1243 -0
- data/spec/fixtures/wiki_table_info.html +631 -0
- data/spec/formatters/table_formatter_spec.rb +137 -0
- data/spec/helpers_spec.rb +8 -0
- data/spec/index/categorical_index_spec.rb +170 -0
- data/spec/index/index_spec.rb +417 -0
- data/spec/index/multi_index_spec.rb +680 -0
- data/spec/io/io_spec.rb +373 -0
- data/spec/io/sql_data_source_spec.rb +56 -0
- data/spec/iruby/dataframe_spec.rb +170 -0
- data/spec/iruby/helpers_spec.rb +49 -0
- data/spec/iruby/multi_index_spec.rb +37 -0
- data/spec/iruby/vector_spec.rb +105 -0
- data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
- data/spec/maths/arithmetic/vector_spec.rb +165 -0
- data/spec/maths/statistics/dataframe_spec.rb +178 -0
- data/spec/maths/statistics/vector_spec.rb +756 -0
- data/spec/monkeys_spec.rb +42 -0
- data/spec/shared/vector_display_spec.rb +213 -0
- data/spec/spec_helper.rb +87 -0
- data/spec/support/database_helper.rb +30 -0
- data/spec/support/matchers.rb +5 -0
- data/spec/vector_spec.rb +2293 -0
- metadata +571 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
module DaruLite
|
|
2
|
+
# Defines constants and methods related to configuration
|
|
3
|
+
module Configuration
|
|
4
|
+
INSPECT_OPTIONS_KEYS = [
|
|
5
|
+
:max_rows,
|
|
6
|
+
# Terminal
|
|
7
|
+
:spacing
|
|
8
|
+
].freeze
|
|
9
|
+
|
|
10
|
+
# Jupyter
|
|
11
|
+
DEFAULT_MAX_ROWS = 30
|
|
12
|
+
|
|
13
|
+
# Terminal
|
|
14
|
+
DEFAULT_SPACING = 10
|
|
15
|
+
|
|
16
|
+
attr_accessor(*INSPECT_OPTIONS_KEYS)
|
|
17
|
+
|
|
18
|
+
def configure
|
|
19
|
+
yield self
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def self.extended(base)
|
|
23
|
+
base.reset_options
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def reset_options
|
|
27
|
+
self.max_rows = DEFAULT_MAX_ROWS
|
|
28
|
+
|
|
29
|
+
self.spacing = DEFAULT_SPACING
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
extend Configuration
|
|
34
|
+
end
|
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
module DaruLite
|
|
2
|
+
module Core
|
|
3
|
+
class GroupBy
|
|
4
|
+
class << self
|
|
5
|
+
extend Gem::Deprecate
|
|
6
|
+
|
|
7
|
+
# @private
|
|
8
|
+
def group_by_index_to_positions(indexes_with_positions, sort: false)
|
|
9
|
+
index_to_positions = {}
|
|
10
|
+
|
|
11
|
+
indexes_with_positions.each do |idx, position|
|
|
12
|
+
(index_to_positions[idx] ||= []) << position
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
if sort # TODO: maybe add a more "stable" sorting option?
|
|
16
|
+
sorted_keys = index_to_positions.keys.sort(&DaruLite::Core::GroupBy::TUPLE_SORTER)
|
|
17
|
+
index_to_positions = sorted_keys.to_h { |k| [k, index_to_positions[k]] }
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
index_to_positions
|
|
21
|
+
end
|
|
22
|
+
alias get_positions_group_map_on group_by_index_to_positions
|
|
23
|
+
deprecate :get_positions_group_map_on, :group_by_index_to_positions, 2019, 10
|
|
24
|
+
|
|
25
|
+
# @private
|
|
26
|
+
def get_positions_group_for_aggregation(multi_index, level = -1)
|
|
27
|
+
raise unless multi_index.is_a?(DaruLite::MultiIndex)
|
|
28
|
+
|
|
29
|
+
new_index = multi_index.dup
|
|
30
|
+
new_index.remove_layer(level) # TODO: recheck code of DaruLite::MultiIndex#remove_layer
|
|
31
|
+
|
|
32
|
+
group_by_index_to_positions(new_index.each_with_index)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# @private
|
|
36
|
+
def get_positions_group_map_for_df(df, group_by_keys, sort: true)
|
|
37
|
+
indexes_with_positions = df[*group_by_keys].to_df.each_row.map(&:to_a).each_with_index
|
|
38
|
+
|
|
39
|
+
group_by_index_to_positions(indexes_with_positions, sort: sort)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# @private
|
|
43
|
+
def group_map_from_positions_to_indexes(positions_group_map, index)
|
|
44
|
+
positions_group_map.transform_values { |positions| positions.map { |pos| index.at(pos) } }
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# @private
|
|
48
|
+
def df_from_group_map(df, group_map, remaining_vectors, from_position: true)
|
|
49
|
+
return nil if group_map == {}
|
|
50
|
+
|
|
51
|
+
new_index = group_map.flat_map { |group, values| values.map { |val| group + [val] } }
|
|
52
|
+
new_index = DaruLite::MultiIndex.from_tuples(new_index)
|
|
53
|
+
|
|
54
|
+
return DaruLite::DataFrame.new({}, index: new_index) if remaining_vectors == []
|
|
55
|
+
|
|
56
|
+
new_rows_order = group_map.values.flatten
|
|
57
|
+
new_df = df[*remaining_vectors].to_df.get_sub_dataframe(new_rows_order, by_position: from_position)
|
|
58
|
+
new_df.index = new_index
|
|
59
|
+
|
|
60
|
+
new_df
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# The group_by was done over the vectors in group_vectors; the remaining vectors are the non_group_vectors
|
|
65
|
+
attr_reader :group_vectors, :non_group_vectors
|
|
66
|
+
|
|
67
|
+
# lazy accessor/attr_reader for the attribute groups
|
|
68
|
+
def groups
|
|
69
|
+
@groups ||= GroupBy.group_map_from_positions_to_indexes(@groups_by_pos, @context.index)
|
|
70
|
+
end
|
|
71
|
+
alias groups_by_idx groups
|
|
72
|
+
|
|
73
|
+
# lazy accessor/attr_reader for the attribute df
|
|
74
|
+
def df
|
|
75
|
+
@df ||= GroupBy.df_from_group_map(@context, @groups_by_pos, @non_group_vectors)
|
|
76
|
+
end
|
|
77
|
+
alias grouped_df df
|
|
78
|
+
|
|
79
|
+
# Iterate over each group created by group_by. A DataFrame is yielded in
|
|
80
|
+
# block.
|
|
81
|
+
def each_group
|
|
82
|
+
return to_enum(:each_group) unless block_given?
|
|
83
|
+
|
|
84
|
+
groups.each_key do |k|
|
|
85
|
+
yield get_group(k)
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
TUPLE_SORTER = lambda do |left, right|
|
|
90
|
+
return -1 unless right
|
|
91
|
+
return 1 unless left
|
|
92
|
+
|
|
93
|
+
left = left.compact
|
|
94
|
+
right = right.compact
|
|
95
|
+
return left <=> right || 0 if left.length == right.length
|
|
96
|
+
|
|
97
|
+
left.length <=> right.length
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def initialize(context, names)
|
|
101
|
+
@group_vectors = names
|
|
102
|
+
@non_group_vectors = context.vectors.to_a - names
|
|
103
|
+
|
|
104
|
+
@context = context # TODO: maybe rename in @original_df
|
|
105
|
+
|
|
106
|
+
# FIXME: It feels like we don't want to sort here. Ruby's #group_by
|
|
107
|
+
# never sorts:
|
|
108
|
+
#
|
|
109
|
+
# ['test', 'me', 'please'].group_by(&:size)
|
|
110
|
+
# # => {4=>["test"], 2=>["me"], 6=>["please"]}
|
|
111
|
+
#
|
|
112
|
+
# - zverok, 2016-09-12
|
|
113
|
+
@groups_by_pos = GroupBy.get_positions_group_map_for_df(@context, @group_vectors, sort: true)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Get a DaruLite::Vector of the size of each group.
|
|
117
|
+
def size
|
|
118
|
+
index = get_grouped_index
|
|
119
|
+
|
|
120
|
+
values = @groups_by_pos.values.map(&:size)
|
|
121
|
+
DaruLite::Vector.new(values, index: index, name: :size)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Get the first group
|
|
125
|
+
def first
|
|
126
|
+
head(1)
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Get the last group
|
|
130
|
+
def last
|
|
131
|
+
tail(1)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Get the top 'n' groups
|
|
135
|
+
# @param quantity [Fixnum] (5) The number of groups.
|
|
136
|
+
# @example Usage of head
|
|
137
|
+
# df = DaruLite::DataFrame.new({
|
|
138
|
+
# a: %w{foo bar foo bar foo bar foo foo},
|
|
139
|
+
# b: %w{one one two three two two one three},
|
|
140
|
+
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
|
141
|
+
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
|
142
|
+
# })
|
|
143
|
+
# df.group_by([:a, :b]).head(1)
|
|
144
|
+
# # =>
|
|
145
|
+
# # #<DaruLite::DataFrame:82745170 @name = d7003f75-5eb9-4967-9303-c08dd9160224 @size = 6>
|
|
146
|
+
# # a b c d
|
|
147
|
+
# # 1 bar one 2 22
|
|
148
|
+
# # 3 bar three 1 44
|
|
149
|
+
# # 5 bar two 6 66
|
|
150
|
+
# # 0 foo one 1 11
|
|
151
|
+
# # 7 foo three 8 88
|
|
152
|
+
# # 2 foo two 3 33
|
|
153
|
+
def head(quantity = 5)
|
|
154
|
+
select_groups_from :first, quantity
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Get the bottom 'n' groups
|
|
158
|
+
# @param quantity [Fixnum] (5) The number of groups.
|
|
159
|
+
# @example Usage of tail
|
|
160
|
+
# df = DaruLite::DataFrame.new({
|
|
161
|
+
# a: %w{foo bar foo bar foo bar foo foo},
|
|
162
|
+
# b: %w{one one two three two two one three},
|
|
163
|
+
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
|
164
|
+
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
|
165
|
+
# })
|
|
166
|
+
# # df.group_by([:a, :b]).tail(1)
|
|
167
|
+
# # =>
|
|
168
|
+
# # #<DaruLite::DataFrame:82378270 @name = 0623db46-5425-41bd-a843-99baac3d1d9a @size = 6>
|
|
169
|
+
# # a b c d
|
|
170
|
+
# # 1 bar one 2 22
|
|
171
|
+
# # 3 bar three 1 44
|
|
172
|
+
# # 5 bar two 6 66
|
|
173
|
+
# # 6 foo one 3 77
|
|
174
|
+
# # 7 foo three 8 88
|
|
175
|
+
# # 4 foo two 3 55
|
|
176
|
+
def tail(quantity = 5)
|
|
177
|
+
select_groups_from :last, quantity
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Calculate mean of numeric groups, excluding missing values.
|
|
181
|
+
# @example Usage of mean
|
|
182
|
+
# df = DaruLite::DataFrame.new({
|
|
183
|
+
# a: %w{foo bar foo bar foo bar foo foo},
|
|
184
|
+
# b: %w{one one two three two two one three},
|
|
185
|
+
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
|
186
|
+
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
|
187
|
+
# df.group_by([:a, :b]).mean
|
|
188
|
+
# # =>
|
|
189
|
+
# # #<DaruLite::DataFrame:81097450 @name = 0c32983f-3e06-451f-a9c9-051cadfe7371 @size = 6>
|
|
190
|
+
# # c d
|
|
191
|
+
# # ["bar", "one"] 2 22
|
|
192
|
+
# # ["bar", "three"] 1 44
|
|
193
|
+
# # ["bar", "two"] 6 66
|
|
194
|
+
# # ["foo", "one"] 2.0 44.0
|
|
195
|
+
# # ["foo", "three"] 8 88
|
|
196
|
+
# # ["foo", "two"] 3.0 44.0
|
|
197
|
+
def mean
|
|
198
|
+
apply_method :numeric, :mean
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
# Calculate the median of numeric groups, excluding missing values.
|
|
202
|
+
def median
|
|
203
|
+
apply_method :numeric, :median
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Calculate sum of numeric groups, excluding missing values.
|
|
207
|
+
def sum
|
|
208
|
+
apply_method :numeric, :sum
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# Count groups, excludes missing values.
|
|
212
|
+
# @example Using count
|
|
213
|
+
# df = DaruLite::DataFrame.new({
|
|
214
|
+
# a: %w{foo bar foo bar foo bar foo foo},
|
|
215
|
+
# b: %w{one one two three two two one three},
|
|
216
|
+
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
|
217
|
+
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
|
218
|
+
# })
|
|
219
|
+
# df.group_by([:a, :b]).count
|
|
220
|
+
# # =>
|
|
221
|
+
# # #<DaruLite::DataFrame:76900210 @name = 7b9cf55d-17f8-48c7-b03a-2586c6e5ec5a @size = 6>
|
|
222
|
+
# # c d
|
|
223
|
+
# # ["bar", "one"] 1 1
|
|
224
|
+
# # ["bar", "two"] 1 1
|
|
225
|
+
# # ["bar", "three"] 1 1
|
|
226
|
+
# # ["foo", "one"] 2 2
|
|
227
|
+
# # ["foo", "three"] 1 1
|
|
228
|
+
# # ["foo", "two"] 2 2
|
|
229
|
+
def count
|
|
230
|
+
width = @non_group_vectors.size
|
|
231
|
+
DaruLite::DataFrame.new([size] * width, order: @non_group_vectors)
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
# Calculate sample standard deviation of numeric vector groups, excluding
|
|
235
|
+
# missing values.
|
|
236
|
+
def std
|
|
237
|
+
apply_method :numeric, :std
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
# Find the max element of each numeric vector group.
|
|
241
|
+
def max
|
|
242
|
+
apply_method :numeric, :max
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# Find the min element of each numeric vector group.
|
|
246
|
+
def min
|
|
247
|
+
apply_method :numeric, :min
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
# Returns one of the selected groups as a DataFrame.
|
|
251
|
+
# @param group [Array] The group that is to be selected from those grouped.
|
|
252
|
+
#
|
|
253
|
+
# @example Getting a group
|
|
254
|
+
#
|
|
255
|
+
# df = DaruLite::DataFrame.new({
|
|
256
|
+
# a: %w{foo bar foo bar foo bar foo foo},
|
|
257
|
+
# b: %w{one one two three two two one three},
|
|
258
|
+
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
|
259
|
+
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
|
260
|
+
# })
|
|
261
|
+
# df.group_by([:a, :b]).get_group ['bar','two']
|
|
262
|
+
# #=>
|
|
263
|
+
# ##<DaruLite::DataFrame:83258980 @name = 687ee3f6-8874-4899-97fa-9b31d84fa1d5 @size = 1>
|
|
264
|
+
# # a b c d
|
|
265
|
+
# # 5 bar two 6 66
|
|
266
|
+
def get_group(group)
|
|
267
|
+
indexes = groups_by_idx[group]
|
|
268
|
+
elements = @context.each_vector.map(&:to_a)
|
|
269
|
+
transpose = elements.transpose
|
|
270
|
+
rows = indexes.each.map { |idx| transpose[idx] }
|
|
271
|
+
|
|
272
|
+
DaruLite::DataFrame.rows(
|
|
273
|
+
rows, index: indexes, order: @context.vectors
|
|
274
|
+
)
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
# Iteratively applies a function to the values in a group and accumulates the result.
|
|
278
|
+
# @param init (nil) The initial value of the accumulator.
|
|
279
|
+
# @yieldparam block [Proc] A proc or lambda that accepts two arguments. The first argument
|
|
280
|
+
# is the accumulated result. The second argument is a DataFrame row.
|
|
281
|
+
# @example Usage of reduce
|
|
282
|
+
# df = DaruLite::DataFrame.new({
|
|
283
|
+
# a: ['a','b'] * 3,
|
|
284
|
+
# b: [1,2,3] * 2,
|
|
285
|
+
# c: 'A'..'F'
|
|
286
|
+
# })
|
|
287
|
+
# df.group_by([:a]).reduce('') { |result, row| result += row[:c]; result }
|
|
288
|
+
# # =>
|
|
289
|
+
# # #<DaruLite::Vector:70343147159900 @name = nil @size = 2 >
|
|
290
|
+
# # nil
|
|
291
|
+
# # a ACE
|
|
292
|
+
# # b BDF
|
|
293
|
+
def reduce(init = nil)
|
|
294
|
+
result_hash = groups_by_idx.each_with_object({}) do |(group, indices), h|
|
|
295
|
+
group_indices = indices.map { |v| @context.index.to_a[v] }
|
|
296
|
+
|
|
297
|
+
grouped_result = init
|
|
298
|
+
group_indices.each do |idx|
|
|
299
|
+
grouped_result = yield(grouped_result, @context.row[idx])
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
h[group] = grouped_result
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
index = get_grouped_index(result_hash.keys)
|
|
306
|
+
|
|
307
|
+
DaruLite::Vector.new(result_hash.values, index: index)
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
def inspect
|
|
311
|
+
grouped_df.inspect
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
# Function to use for aggregating the data.
|
|
315
|
+
# `group_by` is using DaruLite::DataFrame#aggregate
|
|
316
|
+
#
|
|
317
|
+
# @param options [Hash] options for column, you want in resultant dataframe
|
|
318
|
+
#
|
|
319
|
+
# @return [DaruLite::DataFrame]
|
|
320
|
+
#
|
|
321
|
+
# @example
|
|
322
|
+
#
|
|
323
|
+
# df = DaruLite::DataFrame.new(
|
|
324
|
+
# name: ['Ram','Krishna','Ram','Krishna','Krishna'],
|
|
325
|
+
# visited: ['Hyderabad', 'Delhi', 'Mumbai', 'Raipur', 'Banglore'])
|
|
326
|
+
#
|
|
327
|
+
# => #<DaruLite::DataFrame(5x2)>
|
|
328
|
+
# name visited
|
|
329
|
+
# 0 Ram Hyderabad
|
|
330
|
+
# 1 Krishna Delhi
|
|
331
|
+
# 2 Ram Mumbai
|
|
332
|
+
# 3 Krishna Raipur
|
|
333
|
+
# 4 Krishna Banglore
|
|
334
|
+
#
|
|
335
|
+
# df.group_by(:name)
|
|
336
|
+
# => #<DaruLite::DataFrame(5x1)>
|
|
337
|
+
# visited
|
|
338
|
+
# Krishna 1 Delhi
|
|
339
|
+
# 3 Raipur
|
|
340
|
+
# 4 Banglore
|
|
341
|
+
# Ram 0 Hyderabad
|
|
342
|
+
# 2 Mumbai
|
|
343
|
+
#
|
|
344
|
+
# df.group_by(:name).aggregate(visited: -> (vec){vec.to_a.join(',')})
|
|
345
|
+
# => #<DaruLite::DataFrame(2x1)>
|
|
346
|
+
# visited
|
|
347
|
+
# Krishna Delhi,Raipur,Banglore
|
|
348
|
+
# Ram Hyderabad,Mumbai
|
|
349
|
+
#
|
|
350
|
+
def aggregate(options = {})
|
|
351
|
+
new_index = get_grouped_index
|
|
352
|
+
|
|
353
|
+
@context.aggregate(options) { [@groups_by_pos.values, new_index] }
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
private
|
|
357
|
+
|
|
358
|
+
def select_groups_from(method, quantity)
|
|
359
|
+
selection = @context
|
|
360
|
+
rows = []
|
|
361
|
+
indexes = []
|
|
362
|
+
|
|
363
|
+
groups_by_idx.each_value do |index|
|
|
364
|
+
index.send(method, quantity).each do |idx|
|
|
365
|
+
rows << selection.row[idx].to_a
|
|
366
|
+
indexes << idx
|
|
367
|
+
end
|
|
368
|
+
end
|
|
369
|
+
indexes.flatten!
|
|
370
|
+
|
|
371
|
+
DaruLite::DataFrame.rows(rows, order: @context.vectors, index: indexes)
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
def select_numeric_non_group_vectors
|
|
375
|
+
@non_group_vectors.select { |ngvec| @context[ngvec].type == :numeric }
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
def apply_method(method_type, method)
|
|
379
|
+
raise 'To implement' if method_type != :numeric
|
|
380
|
+
|
|
381
|
+
aggregation_options = select_numeric_non_group_vectors.map { |k| [k, method] }.to_h
|
|
382
|
+
|
|
383
|
+
aggregate(aggregation_options)
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
def get_grouped_index(index_tuples = nil)
|
|
387
|
+
index_tuples = @groups_by_pos.keys if index_tuples.nil?
|
|
388
|
+
|
|
389
|
+
if multi_indexed_grouping?
|
|
390
|
+
DaruLite::MultiIndex.from_tuples(index_tuples)
|
|
391
|
+
else
|
|
392
|
+
DaruLite::Index.new(index_tuples.flatten)
|
|
393
|
+
end
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
def multi_indexed_grouping?
|
|
397
|
+
return false unless @groups_by_pos.keys[0]
|
|
398
|
+
|
|
399
|
+
@groups_by_pos.keys[0].size > 1
|
|
400
|
+
end
|
|
401
|
+
end
|
|
402
|
+
end
|
|
403
|
+
end
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
module DaruLite
|
|
2
|
+
module Core
|
|
3
|
+
class MergeFrame
|
|
4
|
+
class NilSorter
|
|
5
|
+
include Comparable
|
|
6
|
+
|
|
7
|
+
def nil?
|
|
8
|
+
true
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def ==(_other)
|
|
12
|
+
false
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def <=>(other)
|
|
16
|
+
other.nil? ? 0 : -1
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# quick-fix for issue #171
|
|
21
|
+
def initialize(left_df, right_df, opts = {})
|
|
22
|
+
init_opts(opts)
|
|
23
|
+
validate_on!(left_df, right_df)
|
|
24
|
+
key_sanitizer = ->(h) { sanitize_merge_keys(h.values_at(*on)) }
|
|
25
|
+
|
|
26
|
+
@left = df_to_a(left_df)
|
|
27
|
+
@left.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) }
|
|
28
|
+
@left_key_values = @left.map(&key_sanitizer)
|
|
29
|
+
|
|
30
|
+
@right = df_to_a(right_df)
|
|
31
|
+
@right.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) }
|
|
32
|
+
@right_key_values = @right.map(&key_sanitizer)
|
|
33
|
+
|
|
34
|
+
@left_keys, @right_keys = merge_keys(left_df, right_df, on)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def join
|
|
38
|
+
res = []
|
|
39
|
+
|
|
40
|
+
until left.empty? && right.empty?
|
|
41
|
+
lkey = first_left_key
|
|
42
|
+
rkey = first_right_key
|
|
43
|
+
|
|
44
|
+
row(lkey, rkey).tap { |r| res << r if r }
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
DaruLite::DataFrame.new(res, order: dataframe_vector_names)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
private
|
|
51
|
+
|
|
52
|
+
attr_reader :on, :indicator,
|
|
53
|
+
:left, :left_key_values, :keep_left, :left_keys,
|
|
54
|
+
:right, :right_key_values, :keep_right, :right_keys
|
|
55
|
+
|
|
56
|
+
attr_accessor :merge_key
|
|
57
|
+
|
|
58
|
+
LEFT_RIGHT_COMBINATIONS = {
|
|
59
|
+
# left right
|
|
60
|
+
inner: [false, false],
|
|
61
|
+
left: [true, false],
|
|
62
|
+
right: [false, true],
|
|
63
|
+
outer: [true, true]
|
|
64
|
+
}.freeze
|
|
65
|
+
|
|
66
|
+
def init_opts(opts)
|
|
67
|
+
@on = opts[:on]
|
|
68
|
+
@keep_left, @keep_right = extract_left_right(opts[:how])
|
|
69
|
+
@indicator = opts[:indicator]
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def dataframe_vector_names
|
|
73
|
+
left_keys.values + on + right_keys.values + Array(indicator)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def extract_left_right(how)
|
|
77
|
+
LEFT_RIGHT_COMBINATIONS[how] or
|
|
78
|
+
raise ArgumentError, "Unrecognized join option: #{how}"
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def sanitize_merge_keys(merge_keys)
|
|
82
|
+
merge_keys.map { |v| v.nil? ? NilSorter.new : v }
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def df_to_a(df)
|
|
86
|
+
# FIXME: much faster than "native" DataFrame#to_a. Should not be
|
|
87
|
+
h = df.to_h
|
|
88
|
+
keys = h.keys
|
|
89
|
+
h.values.map(&:to_a).transpose.map { |r| keys.zip(r).to_h }
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def merge_keys(df1, df2, on)
|
|
93
|
+
duplicates =
|
|
94
|
+
(df1.vectors.to_a + df2.vectors.to_a - on)
|
|
95
|
+
.group_by(&:itself)
|
|
96
|
+
.select { |_, g| g.count == 2 }.map(&:first)
|
|
97
|
+
|
|
98
|
+
[
|
|
99
|
+
guard_keys(df1.vectors.to_a - on, duplicates, 1),
|
|
100
|
+
guard_keys(df2.vectors.to_a - on, duplicates, 2)
|
|
101
|
+
]
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def guard_keys(keys, duplicates, num)
|
|
105
|
+
keys.to_h { |v| [v, guard_duplicate(v, duplicates, num)] }
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def guard_duplicate(val, duplicates, num)
|
|
109
|
+
duplicates.include?(val) ? :"#{val}_#{num}" : val
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def row(lkey, rkey)
|
|
113
|
+
# :nocov:
|
|
114
|
+
# It's just an impossibility handler, can't be covered :)
|
|
115
|
+
raise 'Unexpected condition met during merge' if !lkey && !rkey
|
|
116
|
+
|
|
117
|
+
# :nocov:
|
|
118
|
+
if lkey == rkey
|
|
119
|
+
self.merge_key = lkey
|
|
120
|
+
add_indicator(merge_matching_rows, :both)
|
|
121
|
+
elsif !rkey || lt(lkey, rkey)
|
|
122
|
+
add_indicator(left_row_missing_right, :left_only)
|
|
123
|
+
else # !lkey || lt(rkey, lkey)
|
|
124
|
+
add_indicator(right_row_missing_left, :right_only)
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def add_indicator(row, indicator_value)
|
|
129
|
+
return row unless indicator
|
|
130
|
+
|
|
131
|
+
row[indicator] = indicator_value
|
|
132
|
+
row
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def merge_matching_rows
|
|
136
|
+
if one_to_one_merge?
|
|
137
|
+
merge_rows(one_to_one_left_row, one_to_one_right_row)
|
|
138
|
+
elsif one_to_many_merge?
|
|
139
|
+
result = merge_rows(left.first, right.first)
|
|
140
|
+
one_to_many_shift
|
|
141
|
+
result
|
|
142
|
+
else
|
|
143
|
+
result = cartesian_product.shift
|
|
144
|
+
end_cartesian_product if cartesian_product.empty?
|
|
145
|
+
result
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def one_to_many_shift
|
|
150
|
+
shift_left = first_right_key != next_right_key
|
|
151
|
+
shift_right = first_left_key != next_left_key
|
|
152
|
+
one_to_one_left_row if shift_left
|
|
153
|
+
one_to_one_right_row if shift_right
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def one_to_one_merge?
|
|
157
|
+
merge_key != next_left_key && merge_key != next_right_key
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def one_to_many_merge?
|
|
161
|
+
!(merge_key == next_left_key && merge_key == next_right_key)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def one_to_one_left_row
|
|
165
|
+
left_key_values.shift
|
|
166
|
+
left.shift
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def one_to_one_right_row
|
|
170
|
+
right_key_values.shift
|
|
171
|
+
right.shift
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def left_row_missing_right
|
|
175
|
+
val = one_to_one_left_row
|
|
176
|
+
expand_row(val, left_keys) if keep_left
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def right_row_missing_left
|
|
180
|
+
val = one_to_one_right_row
|
|
181
|
+
expand_row(val, right_keys) if keep_right
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
def lt(k1, k2)
|
|
185
|
+
(k1 <=> k2) == -1
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def merge_rows(lrow, rrow)
|
|
189
|
+
left_keys
|
|
190
|
+
.to_h { |from, to| [to, lrow[from]] }
|
|
191
|
+
.merge(on.to_h { |col| [col, lrow[col]] })
|
|
192
|
+
.merge(indicator ? { indicator => nil } : {})
|
|
193
|
+
.merge(right_keys.to_h { |from, to| [to, rrow[from]] })
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
def expand_row(row, renamings)
|
|
197
|
+
renamings
|
|
198
|
+
.to_h { |from, to| [to, row[from]] }
|
|
199
|
+
.merge(on.to_h { |col| [col, row[col]] })
|
|
200
|
+
.merge(indicator ? { indicator => nil } : {})
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def first_right_key
|
|
204
|
+
right_key_values.empty? ? nil : right_key_values.first
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def next_right_key
|
|
208
|
+
right_key_values[1]
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def first_left_key
|
|
212
|
+
left_key_values.empty? ? nil : left_key_values.first
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def next_left_key
|
|
216
|
+
left_key_values[1]
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
def left_rows_at_merge_key
|
|
220
|
+
left.take_while { |arr| sanitize_merge_keys(arr.values_at(*on)) == merge_key }
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
def right_rows_at_merge_key
|
|
224
|
+
right.take_while { |arr| sanitize_merge_keys(arr.values_at(*on)) == merge_key }
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
def cartesian_product
|
|
228
|
+
@cartesian_product ||= left_rows_at_merge_key.product(right_rows_at_merge_key).map do |left_row, right_row|
|
|
229
|
+
merge_rows(left_row, right_row)
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
def end_cartesian_product
|
|
234
|
+
left_size = left_rows_at_merge_key.size
|
|
235
|
+
left_key_values.shift(left_size)
|
|
236
|
+
left.shift(left_size)
|
|
237
|
+
|
|
238
|
+
right_size = right_rows_at_merge_key.size
|
|
239
|
+
right_key_values.shift(right_size)
|
|
240
|
+
right.shift(right_size)
|
|
241
|
+
@cartesian_product = nil
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def validate_on!(left_df, right_df)
|
|
245
|
+
@on.each do |on|
|
|
246
|
+
(left_df.has_vector?(on) && right_df.has_vector?(on)) or
|
|
247
|
+
raise ArgumentError, "Both dataframes expected to have #{on.inspect} field"
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
def safe_compare(left_array, right_array)
|
|
252
|
+
left_array.zip(right_array).map do |l, r|
|
|
253
|
+
next 0 if l.nil? && r.nil?
|
|
254
|
+
next 1 if r.nil?
|
|
255
|
+
next -1 if l.nil?
|
|
256
|
+
|
|
257
|
+
l <=> r
|
|
258
|
+
end.reject(&:zero?).first || 0
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
module Merge
|
|
263
|
+
class << self
|
|
264
|
+
def join(df1, df2, opts = {})
|
|
265
|
+
MergeFrame.new(df1, df2, opts).join
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
end
|