carray-dataframe 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,199 @@
1
+ #############################################################
2
+ #
3
+ # GROUPING
4
+ #
5
+ #############################################################
6
+
7
+ class CADataFrame
8
+
9
+ def group_by (*names)
10
+ if names.size == 1
11
+ return CADataFrameGroup.new(self, names[0])
12
+ else
13
+ return CADataFrameGroupMulti.new(self, *names)
14
+ end
15
+ end
16
+ end
17
+
18
+ class CADataFrameGroup
19
+
20
+ def initialize (dataframe, name)
21
+ @dataframe = dataframe
22
+ case name
23
+ when Hash
24
+ name, list = name.first
25
+ @column = @dataframe.col(name)
26
+ @keys = list.to_ca
27
+ else
28
+ @column = @dataframe.col(name)
29
+ @keys = @column.uniq.sort
30
+ end
31
+ if @column.is_a?(CATimeIndex)
32
+ @keys = CATimeIndex.from_index_array(@keys, @column.timestep)
33
+ end
34
+ @name = name.to_s
35
+ @addrs = {}
36
+ @keys.each do |k|
37
+ @addrs[k] = @column.eq(k).where
38
+ end
39
+ end
40
+
41
+ def table (&block)
42
+ hashpool = []
43
+ @keys.each do |k|
44
+ hashpool << @dataframe[@addrs[k]].execute(&block)
45
+ end
46
+ columns = { @name => @keys }
47
+ hashpool.each_with_index do |hash, i|
48
+ hash.each do |key, value|
49
+ columns[key] ||= []
50
+ columns[key][i] = value
51
+ end
52
+ end
53
+ return CADataFrame.new(columns)
54
+ end
55
+
56
+ def calculate (label = nil, columns: nil, &block)
57
+ new_columns = { @name => @keys }
58
+ @dataframe.each_column do |name, clmn|
59
+ if name == @name or ( columns && ( not columns.include?(name) ) )
60
+ next
61
+ end
62
+ new_columns[name] = CArray.object(@keys.size) { UNDEF }
63
+ @keys.each_with_index do |k, i|
64
+ begin
65
+ if block
66
+ new_columns[name][i] = yield(name, clmn[@addrs[k]])
67
+ else
68
+ new_columns[name][i] = clmn[@addrs[k]].send(label.intern)
69
+ end
70
+ rescue
71
+ end
72
+ end
73
+ end
74
+ return CADataFrame.new(new_columns)
75
+ end
76
+
77
+ def [] (group_value)
78
+ if @column.is_a?(CATimeIndex) and group_value.is_a?(String)
79
+ group_value = @column.timestep.index_at(group_value)
80
+ end
81
+ if map = @addrs[group_value]
82
+ return @dataframe[map]
83
+ else
84
+ return @dataframe.vacant_copy
85
+ end
86
+ end
87
+
88
+ def each
89
+ @addrs.each do |key, map|
90
+ yield @dataframe[map]
91
+ end
92
+ end
93
+
94
+ def each_with_index
95
+ if @column.is_a?(CATimeIndex)
96
+ ts = @column.timestep
97
+ @addrs.each do |key, map|
98
+ yield @dataframe[map], ts.time_at(key)
99
+ end
100
+ else
101
+ @addrs.each do |key, map|
102
+ yield @dataframe[map], key
103
+ end
104
+ end
105
+ end
106
+ include Enumerable
107
+ end
108
+ class CADataFrameGroupMulti
109
+
110
+ def initialize (dataframe, *names)
111
+ @rank = names.size
112
+ @dataframe = dataframe
113
+ @names = []
114
+ @column = []
115
+ @keys = []
116
+ names.each_with_index do |name, i|
117
+ case name
118
+ when Hash
119
+ name, list = name.first
120
+ @column[i] = @dataframe.col(name)
121
+ @keys[i] = list.to_ca
122
+ else
123
+ @column[i] = @dataframe.col(name)
124
+ @keys[i] = @column[i].to_ca.uniq.sort
125
+ end
126
+ @names[i] = name
127
+ end
128
+ @addrs = {}
129
+ each_with_keys do |list|
130
+ flag = @column[0].eq(list[0])
131
+ (1...@rank).each do |i|
132
+ flag &= @column[i].eq(list[i])
133
+ end
134
+ @addrs[list] = flag.where
135
+ end
136
+ end
137
+
138
+ def each_with_keys (&block)
139
+ @keys[0].to_a.product(*@keys[1..-1].map(&:to_a)).each(&block)
140
+ end
141
+
142
+ def table (&block)
143
+ hashpool = []
144
+ each_with_keys do |list|
145
+ hashpool << @dataframe[@addrs[list]].execute(&block)
146
+ end
147
+ columns = {}
148
+ @names.each do |name|
149
+ columns[name] = []
150
+ end
151
+ each_with_keys.with_index do |list,j|
152
+ @names.each_with_index do |name,i|
153
+ columns[name][j] = list[i]
154
+ end
155
+ end
156
+ hashpool.each_with_index do |hash, i|
157
+ hash.each do |key, value|
158
+ columns[key] ||= []
159
+ columns[key][i] = value
160
+ end
161
+ end
162
+ return CADataFrame.new(columns)
163
+ end
164
+
165
+ def calculate (label, &block)
166
+ new_columns = {@name=>@keys}
167
+ @dataframe.each_column do |name, clmn|
168
+ if name == @name
169
+ next
170
+ end
171
+ new_columns[name] = CArray.object(@keys.size) { UNDEF }
172
+ @keys.each_with_index do |k, i|
173
+ begin
174
+ if block
175
+ new_columns[name][i] = yield(name, clmn[@addrs[k]])
176
+ else
177
+ new_columns[name][i] = clmn[@addrs[k]].send(label.intern)
178
+ end
179
+ rescue
180
+ end
181
+ end
182
+ end
183
+ return CADataFrame.new(new_columns)
184
+ end
185
+
186
+ def [] (group_value)
187
+ if map = @addrs[group_value]
188
+ return @dataframe[map]
189
+ else
190
+ return @dataframe.vacant_copy
191
+ end
192
+ end
193
+
194
+ def each
195
+ each_with_keys do |key|
196
+ yield key, @dataframe[@addrs[key]]
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,62 @@
1
+ class CADataFrame
2
+
3
+ class ILocAccessor
4
+
5
+ def initialize (dataframe)
6
+ @dataframe = dataframe
7
+ end
8
+
9
+ def [] (*argv)
10
+ @dataframe.instance_eval {
11
+ index = argv.first
12
+ column_selector = select_columns(argv[1])
13
+ columns = {}
14
+ column_selector.each do |name|
15
+ columns[name] = @column_data[name][index] ### df[...]
16
+ end
17
+ return CADataFrame.new(columns, index: @row_index ? @row_index[index] : nil)
18
+ }
19
+ end
20
+
21
+ def []= (*argv)
22
+ value = argv.pop
23
+ @dataframe.instance_eval {
24
+ index = argv.first
25
+ column_selector = select_columns(argv[1])
26
+ case value
27
+ when Hash ### value = {"a"=> [1,2,3], ... }
28
+ value = value.map{|k,v| [k.to_s, v]}.to_h
29
+ column_selector.each do |name|
30
+ @column_data[name][index] = value[name]
31
+ end
32
+ when Array
33
+ case value.first
34
+ when Hash ### value = [{"a"=>1,"b"=>11}, {"a"=>2,""=>12} ...]
35
+ table = {}
36
+ column_selector.each do |name|
37
+ table[name] = []
38
+ end
39
+ value.each do |hash|
40
+ hash = hash.map{|k,v| [k.to_s, v]}.to_h
41
+ column_selector.each do |name|
42
+ table[name] << hash[name]
43
+ end
44
+ end
45
+ column_selector.each do |name|
46
+ @column_data[name][index] = table[name]
47
+ end
48
+ else ### value = [[1,11],[2,12],...]
49
+ value = value.transpose
50
+ column_selector.each_with_index do |name, k|
51
+ @column_data[name][index] = value[k]
52
+ end
53
+ end
54
+ else ### value = any value
55
+ column_selector.each_with_index do |name, k|
56
+ @column_data[name][index] = value
57
+ end
58
+ end
59
+ }
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,96 @@
1
+ ######################################
2
+ #
3
+ # IO methods
4
+ #
5
+ ######################################
6
+ require "spreadsheet"
7
+ class CArray
8
+
9
+ def save_excel (filename, &block)
10
+ if self.rank >= 3
11
+ raise "too large rank (>2) to write excel file"
12
+ end
13
+ book = Spreadsheet::Workbook.new
14
+ worksheet = book.create_worksheet
15
+ self.dim0.times do |i|
16
+ worksheet.row(i).push *self[i,nil]
17
+ end
18
+ if block
19
+ block.call(worksheet)
20
+ end
21
+ book.write(filename)
22
+ end
23
+
24
+ def self.load_excel (filename, sheet=0)
25
+ book = Spreadsheet.open(filename)
26
+ sheet = book.worksheet(sheet)
27
+ return sheet.map(&:to_a).to_ca
28
+ end
29
+ end
30
+ class CADataFrame
31
+
32
+ def self.load_sqlite3 (*args)
33
+ df = CArray.load_sqlite3(*args).to_dataframe
34
+ if df
35
+ return df.arrange{
36
+ column_names.each do |name|
37
+ mask name, nil
38
+ end
39
+ }
40
+ else
41
+ return nil
42
+ end
43
+ end
44
+
45
+ def self.load_csv (file, sep: ",", rs: $/, encoding: nil, index: nil, &block)
46
+ df = CArray.load_csv(file, sep: sep, rs: rs, encoding: encoding, &block).to_dataframe(index: index)
47
+ if df
48
+ return df.arrange{
49
+ column_names.each do |name|
50
+ mask name, nil
51
+ end
52
+ }
53
+ else
54
+ return nil
55
+ end
56
+ end
57
+
58
+ def self.from_csv (file, sep: ",", rs: $/, index: nil, &block)
59
+ df = CArray.from_csv(file, sep: sep, rs: rs, &block).to_dataframe(index: index)
60
+ if df
61
+ return df.arrange{
62
+ column_names.each do |name|
63
+ mask name, nil
64
+ end
65
+ }
66
+ else
67
+ return nil
68
+ end
69
+ end
70
+
71
+ def to_sqlite3 (*args)
72
+ self.to_ca.to_sqlite3(*args)
73
+ end
74
+
75
+ def to_sql (tablename)
76
+ if @column_names.any?{ |s| s =~ /[\. \-]/ }
77
+ columns = {}
78
+ each_column_name do |name|
79
+ name2 = name.gsub(/[\. \-]/, '_')
80
+ columns[name2] = column(name)
81
+ end
82
+ df = CADataFrame.new(columns)
83
+ return df.to_sqlite3(database: ":memory:", table: tablename)
84
+ else
85
+ return to_sqlite3(database: ":memory:", table: tablename)
86
+ end
87
+ end
88
+ end
89
+ module SQLite3
90
+ class Database
91
+
92
+ def to_df (expr)
93
+ return CADataFrame.load_sqlite3 self, expr
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,283 @@
1
+ # Copyright (c) 2014, Sameer Deshmukh
2
+ # All rights reserved.
3
+ #
4
+ # Redistribution and use in source and binary forms, with or without
5
+ # modification, are permitted provided that the following conditions are met:
6
+ #
7
+ # * Redistributions of source code must retain the above copyright notice, this
8
+ # list of conditions and the following disclaimer.
9
+ #
10
+ # * Redistributions in binary form must reproduce the above copyright notice,
11
+ # this list of conditions and the following disclaimer in the documentation
12
+ # and/or other materials provided with the distribution.
13
+ #
14
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18
+ # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19
+ # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21
+ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22
+ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24
+ class CADataFrame
25
+
26
+ def join(other_df,opts={})
27
+ CADataFrame::Merge.join(self, other_df, opts)
28
+ end
29
+ end
30
+ class CADataFrame
31
+ class MergeFrame
32
+ class NilSorter
33
+ include Comparable
34
+
35
+ def nil?
36
+ true
37
+ end
38
+
39
+ def ==(_other)
40
+ false
41
+ end
42
+
43
+ def <=>(other)
44
+ other.nil? ? 0 : -1
45
+ end
46
+ end
47
+
48
+ def initialize left_df, right_df, opts={} # rubocop:disable Metrics/AbcSize -- quick-fix for issue #171
49
+ init_opts(opts)
50
+ validate_on!(left_df, right_df)
51
+ key_sanitizer = ->(h) { sanitize_merge_keys(h.values_at(*on)) }
52
+ @left = df_to_a(left_df)
53
+ @left.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) }
54
+ @left_key_values = @left.map(&key_sanitizer)
55
+ @right = df_to_a(right_df)
56
+ @right.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) }
57
+ @right_key_values = @right.map(&key_sanitizer)
58
+ @left_keys, @right_keys = merge_keys(left_df, right_df, on)
59
+ end
60
+
61
+ def join
62
+ res = []
63
+ until left.empty? && right.empty?
64
+ lkey = first_left_key
65
+ rkey = first_right_key
66
+ row(lkey, rkey).tap { |r| res << r if r }
67
+ end
68
+ CADataFrame.new(res, order: dataframe_vector_names)
69
+ end
70
+ private
71
+ attr_reader :on, :indicator,
72
+ :left, :left_key_values, :keep_left, :left_keys,
73
+ :right, :right_key_values, :keep_right, :right_keys
74
+ attr_accessor :merge_key
75
+ LEFT_RIGHT_COMBINATIONS = {
76
+ # left right
77
+ inner: [false, false],
78
+ left: [true, false],
79
+ right: [false, true],
80
+ outer: [true, true]
81
+ }.freeze
82
+
83
+ def init_opts(opts)
84
+ @on = opts[:on].map(&:to_s)
85
+ @keep_left, @keep_right = extract_left_right(opts[:how])
86
+ @indicator = opts[:indicator]
87
+ end
88
+
89
+ def dataframe_vector_names
90
+ left_keys.values + on + right_keys.values + Array(indicator)
91
+ end
92
+
93
+ def extract_left_right(how)
94
+ LEFT_RIGHT_COMBINATIONS[how] or
95
+ raise ArgumentError, "Unrecognized join option: #{how}"
96
+ end
97
+
98
+ def sanitize_merge_keys(merge_keys)
99
+ merge_keys.map { |v| v.nil? ? NilSorter.new : v }
100
+ end
101
+
102
+ def df_to_a df
103
+ # FIXME: much faster than "native" DataFrame#to_a. Should not be
104
+ h = df.to_h
105
+ keys = h.keys
106
+ h.values.map(&:to_a).transpose.map { |r| keys.zip(r).to_h }
107
+ end
108
+
109
+ def merge_keys(df1, df2, on)
110
+ duplicates =
111
+ (df1.column_names + df2.column_names - on)
112
+ .group_by(&:itself)
113
+ .select { |_, g| g.count == 2 }.map(&:first)
114
+ [
115
+ guard_keys(df1.column_names - on, duplicates, 1),
116
+ guard_keys(df2.column_names - on, duplicates, 2)
117
+ ]
118
+ end
119
+
120
+ def guard_keys keys, duplicates, num
121
+ keys.map { |v| [v, guard_duplicate(v, duplicates, num)] }.to_h
122
+ end
123
+
124
+ def guard_duplicate val, duplicates, num
125
+ duplicates.include?(val) ? "#{val}_" : val
126
+ end
127
+
128
+ def row(lkey, rkey)
129
+ case
130
+ when !lkey && !rkey
131
+ # :nocov:
132
+ # It's just an impossibility handler, can't be covered :)
133
+ raise 'Unexpected condition met during merge'
134
+ # :nocov:
135
+ when lkey == rkey
136
+ self.merge_key = lkey
137
+ add_indicator(merge_matching_rows, :both)
138
+ when !rkey || lt(lkey, rkey)
139
+ add_indicator(left_row_missing_right, :left_only)
140
+ else # !lkey || lt(rkey, lkey)
141
+ add_indicator(right_row_missing_left, :right_only)
142
+ end
143
+ end
144
+
145
+ def add_indicator(row, indicator_value)
146
+ return row unless indicator
147
+ row[indicator] = indicator_value
148
+ row
149
+ end
150
+
151
+ def merge_matching_rows
152
+ if one_to_one_merge?
153
+ merge_rows(one_to_one_left_row, one_to_one_right_row)
154
+ elsif one_to_many_merge?
155
+ result = merge_rows(left.first, right.first)
156
+ one_to_many_shift
157
+ result
158
+ else
159
+ result = cartesian_product.shift
160
+ end_cartesian_product if cartesian_product.empty?
161
+ result
162
+ end
163
+ end
164
+
165
+ def one_to_many_shift
166
+ shift_left = first_right_key != next_right_key
167
+ shift_right = first_left_key != next_left_key
168
+ one_to_one_left_row if shift_left
169
+ one_to_one_right_row if shift_right
170
+ end
171
+
172
+ def one_to_one_merge?
173
+ merge_key != next_left_key && merge_key != next_right_key
174
+ end
175
+
176
+ def one_to_many_merge?
177
+ !(merge_key == next_left_key && merge_key == next_right_key)
178
+ end
179
+
180
+ def one_to_one_left_row
181
+ left_key_values.shift
182
+ left.shift
183
+ end
184
+
185
+ def one_to_one_right_row
186
+ right_key_values.shift
187
+ right.shift
188
+ end
189
+
190
+ def left_row_missing_right
191
+ val = one_to_one_left_row
192
+ expand_row(val, left_keys) if keep_left
193
+ end
194
+
195
+ def right_row_missing_left
196
+ val = one_to_one_right_row
197
+ expand_row(val, right_keys) if keep_right
198
+ end
199
+
200
+ def lt(k1, k2)
201
+ (k1 <=> k2) == -1
202
+ end
203
+
204
+ def merge_rows lrow, rrow
205
+ left_keys
206
+ .map { |from, to| [to, lrow[from]] }.to_h
207
+ .merge(on.map { |col| [col, lrow[col]] }.to_h)
208
+ .merge(indicator ? {indicator => nil} : {})
209
+ .merge(right_keys.map { |from, to| [to, rrow[from]] }.to_h)
210
+ end
211
+
212
+ def expand_row row, renamings
213
+ renamings
214
+ .map { |from, to| [to, row[from]] }.to_h
215
+ .merge(on.map { |col| [col, row[col]] }.to_h)
216
+ .merge(indicator ? {indicator => nil} : {})
217
+ end
218
+
219
+ def first_right_key
220
+ right_key_values.empty? ? nil : right_key_values.first
221
+ end
222
+
223
+ def next_right_key
224
+ right_key_values[1]
225
+ end
226
+
227
+ def first_left_key
228
+ left_key_values.empty? ? nil : left_key_values.first
229
+ end
230
+
231
+ def next_left_key
232
+ left_key_values[1]
233
+ end
234
+
235
+ def left_rows_at_merge_key
236
+ left.take_while { |arr| sanitize_merge_keys(arr.values_at(*on)) == merge_key }
237
+ end
238
+
239
+ def right_rows_at_merge_key
240
+ right.take_while { |arr| sanitize_merge_keys(arr.values_at(*on)) == merge_key }
241
+ end
242
+
243
+ def cartesian_product
244
+ @cartesian_product ||= left_rows_at_merge_key.product(right_rows_at_merge_key).map do |left_row, right_row|
245
+ merge_rows(left_row, right_row)
246
+ end
247
+ end
248
+
249
+ def end_cartesian_product
250
+ left_size = left_rows_at_merge_key.size
251
+ left_key_values.shift(left_size)
252
+ left.shift(left_size)
253
+ right_size = right_rows_at_merge_key.size
254
+ right_key_values.shift(right_size)
255
+ right.shift(right_size)
256
+ @cartesian_product = nil
257
+ end
258
+
259
+ def validate_on!(left_df, right_df)
260
+ @on.each do |on|
261
+ left_df.has_column?(on) && right_df.has_column?(on) or
262
+ raise ArgumentError, "Both dataframes expected to have #{on.inspect} field"
263
+ end
264
+ end
265
+
266
+ def safe_compare(left_array, right_array)
267
+ left_array.zip(right_array).map { |l, r|
268
+ next 0 if l.nil? && r.nil?
269
+ next 1 if r.nil?
270
+ next -1 if l.nil?
271
+ l <=> r
272
+ }.reject(&:zero?).first || 0
273
+ end
274
+ end
275
+ module Merge
276
+ class << self
277
+
278
+ def join df1, df2, opts={}
279
+ MergeFrame.new(df1, df2, opts).join
280
+ end
281
+ end
282
+ end
283
+ end