carray-dataframe 1.0.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/API.txt +1 -7
- data/README.md +3 -1
- data/Rakefile +11 -0
- data/carray-dataframe.gemspec +8 -6
- data/lib/carray-dataframe.rb +13 -0
- data/lib/carray-dataframe/arranger.rb +209 -0
- data/lib/carray-dataframe/cadf_array.rb +106 -0
- data/lib/carray-dataframe/converter.rb +97 -0
- data/lib/carray-dataframe/dataframe.rb +1279 -0
- data/lib/carray-dataframe/group.rb +199 -0
- data/lib/carray-dataframe/iloc_accessor.rb +62 -0
- data/lib/carray-dataframe/io.rb +96 -0
- data/lib/carray-dataframe/join.rb +283 -0
- data/lib/carray-dataframe/loc_accessor.rb +145 -0
- data/lib/carray-dataframe/pivot.rb +54 -0
- data/lib/carray-dataframe/reference.rb +142 -0
- data/lib/carray-dataframe/to_html.rb +102 -0
- metadata +23 -28
- data/examples/R/fit.rb +0 -24
- data/examples/R/iris.rb +0 -9
- data/examples/R/japan_area.rb +0 -30
- data/examples/R/kyaku.rb +0 -22
- data/examples/group_by.rb +0 -78
- data/examples/hist.rb +0 -27
- data/examples/iris.rb +0 -29
- data/examples/map.rb +0 -23
- data/examples/match.rb +0 -21
- data/examples/test.xlsx +0 -0
- data/examples/test1.rb +0 -44
- data/examples/test2.rb +0 -14
- data/examples/test3.db +0 -0
- data/examples/test3.rb +0 -11
- data/examples/test3.xlsx +0 -0
- data/examples/to_excel.rb +0 -27
- data/lib/R.rb +0 -365
- data/lib/carray/autoload/autoload_dataframe_dataframe.rb +0 -26
- data/lib/carray/dataframe/dataframe.rb +0 -1640
@@ -0,0 +1,199 @@
|
|
1
|
+
#############################################################
|
2
|
+
#
|
3
|
+
# GROUPING
|
4
|
+
#
|
5
|
+
#############################################################
|
6
|
+
|
7
|
+
class CADataFrame
|
8
|
+
|
9
|
+
def group_by (*names)
|
10
|
+
if names.size == 1
|
11
|
+
return CADataFrameGroup.new(self, names[0])
|
12
|
+
else
|
13
|
+
return CADataFrameGroupMulti.new(self, *names)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
class CADataFrameGroup
|
19
|
+
|
20
|
+
def initialize (dataframe, name)
|
21
|
+
@dataframe = dataframe
|
22
|
+
case name
|
23
|
+
when Hash
|
24
|
+
name, list = name.first
|
25
|
+
@column = @dataframe.col(name)
|
26
|
+
@keys = list.to_ca
|
27
|
+
else
|
28
|
+
@column = @dataframe.col(name)
|
29
|
+
@keys = @column.uniq.sort
|
30
|
+
end
|
31
|
+
if @column.is_a?(CATimeIndex)
|
32
|
+
@keys = CATimeIndex.from_index_array(@keys, @column.timestep)
|
33
|
+
end
|
34
|
+
@name = name.to_s
|
35
|
+
@addrs = {}
|
36
|
+
@keys.each do |k|
|
37
|
+
@addrs[k] = @column.eq(k).where
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def table (&block)
|
42
|
+
hashpool = []
|
43
|
+
@keys.each do |k|
|
44
|
+
hashpool << @dataframe[@addrs[k]].execute(&block)
|
45
|
+
end
|
46
|
+
columns = { @name => @keys }
|
47
|
+
hashpool.each_with_index do |hash, i|
|
48
|
+
hash.each do |key, value|
|
49
|
+
columns[key] ||= []
|
50
|
+
columns[key][i] = value
|
51
|
+
end
|
52
|
+
end
|
53
|
+
return CADataFrame.new(columns)
|
54
|
+
end
|
55
|
+
|
56
|
+
def calculate (label = nil, columns: nil, &block)
|
57
|
+
new_columns = { @name => @keys }
|
58
|
+
@dataframe.each_column do |name, clmn|
|
59
|
+
if name == @name or ( columns && ( not columns.include?(name) ) )
|
60
|
+
next
|
61
|
+
end
|
62
|
+
new_columns[name] = CArray.object(@keys.size) { UNDEF }
|
63
|
+
@keys.each_with_index do |k, i|
|
64
|
+
begin
|
65
|
+
if block
|
66
|
+
new_columns[name][i] = yield(name, clmn[@addrs[k]])
|
67
|
+
else
|
68
|
+
new_columns[name][i] = clmn[@addrs[k]].send(label.intern)
|
69
|
+
end
|
70
|
+
rescue
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
return CADataFrame.new(new_columns)
|
75
|
+
end
|
76
|
+
|
77
|
+
def [] (group_value)
|
78
|
+
if @column.is_a?(CATimeIndex) and group_value.is_a?(String)
|
79
|
+
group_value = @column.timestep.index_at(group_value)
|
80
|
+
end
|
81
|
+
if map = @addrs[group_value]
|
82
|
+
return @dataframe[map]
|
83
|
+
else
|
84
|
+
return @dataframe.vacant_copy
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def each
|
89
|
+
@addrs.each do |key, map|
|
90
|
+
yield @dataframe[map]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def each_with_index
|
95
|
+
if @column.is_a?(CATimeIndex)
|
96
|
+
ts = @column.timestep
|
97
|
+
@addrs.each do |key, map|
|
98
|
+
yield @dataframe[map], ts.time_at(key)
|
99
|
+
end
|
100
|
+
else
|
101
|
+
@addrs.each do |key, map|
|
102
|
+
yield @dataframe[map], key
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
include Enumerable
|
107
|
+
end
|
108
|
+
class CADataFrameGroupMulti
|
109
|
+
|
110
|
+
def initialize (dataframe, *names)
|
111
|
+
@rank = names.size
|
112
|
+
@dataframe = dataframe
|
113
|
+
@names = []
|
114
|
+
@column = []
|
115
|
+
@keys = []
|
116
|
+
names.each_with_index do |name, i|
|
117
|
+
case name
|
118
|
+
when Hash
|
119
|
+
name, list = name.first
|
120
|
+
@column[i] = @dataframe.col(name)
|
121
|
+
@keys[i] = list.to_ca
|
122
|
+
else
|
123
|
+
@column[i] = @dataframe.col(name)
|
124
|
+
@keys[i] = @column[i].to_ca.uniq.sort
|
125
|
+
end
|
126
|
+
@names[i] = name
|
127
|
+
end
|
128
|
+
@addrs = {}
|
129
|
+
each_with_keys do |list|
|
130
|
+
flag = @column[0].eq(list[0])
|
131
|
+
(1...@rank).each do |i|
|
132
|
+
flag &= @column[i].eq(list[i])
|
133
|
+
end
|
134
|
+
@addrs[list] = flag.where
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
def each_with_keys (&block)
|
139
|
+
@keys[0].to_a.product(*@keys[1..-1].map(&:to_a)).each(&block)
|
140
|
+
end
|
141
|
+
|
142
|
+
def table (&block)
|
143
|
+
hashpool = []
|
144
|
+
each_with_keys do |list|
|
145
|
+
hashpool << @dataframe[@addrs[list]].execute(&block)
|
146
|
+
end
|
147
|
+
columns = {}
|
148
|
+
@names.each do |name|
|
149
|
+
columns[name] = []
|
150
|
+
end
|
151
|
+
each_with_keys.with_index do |list,j|
|
152
|
+
@names.each_with_index do |name,i|
|
153
|
+
columns[name][j] = list[i]
|
154
|
+
end
|
155
|
+
end
|
156
|
+
hashpool.each_with_index do |hash, i|
|
157
|
+
hash.each do |key, value|
|
158
|
+
columns[key] ||= []
|
159
|
+
columns[key][i] = value
|
160
|
+
end
|
161
|
+
end
|
162
|
+
return CADataFrame.new(columns)
|
163
|
+
end
|
164
|
+
|
165
|
+
def calculate (label, &block)
|
166
|
+
new_columns = {@name=>@keys}
|
167
|
+
@dataframe.each_column do |name, clmn|
|
168
|
+
if name == @name
|
169
|
+
next
|
170
|
+
end
|
171
|
+
new_columns[name] = CArray.object(@keys.size) { UNDEF }
|
172
|
+
@keys.each_with_index do |k, i|
|
173
|
+
begin
|
174
|
+
if block
|
175
|
+
new_columns[name][i] = yield(name, clmn[@addrs[k]])
|
176
|
+
else
|
177
|
+
new_columns[name][i] = clmn[@addrs[k]].send(label.intern)
|
178
|
+
end
|
179
|
+
rescue
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
return CADataFrame.new(new_columns)
|
184
|
+
end
|
185
|
+
|
186
|
+
def [] (group_value)
|
187
|
+
if map = @addrs[group_value]
|
188
|
+
return @dataframe[map]
|
189
|
+
else
|
190
|
+
return @dataframe.vacant_copy
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
def each
|
195
|
+
each_with_keys do |key|
|
196
|
+
yield key, @dataframe[@addrs[key]]
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
class CADataFrame
|
2
|
+
|
3
|
+
class ILocAccessor
|
4
|
+
|
5
|
+
def initialize (dataframe)
|
6
|
+
@dataframe = dataframe
|
7
|
+
end
|
8
|
+
|
9
|
+
def [] (*argv)
|
10
|
+
@dataframe.instance_eval {
|
11
|
+
index = argv.first
|
12
|
+
column_selector = select_columns(argv[1])
|
13
|
+
columns = {}
|
14
|
+
column_selector.each do |name|
|
15
|
+
columns[name] = @column_data[name][index] ### df[...]
|
16
|
+
end
|
17
|
+
return CADataFrame.new(columns, index: @row_index ? @row_index[index] : nil)
|
18
|
+
}
|
19
|
+
end
|
20
|
+
|
21
|
+
def []= (*argv)
|
22
|
+
value = argv.pop
|
23
|
+
@dataframe.instance_eval {
|
24
|
+
index = argv.first
|
25
|
+
column_selector = select_columns(argv[1])
|
26
|
+
case value
|
27
|
+
when Hash ### value = {"a"=> [1,2,3], ... }
|
28
|
+
value = value.map{|k,v| [k.to_s, v]}.to_h
|
29
|
+
column_selector.each do |name|
|
30
|
+
@column_data[name][index] = value[name]
|
31
|
+
end
|
32
|
+
when Array
|
33
|
+
case value.first
|
34
|
+
when Hash ### value = [{"a"=>1,"b"=>11}, {"a"=>2,""=>12} ...]
|
35
|
+
table = {}
|
36
|
+
column_selector.each do |name|
|
37
|
+
table[name] = []
|
38
|
+
end
|
39
|
+
value.each do |hash|
|
40
|
+
hash = hash.map{|k,v| [k.to_s, v]}.to_h
|
41
|
+
column_selector.each do |name|
|
42
|
+
table[name] << hash[name]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
column_selector.each do |name|
|
46
|
+
@column_data[name][index] = table[name]
|
47
|
+
end
|
48
|
+
else ### value = [[1,11],[2,12],...]
|
49
|
+
value = value.transpose
|
50
|
+
column_selector.each_with_index do |name, k|
|
51
|
+
@column_data[name][index] = value[k]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
else ### value = any value
|
55
|
+
column_selector.each_with_index do |name, k|
|
56
|
+
@column_data[name][index] = value
|
57
|
+
end
|
58
|
+
end
|
59
|
+
}
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
######################################
|
2
|
+
#
|
3
|
+
# IO methods
|
4
|
+
#
|
5
|
+
######################################
|
6
|
+
require "spreadsheet"
|
7
|
+
class CArray
|
8
|
+
|
9
|
+
def save_excel (filename, &block)
|
10
|
+
if self.rank >= 3
|
11
|
+
raise "too large rank (>2) to write excel file"
|
12
|
+
end
|
13
|
+
book = Spreadsheet::Workbook.new
|
14
|
+
worksheet = book.create_worksheet
|
15
|
+
self.dim0.times do |i|
|
16
|
+
worksheet.row(i).push *self[i,nil]
|
17
|
+
end
|
18
|
+
if block
|
19
|
+
block.call(worksheet)
|
20
|
+
end
|
21
|
+
book.write(filename)
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.load_excel (filename, sheet=0)
|
25
|
+
book = Spreadsheet.open(filename)
|
26
|
+
sheet = book.worksheet(sheet)
|
27
|
+
return sheet.map(&:to_a).to_ca
|
28
|
+
end
|
29
|
+
end
|
30
|
+
class CADataFrame
|
31
|
+
|
32
|
+
def self.load_sqlite3 (*args)
|
33
|
+
df = CArray.load_sqlite3(*args).to_dataframe
|
34
|
+
if df
|
35
|
+
return df.arrange{
|
36
|
+
column_names.each do |name|
|
37
|
+
mask name, nil
|
38
|
+
end
|
39
|
+
}
|
40
|
+
else
|
41
|
+
return nil
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.load_csv (file, sep: ",", rs: $/, encoding: nil, index: nil, &block)
|
46
|
+
df = CArray.load_csv(file, sep: sep, rs: rs, encoding: encoding, &block).to_dataframe(index: index)
|
47
|
+
if df
|
48
|
+
return df.arrange{
|
49
|
+
column_names.each do |name|
|
50
|
+
mask name, nil
|
51
|
+
end
|
52
|
+
}
|
53
|
+
else
|
54
|
+
return nil
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.from_csv (file, sep: ",", rs: $/, index: nil, &block)
|
59
|
+
df = CArray.from_csv(file, sep: sep, rs: rs, &block).to_dataframe(index: index)
|
60
|
+
if df
|
61
|
+
return df.arrange{
|
62
|
+
column_names.each do |name|
|
63
|
+
mask name, nil
|
64
|
+
end
|
65
|
+
}
|
66
|
+
else
|
67
|
+
return nil
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def to_sqlite3 (*args)
|
72
|
+
self.to_ca.to_sqlite3(*args)
|
73
|
+
end
|
74
|
+
|
75
|
+
def to_sql (tablename)
|
76
|
+
if @column_names.any?{ |s| s =~ /[\. \-]/ }
|
77
|
+
columns = {}
|
78
|
+
each_column_name do |name|
|
79
|
+
name2 = name.gsub(/[\. \-]/, '_')
|
80
|
+
columns[name2] = column(name)
|
81
|
+
end
|
82
|
+
df = CADataFrame.new(columns)
|
83
|
+
return df.to_sqlite3(database: ":memory:", table: tablename)
|
84
|
+
else
|
85
|
+
return to_sqlite3(database: ":memory:", table: tablename)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
module SQLite3
|
90
|
+
class Database
|
91
|
+
|
92
|
+
def to_df (expr)
|
93
|
+
return CADataFrame.load_sqlite3 self, expr
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,283 @@
|
|
1
|
+
# Copyright (c) 2014, Sameer Deshmukh
|
2
|
+
# All rights reserved.
|
3
|
+
#
|
4
|
+
# Redistribution and use in source and binary forms, with or without
|
5
|
+
# modification, are permitted provided that the following conditions are met:
|
6
|
+
#
|
7
|
+
# * Redistributions of source code must retain the above copyright notice, this
|
8
|
+
# list of conditions and the following disclaimer.
|
9
|
+
#
|
10
|
+
# * Redistributions in binary form must reproduce the above copyright notice,
|
11
|
+
# this list of conditions and the following disclaimer in the documentation
|
12
|
+
# and/or other materials provided with the distribution.
|
13
|
+
#
|
14
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
15
|
+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
16
|
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
17
|
+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
18
|
+
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
19
|
+
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
20
|
+
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
21
|
+
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
22
|
+
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
23
|
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
24
|
+
class CADataFrame
|
25
|
+
|
26
|
+
def join(other_df,opts={})
|
27
|
+
CADataFrame::Merge.join(self, other_df, opts)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
class CADataFrame
|
31
|
+
class MergeFrame
|
32
|
+
class NilSorter
|
33
|
+
include Comparable
|
34
|
+
|
35
|
+
def nil?
|
36
|
+
true
|
37
|
+
end
|
38
|
+
|
39
|
+
def ==(_other)
|
40
|
+
false
|
41
|
+
end
|
42
|
+
|
43
|
+
def <=>(other)
|
44
|
+
other.nil? ? 0 : -1
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def initialize left_df, right_df, opts={} # rubocop:disable Metrics/AbcSize -- quick-fix for issue #171
|
49
|
+
init_opts(opts)
|
50
|
+
validate_on!(left_df, right_df)
|
51
|
+
key_sanitizer = ->(h) { sanitize_merge_keys(h.values_at(*on)) }
|
52
|
+
@left = df_to_a(left_df)
|
53
|
+
@left.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) }
|
54
|
+
@left_key_values = @left.map(&key_sanitizer)
|
55
|
+
@right = df_to_a(right_df)
|
56
|
+
@right.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) }
|
57
|
+
@right_key_values = @right.map(&key_sanitizer)
|
58
|
+
@left_keys, @right_keys = merge_keys(left_df, right_df, on)
|
59
|
+
end
|
60
|
+
|
61
|
+
def join
|
62
|
+
res = []
|
63
|
+
until left.empty? && right.empty?
|
64
|
+
lkey = first_left_key
|
65
|
+
rkey = first_right_key
|
66
|
+
row(lkey, rkey).tap { |r| res << r if r }
|
67
|
+
end
|
68
|
+
CADataFrame.new(res, order: dataframe_vector_names)
|
69
|
+
end
|
70
|
+
private
|
71
|
+
attr_reader :on, :indicator,
|
72
|
+
:left, :left_key_values, :keep_left, :left_keys,
|
73
|
+
:right, :right_key_values, :keep_right, :right_keys
|
74
|
+
attr_accessor :merge_key
|
75
|
+
LEFT_RIGHT_COMBINATIONS = {
|
76
|
+
# left right
|
77
|
+
inner: [false, false],
|
78
|
+
left: [true, false],
|
79
|
+
right: [false, true],
|
80
|
+
outer: [true, true]
|
81
|
+
}.freeze
|
82
|
+
|
83
|
+
def init_opts(opts)
|
84
|
+
@on = opts[:on].map(&:to_s)
|
85
|
+
@keep_left, @keep_right = extract_left_right(opts[:how])
|
86
|
+
@indicator = opts[:indicator]
|
87
|
+
end
|
88
|
+
|
89
|
+
def dataframe_vector_names
|
90
|
+
left_keys.values + on + right_keys.values + Array(indicator)
|
91
|
+
end
|
92
|
+
|
93
|
+
def extract_left_right(how)
|
94
|
+
LEFT_RIGHT_COMBINATIONS[how] or
|
95
|
+
raise ArgumentError, "Unrecognized join option: #{how}"
|
96
|
+
end
|
97
|
+
|
98
|
+
def sanitize_merge_keys(merge_keys)
|
99
|
+
merge_keys.map { |v| v.nil? ? NilSorter.new : v }
|
100
|
+
end
|
101
|
+
|
102
|
+
def df_to_a df
|
103
|
+
# FIXME: much faster than "native" DataFrame#to_a. Should not be
|
104
|
+
h = df.to_h
|
105
|
+
keys = h.keys
|
106
|
+
h.values.map(&:to_a).transpose.map { |r| keys.zip(r).to_h }
|
107
|
+
end
|
108
|
+
|
109
|
+
def merge_keys(df1, df2, on)
|
110
|
+
duplicates =
|
111
|
+
(df1.column_names + df2.column_names - on)
|
112
|
+
.group_by(&:itself)
|
113
|
+
.select { |_, g| g.count == 2 }.map(&:first)
|
114
|
+
[
|
115
|
+
guard_keys(df1.column_names - on, duplicates, 1),
|
116
|
+
guard_keys(df2.column_names - on, duplicates, 2)
|
117
|
+
]
|
118
|
+
end
|
119
|
+
|
120
|
+
def guard_keys keys, duplicates, num
|
121
|
+
keys.map { |v| [v, guard_duplicate(v, duplicates, num)] }.to_h
|
122
|
+
end
|
123
|
+
|
124
|
+
def guard_duplicate val, duplicates, num
|
125
|
+
duplicates.include?(val) ? "#{val}_" : val
|
126
|
+
end
|
127
|
+
|
128
|
+
def row(lkey, rkey)
|
129
|
+
case
|
130
|
+
when !lkey && !rkey
|
131
|
+
# :nocov:
|
132
|
+
# It's just an impossibility handler, can't be covered :)
|
133
|
+
raise 'Unexpected condition met during merge'
|
134
|
+
# :nocov:
|
135
|
+
when lkey == rkey
|
136
|
+
self.merge_key = lkey
|
137
|
+
add_indicator(merge_matching_rows, :both)
|
138
|
+
when !rkey || lt(lkey, rkey)
|
139
|
+
add_indicator(left_row_missing_right, :left_only)
|
140
|
+
else # !lkey || lt(rkey, lkey)
|
141
|
+
add_indicator(right_row_missing_left, :right_only)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def add_indicator(row, indicator_value)
|
146
|
+
return row unless indicator
|
147
|
+
row[indicator] = indicator_value
|
148
|
+
row
|
149
|
+
end
|
150
|
+
|
151
|
+
def merge_matching_rows
|
152
|
+
if one_to_one_merge?
|
153
|
+
merge_rows(one_to_one_left_row, one_to_one_right_row)
|
154
|
+
elsif one_to_many_merge?
|
155
|
+
result = merge_rows(left.first, right.first)
|
156
|
+
one_to_many_shift
|
157
|
+
result
|
158
|
+
else
|
159
|
+
result = cartesian_product.shift
|
160
|
+
end_cartesian_product if cartesian_product.empty?
|
161
|
+
result
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def one_to_many_shift
|
166
|
+
shift_left = first_right_key != next_right_key
|
167
|
+
shift_right = first_left_key != next_left_key
|
168
|
+
one_to_one_left_row if shift_left
|
169
|
+
one_to_one_right_row if shift_right
|
170
|
+
end
|
171
|
+
|
172
|
+
def one_to_one_merge?
|
173
|
+
merge_key != next_left_key && merge_key != next_right_key
|
174
|
+
end
|
175
|
+
|
176
|
+
def one_to_many_merge?
|
177
|
+
!(merge_key == next_left_key && merge_key == next_right_key)
|
178
|
+
end
|
179
|
+
|
180
|
+
def one_to_one_left_row
|
181
|
+
left_key_values.shift
|
182
|
+
left.shift
|
183
|
+
end
|
184
|
+
|
185
|
+
def one_to_one_right_row
|
186
|
+
right_key_values.shift
|
187
|
+
right.shift
|
188
|
+
end
|
189
|
+
|
190
|
+
def left_row_missing_right
|
191
|
+
val = one_to_one_left_row
|
192
|
+
expand_row(val, left_keys) if keep_left
|
193
|
+
end
|
194
|
+
|
195
|
+
def right_row_missing_left
|
196
|
+
val = one_to_one_right_row
|
197
|
+
expand_row(val, right_keys) if keep_right
|
198
|
+
end
|
199
|
+
|
200
|
+
def lt(k1, k2)
|
201
|
+
(k1 <=> k2) == -1
|
202
|
+
end
|
203
|
+
|
204
|
+
def merge_rows lrow, rrow
|
205
|
+
left_keys
|
206
|
+
.map { |from, to| [to, lrow[from]] }.to_h
|
207
|
+
.merge(on.map { |col| [col, lrow[col]] }.to_h)
|
208
|
+
.merge(indicator ? {indicator => nil} : {})
|
209
|
+
.merge(right_keys.map { |from, to| [to, rrow[from]] }.to_h)
|
210
|
+
end
|
211
|
+
|
212
|
+
def expand_row row, renamings
|
213
|
+
renamings
|
214
|
+
.map { |from, to| [to, row[from]] }.to_h
|
215
|
+
.merge(on.map { |col| [col, row[col]] }.to_h)
|
216
|
+
.merge(indicator ? {indicator => nil} : {})
|
217
|
+
end
|
218
|
+
|
219
|
+
def first_right_key
|
220
|
+
right_key_values.empty? ? nil : right_key_values.first
|
221
|
+
end
|
222
|
+
|
223
|
+
def next_right_key
|
224
|
+
right_key_values[1]
|
225
|
+
end
|
226
|
+
|
227
|
+
def first_left_key
|
228
|
+
left_key_values.empty? ? nil : left_key_values.first
|
229
|
+
end
|
230
|
+
|
231
|
+
def next_left_key
|
232
|
+
left_key_values[1]
|
233
|
+
end
|
234
|
+
|
235
|
+
def left_rows_at_merge_key
|
236
|
+
left.take_while { |arr| sanitize_merge_keys(arr.values_at(*on)) == merge_key }
|
237
|
+
end
|
238
|
+
|
239
|
+
def right_rows_at_merge_key
|
240
|
+
right.take_while { |arr| sanitize_merge_keys(arr.values_at(*on)) == merge_key }
|
241
|
+
end
|
242
|
+
|
243
|
+
def cartesian_product
|
244
|
+
@cartesian_product ||= left_rows_at_merge_key.product(right_rows_at_merge_key).map do |left_row, right_row|
|
245
|
+
merge_rows(left_row, right_row)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
def end_cartesian_product
|
250
|
+
left_size = left_rows_at_merge_key.size
|
251
|
+
left_key_values.shift(left_size)
|
252
|
+
left.shift(left_size)
|
253
|
+
right_size = right_rows_at_merge_key.size
|
254
|
+
right_key_values.shift(right_size)
|
255
|
+
right.shift(right_size)
|
256
|
+
@cartesian_product = nil
|
257
|
+
end
|
258
|
+
|
259
|
+
def validate_on!(left_df, right_df)
|
260
|
+
@on.each do |on|
|
261
|
+
left_df.has_column?(on) && right_df.has_column?(on) or
|
262
|
+
raise ArgumentError, "Both dataframes expected to have #{on.inspect} field"
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
def safe_compare(left_array, right_array)
|
267
|
+
left_array.zip(right_array).map { |l, r|
|
268
|
+
next 0 if l.nil? && r.nil?
|
269
|
+
next 1 if r.nil?
|
270
|
+
next -1 if l.nil?
|
271
|
+
l <=> r
|
272
|
+
}.reject(&:zero?).first || 0
|
273
|
+
end
|
274
|
+
end
|
275
|
+
module Merge
|
276
|
+
class << self
|
277
|
+
|
278
|
+
def join df1, df2, opts={}
|
279
|
+
MergeFrame.new(df1, df2, opts).join
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|