red_amber 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +114 -39
- data/CHANGELOG.md +203 -31
- data/Gemfile +5 -2
- data/README.md +62 -29
- data/benchmark/basic.yml +86 -0
- data/benchmark/combine.yml +62 -0
- data/benchmark/dataframe.yml +62 -0
- data/benchmark/drop_nil.yml +15 -3
- data/benchmark/group.yml +39 -0
- data/benchmark/reshape.yml +31 -0
- data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
- data/benchmark/rover/flights.yml +23 -0
- data/benchmark/rover/penguins.yml +23 -0
- data/benchmark/rover/planes.yml +23 -0
- data/benchmark/rover/weather.yml +23 -0
- data/benchmark/vector.yml +60 -0
- data/doc/DataFrame.md +335 -53
- data/doc/Vector.md +91 -0
- data/doc/image/dataframe/join.png +0 -0
- data/doc/image/dataframe/set_and_bind.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/lib/red_amber/data_frame.rb +167 -51
- data/lib/red_amber/data_frame_combinable.rb +486 -0
- data/lib/red_amber/data_frame_displayable.rb +6 -4
- data/lib/red_amber/data_frame_indexable.rb +2 -2
- data/lib/red_amber/data_frame_loadsave.rb +4 -1
- data/lib/red_amber/data_frame_reshaping.rb +35 -10
- data/lib/red_amber/data_frame_selectable.rb +221 -116
- data/lib/red_amber/data_frame_variable_operation.rb +146 -82
- data/lib/red_amber/group.rb +108 -18
- data/lib/red_amber/helper.rb +53 -43
- data/lib/red_amber/refinements.rb +199 -0
- data/lib/red_amber/vector.rb +56 -46
- data/lib/red_amber/vector_functions.rb +23 -83
- data/lib/red_amber/vector_selectable.rb +116 -69
- data/lib/red_amber/vector_updatable.rb +189 -65
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +3 -0
- data/red_amber.gemspec +4 -3
- metadata +24 -10
data/lib/red_amber/data_frame.rb
CHANGED
@@ -5,6 +5,7 @@ module RedAmber
|
|
5
5
|
# Variable @table holds an Arrow::Table object.
|
6
6
|
class DataFrame
|
7
7
|
# mix-in
|
8
|
+
include DataFrameCombinable
|
8
9
|
include DataFrameDisplayable
|
9
10
|
include DataFrameIndexable
|
10
11
|
include DataFrameLoadSave
|
@@ -13,87 +14,135 @@ module RedAmber
|
|
13
14
|
include DataFrameVariableOperation
|
14
15
|
include Helper
|
15
16
|
|
16
|
-
|
17
|
+
using RefineArrowTable
|
18
|
+
using RefineHash
|
19
|
+
|
20
|
+
# Quicker DataFrame construction from a `Arrow::Table`.
|
17
21
|
#
|
18
|
-
# @
|
22
|
+
# @param table [Arrow::Table] A table to have in the DataFrame.
|
23
|
+
# @return [DataFrame] Initialized DataFrame.
|
19
24
|
#
|
20
|
-
#
|
25
|
+
# @note This method will allocate table directly and may be used in the method.
|
26
|
+
# @note `table` must have unique keys.
|
27
|
+
def self.create(table)
|
28
|
+
instance = allocate
|
29
|
+
instance.instance_variable_set(:@table, table)
|
30
|
+
instance
|
31
|
+
end
|
32
|
+
|
33
|
+
# Creates a new DataFrame.
|
21
34
|
#
|
22
35
|
# @overload initialize(table)
|
36
|
+
# Initialize DataFrame by an `Arrow::Table`
|
37
|
+
#
|
38
|
+
# @param table [Arrow::Table]
|
39
|
+
# A table to have in the DataFrame.
|
40
|
+
#
|
41
|
+
# @overload initialize(arrowable)
|
42
|
+
# Initialize DataFrame by a `#to_arrow` responsible object.
|
43
|
+
#
|
44
|
+
# @param arrowable [#to_arrow]
|
45
|
+
# Any object which responds to `#to_arrow`.
|
46
|
+
# `#to_arrow` must return `Arrow::Table`.
|
47
|
+
#
|
48
|
+
# @note `RedAmber::DataFrame` itself is readable by this.
|
49
|
+
# @note Hash is refined to respond to `#to_arrow` in this class.
|
50
|
+
#
|
51
|
+
# @overload initialize(rover_like)
|
52
|
+
# Initialize DataFrame by a `Rover::DataFrame`-like `#to_h` responsible object.
|
53
|
+
#
|
54
|
+
# @param rover_like [#to_h]
|
55
|
+
# Any object which responds to `#to_h`.
|
56
|
+
# `#to_h` must return a Hash which is convertable by `Arrow::Table.new`.
|
23
57
|
#
|
24
|
-
# @
|
58
|
+
# @note `Rover::DataFrame` is readable by this.
|
25
59
|
#
|
26
|
-
# @overload initialize(
|
60
|
+
# @overload initialize()
|
61
|
+
# Create empty DataFrame
|
27
62
|
#
|
28
|
-
# @
|
63
|
+
# @example DataFrame.new
|
29
64
|
#
|
30
|
-
# @overload initialize(
|
65
|
+
# @overload initialize(empty)
|
66
|
+
# Create empty DataFrame
|
31
67
|
#
|
32
|
-
# @
|
68
|
+
# @param empty [nil, [], {}]
|
69
|
+
#
|
70
|
+
# @example DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
|
71
|
+
#
|
72
|
+
# @overload initialize(args)
|
73
|
+
#
|
74
|
+
# @param args [values]
|
75
|
+
# Accepts any argments which is valid for `Arrow::Table.new(args)`. See
|
76
|
+
# {https://github.com/apache/arrow/blob/master/ruby/red-arrow/lib/arrow/table.rb
|
33
77
|
#
|
34
78
|
def initialize(*args)
|
35
|
-
@variables = @keys = @vectors = @types = @data_types = nil
|
36
79
|
case args
|
37
80
|
in nil | [nil] | [] | {} | [[]] | [{}]
|
38
|
-
# DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
|
39
|
-
# returns empty DataFrame
|
40
81
|
@table = Arrow::Table.new({}, [])
|
41
|
-
in [
|
82
|
+
in [Arrow::Table => table]
|
83
|
+
@table = table
|
84
|
+
in [arrowable] if arrowable.respond_to?(:to_arrow)
|
42
85
|
table = arrowable.to_arrow
|
43
86
|
unless table.is_a?(Arrow::Table)
|
44
87
|
raise DataFrameTypeError,
|
45
88
|
"to_arrow must return an Arrow::Table but #{table.class}: #{arrowable}"
|
46
89
|
end
|
47
90
|
@table = table
|
48
|
-
in [
|
49
|
-
@table = table
|
50
|
-
in [DataFrame => dataframe]
|
51
|
-
@table = dataframe.table
|
52
|
-
in [rover_or_hash]
|
91
|
+
in [rover_like] if rover_like.respond_to?(:to_h)
|
53
92
|
begin
|
54
|
-
# Accepts Rover::DataFrame
|
55
|
-
@table = Arrow::Table.new(
|
93
|
+
# Accepts Rover::DataFrame
|
94
|
+
@table = Arrow::Table.new(rover_like.to_h)
|
56
95
|
rescue StandardError
|
57
|
-
raise DataFrameTypeError, "
|
96
|
+
raise DataFrameTypeError, "to_h must return Arrowable object: #{rover_like}"
|
58
97
|
end
|
59
98
|
else
|
60
|
-
|
99
|
+
begin
|
100
|
+
@table = Arrow::Table.new(*args)
|
101
|
+
rescue StandardError
|
102
|
+
raise DataFrameTypeError, "invalid argument to create Arrow::Table: #{args}"
|
103
|
+
end
|
61
104
|
end
|
62
|
-
name_unnamed_keys
|
63
105
|
|
64
|
-
|
65
|
-
|
106
|
+
name_unnamed_keys
|
107
|
+
check_duplicate_keys(keys)
|
66
108
|
end
|
67
109
|
|
110
|
+
# Returns the table having within.
|
111
|
+
#
|
112
|
+
# @return [Arrow::Table] The table within.
|
113
|
+
#
|
68
114
|
attr_reader :table
|
69
115
|
|
70
|
-
|
71
|
-
@table
|
72
|
-
end
|
116
|
+
alias_method :to_arrow, :table
|
73
117
|
|
74
118
|
# Returns the number of rows.
|
75
119
|
#
|
76
120
|
# @return [Integer] Number of rows.
|
121
|
+
#
|
77
122
|
def size
|
78
123
|
@table.n_rows
|
79
124
|
end
|
80
|
-
alias_method :
|
125
|
+
alias_method :n_records, :size
|
81
126
|
alias_method :n_obs, :size
|
127
|
+
alias_method :n_rows, :size
|
82
128
|
|
83
129
|
# Returns the number of columns.
|
84
130
|
#
|
85
131
|
# @return [Integer] Number of columns.
|
132
|
+
#
|
86
133
|
def n_keys
|
87
134
|
@table.n_columns
|
88
135
|
end
|
89
|
-
alias_method :
|
136
|
+
alias_method :n_variables, :n_keys
|
90
137
|
alias_method :n_vars, :n_keys
|
138
|
+
alias_method :n_cols, :n_keys
|
91
139
|
|
92
140
|
# Returns the numbers of rows and columns.
|
93
141
|
#
|
94
142
|
# @return [Array]
|
95
143
|
# Number of rows and number of columns in an array.
|
96
144
|
# Same as [size, n_keys].
|
145
|
+
#
|
97
146
|
def shape
|
98
147
|
[size, n_keys]
|
99
148
|
end
|
@@ -101,7 +150,8 @@ module RedAmber
|
|
101
150
|
# Returns a Hash of key and Vector pairs in the columns.
|
102
151
|
#
|
103
152
|
# @return [Hash]
|
104
|
-
# key => Vector pairs for each columns.
|
153
|
+
# `key => Vector` pairs for each columns.
|
154
|
+
#
|
105
155
|
def variables
|
106
156
|
@variables || @variables = init_instance_vars(:variables)
|
107
157
|
end
|
@@ -111,6 +161,7 @@ module RedAmber
|
|
111
161
|
#
|
112
162
|
# @return [Array]
|
113
163
|
# Keys in an Array.
|
164
|
+
#
|
114
165
|
def keys
|
115
166
|
@keys || @keys = init_instance_vars(:keys)
|
116
167
|
end
|
@@ -122,6 +173,7 @@ module RedAmber
|
|
122
173
|
# @param key [Symbol, String] Key to test.
|
123
174
|
# @return [Boolean]
|
124
175
|
# Returns true if self has key in Symbol.
|
176
|
+
#
|
125
177
|
def key?(key)
|
126
178
|
keys.include?(key.to_sym)
|
127
179
|
end
|
@@ -132,6 +184,7 @@ module RedAmber
|
|
132
184
|
# @param key [Symbol, String] key to know.
|
133
185
|
# @return [Integer]
|
134
186
|
# Index of key in the Array keys.
|
187
|
+
#
|
135
188
|
def key_index(key)
|
136
189
|
keys.find_index(key.to_sym)
|
137
190
|
end
|
@@ -142,14 +195,18 @@ module RedAmber
|
|
142
195
|
#
|
143
196
|
# @return [Array]
|
144
197
|
# Abbreviated Red Arrow data type names.
|
198
|
+
#
|
145
199
|
def types
|
146
|
-
@types || @types = @table.columns.map
|
200
|
+
@types || @types = @table.columns.map do |column|
|
201
|
+
column.data.value_type.nick.to_sym
|
202
|
+
end
|
147
203
|
end
|
148
204
|
|
149
205
|
# Returns an Array of Classes of data type.
|
150
206
|
#
|
151
207
|
# @return [Array]
|
152
208
|
# An Array of Red Arrow data type Classes.
|
209
|
+
#
|
153
210
|
def type_classes
|
154
211
|
@data_types || @data_types = @table.columns.map { |column| column.data_type.class }
|
155
212
|
end
|
@@ -157,50 +214,94 @@ module RedAmber
|
|
157
214
|
# Returns Vectors in an Array.
|
158
215
|
#
|
159
216
|
# @return [Array]
|
160
|
-
# An Array of RedAmber::Vector
|
217
|
+
# An Array of `RedAmber::Vector`s.
|
218
|
+
#
|
161
219
|
def vectors
|
162
220
|
@vectors || @vectors = init_instance_vars(:vectors)
|
163
221
|
end
|
164
222
|
|
165
|
-
# Returns row indices (start...(size+start)) in
|
223
|
+
# Returns row indices (start...(size+start)) in a Vector.
|
166
224
|
#
|
167
225
|
# @param start [Object]
|
168
|
-
# Object which have
|
226
|
+
# Object which have `#succ` method.
|
227
|
+
#
|
169
228
|
# @return [Array]
|
170
|
-
#
|
229
|
+
# A Vector of row indices.
|
230
|
+
#
|
171
231
|
# @example
|
172
232
|
# (when self.size == 5)
|
173
|
-
# - indices #=> [0, 1, 2, 3, 4]
|
174
|
-
# - indices(1) #=> [1, 2, 3, 4, 5]
|
175
|
-
# - indices('a') #=> ['a', 'b', 'c', 'd', 'e']
|
233
|
+
# - indices #=> Vector[0, 1, 2, 3, 4]
|
234
|
+
# - indices(1) #=> Vector[1, 2, 3, 4, 5]
|
235
|
+
# - indices('a') #=> Vector['a', 'b', 'c', 'd', 'e']
|
236
|
+
#
|
176
237
|
def indices(start = 0)
|
177
|
-
(start..).take(size)
|
238
|
+
Vector.new((start..).take(size))
|
178
239
|
end
|
179
240
|
alias_method :indexes, :indices
|
180
241
|
|
242
|
+
# Returns column-oriented data in a Hash.
|
243
|
+
#
|
244
|
+
# @return [Hash] A Hash of 'key => column_in_an_array'.
|
245
|
+
#
|
181
246
|
def to_h
|
182
247
|
variables.transform_values(&:to_a)
|
183
248
|
end
|
184
249
|
|
250
|
+
# Returns a row-oriented array without header.
|
251
|
+
#
|
252
|
+
# @return [Array] Row-oriented data without header.
|
253
|
+
#
|
254
|
+
# @note If you need column-oriented array, use `.to_h.to_a`.
|
255
|
+
#
|
185
256
|
def to_a
|
186
|
-
# output an array of row-oriented data without header
|
187
|
-
# if you need column-oriented array, use `.to_h.to_a`
|
188
257
|
@table.raw_records
|
189
258
|
end
|
190
259
|
alias_method :raw_records, :to_a
|
191
260
|
|
261
|
+
# Returns column name and data type in a Hash.
|
262
|
+
#
|
263
|
+
# @return [Hash] Column name and data type.
|
264
|
+
#
|
265
|
+
# @example
|
266
|
+
# RedAmber::DataFrame.new(x: [1, 2, 3], y: %w[A B C]).schema
|
267
|
+
# # => {:x=>:uint8, :y=>:string}
|
268
|
+
#
|
192
269
|
def schema
|
193
270
|
keys.zip(types).to_h
|
194
271
|
end
|
195
272
|
|
273
|
+
# Compare DataFrames.
|
274
|
+
#
|
275
|
+
# @return [true, false]
|
276
|
+
# True if other is a DataFrame and table is same.
|
277
|
+
# Otherwise return false.
|
278
|
+
#
|
196
279
|
def ==(other)
|
197
280
|
other.is_a?(DataFrame) && @table == other.table
|
198
281
|
end
|
199
282
|
|
283
|
+
# Check if it is a empty DataFrame.
|
284
|
+
#
|
285
|
+
# @return [true, false] True if it has no columns.
|
286
|
+
#
|
200
287
|
def empty?
|
201
288
|
variables.empty?
|
202
289
|
end
|
203
290
|
|
291
|
+
# Enumerate for each row.
|
292
|
+
#
|
293
|
+
# @overload each_row
|
294
|
+
# Returns Enumerator when no block given.
|
295
|
+
#
|
296
|
+
# @return [Enumerator] Enumerator of each rows.
|
297
|
+
#
|
298
|
+
# @overload each_row(&block)
|
299
|
+
# Yields with key and row pairs.
|
300
|
+
#
|
301
|
+
# @yield [key_row_pairs] Yields with key and row pairs.
|
302
|
+
# @yieldparam [Hash] Key and row pairs.
|
303
|
+
# @yieldreturn [Integer] Size of the DataFrame.
|
304
|
+
#
|
204
305
|
def each_row
|
205
306
|
return enum_for(:each_row) unless block_given?
|
206
307
|
|
@@ -213,6 +314,10 @@ module RedAmber
|
|
213
314
|
end
|
214
315
|
end
|
215
316
|
|
317
|
+
# Returns self in a `Rover::DataFrame`.
|
318
|
+
#
|
319
|
+
# @return [Rover::DataFrame] A `Rover::DataFrame`.
|
320
|
+
#
|
216
321
|
def to_rover
|
217
322
|
require 'rover'
|
218
323
|
Rover::DataFrame.new(to_h)
|
@@ -225,7 +330,7 @@ module RedAmber
|
|
225
330
|
end
|
226
331
|
|
227
332
|
def method_missing(name, *args, &block)
|
228
|
-
return v(name) if args.empty?
|
333
|
+
return v(name) if args.empty? && key?(name)
|
229
334
|
|
230
335
|
super
|
231
336
|
end
|
@@ -240,20 +345,31 @@ module RedAmber
|
|
240
345
|
|
241
346
|
# initialize @variable, @keys, @vectors and return one of them
|
242
347
|
def init_instance_vars(var)
|
243
|
-
ary =
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
348
|
+
ary =
|
349
|
+
@table.columns
|
350
|
+
.each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
|
351
|
+
v = Vector.create(column.data)
|
352
|
+
k = column.name.to_sym
|
353
|
+
v.key = k
|
354
|
+
variables[k] = v
|
355
|
+
keys << k
|
356
|
+
vectors << v
|
357
|
+
end
|
358
|
+
|
251
359
|
@variables, @keys, @vectors = ary
|
252
360
|
ary[%i[variables keys vectors].index(var)]
|
253
361
|
end
|
254
362
|
|
363
|
+
def check_duplicate_keys(array)
|
364
|
+
org = array.dup
|
365
|
+
return unless array.uniq!
|
366
|
+
|
367
|
+
raise DataFrameArgumentError,
|
368
|
+
"duplicate keys: #{org.tally.select { |_k, v| v > 1 }.keys}"
|
369
|
+
end
|
370
|
+
|
255
371
|
def name_unnamed_keys
|
256
|
-
return unless @table
|
372
|
+
return unless @table.key?('')
|
257
373
|
|
258
374
|
# We can't use #keys because it causes mismatch of @table and @keys
|
259
375
|
keys = @table.schema.fields.map { |f| f.name.to_sym }
|