red_amber 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +114 -39
- data/CHANGELOG.md +203 -31
- data/Gemfile +5 -2
- data/README.md +62 -29
- data/benchmark/basic.yml +86 -0
- data/benchmark/combine.yml +62 -0
- data/benchmark/dataframe.yml +62 -0
- data/benchmark/drop_nil.yml +15 -3
- data/benchmark/group.yml +39 -0
- data/benchmark/reshape.yml +31 -0
- data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
- data/benchmark/rover/flights.yml +23 -0
- data/benchmark/rover/penguins.yml +23 -0
- data/benchmark/rover/planes.yml +23 -0
- data/benchmark/rover/weather.yml +23 -0
- data/benchmark/vector.yml +60 -0
- data/doc/DataFrame.md +335 -53
- data/doc/Vector.md +91 -0
- data/doc/image/dataframe/join.png +0 -0
- data/doc/image/dataframe/set_and_bind.png +0 -0
- data/doc/image/dataframe_model.png +0 -0
- data/lib/red_amber/data_frame.rb +167 -51
- data/lib/red_amber/data_frame_combinable.rb +486 -0
- data/lib/red_amber/data_frame_displayable.rb +6 -4
- data/lib/red_amber/data_frame_indexable.rb +2 -2
- data/lib/red_amber/data_frame_loadsave.rb +4 -1
- data/lib/red_amber/data_frame_reshaping.rb +35 -10
- data/lib/red_amber/data_frame_selectable.rb +221 -116
- data/lib/red_amber/data_frame_variable_operation.rb +146 -82
- data/lib/red_amber/group.rb +108 -18
- data/lib/red_amber/helper.rb +53 -43
- data/lib/red_amber/refinements.rb +199 -0
- data/lib/red_amber/vector.rb +56 -46
- data/lib/red_amber/vector_functions.rb +23 -83
- data/lib/red_amber/vector_selectable.rb +116 -69
- data/lib/red_amber/vector_updatable.rb +189 -65
- data/lib/red_amber/version.rb +1 -1
- data/lib/red_amber.rb +3 -0
- data/red_amber.gemspec +4 -3
- metadata +24 -10
data/lib/red_amber/data_frame.rb
CHANGED
@@ -5,6 +5,7 @@ module RedAmber
|
|
5
5
|
# Variable @table holds an Arrow::Table object.
|
6
6
|
class DataFrame
|
7
7
|
# mix-in
|
8
|
+
include DataFrameCombinable
|
8
9
|
include DataFrameDisplayable
|
9
10
|
include DataFrameIndexable
|
10
11
|
include DataFrameLoadSave
|
@@ -13,87 +14,135 @@ module RedAmber
|
|
13
14
|
include DataFrameVariableOperation
|
14
15
|
include Helper
|
15
16
|
|
16
|
-
|
17
|
+
using RefineArrowTable
|
18
|
+
using RefineHash
|
19
|
+
|
20
|
+
# Quicker DataFrame construction from a `Arrow::Table`.
|
17
21
|
#
|
18
|
-
# @
|
22
|
+
# @param table [Arrow::Table] A table to have in the DataFrame.
|
23
|
+
# @return [DataFrame] Initialized DataFrame.
|
19
24
|
#
|
20
|
-
#
|
25
|
+
# @note This method will allocate table directly and may be used in the method.
|
26
|
+
# @note `table` must have unique keys.
|
27
|
+
def self.create(table)
|
28
|
+
instance = allocate
|
29
|
+
instance.instance_variable_set(:@table, table)
|
30
|
+
instance
|
31
|
+
end
|
32
|
+
|
33
|
+
# Creates a new DataFrame.
|
21
34
|
#
|
22
35
|
# @overload initialize(table)
|
36
|
+
# Initialize DataFrame by an `Arrow::Table`
|
37
|
+
#
|
38
|
+
# @param table [Arrow::Table]
|
39
|
+
# A table to have in the DataFrame.
|
40
|
+
#
|
41
|
+
# @overload initialize(arrowable)
|
42
|
+
# Initialize DataFrame by a `#to_arrow` responsible object.
|
43
|
+
#
|
44
|
+
# @param arrowable [#to_arrow]
|
45
|
+
# Any object which responds to `#to_arrow`.
|
46
|
+
# `#to_arrow` must return `Arrow::Table`.
|
47
|
+
#
|
48
|
+
# @note `RedAmber::DataFrame` itself is readable by this.
|
49
|
+
# @note Hash is refined to respond to `#to_arrow` in this class.
|
50
|
+
#
|
51
|
+
# @overload initialize(rover_like)
|
52
|
+
# Initialize DataFrame by a `Rover::DataFrame`-like `#to_h` responsible object.
|
53
|
+
#
|
54
|
+
# @param rover_like [#to_h]
|
55
|
+
# Any object which responds to `#to_h`.
|
56
|
+
# `#to_h` must return a Hash which is convertable by `Arrow::Table.new`.
|
23
57
|
#
|
24
|
-
# @
|
58
|
+
# @note `Rover::DataFrame` is readable by this.
|
25
59
|
#
|
26
|
-
# @overload initialize(
|
60
|
+
# @overload initialize()
|
61
|
+
# Create empty DataFrame
|
27
62
|
#
|
28
|
-
# @
|
63
|
+
# @example DataFrame.new
|
29
64
|
#
|
30
|
-
# @overload initialize(
|
65
|
+
# @overload initialize(empty)
|
66
|
+
# Create empty DataFrame
|
31
67
|
#
|
32
|
-
# @
|
68
|
+
# @param empty [nil, [], {}]
|
69
|
+
#
|
70
|
+
# @example DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
|
71
|
+
#
|
72
|
+
# @overload initialize(args)
|
73
|
+
#
|
74
|
+
# @param args [values]
|
75
|
+
# Accepts any argments which is valid for `Arrow::Table.new(args)`. See
|
76
|
+
# {https://github.com/apache/arrow/blob/master/ruby/red-arrow/lib/arrow/table.rb
|
33
77
|
#
|
34
78
|
def initialize(*args)
|
35
|
-
@variables = @keys = @vectors = @types = @data_types = nil
|
36
79
|
case args
|
37
80
|
in nil | [nil] | [] | {} | [[]] | [{}]
|
38
|
-
# DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
|
39
|
-
# returns empty DataFrame
|
40
81
|
@table = Arrow::Table.new({}, [])
|
41
|
-
in [
|
82
|
+
in [Arrow::Table => table]
|
83
|
+
@table = table
|
84
|
+
in [arrowable] if arrowable.respond_to?(:to_arrow)
|
42
85
|
table = arrowable.to_arrow
|
43
86
|
unless table.is_a?(Arrow::Table)
|
44
87
|
raise DataFrameTypeError,
|
45
88
|
"to_arrow must return an Arrow::Table but #{table.class}: #{arrowable}"
|
46
89
|
end
|
47
90
|
@table = table
|
48
|
-
in [
|
49
|
-
@table = table
|
50
|
-
in [DataFrame => dataframe]
|
51
|
-
@table = dataframe.table
|
52
|
-
in [rover_or_hash]
|
91
|
+
in [rover_like] if rover_like.respond_to?(:to_h)
|
53
92
|
begin
|
54
|
-
# Accepts Rover::DataFrame
|
55
|
-
@table = Arrow::Table.new(
|
93
|
+
# Accepts Rover::DataFrame
|
94
|
+
@table = Arrow::Table.new(rover_like.to_h)
|
56
95
|
rescue StandardError
|
57
|
-
raise DataFrameTypeError, "
|
96
|
+
raise DataFrameTypeError, "to_h must return Arrowable object: #{rover_like}"
|
58
97
|
end
|
59
98
|
else
|
60
|
-
|
99
|
+
begin
|
100
|
+
@table = Arrow::Table.new(*args)
|
101
|
+
rescue StandardError
|
102
|
+
raise DataFrameTypeError, "invalid argument to create Arrow::Table: #{args}"
|
103
|
+
end
|
61
104
|
end
|
62
|
-
name_unnamed_keys
|
63
105
|
|
64
|
-
|
65
|
-
|
106
|
+
name_unnamed_keys
|
107
|
+
check_duplicate_keys(keys)
|
66
108
|
end
|
67
109
|
|
110
|
+
# Returns the table having within.
|
111
|
+
#
|
112
|
+
# @return [Arrow::Table] The table within.
|
113
|
+
#
|
68
114
|
attr_reader :table
|
69
115
|
|
70
|
-
|
71
|
-
@table
|
72
|
-
end
|
116
|
+
alias_method :to_arrow, :table
|
73
117
|
|
74
118
|
# Returns the number of rows.
|
75
119
|
#
|
76
120
|
# @return [Integer] Number of rows.
|
121
|
+
#
|
77
122
|
def size
|
78
123
|
@table.n_rows
|
79
124
|
end
|
80
|
-
alias_method :
|
125
|
+
alias_method :n_records, :size
|
81
126
|
alias_method :n_obs, :size
|
127
|
+
alias_method :n_rows, :size
|
82
128
|
|
83
129
|
# Returns the number of columns.
|
84
130
|
#
|
85
131
|
# @return [Integer] Number of columns.
|
132
|
+
#
|
86
133
|
def n_keys
|
87
134
|
@table.n_columns
|
88
135
|
end
|
89
|
-
alias_method :
|
136
|
+
alias_method :n_variables, :n_keys
|
90
137
|
alias_method :n_vars, :n_keys
|
138
|
+
alias_method :n_cols, :n_keys
|
91
139
|
|
92
140
|
# Returns the numbers of rows and columns.
|
93
141
|
#
|
94
142
|
# @return [Array]
|
95
143
|
# Number of rows and number of columns in an array.
|
96
144
|
# Same as [size, n_keys].
|
145
|
+
#
|
97
146
|
def shape
|
98
147
|
[size, n_keys]
|
99
148
|
end
|
@@ -101,7 +150,8 @@ module RedAmber
|
|
101
150
|
# Returns a Hash of key and Vector pairs in the columns.
|
102
151
|
#
|
103
152
|
# @return [Hash]
|
104
|
-
# key => Vector pairs for each columns.
|
153
|
+
# `key => Vector` pairs for each columns.
|
154
|
+
#
|
105
155
|
def variables
|
106
156
|
@variables || @variables = init_instance_vars(:variables)
|
107
157
|
end
|
@@ -111,6 +161,7 @@ module RedAmber
|
|
111
161
|
#
|
112
162
|
# @return [Array]
|
113
163
|
# Keys in an Array.
|
164
|
+
#
|
114
165
|
def keys
|
115
166
|
@keys || @keys = init_instance_vars(:keys)
|
116
167
|
end
|
@@ -122,6 +173,7 @@ module RedAmber
|
|
122
173
|
# @param key [Symbol, String] Key to test.
|
123
174
|
# @return [Boolean]
|
124
175
|
# Returns true if self has key in Symbol.
|
176
|
+
#
|
125
177
|
def key?(key)
|
126
178
|
keys.include?(key.to_sym)
|
127
179
|
end
|
@@ -132,6 +184,7 @@ module RedAmber
|
|
132
184
|
# @param key [Symbol, String] key to know.
|
133
185
|
# @return [Integer]
|
134
186
|
# Index of key in the Array keys.
|
187
|
+
#
|
135
188
|
def key_index(key)
|
136
189
|
keys.find_index(key.to_sym)
|
137
190
|
end
|
@@ -142,14 +195,18 @@ module RedAmber
|
|
142
195
|
#
|
143
196
|
# @return [Array]
|
144
197
|
# Abbreviated Red Arrow data type names.
|
198
|
+
#
|
145
199
|
def types
|
146
|
-
@types || @types = @table.columns.map
|
200
|
+
@types || @types = @table.columns.map do |column|
|
201
|
+
column.data.value_type.nick.to_sym
|
202
|
+
end
|
147
203
|
end
|
148
204
|
|
149
205
|
# Returns an Array of Classes of data type.
|
150
206
|
#
|
151
207
|
# @return [Array]
|
152
208
|
# An Array of Red Arrow data type Classes.
|
209
|
+
#
|
153
210
|
def type_classes
|
154
211
|
@data_types || @data_types = @table.columns.map { |column| column.data_type.class }
|
155
212
|
end
|
@@ -157,50 +214,94 @@ module RedAmber
|
|
157
214
|
# Returns Vectors in an Array.
|
158
215
|
#
|
159
216
|
# @return [Array]
|
160
|
-
# An Array of RedAmber::Vector
|
217
|
+
# An Array of `RedAmber::Vector`s.
|
218
|
+
#
|
161
219
|
def vectors
|
162
220
|
@vectors || @vectors = init_instance_vars(:vectors)
|
163
221
|
end
|
164
222
|
|
165
|
-
# Returns row indices (start...(size+start)) in
|
223
|
+
# Returns row indices (start...(size+start)) in a Vector.
|
166
224
|
#
|
167
225
|
# @param start [Object]
|
168
|
-
# Object which have
|
226
|
+
# Object which have `#succ` method.
|
227
|
+
#
|
169
228
|
# @return [Array]
|
170
|
-
#
|
229
|
+
# A Vector of row indices.
|
230
|
+
#
|
171
231
|
# @example
|
172
232
|
# (when self.size == 5)
|
173
|
-
# - indices #=> [0, 1, 2, 3, 4]
|
174
|
-
# - indices(1) #=> [1, 2, 3, 4, 5]
|
175
|
-
# - indices('a') #=> ['a', 'b', 'c', 'd', 'e']
|
233
|
+
# - indices #=> Vector[0, 1, 2, 3, 4]
|
234
|
+
# - indices(1) #=> Vector[1, 2, 3, 4, 5]
|
235
|
+
# - indices('a') #=> Vector['a', 'b', 'c', 'd', 'e']
|
236
|
+
#
|
176
237
|
def indices(start = 0)
|
177
|
-
(start..).take(size)
|
238
|
+
Vector.new((start..).take(size))
|
178
239
|
end
|
179
240
|
alias_method :indexes, :indices
|
180
241
|
|
242
|
+
# Returns column-oriented data in a Hash.
|
243
|
+
#
|
244
|
+
# @return [Hash] A Hash of 'key => column_in_an_array'.
|
245
|
+
#
|
181
246
|
def to_h
|
182
247
|
variables.transform_values(&:to_a)
|
183
248
|
end
|
184
249
|
|
250
|
+
# Returns a row-oriented array without header.
|
251
|
+
#
|
252
|
+
# @return [Array] Row-oriented data without header.
|
253
|
+
#
|
254
|
+
# @note If you need column-oriented array, use `.to_h.to_a`.
|
255
|
+
#
|
185
256
|
def to_a
|
186
|
-
# output an array of row-oriented data without header
|
187
|
-
# if you need column-oriented array, use `.to_h.to_a`
|
188
257
|
@table.raw_records
|
189
258
|
end
|
190
259
|
alias_method :raw_records, :to_a
|
191
260
|
|
261
|
+
# Returns column name and data type in a Hash.
|
262
|
+
#
|
263
|
+
# @return [Hash] Column name and data type.
|
264
|
+
#
|
265
|
+
# @example
|
266
|
+
# RedAmber::DataFrame.new(x: [1, 2, 3], y: %w[A B C]).schema
|
267
|
+
# # => {:x=>:uint8, :y=>:string}
|
268
|
+
#
|
192
269
|
def schema
|
193
270
|
keys.zip(types).to_h
|
194
271
|
end
|
195
272
|
|
273
|
+
# Compare DataFrames.
|
274
|
+
#
|
275
|
+
# @return [true, false]
|
276
|
+
# True if other is a DataFrame and table is same.
|
277
|
+
# Otherwise return false.
|
278
|
+
#
|
196
279
|
def ==(other)
|
197
280
|
other.is_a?(DataFrame) && @table == other.table
|
198
281
|
end
|
199
282
|
|
283
|
+
# Check if it is a empty DataFrame.
|
284
|
+
#
|
285
|
+
# @return [true, false] True if it has no columns.
|
286
|
+
#
|
200
287
|
def empty?
|
201
288
|
variables.empty?
|
202
289
|
end
|
203
290
|
|
291
|
+
# Enumerate for each row.
|
292
|
+
#
|
293
|
+
# @overload each_row
|
294
|
+
# Returns Enumerator when no block given.
|
295
|
+
#
|
296
|
+
# @return [Enumerator] Enumerator of each rows.
|
297
|
+
#
|
298
|
+
# @overload each_row(&block)
|
299
|
+
# Yields with key and row pairs.
|
300
|
+
#
|
301
|
+
# @yield [key_row_pairs] Yields with key and row pairs.
|
302
|
+
# @yieldparam [Hash] Key and row pairs.
|
303
|
+
# @yieldreturn [Integer] Size of the DataFrame.
|
304
|
+
#
|
204
305
|
def each_row
|
205
306
|
return enum_for(:each_row) unless block_given?
|
206
307
|
|
@@ -213,6 +314,10 @@ module RedAmber
|
|
213
314
|
end
|
214
315
|
end
|
215
316
|
|
317
|
+
# Returns self in a `Rover::DataFrame`.
|
318
|
+
#
|
319
|
+
# @return [Rover::DataFrame] A `Rover::DataFrame`.
|
320
|
+
#
|
216
321
|
def to_rover
|
217
322
|
require 'rover'
|
218
323
|
Rover::DataFrame.new(to_h)
|
@@ -225,7 +330,7 @@ module RedAmber
|
|
225
330
|
end
|
226
331
|
|
227
332
|
def method_missing(name, *args, &block)
|
228
|
-
return v(name) if args.empty?
|
333
|
+
return v(name) if args.empty? && key?(name)
|
229
334
|
|
230
335
|
super
|
231
336
|
end
|
@@ -240,20 +345,31 @@ module RedAmber
|
|
240
345
|
|
241
346
|
# initialize @variable, @keys, @vectors and return one of them
|
242
347
|
def init_instance_vars(var)
|
243
|
-
ary =
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
348
|
+
ary =
|
349
|
+
@table.columns
|
350
|
+
.each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
|
351
|
+
v = Vector.create(column.data)
|
352
|
+
k = column.name.to_sym
|
353
|
+
v.key = k
|
354
|
+
variables[k] = v
|
355
|
+
keys << k
|
356
|
+
vectors << v
|
357
|
+
end
|
358
|
+
|
251
359
|
@variables, @keys, @vectors = ary
|
252
360
|
ary[%i[variables keys vectors].index(var)]
|
253
361
|
end
|
254
362
|
|
363
|
+
def check_duplicate_keys(array)
|
364
|
+
org = array.dup
|
365
|
+
return unless array.uniq!
|
366
|
+
|
367
|
+
raise DataFrameArgumentError,
|
368
|
+
"duplicate keys: #{org.tally.select { |_k, v| v > 1 }.keys}"
|
369
|
+
end
|
370
|
+
|
255
371
|
def name_unnamed_keys
|
256
|
-
return unless @table
|
372
|
+
return unless @table.key?('')
|
257
373
|
|
258
374
|
# We can't use #keys because it causes mismatch of @table and @keys
|
259
375
|
keys = @table.schema.fields.map { |f| f.name.to_sym }
|