red_amber 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +114 -39
  3. data/CHANGELOG.md +203 -31
  4. data/Gemfile +5 -2
  5. data/README.md +62 -29
  6. data/benchmark/basic.yml +86 -0
  7. data/benchmark/combine.yml +62 -0
  8. data/benchmark/dataframe.yml +62 -0
  9. data/benchmark/drop_nil.yml +15 -3
  10. data/benchmark/group.yml +39 -0
  11. data/benchmark/reshape.yml +31 -0
  12. data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
  13. data/benchmark/rover/flights.yml +23 -0
  14. data/benchmark/rover/penguins.yml +23 -0
  15. data/benchmark/rover/planes.yml +23 -0
  16. data/benchmark/rover/weather.yml +23 -0
  17. data/benchmark/vector.yml +60 -0
  18. data/doc/DataFrame.md +335 -53
  19. data/doc/Vector.md +91 -0
  20. data/doc/image/dataframe/join.png +0 -0
  21. data/doc/image/dataframe/set_and_bind.png +0 -0
  22. data/doc/image/dataframe_model.png +0 -0
  23. data/lib/red_amber/data_frame.rb +167 -51
  24. data/lib/red_amber/data_frame_combinable.rb +486 -0
  25. data/lib/red_amber/data_frame_displayable.rb +6 -4
  26. data/lib/red_amber/data_frame_indexable.rb +2 -2
  27. data/lib/red_amber/data_frame_loadsave.rb +4 -1
  28. data/lib/red_amber/data_frame_reshaping.rb +35 -10
  29. data/lib/red_amber/data_frame_selectable.rb +221 -116
  30. data/lib/red_amber/data_frame_variable_operation.rb +146 -82
  31. data/lib/red_amber/group.rb +108 -18
  32. data/lib/red_amber/helper.rb +53 -43
  33. data/lib/red_amber/refinements.rb +199 -0
  34. data/lib/red_amber/vector.rb +56 -46
  35. data/lib/red_amber/vector_functions.rb +23 -83
  36. data/lib/red_amber/vector_selectable.rb +116 -69
  37. data/lib/red_amber/vector_updatable.rb +189 -65
  38. data/lib/red_amber/version.rb +1 -1
  39. data/lib/red_amber.rb +3 -0
  40. data/red_amber.gemspec +4 -3
  41. metadata +24 -10
@@ -5,6 +5,7 @@ module RedAmber
5
5
  # Variable @table holds an Arrow::Table object.
6
6
  class DataFrame
7
7
  # mix-in
8
+ include DataFrameCombinable
8
9
  include DataFrameDisplayable
9
10
  include DataFrameIndexable
10
11
  include DataFrameLoadSave
@@ -13,87 +14,135 @@ module RedAmber
13
14
  include DataFrameVariableOperation
14
15
  include Helper
15
16
 
16
- # Creates a new RedAmber::DataFrame.
17
+ using RefineArrowTable
18
+ using RefineHash
19
+
20
+ # Quicker DataFrame construction from a `Arrow::Table`.
17
21
  #
18
- # @overload initialize(hash)
22
+ # @param table [Arrow::Table] A table to have in the DataFrame.
23
+ # @return [DataFrame] Initialized DataFrame.
19
24
  #
20
- # @params hash [Hash]
25
+ # @note This method will allocate table directly and may be used in the method.
26
+ # @note `table` must have unique keys.
27
+ def self.create(table)
28
+ instance = allocate
29
+ instance.instance_variable_set(:@table, table)
30
+ instance
31
+ end
32
+
33
+ # Creates a new DataFrame.
21
34
  #
22
35
  # @overload initialize(table)
36
+ # Initialize DataFrame by an `Arrow::Table`
37
+ #
38
+ # @param table [Arrow::Table]
39
+ # A table to have in the DataFrame.
40
+ #
41
+ # @overload initialize(arrowable)
42
+ # Initialize DataFrame by a `#to_arrow` responsible object.
43
+ #
44
+ # @param arrowable [#to_arrow]
45
+ # Any object which responds to `#to_arrow`.
46
+ # `#to_arrow` must return `Arrow::Table`.
47
+ #
48
+ # @note `RedAmber::DataFrame` itself is readable by this.
49
+ # @note Hash is refined to respond to `#to_arrow` in this class.
50
+ #
51
+ # @overload initialize(rover_like)
52
+ # Initialize DataFrame by a `Rover::DataFrame`-like `#to_h` responsible object.
53
+ #
54
+ # @param rover_like [#to_h]
55
+ # Any object which responds to `#to_h`.
56
+ # `#to_h` must return a Hash which is convertable by `Arrow::Table.new`.
23
57
  #
24
- # @params table [Arrow::Table]
58
+ # @note `Rover::DataFrame` is readable by this.
25
59
  #
26
- # @overload initialize(dataframe)
60
+ # @overload initialize()
61
+ # Create empty DataFrame
27
62
  #
28
- # @params dataframe [RedAmber::DataFrame, Rover::DataFrame]
63
+ # @example DataFrame.new
29
64
  #
30
- # @overload initialize(null)
65
+ # @overload initialize(empty)
66
+ # Create empty DataFrame
31
67
  #
32
- # @params null [NilClass] No arguments.
68
+ # @param empty [nil, [], {}]
69
+ #
70
+ # @example DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
71
+ #
72
+ # @overload initialize(args)
73
+ #
74
+ # @param args [values]
75
+ # Accepts any argments which is valid for `Arrow::Table.new(args)`. See
76
+ # {https://github.com/apache/arrow/blob/master/ruby/red-arrow/lib/arrow/table.rb
33
77
  #
34
78
  def initialize(*args)
35
- @variables = @keys = @vectors = @types = @data_types = nil
36
79
  case args
37
80
  in nil | [nil] | [] | {} | [[]] | [{}]
38
- # DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
39
- # returns empty DataFrame
40
81
  @table = Arrow::Table.new({}, [])
41
- in [->(x) { x.respond_to?(:to_arrow) } => arrowable]
82
+ in [Arrow::Table => table]
83
+ @table = table
84
+ in [arrowable] if arrowable.respond_to?(:to_arrow)
42
85
  table = arrowable.to_arrow
43
86
  unless table.is_a?(Arrow::Table)
44
87
  raise DataFrameTypeError,
45
88
  "to_arrow must return an Arrow::Table but #{table.class}: #{arrowable}"
46
89
  end
47
90
  @table = table
48
- in [Arrow::Table => table]
49
- @table = table
50
- in [DataFrame => dataframe]
51
- @table = dataframe.table
52
- in [rover_or_hash]
91
+ in [rover_like] if rover_like.respond_to?(:to_h)
53
92
  begin
54
- # Accepts Rover::DataFrame or Hash
55
- @table = Arrow::Table.new(rover_or_hash.to_h)
93
+ # Accepts Rover::DataFrame
94
+ @table = Arrow::Table.new(rover_like.to_h)
56
95
  rescue StandardError
57
- raise DataFrameTypeError, "invalid argument: #{rover_or_hash}"
96
+ raise DataFrameTypeError, "to_h must return Arrowable object: #{rover_like}"
58
97
  end
59
98
  else
60
- @table = Arrow::Table.new(*args)
99
+ begin
100
+ @table = Arrow::Table.new(*args)
101
+ rescue StandardError
102
+ raise DataFrameTypeError, "invalid argument to create Arrow::Table: #{args}"
103
+ end
61
104
  end
62
- name_unnamed_keys
63
105
 
64
- duplicated_keys = keys.tally.select { |_k, v| v > 1 }.keys
65
- raise DataFrameArgumentError, "duplicate keys: #{duplicated_keys}" unless duplicated_keys.empty?
106
+ name_unnamed_keys
107
+ check_duplicate_keys(keys)
66
108
  end
67
109
 
110
+ # Returns the table having within.
111
+ #
112
+ # @return [Arrow::Table] The table within.
113
+ #
68
114
  attr_reader :table
69
115
 
70
- def to_arrow
71
- @table
72
- end
116
+ alias_method :to_arrow, :table
73
117
 
74
118
  # Returns the number of rows.
75
119
  #
76
120
  # @return [Integer] Number of rows.
121
+ #
77
122
  def size
78
123
  @table.n_rows
79
124
  end
80
- alias_method :n_rows, :size
125
+ alias_method :n_records, :size
81
126
  alias_method :n_obs, :size
127
+ alias_method :n_rows, :size
82
128
 
83
129
  # Returns the number of columns.
84
130
  #
85
131
  # @return [Integer] Number of columns.
132
+ #
86
133
  def n_keys
87
134
  @table.n_columns
88
135
  end
89
- alias_method :n_cols, :n_keys
136
+ alias_method :n_variables, :n_keys
90
137
  alias_method :n_vars, :n_keys
138
+ alias_method :n_cols, :n_keys
91
139
 
92
140
  # Returns the numbers of rows and columns.
93
141
  #
94
142
  # @return [Array]
95
143
  # Number of rows and number of columns in an array.
96
144
  # Same as [size, n_keys].
145
+ #
97
146
  def shape
98
147
  [size, n_keys]
99
148
  end
@@ -101,7 +150,8 @@ module RedAmber
101
150
  # Returns a Hash of key and Vector pairs in the columns.
102
151
  #
103
152
  # @return [Hash]
104
- # key => Vector pairs for each columns.
153
+ # `key => Vector` pairs for each columns.
154
+ #
105
155
  def variables
106
156
  @variables || @variables = init_instance_vars(:variables)
107
157
  end
@@ -111,6 +161,7 @@ module RedAmber
111
161
  #
112
162
  # @return [Array]
113
163
  # Keys in an Array.
164
+ #
114
165
  def keys
115
166
  @keys || @keys = init_instance_vars(:keys)
116
167
  end
@@ -122,6 +173,7 @@ module RedAmber
122
173
  # @param key [Symbol, String] Key to test.
123
174
  # @return [Boolean]
124
175
  # Returns true if self has key in Symbol.
176
+ #
125
177
  def key?(key)
126
178
  keys.include?(key.to_sym)
127
179
  end
@@ -132,6 +184,7 @@ module RedAmber
132
184
  # @param key [Symbol, String] key to know.
133
185
  # @return [Integer]
134
186
  # Index of key in the Array keys.
187
+ #
135
188
  def key_index(key)
136
189
  keys.find_index(key.to_sym)
137
190
  end
@@ -142,14 +195,18 @@ module RedAmber
142
195
  #
143
196
  # @return [Array]
144
197
  # Abbreviated Red Arrow data type names.
198
+ #
145
199
  def types
146
- @types || @types = @table.columns.map { |column| column.data.value_type.nick.to_sym }
200
+ @types || @types = @table.columns.map do |column|
201
+ column.data.value_type.nick.to_sym
202
+ end
147
203
  end
148
204
 
149
205
  # Returns an Array of Classes of data type.
150
206
  #
151
207
  # @return [Array]
152
208
  # An Array of Red Arrow data type Classes.
209
+ #
153
210
  def type_classes
154
211
  @data_types || @data_types = @table.columns.map { |column| column.data_type.class }
155
212
  end
@@ -157,50 +214,94 @@ module RedAmber
157
214
  # Returns Vectors in an Array.
158
215
  #
159
216
  # @return [Array]
160
- # An Array of RedAmber::Vector s.
217
+ # An Array of `RedAmber::Vector`s.
218
+ #
161
219
  def vectors
162
220
  @vectors || @vectors = init_instance_vars(:vectors)
163
221
  end
164
222
 
165
- # Returns row indices (start...(size+start)) in an Array.
223
+ # Returns row indices (start...(size+start)) in a Vector.
166
224
  #
167
225
  # @param start [Object]
168
- # Object which have #succ method.
226
+ # Object which have `#succ` method.
227
+ #
169
228
  # @return [Array]
170
- # An Array of indices of the row.
229
+ # A Vector of row indices.
230
+ #
171
231
  # @example
172
232
  # (when self.size == 5)
173
- # - indices #=> [0, 1, 2, 3, 4]
174
- # - indices(1) #=> [1, 2, 3, 4, 5]
175
- # - indices('a') #=> ['a', 'b', 'c', 'd', 'e']
233
+ # - indices #=> Vector[0, 1, 2, 3, 4]
234
+ # - indices(1) #=> Vector[1, 2, 3, 4, 5]
235
+ # - indices('a') #=> Vector['a', 'b', 'c', 'd', 'e']
236
+ #
176
237
  def indices(start = 0)
177
- (start..).take(size)
238
+ Vector.new((start..).take(size))
178
239
  end
179
240
  alias_method :indexes, :indices
180
241
 
242
+ # Returns column-oriented data in a Hash.
243
+ #
244
+ # @return [Hash] A Hash of 'key => column_in_an_array'.
245
+ #
181
246
  def to_h
182
247
  variables.transform_values(&:to_a)
183
248
  end
184
249
 
250
+ # Returns a row-oriented array without header.
251
+ #
252
+ # @return [Array] Row-oriented data without header.
253
+ #
254
+ # @note If you need column-oriented array, use `.to_h.to_a`.
255
+ #
185
256
  def to_a
186
- # output an array of row-oriented data without header
187
- # if you need column-oriented array, use `.to_h.to_a`
188
257
  @table.raw_records
189
258
  end
190
259
  alias_method :raw_records, :to_a
191
260
 
261
+ # Returns column name and data type in a Hash.
262
+ #
263
+ # @return [Hash] Column name and data type.
264
+ #
265
+ # @example
266
+ # RedAmber::DataFrame.new(x: [1, 2, 3], y: %w[A B C]).schema
267
+ # # => {:x=>:uint8, :y=>:string}
268
+ #
192
269
  def schema
193
270
  keys.zip(types).to_h
194
271
  end
195
272
 
273
+ # Compare DataFrames.
274
+ #
275
+ # @return [true, false]
276
+ # True if other is a DataFrame and table is same.
277
+ # Otherwise return false.
278
+ #
196
279
  def ==(other)
197
280
  other.is_a?(DataFrame) && @table == other.table
198
281
  end
199
282
 
283
+ # Check if it is a empty DataFrame.
284
+ #
285
+ # @return [true, false] True if it has no columns.
286
+ #
200
287
  def empty?
201
288
  variables.empty?
202
289
  end
203
290
 
291
+ # Enumerate for each row.
292
+ #
293
+ # @overload each_row
294
+ # Returns Enumerator when no block given.
295
+ #
296
+ # @return [Enumerator] Enumerator of each rows.
297
+ #
298
+ # @overload each_row(&block)
299
+ # Yields with key and row pairs.
300
+ #
301
+ # @yield [key_row_pairs] Yields with key and row pairs.
302
+ # @yieldparam [Hash] Key and row pairs.
303
+ # @yieldreturn [Integer] Size of the DataFrame.
304
+ #
204
305
  def each_row
205
306
  return enum_for(:each_row) unless block_given?
206
307
 
@@ -213,6 +314,10 @@ module RedAmber
213
314
  end
214
315
  end
215
316
 
317
+ # Returns self in a `Rover::DataFrame`.
318
+ #
319
+ # @return [Rover::DataFrame] A `Rover::DataFrame`.
320
+ #
216
321
  def to_rover
217
322
  require 'rover'
218
323
  Rover::DataFrame.new(to_h)
@@ -225,7 +330,7 @@ module RedAmber
225
330
  end
226
331
 
227
332
  def method_missing(name, *args, &block)
228
- return v(name) if args.empty?
333
+ return v(name) if args.empty? && key?(name)
229
334
 
230
335
  super
231
336
  end
@@ -240,20 +345,31 @@ module RedAmber
240
345
 
241
346
  # initialize @variable, @keys, @vectors and return one of them
242
347
  def init_instance_vars(var)
243
- ary = @table.columns.each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
244
- v = Vector.new(column.data)
245
- k = column.name.to_sym
246
- v.key = k
247
- variables[k] = v
248
- keys << k
249
- vectors << v
250
- end
348
+ ary =
349
+ @table.columns
350
+ .each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
351
+ v = Vector.create(column.data)
352
+ k = column.name.to_sym
353
+ v.key = k
354
+ variables[k] = v
355
+ keys << k
356
+ vectors << v
357
+ end
358
+
251
359
  @variables, @keys, @vectors = ary
252
360
  ary[%i[variables keys vectors].index(var)]
253
361
  end
254
362
 
363
+ def check_duplicate_keys(array)
364
+ org = array.dup
365
+ return unless array.uniq!
366
+
367
+ raise DataFrameArgumentError,
368
+ "duplicate keys: #{org.tally.select { |_k, v| v > 1 }.keys}"
369
+ end
370
+
255
371
  def name_unnamed_keys
256
- return unless @table[:'']
372
+ return unless @table.key?('')
257
373
 
258
374
  # We can't use #keys because it causes mismatch of @table and @keys
259
375
  keys = @table.schema.fields.map { |f| f.name.to_sym }