red_amber 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +114 -39
  3. data/CHANGELOG.md +203 -31
  4. data/Gemfile +5 -2
  5. data/README.md +62 -29
  6. data/benchmark/basic.yml +86 -0
  7. data/benchmark/combine.yml +62 -0
  8. data/benchmark/dataframe.yml +62 -0
  9. data/benchmark/drop_nil.yml +15 -3
  10. data/benchmark/group.yml +39 -0
  11. data/benchmark/reshape.yml +31 -0
  12. data/benchmark/{csv_load_penguins.yml → rover/csv_load_penguins.yml} +3 -3
  13. data/benchmark/rover/flights.yml +23 -0
  14. data/benchmark/rover/penguins.yml +23 -0
  15. data/benchmark/rover/planes.yml +23 -0
  16. data/benchmark/rover/weather.yml +23 -0
  17. data/benchmark/vector.yml +60 -0
  18. data/doc/DataFrame.md +335 -53
  19. data/doc/Vector.md +91 -0
  20. data/doc/image/dataframe/join.png +0 -0
  21. data/doc/image/dataframe/set_and_bind.png +0 -0
  22. data/doc/image/dataframe_model.png +0 -0
  23. data/lib/red_amber/data_frame.rb +167 -51
  24. data/lib/red_amber/data_frame_combinable.rb +486 -0
  25. data/lib/red_amber/data_frame_displayable.rb +6 -4
  26. data/lib/red_amber/data_frame_indexable.rb +2 -2
  27. data/lib/red_amber/data_frame_loadsave.rb +4 -1
  28. data/lib/red_amber/data_frame_reshaping.rb +35 -10
  29. data/lib/red_amber/data_frame_selectable.rb +221 -116
  30. data/lib/red_amber/data_frame_variable_operation.rb +146 -82
  31. data/lib/red_amber/group.rb +108 -18
  32. data/lib/red_amber/helper.rb +53 -43
  33. data/lib/red_amber/refinements.rb +199 -0
  34. data/lib/red_amber/vector.rb +56 -46
  35. data/lib/red_amber/vector_functions.rb +23 -83
  36. data/lib/red_amber/vector_selectable.rb +116 -69
  37. data/lib/red_amber/vector_updatable.rb +189 -65
  38. data/lib/red_amber/version.rb +1 -1
  39. data/lib/red_amber.rb +3 -0
  40. data/red_amber.gemspec +4 -3
  41. metadata +24 -10
@@ -5,6 +5,7 @@ module RedAmber
5
5
  # Variable @table holds an Arrow::Table object.
6
6
  class DataFrame
7
7
  # mix-in
8
+ include DataFrameCombinable
8
9
  include DataFrameDisplayable
9
10
  include DataFrameIndexable
10
11
  include DataFrameLoadSave
@@ -13,87 +14,135 @@ module RedAmber
13
14
  include DataFrameVariableOperation
14
15
  include Helper
15
16
 
16
- # Creates a new RedAmber::DataFrame.
17
+ using RefineArrowTable
18
+ using RefineHash
19
+
20
+ # Quicker DataFrame construction from a `Arrow::Table`.
17
21
  #
18
- # @overload initialize(hash)
22
+ # @param table [Arrow::Table] A table to have in the DataFrame.
23
+ # @return [DataFrame] Initialized DataFrame.
19
24
  #
20
- # @params hash [Hash]
25
+ # @note This method will allocate table directly and may be used in the method.
26
+ # @note `table` must have unique keys.
27
+ def self.create(table)
28
+ instance = allocate
29
+ instance.instance_variable_set(:@table, table)
30
+ instance
31
+ end
32
+
33
+ # Creates a new DataFrame.
21
34
  #
22
35
  # @overload initialize(table)
36
+ # Initialize DataFrame by an `Arrow::Table`
37
+ #
38
+ # @param table [Arrow::Table]
39
+ # A table to have in the DataFrame.
40
+ #
41
+ # @overload initialize(arrowable)
42
+ # Initialize DataFrame by a `#to_arrow` responsible object.
43
+ #
44
+ # @param arrowable [#to_arrow]
45
+ # Any object which responds to `#to_arrow`.
46
+ # `#to_arrow` must return `Arrow::Table`.
47
+ #
48
+ # @note `RedAmber::DataFrame` itself is readable by this.
49
+ # @note Hash is refined to respond to `#to_arrow` in this class.
50
+ #
51
+ # @overload initialize(rover_like)
52
+ # Initialize DataFrame by a `Rover::DataFrame`-like `#to_h` responsible object.
53
+ #
54
+ # @param rover_like [#to_h]
55
+ # Any object which responds to `#to_h`.
56
+ # `#to_h` must return a Hash which is convertable by `Arrow::Table.new`.
23
57
  #
24
- # @params table [Arrow::Table]
58
+ # @note `Rover::DataFrame` is readable by this.
25
59
  #
26
- # @overload initialize(dataframe)
60
+ # @overload initialize()
61
+ # Create empty DataFrame
27
62
  #
28
- # @params dataframe [RedAmber::DataFrame, Rover::DataFrame]
63
+ # @example DataFrame.new
29
64
  #
30
- # @overload initialize(null)
65
+ # @overload initialize(empty)
66
+ # Create empty DataFrame
31
67
  #
32
- # @params null [NilClass] No arguments.
68
+ # @param empty [nil, [], {}]
69
+ #
70
+ # @example DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
71
+ #
72
+ # @overload initialize(args)
73
+ #
74
+ # @param args [values]
75
+ # Accepts any argments which is valid for `Arrow::Table.new(args)`. See
76
+ # {https://github.com/apache/arrow/blob/master/ruby/red-arrow/lib/arrow/table.rb
33
77
  #
34
78
  def initialize(*args)
35
- @variables = @keys = @vectors = @types = @data_types = nil
36
79
  case args
37
80
  in nil | [nil] | [] | {} | [[]] | [{}]
38
- # DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
39
- # returns empty DataFrame
40
81
  @table = Arrow::Table.new({}, [])
41
- in [->(x) { x.respond_to?(:to_arrow) } => arrowable]
82
+ in [Arrow::Table => table]
83
+ @table = table
84
+ in [arrowable] if arrowable.respond_to?(:to_arrow)
42
85
  table = arrowable.to_arrow
43
86
  unless table.is_a?(Arrow::Table)
44
87
  raise DataFrameTypeError,
45
88
  "to_arrow must return an Arrow::Table but #{table.class}: #{arrowable}"
46
89
  end
47
90
  @table = table
48
- in [Arrow::Table => table]
49
- @table = table
50
- in [DataFrame => dataframe]
51
- @table = dataframe.table
52
- in [rover_or_hash]
91
+ in [rover_like] if rover_like.respond_to?(:to_h)
53
92
  begin
54
- # Accepts Rover::DataFrame or Hash
55
- @table = Arrow::Table.new(rover_or_hash.to_h)
93
+ # Accepts Rover::DataFrame
94
+ @table = Arrow::Table.new(rover_like.to_h)
56
95
  rescue StandardError
57
- raise DataFrameTypeError, "invalid argument: #{rover_or_hash}"
96
+ raise DataFrameTypeError, "to_h must return Arrowable object: #{rover_like}"
58
97
  end
59
98
  else
60
- @table = Arrow::Table.new(*args)
99
+ begin
100
+ @table = Arrow::Table.new(*args)
101
+ rescue StandardError
102
+ raise DataFrameTypeError, "invalid argument to create Arrow::Table: #{args}"
103
+ end
61
104
  end
62
- name_unnamed_keys
63
105
 
64
- duplicated_keys = keys.tally.select { |_k, v| v > 1 }.keys
65
- raise DataFrameArgumentError, "duplicate keys: #{duplicated_keys}" unless duplicated_keys.empty?
106
+ name_unnamed_keys
107
+ check_duplicate_keys(keys)
66
108
  end
67
109
 
110
+ # Returns the table having within.
111
+ #
112
+ # @return [Arrow::Table] The table within.
113
+ #
68
114
  attr_reader :table
69
115
 
70
- def to_arrow
71
- @table
72
- end
116
+ alias_method :to_arrow, :table
73
117
 
74
118
  # Returns the number of rows.
75
119
  #
76
120
  # @return [Integer] Number of rows.
121
+ #
77
122
  def size
78
123
  @table.n_rows
79
124
  end
80
- alias_method :n_rows, :size
125
+ alias_method :n_records, :size
81
126
  alias_method :n_obs, :size
127
+ alias_method :n_rows, :size
82
128
 
83
129
  # Returns the number of columns.
84
130
  #
85
131
  # @return [Integer] Number of columns.
132
+ #
86
133
  def n_keys
87
134
  @table.n_columns
88
135
  end
89
- alias_method :n_cols, :n_keys
136
+ alias_method :n_variables, :n_keys
90
137
  alias_method :n_vars, :n_keys
138
+ alias_method :n_cols, :n_keys
91
139
 
92
140
  # Returns the numbers of rows and columns.
93
141
  #
94
142
  # @return [Array]
95
143
  # Number of rows and number of columns in an array.
96
144
  # Same as [size, n_keys].
145
+ #
97
146
  def shape
98
147
  [size, n_keys]
99
148
  end
@@ -101,7 +150,8 @@ module RedAmber
101
150
  # Returns a Hash of key and Vector pairs in the columns.
102
151
  #
103
152
  # @return [Hash]
104
- # key => Vector pairs for each columns.
153
+ # `key => Vector` pairs for each columns.
154
+ #
105
155
  def variables
106
156
  @variables || @variables = init_instance_vars(:variables)
107
157
  end
@@ -111,6 +161,7 @@ module RedAmber
111
161
  #
112
162
  # @return [Array]
113
163
  # Keys in an Array.
164
+ #
114
165
  def keys
115
166
  @keys || @keys = init_instance_vars(:keys)
116
167
  end
@@ -122,6 +173,7 @@ module RedAmber
122
173
  # @param key [Symbol, String] Key to test.
123
174
  # @return [Boolean]
124
175
  # Returns true if self has key in Symbol.
176
+ #
125
177
  def key?(key)
126
178
  keys.include?(key.to_sym)
127
179
  end
@@ -132,6 +184,7 @@ module RedAmber
132
184
  # @param key [Symbol, String] key to know.
133
185
  # @return [Integer]
134
186
  # Index of key in the Array keys.
187
+ #
135
188
  def key_index(key)
136
189
  keys.find_index(key.to_sym)
137
190
  end
@@ -142,14 +195,18 @@ module RedAmber
142
195
  #
143
196
  # @return [Array]
144
197
  # Abbreviated Red Arrow data type names.
198
+ #
145
199
  def types
146
- @types || @types = @table.columns.map { |column| column.data.value_type.nick.to_sym }
200
+ @types || @types = @table.columns.map do |column|
201
+ column.data.value_type.nick.to_sym
202
+ end
147
203
  end
148
204
 
149
205
  # Returns an Array of Classes of data type.
150
206
  #
151
207
  # @return [Array]
152
208
  # An Array of Red Arrow data type Classes.
209
+ #
153
210
  def type_classes
154
211
  @data_types || @data_types = @table.columns.map { |column| column.data_type.class }
155
212
  end
@@ -157,50 +214,94 @@ module RedAmber
157
214
  # Returns Vectors in an Array.
158
215
  #
159
216
  # @return [Array]
160
- # An Array of RedAmber::Vector s.
217
+ # An Array of `RedAmber::Vector`s.
218
+ #
161
219
  def vectors
162
220
  @vectors || @vectors = init_instance_vars(:vectors)
163
221
  end
164
222
 
165
- # Returns row indices (start...(size+start)) in an Array.
223
+ # Returns row indices (start...(size+start)) in a Vector.
166
224
  #
167
225
  # @param start [Object]
168
- # Object which have #succ method.
226
+ # Object which have `#succ` method.
227
+ #
169
228
  # @return [Array]
170
- # An Array of indices of the row.
229
+ # A Vector of row indices.
230
+ #
171
231
  # @example
172
232
  # (when self.size == 5)
173
- # - indices #=> [0, 1, 2, 3, 4]
174
- # - indices(1) #=> [1, 2, 3, 4, 5]
175
- # - indices('a') #=> ['a', 'b', 'c', 'd', 'e']
233
+ # - indices #=> Vector[0, 1, 2, 3, 4]
234
+ # - indices(1) #=> Vector[1, 2, 3, 4, 5]
235
+ # - indices('a') #=> Vector['a', 'b', 'c', 'd', 'e']
236
+ #
176
237
  def indices(start = 0)
177
- (start..).take(size)
238
+ Vector.new((start..).take(size))
178
239
  end
179
240
  alias_method :indexes, :indices
180
241
 
242
+ # Returns column-oriented data in a Hash.
243
+ #
244
+ # @return [Hash] A Hash of 'key => column_in_an_array'.
245
+ #
181
246
  def to_h
182
247
  variables.transform_values(&:to_a)
183
248
  end
184
249
 
250
+ # Returns a row-oriented array without header.
251
+ #
252
+ # @return [Array] Row-oriented data without header.
253
+ #
254
+ # @note If you need column-oriented array, use `.to_h.to_a`.
255
+ #
185
256
  def to_a
186
- # output an array of row-oriented data without header
187
- # if you need column-oriented array, use `.to_h.to_a`
188
257
  @table.raw_records
189
258
  end
190
259
  alias_method :raw_records, :to_a
191
260
 
261
+ # Returns column name and data type in a Hash.
262
+ #
263
+ # @return [Hash] Column name and data type.
264
+ #
265
+ # @example
266
+ # RedAmber::DataFrame.new(x: [1, 2, 3], y: %w[A B C]).schema
267
+ # # => {:x=>:uint8, :y=>:string}
268
+ #
192
269
  def schema
193
270
  keys.zip(types).to_h
194
271
  end
195
272
 
273
+ # Compare DataFrames.
274
+ #
275
+ # @return [true, false]
276
+ # True if other is a DataFrame and table is same.
277
+ # Otherwise return false.
278
+ #
196
279
  def ==(other)
197
280
  other.is_a?(DataFrame) && @table == other.table
198
281
  end
199
282
 
283
+ # Check if it is a empty DataFrame.
284
+ #
285
+ # @return [true, false] True if it has no columns.
286
+ #
200
287
  def empty?
201
288
  variables.empty?
202
289
  end
203
290
 
291
+ # Enumerate for each row.
292
+ #
293
+ # @overload each_row
294
+ # Returns Enumerator when no block given.
295
+ #
296
+ # @return [Enumerator] Enumerator of each rows.
297
+ #
298
+ # @overload each_row(&block)
299
+ # Yields with key and row pairs.
300
+ #
301
+ # @yield [key_row_pairs] Yields with key and row pairs.
302
+ # @yieldparam [Hash] Key and row pairs.
303
+ # @yieldreturn [Integer] Size of the DataFrame.
304
+ #
204
305
  def each_row
205
306
  return enum_for(:each_row) unless block_given?
206
307
 
@@ -213,6 +314,10 @@ module RedAmber
213
314
  end
214
315
  end
215
316
 
317
+ # Returns self in a `Rover::DataFrame`.
318
+ #
319
+ # @return [Rover::DataFrame] A `Rover::DataFrame`.
320
+ #
216
321
  def to_rover
217
322
  require 'rover'
218
323
  Rover::DataFrame.new(to_h)
@@ -225,7 +330,7 @@ module RedAmber
225
330
  end
226
331
 
227
332
  def method_missing(name, *args, &block)
228
- return v(name) if args.empty?
333
+ return v(name) if args.empty? && key?(name)
229
334
 
230
335
  super
231
336
  end
@@ -240,20 +345,31 @@ module RedAmber
240
345
 
241
346
  # initialize @variable, @keys, @vectors and return one of them
242
347
  def init_instance_vars(var)
243
- ary = @table.columns.each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
244
- v = Vector.new(column.data)
245
- k = column.name.to_sym
246
- v.key = k
247
- variables[k] = v
248
- keys << k
249
- vectors << v
250
- end
348
+ ary =
349
+ @table.columns
350
+ .each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
351
+ v = Vector.create(column.data)
352
+ k = column.name.to_sym
353
+ v.key = k
354
+ variables[k] = v
355
+ keys << k
356
+ vectors << v
357
+ end
358
+
251
359
  @variables, @keys, @vectors = ary
252
360
  ary[%i[variables keys vectors].index(var)]
253
361
  end
254
362
 
363
+ def check_duplicate_keys(array)
364
+ org = array.dup
365
+ return unless array.uniq!
366
+
367
+ raise DataFrameArgumentError,
368
+ "duplicate keys: #{org.tally.select { |_k, v| v > 1 }.keys}"
369
+ end
370
+
255
371
  def name_unnamed_keys
256
- return unless @table[:'']
372
+ return unless @table.key?('')
257
373
 
258
374
  # We can't use #keys because it causes mismatch of @table and @keys
259
375
  keys = @table.schema.fields.map { |f| f.name.to_sym }