red_amber 0.2.3 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,60 @@
1
+ loop_count: 10
2
+
3
+ contexts:
4
+ - name: HEAD
5
+ prelude: |
6
+ $LOAD_PATH.unshift(File.expand_path('lib'))
7
+ - name: 0.2.0
8
+ gems:
9
+ red_amber: 0.2.0
10
+
11
+ prelude: |
12
+ require 'red_amber'
13
+ include RedAmber
14
+ require 'datasets-arrow'
15
+
16
+ ds = Datasets::Rdatasets.new('nycflights13', 'flights')
17
+ flights = RedAmber::DataFrame.new(ds.to_arrow)
18
+ df = flights.slice { flights[:month] <= 6 }
19
+
20
+ tailnum_vector = df[:tailnum]
21
+ distance_vector = df[:distance]
22
+
23
+ strings = tailnum_vector.to_a
24
+ arrow_array = tailnum_vector.data
25
+ integers = df[:dep_delay].to_a
26
+ boolean_vector = df[:air_time].is_nil
27
+ index_vector = Vector.new(0...boolean_vector.size).filter(boolean_vector)
28
+ replacer = index_vector.data.map(&:to_s)
29
+ booleans = boolean_vector.to_a
30
+
31
+ benchmark:
32
+ 'V01: Vector.new from integer Array': |
33
+ Vector.new(integers)
34
+
35
+ 'V02: Vector.new from string Array': |
36
+ Vector.new(strings)
37
+
38
+ 'V03: Vector.new from boolean Vector': |
39
+ Vector.new(boolean_vector)
40
+
41
+ 'V04: Vector#sum': |
42
+ distance_vector.mean
43
+
44
+ 'V05: Vector#*': |
45
+ distance_vector * 1.852
46
+
47
+ 'V06: Vector#[booleans]': |
48
+ tailnum_vector[booleans]
49
+
50
+ 'V07: Vector#[boolean_vector]': |
51
+ tailnum_vector[boolean_vector]
52
+
53
+ 'V08: Vector#[index_vector]': |
54
+ tailnum_vector[index_vector]
55
+
56
+ 'V09: Vector#replace': |
57
+ tailnum_vector.replace(booleans, replacer)
58
+
59
+ 'V10: Vector#replace with broad casting': |
60
+ tailnum_vector.replace(booleans, 'x')
data/doc/DataFrame.md CHANGED
@@ -1302,7 +1302,10 @@ When the option `keep_key: true` used, the column `key` will be preserved.
1302
1302
  - `join_keys` are keys shared by self and other to match with them.
1303
1303
  - If `join_keys` are empty, common keys in self and other are chosen (natural join).
1304
1304
  - If (common keys) > `join_keys`, duplicated keys are renamed by `suffix`.
1305
+ - If you want to match the columns with different names,
1306
+ use Hash for `join_keys` such as `{ left: :KEY1, right: KEY2}`.
1305
1307
 
1308
+ These are dataframes to use in the examples of joins.
1306
1309
  ```ruby
1307
1310
  df = DataFrame.new(
1308
1311
  KEY: %w[A B C],
data/doc/Vector.md CHANGED
@@ -513,3 +513,91 @@ vector.shift(fill: Float::NAN)
513
513
  #<RedAmber::Vector(:double, size=5):0x0000000000011d3c>
514
514
  [NaN, 1.0, 2.0, 3.0, 4.0]
515
515
  ```
516
+
517
+ ### `split_to_columns(sep = ' ', limit = 0)`
518
+
519
+ Split string type Vector with any ASCII whitespace as separator.
520
+ Returns an Array of Vectors.
521
+
522
+ ```ruby
523
+ vector = Vector.new(['a b', 'c d', 'e f'])
524
+ vector.split_to_columns
525
+
526
+ #=>
527
+ [#<RedAmber::Vector(:string, size=3):0x00000000000363a8>
528
+ ["a", "c", "e"]
529
+ ,
530
+ #<RedAmber::Vector(:string, size=3):0x00000000000363bc>
531
+ ["b", "d", "f"]
532
+ ]
533
+ ```
534
+ It will be used for column splitting in DataFrame.
535
+
536
+ ```ruby
537
+ df = DataFrame.new(year_month: %w[2022-01 2022-02 2022-03])
538
+ .assign(:year, :month) { year_month.split_to_columns('-') }
539
+ .drop(:year_month)
540
+
541
+ #=>
542
+ #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000f974>
543
+ year month
544
+ <string> <string>
545
+ 0 2022 01
546
+ 1 2022 02
547
+ 2 2022 03
548
+ ```
549
+
550
+ ### `split_to_rows(sep = ' ', limit = 0)`
551
+
552
+ Split string type Vector with any ASCII whitespace as separator.
553
+ Returns an flattend into rows by Vector.
554
+
555
+ ```ruby
556
+ vector = Vector.new(['a b', 'c d', 'e f'])
557
+ vector.split_to_rows
558
+
559
+ #=>
560
+ #<RedAmber::Vector(:string, size=6):0x000000000002ccf4>
561
+ ["a", "b", "c", "d", "e", "f"]
562
+ ```
563
+
564
+ ### `merge(other, sep: ' ')`
565
+
566
+ Merge String or other string Vector to self using aseparator.
567
+ Self must be a string Vector.
568
+ Returns merged string Vector.
569
+
570
+ ```ruby
571
+ # with vector
572
+ vector = Vector.new(%w[a c e])
573
+ other = Vector.new(%w[b d f])
574
+ vector.merge(other)
575
+
576
+ #=>
577
+ #<RedAmber::Vector(:string, size=3):0x0000000000038b80>
578
+ ["a b", "c d", "e f"]
579
+ ```
580
+
581
+ If other is a String it will be broadcasted.
582
+
583
+ ```ruby
584
+ # with vector
585
+ vector = Vector.new(%w[a c e])
586
+
587
+ #=>
588
+ #<RedAmber::Vector(:string, size=3):0x00000000000446b0>
589
+ ["a x", "c x", "e x"]
590
+ ```
591
+
592
+ You can specify separator string by :sep.
593
+
594
+ ```ruby
595
+ # with vector
596
+ vector = Vector.new(%w[a c e])
597
+ other = Vector.new(%w[b d f])
598
+ vector.merge(other, sep: '')
599
+
600
+ #=>
601
+ #<RedAmber::Vector(:string, size=3):0x0000000000038b80>
602
+ ["ab", "cd", "ef"]
603
+ ```
@@ -14,65 +14,111 @@ module RedAmber
14
14
  include DataFrameVariableOperation
15
15
  include Helper
16
16
 
17
- # Creates a new RedAmber::DataFrame.
17
+ using RefineArrowTable
18
+ using RefineHash
19
+
20
+ # Quicker DataFrame construction from a `Arrow::Table`.
18
21
  #
19
- # @overload initialize(hash)
22
+ # @param table [Arrow::Table] A table to have in the DataFrame.
23
+ # @return [DataFrame] Initialized DataFrame.
20
24
  #
21
- # @params hash [Hash]
25
+ # @note This method will allocate table directly and may be used in the method.
26
+ # @note `table` must have unique keys.
27
+ def self.create(table)
28
+ instance = allocate
29
+ instance.instance_variable_set(:@table, table)
30
+ instance
31
+ end
32
+
33
+ # Creates a new DataFrame.
22
34
  #
23
35
  # @overload initialize(table)
36
+ # Initialize DataFrame by an `Arrow::Table`
37
+ #
38
+ # @param table [Arrow::Table]
39
+ # A table to have in the DataFrame.
40
+ #
41
+ # @overload initialize(arrowable)
42
+ # Initialize DataFrame by a `#to_arrow` responsible object.
43
+ #
44
+ # @param arrowable [#to_arrow]
45
+ # Any object which responds to `#to_arrow`.
46
+ # `#to_arrow` must return `Arrow::Table`.
47
+ #
48
+ # @note `RedAmber::DataFrame` itself is readable by this.
49
+ # @note Hash is refined to respond to `#to_arrow` in this class.
50
+ #
51
+ # @overload initialize(rover_like)
52
+ # Initialize DataFrame by a `Rover::DataFrame`-like `#to_h` responsible object.
24
53
  #
25
- # @params table [Arrow::Table]
54
+ # @param rover_like [#to_h]
55
+ # Any object which responds to `#to_h`.
56
+ # `#to_h` must return a Hash which is convertable by `Arrow::Table.new`.
26
57
  #
27
- # @overload initialize(dataframe)
58
+ # @note `Rover::DataFrame` is readable by this.
28
59
  #
29
- # @params dataframe [RedAmber::DataFrame, Rover::DataFrame]
60
+ # @overload initialize()
61
+ # Create empty DataFrame
30
62
  #
31
- # @overload initialize(null)
63
+ # @example DataFrame.new
32
64
  #
33
- # @params null [NilClass] No arguments.
65
+ # @overload initialize(empty)
66
+ # Create empty DataFrame
67
+ #
68
+ # @param empty [nil, [], {}]
69
+ #
70
+ # @example DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
71
+ #
72
+ # @overload initialize(args)
73
+ #
74
+ # @param args [values]
75
+ # Accepts any argments which is valid for `Arrow::Table.new(args)`. See
76
+ # {https://github.com/apache/arrow/blob/master/ruby/red-arrow/lib/arrow/table.rb
34
77
  #
35
78
  def initialize(*args)
36
- @variables = @keys = @vectors = @types = @data_types = nil
37
79
  case args
38
80
  in nil | [nil] | [] | {} | [[]] | [{}]
39
- # DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
40
- # returns empty DataFrame
41
81
  @table = Arrow::Table.new({}, [])
42
- in [->(x) { x.respond_to?(:to_arrow) } => arrowable]
82
+ in [Arrow::Table => table]
83
+ @table = table
84
+ in [arrowable] if arrowable.respond_to?(:to_arrow)
43
85
  table = arrowable.to_arrow
44
86
  unless table.is_a?(Arrow::Table)
45
87
  raise DataFrameTypeError,
46
88
  "to_arrow must return an Arrow::Table but #{table.class}: #{arrowable}"
47
89
  end
48
90
  @table = table
49
- in [Arrow::Table => table]
50
- @table = table
51
- in [rover_or_hash]
91
+ in [rover_like] if rover_like.respond_to?(:to_h)
52
92
  begin
53
- # Accepts Rover::DataFrame or Hash
54
- @table = Arrow::Table.new(rover_or_hash.to_h)
93
+ # Accepts Rover::DataFrame
94
+ @table = Arrow::Table.new(rover_like.to_h)
55
95
  rescue StandardError
56
- raise DataFrameTypeError, "invalid argument: #{rover_or_hash}"
96
+ raise DataFrameTypeError, "to_h must return Arrowable object: #{rover_like}"
57
97
  end
58
98
  else
59
- @table = Arrow::Table.new(*args)
99
+ begin
100
+ @table = Arrow::Table.new(*args)
101
+ rescue StandardError
102
+ raise DataFrameTypeError, "invalid argument to create Arrow::Table: #{args}"
103
+ end
60
104
  end
61
- name_unnamed_keys
62
105
 
63
- duplicated_keys = keys.tally.select { |_k, v| v > 1 }.keys
64
- raise DataFrameArgumentError, "duplicate keys: #{duplicated_keys}" unless duplicated_keys.empty?
106
+ name_unnamed_keys
107
+ check_duplicate_keys(keys)
65
108
  end
66
109
 
110
+ # Returns the table having within.
111
+ #
112
+ # @return [Arrow::Table] The table within.
113
+ #
67
114
  attr_reader :table
68
115
 
69
- def to_arrow
70
- @table
71
- end
116
+ alias_method :to_arrow, :table
72
117
 
73
118
  # Returns the number of rows.
74
119
  #
75
120
  # @return [Integer] Number of rows.
121
+ #
76
122
  def size
77
123
  @table.n_rows
78
124
  end
@@ -83,6 +129,7 @@ module RedAmber
83
129
  # Returns the number of columns.
84
130
  #
85
131
  # @return [Integer] Number of columns.
132
+ #
86
133
  def n_keys
87
134
  @table.n_columns
88
135
  end
@@ -95,6 +142,7 @@ module RedAmber
95
142
  # @return [Array]
96
143
  # Number of rows and number of columns in an array.
97
144
  # Same as [size, n_keys].
145
+ #
98
146
  def shape
99
147
  [size, n_keys]
100
148
  end
@@ -102,7 +150,8 @@ module RedAmber
102
150
  # Returns a Hash of key and Vector pairs in the columns.
103
151
  #
104
152
  # @return [Hash]
105
- # key => Vector pairs for each columns.
153
+ # `key => Vector` pairs for each columns.
154
+ #
106
155
  def variables
107
156
  @variables || @variables = init_instance_vars(:variables)
108
157
  end
@@ -112,6 +161,7 @@ module RedAmber
112
161
  #
113
162
  # @return [Array]
114
163
  # Keys in an Array.
164
+ #
115
165
  def keys
116
166
  @keys || @keys = init_instance_vars(:keys)
117
167
  end
@@ -123,6 +173,7 @@ module RedAmber
123
173
  # @param key [Symbol, String] Key to test.
124
174
  # @return [Boolean]
125
175
  # Returns true if self has key in Symbol.
176
+ #
126
177
  def key?(key)
127
178
  keys.include?(key.to_sym)
128
179
  end
@@ -133,6 +184,7 @@ module RedAmber
133
184
  # @param key [Symbol, String] key to know.
134
185
  # @return [Integer]
135
186
  # Index of key in the Array keys.
187
+ #
136
188
  def key_index(key)
137
189
  keys.find_index(key.to_sym)
138
190
  end
@@ -143,14 +195,18 @@ module RedAmber
143
195
  #
144
196
  # @return [Array]
145
197
  # Abbreviated Red Arrow data type names.
198
+ #
146
199
  def types
147
- @types || @types = @table.columns.map { |column| column.data.value_type.nick.to_sym }
200
+ @types || @types = @table.columns.map do |column|
201
+ column.data.value_type.nick.to_sym
202
+ end
148
203
  end
149
204
 
150
205
  # Returns an Array of Classes of data type.
151
206
  #
152
207
  # @return [Array]
153
208
  # An Array of Red Arrow data type Classes.
209
+ #
154
210
  def type_classes
155
211
  @data_types || @data_types = @table.columns.map { |column| column.data_type.class }
156
212
  end
@@ -158,50 +214,94 @@ module RedAmber
158
214
  # Returns Vectors in an Array.
159
215
  #
160
216
  # @return [Array]
161
- # An Array of RedAmber::Vector s.
217
+ # An Array of `RedAmber::Vector`s.
218
+ #
162
219
  def vectors
163
220
  @vectors || @vectors = init_instance_vars(:vectors)
164
221
  end
165
222
 
166
- # Returns row indices (start...(size+start)) in an Array.
223
+ # Returns row indices (start...(size+start)) in a Vector.
167
224
  #
168
225
  # @param start [Object]
169
- # Object which have #succ method.
226
+ # Object which have `#succ` method.
227
+ #
170
228
  # @return [Array]
171
- # An Array of indices of the row.
229
+ # A Vector of row indices.
230
+ #
172
231
  # @example
173
232
  # (when self.size == 5)
174
- # - indices #=> [0, 1, 2, 3, 4]
175
- # - indices(1) #=> [1, 2, 3, 4, 5]
176
- # - indices('a') #=> ['a', 'b', 'c', 'd', 'e']
233
+ # - indices #=> Vector[0, 1, 2, 3, 4]
234
+ # - indices(1) #=> Vector[1, 2, 3, 4, 5]
235
+ # - indices('a') #=> Vector['a', 'b', 'c', 'd', 'e']
236
+ #
177
237
  def indices(start = 0)
178
238
  Vector.new((start..).take(size))
179
239
  end
180
240
  alias_method :indexes, :indices
181
241
 
242
+ # Returns column-oriented data in a Hash.
243
+ #
244
+ # @return [Hash] A Hash of 'key => column_in_an_array'.
245
+ #
182
246
  def to_h
183
247
  variables.transform_values(&:to_a)
184
248
  end
185
249
 
250
+ # Returns a row-oriented array without header.
251
+ #
252
+ # @return [Array] Row-oriented data without header.
253
+ #
254
+ # @note If you need column-oriented array, use `.to_h.to_a`.
255
+ #
186
256
  def to_a
187
- # output an array of row-oriented data without header
188
- # if you need column-oriented array, use `.to_h.to_a`
189
257
  @table.raw_records
190
258
  end
191
259
  alias_method :raw_records, :to_a
192
260
 
261
+ # Returns column name and data type in a Hash.
262
+ #
263
+ # @return [Hash] Column name and data type.
264
+ #
265
+ # @example
266
+ # RedAmber::DataFrame.new(x: [1, 2, 3], y: %w[A B C]).schema
267
+ # # => {:x=>:uint8, :y=>:string}
268
+ #
193
269
  def schema
194
270
  keys.zip(types).to_h
195
271
  end
196
272
 
273
+ # Compare DataFrames.
274
+ #
275
+ # @return [true, false]
276
+ # True if other is a DataFrame and table is same.
277
+ # Otherwise return false.
278
+ #
197
279
  def ==(other)
198
280
  other.is_a?(DataFrame) && @table == other.table
199
281
  end
200
282
 
283
+ # Check if it is a empty DataFrame.
284
+ #
285
+ # @return [true, false] True if it has no columns.
286
+ #
201
287
  def empty?
202
288
  variables.empty?
203
289
  end
204
290
 
291
+ # Enumerate for each row.
292
+ #
293
+ # @overload each_row
294
+ # Returns Enumerator when no block given.
295
+ #
296
+ # @return [Enumerator] Enumerator of each rows.
297
+ #
298
+ # @overload each_row(&block)
299
+ # Yields with key and row pairs.
300
+ #
301
+ # @yield [key_row_pairs] Yields with key and row pairs.
302
+ # @yieldparam [Hash] Key and row pairs.
303
+ # @yieldreturn [Integer] Size of the DataFrame.
304
+ #
205
305
  def each_row
206
306
  return enum_for(:each_row) unless block_given?
207
307
 
@@ -214,6 +314,10 @@ module RedAmber
214
314
  end
215
315
  end
216
316
 
317
+ # Returns self in a `Rover::DataFrame`.
318
+ #
319
+ # @return [Rover::DataFrame] A `Rover::DataFrame`.
320
+ #
217
321
  def to_rover
218
322
  require 'rover'
219
323
  Rover::DataFrame.new(to_h)
@@ -226,7 +330,7 @@ module RedAmber
226
330
  end
227
331
 
228
332
  def method_missing(name, *args, &block)
229
- return v(name) if args.empty?
333
+ return v(name) if args.empty? && key?(name)
230
334
 
231
335
  super
232
336
  end
@@ -241,20 +345,31 @@ module RedAmber
241
345
 
242
346
  # initialize @variable, @keys, @vectors and return one of them
243
347
  def init_instance_vars(var)
244
- ary = @table.columns.each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
245
- v = Vector.new(column.data)
246
- k = column.name.to_sym
247
- v.key = k
248
- variables[k] = v
249
- keys << k
250
- vectors << v
251
- end
348
+ ary =
349
+ @table.columns
350
+ .each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
351
+ v = Vector.create(column.data)
352
+ k = column.name.to_sym
353
+ v.key = k
354
+ variables[k] = v
355
+ keys << k
356
+ vectors << v
357
+ end
358
+
252
359
  @variables, @keys, @vectors = ary
253
360
  ary[%i[variables keys vectors].index(var)]
254
361
  end
255
362
 
363
+ def check_duplicate_keys(array)
364
+ org = array.dup
365
+ return unless array.uniq!
366
+
367
+ raise DataFrameArgumentError,
368
+ "duplicate keys: #{org.tally.select { |_k, v| v > 1 }.keys}"
369
+ end
370
+
256
371
  def name_unnamed_keys
257
- return unless @table[:'']
372
+ return unless @table.key?('')
258
373
 
259
374
  # We can't use #keys because it causes mismatch of @table and @keys
260
375
  keys = @table.schema.fields.map { |f| f.name.to_sym }