red_amber 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,60 @@
1
+ loop_count: 10
2
+
3
+ contexts:
4
+ - name: HEAD
5
+ prelude: |
6
+ $LOAD_PATH.unshift(File.expand_path('lib'))
7
+ - name: 0.2.0
8
+ gems:
9
+ red_amber: 0.2.0
10
+
11
+ prelude: |
12
+ require 'red_amber'
13
+ include RedAmber
14
+ require 'datasets-arrow'
15
+
16
+ ds = Datasets::Rdatasets.new('nycflights13', 'flights')
17
+ flights = RedAmber::DataFrame.new(ds.to_arrow)
18
+ df = flights.slice { flights[:month] <= 6 }
19
+
20
+ tailnum_vector = df[:tailnum]
21
+ distance_vector = df[:distance]
22
+
23
+ strings = tailnum_vector.to_a
24
+ arrow_array = tailnum_vector.data
25
+ integers = df[:dep_delay].to_a
26
+ boolean_vector = df[:air_time].is_nil
27
+ index_vector = Vector.new(0...boolean_vector.size).filter(boolean_vector)
28
+ replacer = index_vector.data.map(&:to_s)
29
+ booleans = boolean_vector.to_a
30
+
31
+ benchmark:
32
+ 'V01: Vector.new from integer Array': |
33
+ Vector.new(integers)
34
+
35
+ 'V02: Vector.new from string Array': |
36
+ Vector.new(strings)
37
+
38
+ 'V03: Vector.new from boolean Vector': |
39
+ Vector.new(boolean_vector)
40
+
41
+ 'V04: Vector#sum': |
42
+ distance_vector.mean
43
+
44
+ 'V05: Vector#*': |
45
+ distance_vector * 1.852
46
+
47
+ 'V06: Vector#[booleans]': |
48
+ tailnum_vector[booleans]
49
+
50
+ 'V07: Vector#[boolean_vector]': |
51
+ tailnum_vector[boolean_vector]
52
+
53
+ 'V08: Vector#[index_vector]': |
54
+ tailnum_vector[index_vector]
55
+
56
+ 'V09: Vector#replace': |
57
+ tailnum_vector.replace(booleans, replacer)
58
+
59
+ 'V10: Vector#replace with broad casting': |
60
+ tailnum_vector.replace(booleans, 'x')
data/doc/DataFrame.md CHANGED
@@ -1302,7 +1302,10 @@ When the option `keep_key: true` used, the column `key` will be preserved.
1302
1302
  - `join_keys` are keys shared by self and other to match with them.
1303
1303
  - If `join_keys` are empty, common keys in self and other are chosen (natural join).
1304
1304
  - If (common keys) > `join_keys`, duplicated keys are renamed by `suffix`.
1305
+ - If you want to match the columns with different names,
1306
+ use Hash for `join_keys` such as `{ left: :KEY1, right: KEY2}`.
1305
1307
 
1308
+ These are dataframes to use in the examples of joins.
1306
1309
  ```ruby
1307
1310
  df = DataFrame.new(
1308
1311
  KEY: %w[A B C],
data/doc/Vector.md CHANGED
@@ -513,3 +513,91 @@ vector.shift(fill: Float::NAN)
513
513
  #<RedAmber::Vector(:double, size=5):0x0000000000011d3c>
514
514
  [NaN, 1.0, 2.0, 3.0, 4.0]
515
515
  ```
516
+
517
+ ### `split_to_columns(sep = ' ', limit = 0)`
518
+
519
+ Split string type Vector with any ASCII whitespace as separator.
520
+ Returns an Array of Vectors.
521
+
522
+ ```ruby
523
+ vector = Vector.new(['a b', 'c d', 'e f'])
524
+ vector.split_to_columns
525
+
526
+ #=>
527
+ [#<RedAmber::Vector(:string, size=3):0x00000000000363a8>
528
+ ["a", "c", "e"]
529
+ ,
530
+ #<RedAmber::Vector(:string, size=3):0x00000000000363bc>
531
+ ["b", "d", "f"]
532
+ ]
533
+ ```
534
+ It will be used for column splitting in DataFrame.
535
+
536
+ ```ruby
537
+ df = DataFrame.new(year_month: %w[2022-01 2022-02 2022-03])
538
+ .assign(:year, :month) { year_month.split_to_columns('-') }
539
+ .drop(:year_month)
540
+
541
+ #=>
542
+ #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000f974>
543
+ year month
544
+ <string> <string>
545
+ 0 2022 01
546
+ 1 2022 02
547
+ 2 2022 03
548
+ ```
549
+
550
+ ### `split_to_rows(sep = ' ', limit = 0)`
551
+
552
+ Split string type Vector with any ASCII whitespace as separator.
553
+ Returns an flattend into rows by Vector.
554
+
555
+ ```ruby
556
+ vector = Vector.new(['a b', 'c d', 'e f'])
557
+ vector.split_to_rows
558
+
559
+ #=>
560
+ #<RedAmber::Vector(:string, size=6):0x000000000002ccf4>
561
+ ["a", "b", "c", "d", "e", "f"]
562
+ ```
563
+
564
+ ### `merge(other, sep: ' ')`
565
+
566
+ Merge String or other string Vector to self using aseparator.
567
+ Self must be a string Vector.
568
+ Returns merged string Vector.
569
+
570
+ ```ruby
571
+ # with vector
572
+ vector = Vector.new(%w[a c e])
573
+ other = Vector.new(%w[b d f])
574
+ vector.merge(other)
575
+
576
+ #=>
577
+ #<RedAmber::Vector(:string, size=3):0x0000000000038b80>
578
+ ["a b", "c d", "e f"]
579
+ ```
580
+
581
+ If other is a String it will be broadcasted.
582
+
583
+ ```ruby
584
+ # with vector
585
+ vector = Vector.new(%w[a c e])
586
+
587
+ #=>
588
+ #<RedAmber::Vector(:string, size=3):0x00000000000446b0>
589
+ ["a x", "c x", "e x"]
590
+ ```
591
+
592
+ You can specify separator string by :sep.
593
+
594
+ ```ruby
595
+ # with vector
596
+ vector = Vector.new(%w[a c e])
597
+ other = Vector.new(%w[b d f])
598
+ vector.merge(other, sep: '')
599
+
600
+ #=>
601
+ #<RedAmber::Vector(:string, size=3):0x0000000000038b80>
602
+ ["ab", "cd", "ef"]
603
+ ```
@@ -14,65 +14,111 @@ module RedAmber
14
14
  include DataFrameVariableOperation
15
15
  include Helper
16
16
 
17
- # Creates a new RedAmber::DataFrame.
17
+ using RefineArrowTable
18
+ using RefineHash
19
+
20
+ # Quicker DataFrame construction from a `Arrow::Table`.
18
21
  #
19
- # @overload initialize(hash)
22
+ # @param table [Arrow::Table] A table to have in the DataFrame.
23
+ # @return [DataFrame] Initialized DataFrame.
20
24
  #
21
- # @params hash [Hash]
25
+ # @note This method will allocate table directly and may be used in the method.
26
+ # @note `table` must have unique keys.
27
+ def self.create(table)
28
+ instance = allocate
29
+ instance.instance_variable_set(:@table, table)
30
+ instance
31
+ end
32
+
33
+ # Creates a new DataFrame.
22
34
  #
23
35
  # @overload initialize(table)
36
+ # Initialize DataFrame by an `Arrow::Table`
37
+ #
38
+ # @param table [Arrow::Table]
39
+ # A table to have in the DataFrame.
40
+ #
41
+ # @overload initialize(arrowable)
42
+ # Initialize DataFrame by a `#to_arrow` responsible object.
43
+ #
44
+ # @param arrowable [#to_arrow]
45
+ # Any object which responds to `#to_arrow`.
46
+ # `#to_arrow` must return `Arrow::Table`.
47
+ #
48
+ # @note `RedAmber::DataFrame` itself is readable by this.
49
+ # @note Hash is refined to respond to `#to_arrow` in this class.
50
+ #
51
+ # @overload initialize(rover_like)
52
+ # Initialize DataFrame by a `Rover::DataFrame`-like `#to_h` responsible object.
24
53
  #
25
- # @params table [Arrow::Table]
54
+ # @param rover_like [#to_h]
55
+ # Any object which responds to `#to_h`.
56
+ # `#to_h` must return a Hash which is convertable by `Arrow::Table.new`.
26
57
  #
27
- # @overload initialize(dataframe)
58
+ # @note `Rover::DataFrame` is readable by this.
28
59
  #
29
- # @params dataframe [RedAmber::DataFrame, Rover::DataFrame]
60
+ # @overload initialize()
61
+ # Create empty DataFrame
30
62
  #
31
- # @overload initialize(null)
63
+ # @example DataFrame.new
32
64
  #
33
- # @params null [NilClass] No arguments.
65
+ # @overload initialize(empty)
66
+ # Create empty DataFrame
67
+ #
68
+ # @param empty [nil, [], {}]
69
+ #
70
+ # @example DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
71
+ #
72
+ # @overload initialize(args)
73
+ #
74
+ # @param args [values]
75
+ # Accepts any argments which is valid for `Arrow::Table.new(args)`. See
76
+ # {https://github.com/apache/arrow/blob/master/ruby/red-arrow/lib/arrow/table.rb
34
77
  #
35
78
  def initialize(*args)
36
- @variables = @keys = @vectors = @types = @data_types = nil
37
79
  case args
38
80
  in nil | [nil] | [] | {} | [[]] | [{}]
39
- # DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
40
- # returns empty DataFrame
41
81
  @table = Arrow::Table.new({}, [])
42
- in [->(x) { x.respond_to?(:to_arrow) } => arrowable]
82
+ in [Arrow::Table => table]
83
+ @table = table
84
+ in [arrowable] if arrowable.respond_to?(:to_arrow)
43
85
  table = arrowable.to_arrow
44
86
  unless table.is_a?(Arrow::Table)
45
87
  raise DataFrameTypeError,
46
88
  "to_arrow must return an Arrow::Table but #{table.class}: #{arrowable}"
47
89
  end
48
90
  @table = table
49
- in [Arrow::Table => table]
50
- @table = table
51
- in [rover_or_hash]
91
+ in [rover_like] if rover_like.respond_to?(:to_h)
52
92
  begin
53
- # Accepts Rover::DataFrame or Hash
54
- @table = Arrow::Table.new(rover_or_hash.to_h)
93
+ # Accepts Rover::DataFrame
94
+ @table = Arrow::Table.new(rover_like.to_h)
55
95
  rescue StandardError
56
- raise DataFrameTypeError, "invalid argument: #{rover_or_hash}"
96
+ raise DataFrameTypeError, "to_h must return Arrowable object: #{rover_like}"
57
97
  end
58
98
  else
59
- @table = Arrow::Table.new(*args)
99
+ begin
100
+ @table = Arrow::Table.new(*args)
101
+ rescue StandardError
102
+ raise DataFrameTypeError, "invalid argument to create Arrow::Table: #{args}"
103
+ end
60
104
  end
61
- name_unnamed_keys
62
105
 
63
- duplicated_keys = keys.tally.select { |_k, v| v > 1 }.keys
64
- raise DataFrameArgumentError, "duplicate keys: #{duplicated_keys}" unless duplicated_keys.empty?
106
+ name_unnamed_keys
107
+ check_duplicate_keys(keys)
65
108
  end
66
109
 
110
+ # Returns the table having within.
111
+ #
112
+ # @return [Arrow::Table] The table within.
113
+ #
67
114
  attr_reader :table
68
115
 
69
- def to_arrow
70
- @table
71
- end
116
+ alias_method :to_arrow, :table
72
117
 
73
118
  # Returns the number of rows.
74
119
  #
75
120
  # @return [Integer] Number of rows.
121
+ #
76
122
  def size
77
123
  @table.n_rows
78
124
  end
@@ -83,6 +129,7 @@ module RedAmber
83
129
  # Returns the number of columns.
84
130
  #
85
131
  # @return [Integer] Number of columns.
132
+ #
86
133
  def n_keys
87
134
  @table.n_columns
88
135
  end
@@ -95,6 +142,7 @@ module RedAmber
95
142
  # @return [Array]
96
143
  # Number of rows and number of columns in an array.
97
144
  # Same as [size, n_keys].
145
+ #
98
146
  def shape
99
147
  [size, n_keys]
100
148
  end
@@ -102,7 +150,8 @@ module RedAmber
102
150
  # Returns a Hash of key and Vector pairs in the columns.
103
151
  #
104
152
  # @return [Hash]
105
- # key => Vector pairs for each columns.
153
+ # `key => Vector` pairs for each columns.
154
+ #
106
155
  def variables
107
156
  @variables || @variables = init_instance_vars(:variables)
108
157
  end
@@ -112,6 +161,7 @@ module RedAmber
112
161
  #
113
162
  # @return [Array]
114
163
  # Keys in an Array.
164
+ #
115
165
  def keys
116
166
  @keys || @keys = init_instance_vars(:keys)
117
167
  end
@@ -123,6 +173,7 @@ module RedAmber
123
173
  # @param key [Symbol, String] Key to test.
124
174
  # @return [Boolean]
125
175
  # Returns true if self has key in Symbol.
176
+ #
126
177
  def key?(key)
127
178
  keys.include?(key.to_sym)
128
179
  end
@@ -133,6 +184,7 @@ module RedAmber
133
184
  # @param key [Symbol, String] key to know.
134
185
  # @return [Integer]
135
186
  # Index of key in the Array keys.
187
+ #
136
188
  def key_index(key)
137
189
  keys.find_index(key.to_sym)
138
190
  end
@@ -143,14 +195,18 @@ module RedAmber
143
195
  #
144
196
  # @return [Array]
145
197
  # Abbreviated Red Arrow data type names.
198
+ #
146
199
  def types
147
- @types || @types = @table.columns.map { |column| column.data.value_type.nick.to_sym }
200
+ @types || @types = @table.columns.map do |column|
201
+ column.data.value_type.nick.to_sym
202
+ end
148
203
  end
149
204
 
150
205
  # Returns an Array of Classes of data type.
151
206
  #
152
207
  # @return [Array]
153
208
  # An Array of Red Arrow data type Classes.
209
+ #
154
210
  def type_classes
155
211
  @data_types || @data_types = @table.columns.map { |column| column.data_type.class }
156
212
  end
@@ -158,50 +214,94 @@ module RedAmber
158
214
  # Returns Vectors in an Array.
159
215
  #
160
216
  # @return [Array]
161
- # An Array of RedAmber::Vector s.
217
+ # An Array of `RedAmber::Vector`s.
218
+ #
162
219
  def vectors
163
220
  @vectors || @vectors = init_instance_vars(:vectors)
164
221
  end
165
222
 
166
- # Returns row indices (start...(size+start)) in an Array.
223
+ # Returns row indices (start...(size+start)) in a Vector.
167
224
  #
168
225
  # @param start [Object]
169
- # Object which have #succ method.
226
+ # Object which have `#succ` method.
227
+ #
170
228
  # @return [Array]
171
- # An Array of indices of the row.
229
+ # A Vector of row indices.
230
+ #
172
231
  # @example
173
232
  # (when self.size == 5)
174
- # - indices #=> [0, 1, 2, 3, 4]
175
- # - indices(1) #=> [1, 2, 3, 4, 5]
176
- # - indices('a') #=> ['a', 'b', 'c', 'd', 'e']
233
+ # - indices #=> Vector[0, 1, 2, 3, 4]
234
+ # - indices(1) #=> Vector[1, 2, 3, 4, 5]
235
+ # - indices('a') #=> Vector['a', 'b', 'c', 'd', 'e']
236
+ #
177
237
  def indices(start = 0)
178
238
  Vector.new((start..).take(size))
179
239
  end
180
240
  alias_method :indexes, :indices
181
241
 
242
+ # Returns column-oriented data in a Hash.
243
+ #
244
+ # @return [Hash] A Hash of 'key => column_in_an_array'.
245
+ #
182
246
  def to_h
183
247
  variables.transform_values(&:to_a)
184
248
  end
185
249
 
250
+ # Returns a row-oriented array without header.
251
+ #
252
+ # @return [Array] Row-oriented data without header.
253
+ #
254
+ # @note If you need column-oriented array, use `.to_h.to_a`.
255
+ #
186
256
  def to_a
187
- # output an array of row-oriented data without header
188
- # if you need column-oriented array, use `.to_h.to_a`
189
257
  @table.raw_records
190
258
  end
191
259
  alias_method :raw_records, :to_a
192
260
 
261
+ # Returns column name and data type in a Hash.
262
+ #
263
+ # @return [Hash] Column name and data type.
264
+ #
265
+ # @example
266
+ # RedAmber::DataFrame.new(x: [1, 2, 3], y: %w[A B C]).schema
267
+ # # => {:x=>:uint8, :y=>:string}
268
+ #
193
269
  def schema
194
270
  keys.zip(types).to_h
195
271
  end
196
272
 
273
+ # Compare DataFrames.
274
+ #
275
+ # @return [true, false]
276
+ # True if other is a DataFrame and table is same.
277
+ # Otherwise return false.
278
+ #
197
279
  def ==(other)
198
280
  other.is_a?(DataFrame) && @table == other.table
199
281
  end
200
282
 
283
+ # Check if it is a empty DataFrame.
284
+ #
285
+ # @return [true, false] True if it has no columns.
286
+ #
201
287
  def empty?
202
288
  variables.empty?
203
289
  end
204
290
 
291
+ # Enumerate for each row.
292
+ #
293
+ # @overload each_row
294
+ # Returns Enumerator when no block given.
295
+ #
296
+ # @return [Enumerator] Enumerator of each rows.
297
+ #
298
+ # @overload each_row(&block)
299
+ # Yields with key and row pairs.
300
+ #
301
+ # @yield [key_row_pairs] Yields with key and row pairs.
302
+ # @yieldparam [Hash] Key and row pairs.
303
+ # @yieldreturn [Integer] Size of the DataFrame.
304
+ #
205
305
  def each_row
206
306
  return enum_for(:each_row) unless block_given?
207
307
 
@@ -214,6 +314,10 @@ module RedAmber
214
314
  end
215
315
  end
216
316
 
317
+ # Returns self in a `Rover::DataFrame`.
318
+ #
319
+ # @return [Rover::DataFrame] A `Rover::DataFrame`.
320
+ #
217
321
  def to_rover
218
322
  require 'rover'
219
323
  Rover::DataFrame.new(to_h)
@@ -226,7 +330,7 @@ module RedAmber
226
330
  end
227
331
 
228
332
  def method_missing(name, *args, &block)
229
- return v(name) if args.empty?
333
+ return v(name) if args.empty? && key?(name)
230
334
 
231
335
  super
232
336
  end
@@ -241,20 +345,31 @@ module RedAmber
241
345
 
242
346
  # initialize @variable, @keys, @vectors and return one of them
243
347
  def init_instance_vars(var)
244
- ary = @table.columns.each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
245
- v = Vector.new(column.data)
246
- k = column.name.to_sym
247
- v.key = k
248
- variables[k] = v
249
- keys << k
250
- vectors << v
251
- end
348
+ ary =
349
+ @table.columns
350
+ .each_with_object([{}, [], []]) do |column, (variables, keys, vectors)|
351
+ v = Vector.create(column.data)
352
+ k = column.name.to_sym
353
+ v.key = k
354
+ variables[k] = v
355
+ keys << k
356
+ vectors << v
357
+ end
358
+
252
359
  @variables, @keys, @vectors = ary
253
360
  ary[%i[variables keys vectors].index(var)]
254
361
  end
255
362
 
363
+ def check_duplicate_keys(array)
364
+ org = array.dup
365
+ return unless array.uniq!
366
+
367
+ raise DataFrameArgumentError,
368
+ "duplicate keys: #{org.tally.select { |_k, v| v > 1 }.keys}"
369
+ end
370
+
256
371
  def name_unnamed_keys
257
- return unless @table[:'']
372
+ return unless @table.key?('')
258
373
 
259
374
  # We can't use #keys because it causes mismatch of @table and @keys
260
375
  keys = @table.schema.fields.map { |f| f.name.to_sym }