red_amber 0.1.7 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,39 +1,57 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # data frame class
5
- # @table : holds Arrow::Table object
4
+ # Class to represent a data frame.
5
+ # Variable @table holds an Arrow::Table object.
6
6
  class DataFrame
7
7
  # mix-in
8
8
  include DataFrameDisplayable
9
9
  include DataFrameIndexable
10
+ include DataFrameReshaping
10
11
  include DataFrameSelectable
11
12
  include DataFrameVariableOperation
12
13
  include Helper
13
14
 
15
+ # Creates a new RedAmber::DataFrame.
16
+ #
17
+ # @overload initialize(hash)
18
+ #
19
+ # @params hash [Hash]
20
+ #
21
+ # @overload initialize(table)
22
+ #
23
+ # @params table [Arrow::Table]
24
+ #
25
+ # @overload initialize(dataframe)
26
+ #
27
+ # @params dataframe [RedAmber::DataFrame, Rover::DataFrame]
28
+ #
29
+ # @overload initialize(null)
30
+ #
31
+ # @params null [NilClass] No arguments.
32
+ #
14
33
  def initialize(*args)
15
34
  @variables = @keys = @vectors = @types = @data_types = nil
16
- # bug in gobject-introspection: ruby-gnome/ruby-gnome#1472
17
- # [Arrow::Table] == [nil] shows ArgumentError
18
- # temporary use yoda condition to workaround
19
- if args.empty? || args == [[]] || args == [{}] || [nil] == args
35
+ case args
36
+ in nil | [nil] | [] | {} | [[]] | [{}]
20
37
  # DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
21
38
  # returns empty DataFrame
22
39
  @table = Arrow::Table.new({}, [])
23
- elsif args.size > 1
24
- @table = Arrow::Table.new(*args)
40
+ in [Arrow::Table => table]
41
+ @table = table
42
+ in [DataFrame => dataframe]
43
+ @table = dataframe.table
44
+ in [rover_or_hash]
45
+ begin
46
+ # Accepts Rover::DataFrame or Hash
47
+ @table = Arrow::Table.new(rover_or_hash.to_h)
48
+ rescue StandardError
49
+ raise DataFrameTypeError, "invalid argument: #{rover_or_hash}"
50
+ end
25
51
  else
26
- arg = args[0]
27
- @table =
28
- case arg
29
- when Arrow::Table then arg
30
- when DataFrame then arg.table
31
- when Rover::DataFrame then Arrow::Table.new(arg.to_h)
32
- when Hash then Arrow::Table.new(arg)
33
- else
34
- raise DataFrameTypeError, "invalid argument: #{arg}"
35
- end
52
+ @table = Arrow::Table.new(*args)
36
53
  end
54
+ name_unnamed_keys
37
55
  end
38
56
 
39
57
  def self.load(path, options = {})
@@ -50,58 +68,110 @@ module RedAmber
50
68
  @table.save(output, options)
51
69
  end
52
70
 
71
+ # Returns the number of rows.
72
+ #
73
+ # @return [Integer] Number of rows.
53
74
  def size
54
75
  @table.n_rows
55
76
  end
56
77
  alias_method :n_rows, :size
57
78
  alias_method :n_obs, :size
58
79
 
80
+ # Returns the number of columns.
81
+ #
82
+ # @return [Integer] Number of columns.
59
83
  def n_keys
60
84
  @table.n_columns
61
85
  end
62
86
  alias_method :n_cols, :n_keys
63
87
  alias_method :n_vars, :n_keys
64
88
 
89
+ # Returns the numbers of rows and columns.
90
+ #
91
+ # @return [Array]
92
+ # Number of rows and number of columns in an array.
93
+ # Same as [size, n_keys].
65
94
  def shape
66
95
  [size, n_keys]
67
96
  end
68
97
 
98
+ # Returns a Hash of key and Vector pairs in the columns.
99
+ #
100
+ # @return [Hash]
101
+ # key => Vector pairs for each columns.
69
102
  def variables
70
103
  @variables || @variables = init_instance_vars(:variables)
71
104
  end
72
105
  alias_method :vars, :variables
73
106
 
107
+ # Returns an Array of keys.
108
+ #
109
+ # @return [Array]
110
+ # Keys in an Array.
74
111
  def keys
75
112
  @keys || @keys = init_instance_vars(:keys)
76
113
  end
77
114
  alias_method :column_names, :keys
78
115
  alias_method :var_names, :keys
79
116
 
117
+ # Returns true if self has a specified key in the argument.
118
+ #
119
+ # @param key [Symbol, String] Key to test.
120
+ # @return [Boolean]
121
+ # Returns true if self has key in Symbol.
80
122
  def key?(key)
81
- @keys.include?(key.to_sym)
123
+ keys.include?(key.to_sym)
82
124
  end
83
125
  alias_method :has_key?, :key?
84
126
 
127
+ # Returns index of specified key in the Array keys.
128
+ #
129
+ # @param key [Symbol, String] key to know.
130
+ # @return [Integer]
131
+ # Index of key in the Array keys.
85
132
  def key_index(key)
86
- @keys.find_index(key.to_sym)
133
+ keys.find_index(key.to_sym)
87
134
  end
88
135
  alias_method :find_index, :key_index
89
136
  alias_method :index, :key_index
90
137
 
138
+ # Returns abbreviated type names in an Array.
139
+ #
140
+ # @return [Array]
141
+ # Abbreviated Red Arrow data type names.
91
142
  def types
92
143
  @types || @types = @table.columns.map { |column| column.data.value_type.nick.to_sym }
93
144
  end
94
145
 
146
+ # Returns an Array of Classes of data type.
147
+ #
148
+ # @return [Array]
149
+ # An Array of Red Arrow data type Classes.
95
150
  def type_classes
96
151
  @data_types || @data_types = @table.columns.map { |column| column.data_type.class }
97
152
  end
98
153
 
154
+ # Returns Vectors in an Array.
155
+ #
156
+ # @return [Array]
157
+ # An Array of RedAmber::Vector s.
99
158
  def vectors
100
159
  @vectors || @vectors = init_instance_vars(:vectors)
101
160
  end
102
161
 
103
- def indices
104
- (0...size).to_a
162
+ # Returns row indices (start...(size+start)) in an Array.
163
+ #
164
+ # @param start [Object]
165
+ # Object which have #succ method.
166
+ # @return [Array]
167
+ # An Array of indices of the row.
168
+ # @example
169
+ # (when self.size == 5)
170
+ # - indices #=> [0, 1, 2, 3, 4]
171
+ # - indices(1) #=> [1, 2, 3, 4, 5]
172
+ # - indices('a') #=> ['a', 'b', 'c', 'd', 'e']
173
+ def indices(start = 0)
174
+ (start..).take(size)
105
175
  end
106
176
  alias_method :indexes, :indices
107
177
 
@@ -128,6 +198,18 @@ module RedAmber
128
198
  variables.empty?
129
199
  end
130
200
 
201
+ def each_row
202
+ return enum_for(:each_row) unless block_given?
203
+
204
+ size.times do |i|
205
+ key_row_pairs =
206
+ vectors.each_with_object({}) do |v, h|
207
+ h[v.key] = v.data[i]
208
+ end
209
+ yield key_row_pairs
210
+ end
211
+ end
212
+
131
213
  def to_rover
132
214
  require 'rover'
133
215
  Rover::DataFrame.new(to_h)
@@ -144,8 +226,22 @@ module RedAmber
144
226
  end
145
227
  end
146
228
 
147
- def group(*group_keys)
148
- Group.new(self, group_keys)
229
+ def group(*group_keys, &block)
230
+ g = Group.new(self, group_keys)
231
+ g = g.summarize(&block) if block
232
+ g
233
+ end
234
+
235
+ def method_missing(name, *args, &block)
236
+ return v(name) if args.empty?
237
+
238
+ super
239
+ end
240
+
241
+ def respond_to_missing?(name, include_private)
242
+ return true if key?(name)
243
+
244
+ super
149
245
  end
150
246
 
151
247
  private
@@ -182,5 +278,23 @@ module RedAmber
182
278
  html = IRuby::HTML.table(converted.to_h, maxrows: 8, maxcols: 15)
183
279
  "#{self.class} <#{size} x #{n_keys} vector#{pl(n_keys)}> #{html}"
184
280
  end
281
+
282
+ def name_unnamed_keys
283
+ return unless @table[:'']
284
+
285
+ # We can't use #keys because it causes mismatch of @table and @keys
286
+ keys = @table.schema.fields.map { |f| f.name.to_sym }
287
+ unnamed = (:unnamed1..).find { |e| !keys.include?(e) }
288
+ fields =
289
+ @table.schema.fields.map do |field|
290
+ if field.name.empty?
291
+ Arrow::Field.new(unnamed, field.data_type)
292
+ else
293
+ field
294
+ end
295
+ end
296
+ schema = Arrow::Schema.new(fields)
297
+ @table = Arrow::Table.new(schema, @table.columns)
298
+ end
185
299
  end
186
300
  end
@@ -5,15 +5,36 @@ require 'stringio'
5
5
  module RedAmber
6
6
  # mix-ins for the class DataFrame
7
7
  module DataFrameDisplayable
8
- def to_s
8
+ INDEX_KEY = :index_key_for_format_table
9
+
10
+ def to_s(width: 80)
9
11
  return '' if empty?
10
12
 
11
- format_table(width: 80)
13
+ format_table(width: width)
12
14
  end
13
15
 
14
- # def describe() end
15
-
16
- # def summary() end
16
+ # Show statistical summary by a new DatFrame.
17
+ # Make stats for numeric columns only.
18
+ # NaNs are ignored.
19
+ # Counts also show non-NaN counts.
20
+ #
21
+ # @return [DataFrame] a new dataframe.
22
+ def summary
23
+ num_keys = keys.select { |key| self[key].numeric? }
24
+
25
+ DataFrame.new(
26
+ variables: num_keys,
27
+ count: num_keys.map { |k| self[k].count },
28
+ mean: num_keys.map { |k| self[k].mean },
29
+ std: num_keys.map { |k| self[k].std },
30
+ min: num_keys.map { |k| self[k].min },
31
+ '25%': num_keys.map { |k| self[k].quantile(0.25) },
32
+ median: num_keys.map { |k| self[k].median },
33
+ '75%': num_keys.map { |k| self[k].quantile(0.75) },
34
+ max: num_keys.map { |k| self[k].max }
35
+ )
36
+ end
37
+ alias_method :describe, :summary
17
38
 
18
39
  def inspect
19
40
  if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table') == 'TDR'
@@ -131,15 +152,11 @@ module RedAmber
131
152
  a
132
153
  end
133
154
 
134
- def format_table(width: 80)
135
- head = 5
136
- tail = 3
137
- n_digit = 1
138
-
155
+ def format_table(width: 80, head: 5, tail: 3, n_digit: 2)
139
156
  original = self
140
- indices = size > head + tail ? [*0...head, *(size - tail)...size] : [*0...size]
157
+ indices = size > head + tail ? [*0..head, *(size - tail)...size] : [*0...size]
141
158
  df = slice(indices).assign do
142
- assigner = { '': indices.map { |i| (i + 1).to_s } }
159
+ assigner = { INDEX_KEY => indices.map { |i| (i + 1).to_s } }
143
160
  vectors.each_with_object(assigner) do |v, a|
144
161
  a[v.key] = v.to_a.map do |e|
145
162
  if e.nil?
@@ -155,13 +172,13 @@ module RedAmber
155
172
  end
156
173
  end
157
174
 
158
- df = df.pick { [keys[-1], keys[0..-2]] }
159
- df = size > head + tail ? df[0, 0, 0...head, 0, -tail..-1] : df[0, 0, 0..-1]
175
+ df = df.pick { [INDEX_KEY, keys - [INDEX_KEY]] }
176
+ df = size > head + tail ? df[0, 0, 0..head, -tail..-1] : df[0, 0, 0..-1]
160
177
  df = df.assign do
161
178
  vectors.each_with_object({}) do |v, assigner|
162
- vec = v.replace(0, v.key.to_s)
163
- .replace(1, v.key == :'' ? '' : "<#{original[v.key].type}>")
164
- assigner[v.key] = size > head + tail ? vec.replace(head + 2, ':') : vec
179
+ vec = v.replace(0, v.key == INDEX_KEY ? '' : v.key.to_s)
180
+ .replace(1, v.key == INDEX_KEY ? '' : "<#{original[v.key].type}>")
181
+ assigner[v.key] = original.size > head + tail + 1 ? vec.replace(head + 2, ':') : vec
165
182
  end
166
183
  end
167
184
 
@@ -197,7 +214,7 @@ module RedAmber
197
214
  end
198
215
 
199
216
  def format_for_column(vector, original, width)
200
- if vector.key != :'' && !original[vector.key].numeric?
217
+ if vector.key != INDEX_KEY && !original[vector.key].numeric?
201
218
  "%-#{width}s"
202
219
  else
203
220
  "%#{width}s"
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-ins for the class DataFrame
5
+ module DataFrameReshaping
6
+ # Transpose a wide DataFrame.
7
+ #
8
+ # @param key [Symbol] key of the index column
9
+ # to transepose into keys.
10
+ # If it is not specified, keys[0] is used.
11
+ # @param new_key [Symbol] key name of transposed index column.
12
+ # If it is not specified, :N is used. If it already exists, :N1 or :N1.succ is used.
13
+ # @return [DataFrame] trnsposed DataFrame
14
+ def transpose(key: keys.first, name: :N)
15
+ raise DataFrameArgumentError, "Self does not include: #{key}" unless keys.include?(key)
16
+
17
+ # Find unused name
18
+ new_keys = self[key].to_a.map { |e| e.to_s.to_sym }
19
+ name = (:N1..).find { |k| !new_keys.include?(k) } if new_keys.include?(name)
20
+
21
+ hash = { name => (keys - [key]) }
22
+ i = keys.index(key)
23
+ each_row do |h|
24
+ k = h.values[i]
25
+ hash[k] = h.values - [k]
26
+ end
27
+ DataFrame.new(hash)
28
+ end
29
+
30
+ # Reshape wide DataFrame to a longer DataFrame.
31
+ #
32
+ # @param keep_keys [Array] keys to keep.
33
+ # @param name [Symbol, String] key of the column which is come **from values**.
34
+ # @param value [Symbol, String] key of the column which is come **from values**.
35
+ # @return [DataFrame] long DataFrame.
36
+ def to_long(*keep_keys, name: :N, value: :V)
37
+ not_included = keep_keys - keys
38
+ raise DataFrameArgumentError, "Not have keys #{not_included}" unless not_included.empty?
39
+
40
+ name = name.to_sym
41
+ raise DataFrameArgumentError, "Invalid key: #{name}" if keep_keys.include?(name)
42
+
43
+ value = value.to_sym
44
+ raise DataFrameArgumentError, "Invalid key: #{value}" if keep_keys.include?(value)
45
+
46
+ hash = Hash.new { |h, k| h[k] = [] }
47
+ l = keys.size - keep_keys.size
48
+ each_row do |row|
49
+ row.each do |k, v|
50
+ if keep_keys.include?(k)
51
+ hash[k].concat([v] * l)
52
+ else
53
+ hash[name] << k
54
+ hash[value] << v
55
+ end
56
+ end
57
+ end
58
+ DataFrame.new(hash)
59
+ end
60
+
61
+ # Reshape long DataFrame to a wide DataFrame.
62
+ #
63
+ # @param name [Symbol, String] key of the column which will be expanded **to key names**.
64
+ # @param value [Symbol, String] key of the column which will be expanded **to values**.
65
+ # @return [DataFrame] wide DataFrame.
66
+ def to_wide(name: :N, value: :V)
67
+ name = name.to_sym
68
+ raise DataFrameArgumentError, "Invalid key: #{name}" unless keys.include?(name)
69
+
70
+ value = value.to_sym
71
+ raise DataFrameArgumentError, "Invalid key: #{value}" unless keys.include?(value)
72
+
73
+ hash = Hash.new { |h, k| h[k] = {} }
74
+ keep_keys = keys - [name, value]
75
+ each_row do |row|
76
+ keeps, converts = row.partition { |k, _| keep_keys.include?(k) }
77
+ h = converts.to_h
78
+ hash[keeps.to_h][h[name].to_s.to_sym] = h[value]
79
+ end
80
+ ks = hash.first[0].keys + hash.first[1].keys
81
+ vs = hash.map { |k, v| k.values + v.values }.transpose
82
+ DataFrame.new(ks.zip(vs))
83
+ end
84
+ end
85
+ end
@@ -3,8 +3,8 @@
3
3
  module RedAmber
4
4
  # mix-in for the class DataFrame
5
5
  module DataFrameSelectable
6
- # select variables: [symbol] or [string]
7
- # select observations: [array of index], [range]
6
+ # select columns: [symbol] or [string]
7
+ # select rows: [array of index], [range]
8
8
  def [](*args)
9
9
  args.flatten!
10
10
  raise DataFrameArgumentError, 'Empty dataframe' if empty?
@@ -22,17 +22,17 @@ module RedAmber
22
22
  raise DataFrameArgumentError, "Invalid argument: #{args}"
23
23
  end
24
24
 
25
- # slice and select some observations to create sub DataFrame
25
+ # slice and select rows to create sub DataFrame
26
26
  def slice(*args, &block)
27
27
  slicer = args
28
28
  if block
29
29
  raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
30
30
 
31
- slicer = instance_eval(&block)
31
+ slicer = [instance_eval(&block)]
32
32
  end
33
- slicer = [slicer].flatten
33
+ slicer.flatten!
34
34
 
35
- raise DataFrameArgumentError, 'Empty dataframe' if empty?
35
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
36
36
  return remove_all_values if slicer.empty? || slicer[0].nil?
37
37
 
38
38
  vector = parse_to_vector(slicer)
@@ -46,15 +46,59 @@ module RedAmber
46
46
  raise DataFrameArgumentError, "Invalid argument #{slicer}"
47
47
  end
48
48
 
49
- # remove selected observations to create sub DataFrame
49
+ def slice_by(key, keep_key: false, &block)
50
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
51
+ raise DataFrameArgumentError, 'No block given' unless block
52
+ raise DataFrameArgumentError, "#{key} is no a key of self" unless key?(key)
53
+ return self if key.nil?
54
+
55
+ slicer = instance_eval(&block)
56
+ return DataFrame.new unless slicer
57
+
58
+ if slicer.is_a?(Range)
59
+ from = slicer.begin
60
+ from =
61
+ if from.is_a?(String)
62
+ self[key].index(from)
63
+ elsif from.nil?
64
+ 0
65
+ elsif from < 0
66
+ size + from
67
+ else
68
+ from
69
+ end
70
+ to = slicer.end
71
+ to =
72
+ if to.is_a?(String)
73
+ self[key].index(to)
74
+ elsif to.nil?
75
+ size - 1
76
+ elsif to < 0
77
+ size + to
78
+ else
79
+ to
80
+ end
81
+ slicer = (from..to).to_a
82
+ else
83
+ slicer = slicer.map { |x| x.is_a?(String) ? self[key].index(x) : x }
84
+ end
85
+
86
+ if keep_key
87
+ take(slicer)
88
+ else
89
+ take(slicer).drop(key)
90
+ end
91
+ end
92
+
93
+ # remove selected rows to create remainer DataFrame
50
94
  def remove(*args, &block)
51
95
  remover = args
52
96
  if block
53
97
  raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
54
98
 
55
- remover = instance_eval(&block)
99
+ remover = [instance_eval(&block)]
56
100
  end
57
- remover = [remover].flatten
101
+ remover.flatten!
58
102
 
59
103
  raise DataFrameArgumentError, 'Empty dataframe' if empty?
60
104
  return self if remover.empty? || remover[0].nil?