red_amber 0.1.7 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,39 +1,57 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # data frame class
5
- # @table : holds Arrow::Table object
4
+ # Class to represent a data frame.
5
+ # Variable @table holds an Arrow::Table object.
6
6
  class DataFrame
7
7
  # mix-in
8
8
  include DataFrameDisplayable
9
9
  include DataFrameIndexable
10
+ include DataFrameReshaping
10
11
  include DataFrameSelectable
11
12
  include DataFrameVariableOperation
12
13
  include Helper
13
14
 
15
+ # Creates a new RedAmber::DataFrame.
16
+ #
17
+ # @overload initialize(hash)
18
+ #
19
+ # @params hash [Hash]
20
+ #
21
+ # @overload initialize(table)
22
+ #
23
+ # @params table [Arrow::Table]
24
+ #
25
+ # @overload initialize(dataframe)
26
+ #
27
+ # @params dataframe [RedAmber::DataFrame, Rover::DataFrame]
28
+ #
29
+ # @overload initialize(null)
30
+ #
31
+ # @params null [NilClass] No arguments.
32
+ #
14
33
  def initialize(*args)
15
34
  @variables = @keys = @vectors = @types = @data_types = nil
16
- # bug in gobject-introspection: ruby-gnome/ruby-gnome#1472
17
- # [Arrow::Table] == [nil] shows ArgumentError
18
- # temporary use yoda condition to workaround
19
- if args.empty? || args == [[]] || args == [{}] || [nil] == args
35
+ case args
36
+ in nil | [nil] | [] | {} | [[]] | [{}]
20
37
  # DataFrame.new, DataFrame.new([]), DataFrame.new({}), DataFrame.new(nil)
21
38
  # returns empty DataFrame
22
39
  @table = Arrow::Table.new({}, [])
23
- elsif args.size > 1
24
- @table = Arrow::Table.new(*args)
40
+ in [Arrow::Table => table]
41
+ @table = table
42
+ in [DataFrame => dataframe]
43
+ @table = dataframe.table
44
+ in [rover_or_hash]
45
+ begin
46
+ # Accepts Rover::DataFrame or Hash
47
+ @table = Arrow::Table.new(rover_or_hash.to_h)
48
+ rescue StandardError
49
+ raise DataFrameTypeError, "invalid argument: #{rover_or_hash}"
50
+ end
25
51
  else
26
- arg = args[0]
27
- @table =
28
- case arg
29
- when Arrow::Table then arg
30
- when DataFrame then arg.table
31
- when Rover::DataFrame then Arrow::Table.new(arg.to_h)
32
- when Hash then Arrow::Table.new(arg)
33
- else
34
- raise DataFrameTypeError, "invalid argument: #{arg}"
35
- end
52
+ @table = Arrow::Table.new(*args)
36
53
  end
54
+ name_unnamed_keys
37
55
  end
38
56
 
39
57
  def self.load(path, options = {})
@@ -50,58 +68,110 @@ module RedAmber
50
68
  @table.save(output, options)
51
69
  end
52
70
 
71
+ # Returns the number of rows.
72
+ #
73
+ # @return [Integer] Number of rows.
53
74
  def size
54
75
  @table.n_rows
55
76
  end
56
77
  alias_method :n_rows, :size
57
78
  alias_method :n_obs, :size
58
79
 
80
+ # Returns the number of columns.
81
+ #
82
+ # @return [Integer] Number of columns.
59
83
  def n_keys
60
84
  @table.n_columns
61
85
  end
62
86
  alias_method :n_cols, :n_keys
63
87
  alias_method :n_vars, :n_keys
64
88
 
89
+ # Returns the numbers of rows and columns.
90
+ #
91
+ # @return [Array]
92
+ # Number of rows and number of columns in an array.
93
+ # Same as [size, n_keys].
65
94
  def shape
66
95
  [size, n_keys]
67
96
  end
68
97
 
98
+ # Returns a Hash of key and Vector pairs in the columns.
99
+ #
100
+ # @return [Hash]
101
+ # key => Vector pairs for each columns.
69
102
  def variables
70
103
  @variables || @variables = init_instance_vars(:variables)
71
104
  end
72
105
  alias_method :vars, :variables
73
106
 
107
+ # Returns an Array of keys.
108
+ #
109
+ # @return [Array]
110
+ # Keys in an Array.
74
111
  def keys
75
112
  @keys || @keys = init_instance_vars(:keys)
76
113
  end
77
114
  alias_method :column_names, :keys
78
115
  alias_method :var_names, :keys
79
116
 
117
+ # Returns true if self has a specified key in the argument.
118
+ #
119
+ # @param key [Symbol, String] Key to test.
120
+ # @return [Boolean]
121
+ # Returns true if self has key in Symbol.
80
122
  def key?(key)
81
- @keys.include?(key.to_sym)
123
+ keys.include?(key.to_sym)
82
124
  end
83
125
  alias_method :has_key?, :key?
84
126
 
127
+ # Returns index of specified key in the Array keys.
128
+ #
129
+ # @param key [Symbol, String] key to know.
130
+ # @return [Integer]
131
+ # Index of key in the Array keys.
85
132
  def key_index(key)
86
- @keys.find_index(key.to_sym)
133
+ keys.find_index(key.to_sym)
87
134
  end
88
135
  alias_method :find_index, :key_index
89
136
  alias_method :index, :key_index
90
137
 
138
+ # Returns abbreviated type names in an Array.
139
+ #
140
+ # @return [Array]
141
+ # Abbreviated Red Arrow data type names.
91
142
  def types
92
143
  @types || @types = @table.columns.map { |column| column.data.value_type.nick.to_sym }
93
144
  end
94
145
 
146
+ # Returns an Array of Classes of data type.
147
+ #
148
+ # @return [Array]
149
+ # An Array of Red Arrow data type Classes.
95
150
  def type_classes
96
151
  @data_types || @data_types = @table.columns.map { |column| column.data_type.class }
97
152
  end
98
153
 
154
+ # Returns Vectors in an Array.
155
+ #
156
+ # @return [Array]
157
+ # An Array of RedAmber::Vector s.
99
158
  def vectors
100
159
  @vectors || @vectors = init_instance_vars(:vectors)
101
160
  end
102
161
 
103
- def indices
104
- (0...size).to_a
162
+ # Returns row indices (start...(size+start)) in an Array.
163
+ #
164
+ # @param start [Object]
165
+ # Object which have #succ method.
166
+ # @return [Array]
167
+ # An Array of indices of the row.
168
+ # @example
169
+ # (when self.size == 5)
170
+ # - indices #=> [0, 1, 2, 3, 4]
171
+ # - indices(1) #=> [1, 2, 3, 4, 5]
172
+ # - indices('a') #=> ['a', 'b', 'c', 'd', 'e']
173
+ def indices(start = 0)
174
+ (start..).take(size)
105
175
  end
106
176
  alias_method :indexes, :indices
107
177
 
@@ -128,6 +198,18 @@ module RedAmber
128
198
  variables.empty?
129
199
  end
130
200
 
201
+ def each_row
202
+ return enum_for(:each_row) unless block_given?
203
+
204
+ size.times do |i|
205
+ key_row_pairs =
206
+ vectors.each_with_object({}) do |v, h|
207
+ h[v.key] = v.data[i]
208
+ end
209
+ yield key_row_pairs
210
+ end
211
+ end
212
+
131
213
  def to_rover
132
214
  require 'rover'
133
215
  Rover::DataFrame.new(to_h)
@@ -144,8 +226,22 @@ module RedAmber
144
226
  end
145
227
  end
146
228
 
147
- def group(*group_keys)
148
- Group.new(self, group_keys)
229
+ def group(*group_keys, &block)
230
+ g = Group.new(self, group_keys)
231
+ g = g.summarize(&block) if block
232
+ g
233
+ end
234
+
235
+ def method_missing(name, *args, &block)
236
+ return v(name) if args.empty?
237
+
238
+ super
239
+ end
240
+
241
+ def respond_to_missing?(name, include_private)
242
+ return true if key?(name)
243
+
244
+ super
149
245
  end
150
246
 
151
247
  private
@@ -182,5 +278,23 @@ module RedAmber
182
278
  html = IRuby::HTML.table(converted.to_h, maxrows: 8, maxcols: 15)
183
279
  "#{self.class} <#{size} x #{n_keys} vector#{pl(n_keys)}> #{html}"
184
280
  end
281
+
282
+ def name_unnamed_keys
283
+ return unless @table[:'']
284
+
285
+ # We can't use #keys because it causes mismatch of @table and @keys
286
+ keys = @table.schema.fields.map { |f| f.name.to_sym }
287
+ unnamed = (:unnamed1..).find { |e| !keys.include?(e) }
288
+ fields =
289
+ @table.schema.fields.map do |field|
290
+ if field.name.empty?
291
+ Arrow::Field.new(unnamed, field.data_type)
292
+ else
293
+ field
294
+ end
295
+ end
296
+ schema = Arrow::Schema.new(fields)
297
+ @table = Arrow::Table.new(schema, @table.columns)
298
+ end
185
299
  end
186
300
  end
@@ -5,15 +5,36 @@ require 'stringio'
5
5
  module RedAmber
6
6
  # mix-ins for the class DataFrame
7
7
  module DataFrameDisplayable
8
- def to_s
8
+ INDEX_KEY = :index_key_for_format_table
9
+
10
+ def to_s(width: 80)
9
11
  return '' if empty?
10
12
 
11
- format_table(width: 80)
13
+ format_table(width: width)
12
14
  end
13
15
 
14
- # def describe() end
15
-
16
- # def summary() end
16
+ # Show statistical summary by a new DatFrame.
17
+ # Make stats for numeric columns only.
18
+ # NaNs are ignored.
19
+ # Counts also show non-NaN counts.
20
+ #
21
+ # @return [DataFrame] a new dataframe.
22
+ def summary
23
+ num_keys = keys.select { |key| self[key].numeric? }
24
+
25
+ DataFrame.new(
26
+ variables: num_keys,
27
+ count: num_keys.map { |k| self[k].count },
28
+ mean: num_keys.map { |k| self[k].mean },
29
+ std: num_keys.map { |k| self[k].std },
30
+ min: num_keys.map { |k| self[k].min },
31
+ '25%': num_keys.map { |k| self[k].quantile(0.25) },
32
+ median: num_keys.map { |k| self[k].median },
33
+ '75%': num_keys.map { |k| self[k].quantile(0.75) },
34
+ max: num_keys.map { |k| self[k].max }
35
+ )
36
+ end
37
+ alias_method :describe, :summary
17
38
 
18
39
  def inspect
19
40
  if ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table') == 'TDR'
@@ -131,15 +152,11 @@ module RedAmber
131
152
  a
132
153
  end
133
154
 
134
- def format_table(width: 80)
135
- head = 5
136
- tail = 3
137
- n_digit = 1
138
-
155
+ def format_table(width: 80, head: 5, tail: 3, n_digit: 2)
139
156
  original = self
140
- indices = size > head + tail ? [*0...head, *(size - tail)...size] : [*0...size]
157
+ indices = size > head + tail ? [*0..head, *(size - tail)...size] : [*0...size]
141
158
  df = slice(indices).assign do
142
- assigner = { '': indices.map { |i| (i + 1).to_s } }
159
+ assigner = { INDEX_KEY => indices.map { |i| (i + 1).to_s } }
143
160
  vectors.each_with_object(assigner) do |v, a|
144
161
  a[v.key] = v.to_a.map do |e|
145
162
  if e.nil?
@@ -155,13 +172,13 @@ module RedAmber
155
172
  end
156
173
  end
157
174
 
158
- df = df.pick { [keys[-1], keys[0..-2]] }
159
- df = size > head + tail ? df[0, 0, 0...head, 0, -tail..-1] : df[0, 0, 0..-1]
175
+ df = df.pick { [INDEX_KEY, keys - [INDEX_KEY]] }
176
+ df = size > head + tail ? df[0, 0, 0..head, -tail..-1] : df[0, 0, 0..-1]
160
177
  df = df.assign do
161
178
  vectors.each_with_object({}) do |v, assigner|
162
- vec = v.replace(0, v.key.to_s)
163
- .replace(1, v.key == :'' ? '' : "<#{original[v.key].type}>")
164
- assigner[v.key] = size > head + tail ? vec.replace(head + 2, ':') : vec
179
+ vec = v.replace(0, v.key == INDEX_KEY ? '' : v.key.to_s)
180
+ .replace(1, v.key == INDEX_KEY ? '' : "<#{original[v.key].type}>")
181
+ assigner[v.key] = original.size > head + tail + 1 ? vec.replace(head + 2, ':') : vec
165
182
  end
166
183
  end
167
184
 
@@ -197,7 +214,7 @@ module RedAmber
197
214
  end
198
215
 
199
216
  def format_for_column(vector, original, width)
200
- if vector.key != :'' && !original[vector.key].numeric?
217
+ if vector.key != INDEX_KEY && !original[vector.key].numeric?
201
218
  "%-#{width}s"
202
219
  else
203
220
  "%#{width}s"
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RedAmber
4
+ # mix-ins for the class DataFrame
5
+ module DataFrameReshaping
6
+ # Transpose a wide DataFrame.
7
+ #
8
+ # @param key [Symbol] key of the index column
9
+ # to transepose into keys.
10
+ # If it is not specified, keys[0] is used.
11
+ # @param new_key [Symbol] key name of transposed index column.
12
+ # If it is not specified, :N is used. If it already exists, :N1 or :N1.succ is used.
13
+ # @return [DataFrame] trnsposed DataFrame
14
+ def transpose(key: keys.first, name: :N)
15
+ raise DataFrameArgumentError, "Self does not include: #{key}" unless keys.include?(key)
16
+
17
+ # Find unused name
18
+ new_keys = self[key].to_a.map { |e| e.to_s.to_sym }
19
+ name = (:N1..).find { |k| !new_keys.include?(k) } if new_keys.include?(name)
20
+
21
+ hash = { name => (keys - [key]) }
22
+ i = keys.index(key)
23
+ each_row do |h|
24
+ k = h.values[i]
25
+ hash[k] = h.values - [k]
26
+ end
27
+ DataFrame.new(hash)
28
+ end
29
+
30
+ # Reshape wide DataFrame to a longer DataFrame.
31
+ #
32
+ # @param keep_keys [Array] keys to keep.
33
+ # @param name [Symbol, String] key of the column which is come **from values**.
34
+ # @param value [Symbol, String] key of the column which is come **from values**.
35
+ # @return [DataFrame] long DataFrame.
36
+ def to_long(*keep_keys, name: :N, value: :V)
37
+ not_included = keep_keys - keys
38
+ raise DataFrameArgumentError, "Not have keys #{not_included}" unless not_included.empty?
39
+
40
+ name = name.to_sym
41
+ raise DataFrameArgumentError, "Invalid key: #{name}" if keep_keys.include?(name)
42
+
43
+ value = value.to_sym
44
+ raise DataFrameArgumentError, "Invalid key: #{value}" if keep_keys.include?(value)
45
+
46
+ hash = Hash.new { |h, k| h[k] = [] }
47
+ l = keys.size - keep_keys.size
48
+ each_row do |row|
49
+ row.each do |k, v|
50
+ if keep_keys.include?(k)
51
+ hash[k].concat([v] * l)
52
+ else
53
+ hash[name] << k
54
+ hash[value] << v
55
+ end
56
+ end
57
+ end
58
+ DataFrame.new(hash)
59
+ end
60
+
61
+ # Reshape long DataFrame to a wide DataFrame.
62
+ #
63
+ # @param name [Symbol, String] key of the column which will be expanded **to key names**.
64
+ # @param value [Symbol, String] key of the column which will be expanded **to values**.
65
+ # @return [DataFrame] wide DataFrame.
66
+ def to_wide(name: :N, value: :V)
67
+ name = name.to_sym
68
+ raise DataFrameArgumentError, "Invalid key: #{name}" unless keys.include?(name)
69
+
70
+ value = value.to_sym
71
+ raise DataFrameArgumentError, "Invalid key: #{value}" unless keys.include?(value)
72
+
73
+ hash = Hash.new { |h, k| h[k] = {} }
74
+ keep_keys = keys - [name, value]
75
+ each_row do |row|
76
+ keeps, converts = row.partition { |k, _| keep_keys.include?(k) }
77
+ h = converts.to_h
78
+ hash[keeps.to_h][h[name].to_s.to_sym] = h[value]
79
+ end
80
+ ks = hash.first[0].keys + hash.first[1].keys
81
+ vs = hash.map { |k, v| k.values + v.values }.transpose
82
+ DataFrame.new(ks.zip(vs))
83
+ end
84
+ end
85
+ end
@@ -3,8 +3,8 @@
3
3
  module RedAmber
4
4
  # mix-in for the class DataFrame
5
5
  module DataFrameSelectable
6
- # select variables: [symbol] or [string]
7
- # select observations: [array of index], [range]
6
+ # select columns: [symbol] or [string]
7
+ # select rows: [array of index], [range]
8
8
  def [](*args)
9
9
  args.flatten!
10
10
  raise DataFrameArgumentError, 'Empty dataframe' if empty?
@@ -22,17 +22,17 @@ module RedAmber
22
22
  raise DataFrameArgumentError, "Invalid argument: #{args}"
23
23
  end
24
24
 
25
- # slice and select some observations to create sub DataFrame
25
+ # slice and select rows to create sub DataFrame
26
26
  def slice(*args, &block)
27
27
  slicer = args
28
28
  if block
29
29
  raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
30
30
 
31
- slicer = instance_eval(&block)
31
+ slicer = [instance_eval(&block)]
32
32
  end
33
- slicer = [slicer].flatten
33
+ slicer.flatten!
34
34
 
35
- raise DataFrameArgumentError, 'Empty dataframe' if empty?
35
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
36
36
  return remove_all_values if slicer.empty? || slicer[0].nil?
37
37
 
38
38
  vector = parse_to_vector(slicer)
@@ -46,15 +46,59 @@ module RedAmber
46
46
  raise DataFrameArgumentError, "Invalid argument #{slicer}"
47
47
  end
48
48
 
49
- # remove selected observations to create sub DataFrame
49
+ def slice_by(key, keep_key: false, &block)
50
+ raise DataFrameArgumentError, 'Self is an empty dataframe' if empty?
51
+ raise DataFrameArgumentError, 'No block given' unless block
52
+ raise DataFrameArgumentError, "#{key} is no a key of self" unless key?(key)
53
+ return self if key.nil?
54
+
55
+ slicer = instance_eval(&block)
56
+ return DataFrame.new unless slicer
57
+
58
+ if slicer.is_a?(Range)
59
+ from = slicer.begin
60
+ from =
61
+ if from.is_a?(String)
62
+ self[key].index(from)
63
+ elsif from.nil?
64
+ 0
65
+ elsif from < 0
66
+ size + from
67
+ else
68
+ from
69
+ end
70
+ to = slicer.end
71
+ to =
72
+ if to.is_a?(String)
73
+ self[key].index(to)
74
+ elsif to.nil?
75
+ size - 1
76
+ elsif to < 0
77
+ size + to
78
+ else
79
+ to
80
+ end
81
+ slicer = (from..to).to_a
82
+ else
83
+ slicer = slicer.map { |x| x.is_a?(String) ? self[key].index(x) : x }
84
+ end
85
+
86
+ if keep_key
87
+ take(slicer)
88
+ else
89
+ take(slicer).drop(key)
90
+ end
91
+ end
92
+
93
+ # remove selected rows to create remainer DataFrame
50
94
  def remove(*args, &block)
51
95
  remover = args
52
96
  if block
53
97
  raise DataFrameArgumentError, 'Must not specify both arguments and block.' unless args.empty?
54
98
 
55
- remover = instance_eval(&block)
99
+ remover = [instance_eval(&block)]
56
100
  end
57
- remover = [remover].flatten
101
+ remover.flatten!
58
102
 
59
103
  raise DataFrameArgumentError, 'Empty dataframe' if empty?
60
104
  return self if remover.empty? || remover[0].nil?