red_amber 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +39 -20
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +113 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +25 -26
  8. data/benchmark/basic.yml +2 -2
  9. data/benchmark/combine.yml +2 -2
  10. data/benchmark/dataframe.yml +2 -2
  11. data/benchmark/group.yml +2 -2
  12. data/benchmark/reshape.yml +2 -2
  13. data/benchmark/vector.yml +3 -0
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +429 -75
  20. data/lib/red_amber/data_frame_combinable.rb +516 -66
  21. data/lib/red_amber/data_frame_displayable.rb +244 -14
  22. data/lib/red_amber/data_frame_indexable.rb +121 -18
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +622 -66
  26. data/lib/red_amber/data_frame_variable_operation.rb +446 -34
  27. data/lib/red_amber/group.rb +187 -22
  28. data/lib/red_amber/helper.rb +70 -10
  29. data/lib/red_amber/refinements.rb +12 -5
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +385 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +217 -12
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -3,22 +3,67 @@
3
3
  require 'stringio'
4
4
 
5
5
  module RedAmber
6
- # mix-ins for the class DataFrame
6
+ # Mix-in for the class DataFrame
7
7
  module DataFrameDisplayable
8
+ # Used internally to display table.
8
9
  INDEX_KEY = :index_key_for_format_table
10
+ private_constant :INDEX_KEY
9
11
 
10
- def to_s(width: 80)
12
+ # rubocop:disable Layout/LineLength
13
+
14
+ # Show a preview of self as a string.
15
+ #
16
+ # @param width [Integer]
17
+ # maximum size of result.
18
+ # @param head [Integer]
19
+ # number of records to show from head.
20
+ # @param tail [Integer]
21
+ # number of records to show at tail.
22
+ # @return [String]
23
+ # string representation of self.
24
+ # @example Show penguins dataset
25
+ # puts penguins.to_s
26
+ #
27
+ # # =>
28
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
29
+ # <string> <string> <double> <double> <uint8> ... <uint16>
30
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
31
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
32
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
33
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
34
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
35
+ # : : : : : : ... :
36
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
37
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
38
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
39
+ #
40
+ def to_s(width: 80, head: 5, tail: 3)
11
41
  return '' if empty?
12
42
 
13
- format_table(width: width)
43
+ format_table(width: width, head: head, tail: tail)
14
44
  end
15
45
 
16
- # Show statistical summary by a new DatFrame.
17
- # Make stats for numeric columns only.
18
- # NaNs are ignored.
19
- # Counts also show non-NaN counts.
46
+ # Show statistical summary by a new DataFrame.
47
+ #
48
+ # This method will make stats only for numeric columns.
49
+ # - NaNs are ignored.
50
+ # - `count` shows non-NaN counts.
51
+ #
52
+ # @return [DataFrame]
53
+ # a new dataframe.
54
+ # @example Statistical summary of penguins dataset
55
+ # # needs more width to show all stats in this example
56
+ # puts penguins.summary.to_s(width: 82)
57
+ #
58
+ # # =>
59
+ # variables count mean std min 25% median 75% max
60
+ # <dictionary> <uint16> <double> <double> <double> <double> <double> <double> <double>
61
+ # 0 bill_length_mm 342 43.92 5.46 32.1 39.23 44.38 48.5 59.6
62
+ # 1 bill_depth_mm 342 17.15 1.97 13.1 15.6 17.32 18.7 21.5
63
+ # 2 flipper_length_mm 342 200.92 14.06 172.0 190.0 197.0 213.0 231.0
64
+ # 3 body_mass_g 342 4201.75 801.95 2700.0 3550.0 4031.5 4750.0 6300.0
65
+ # 4 year 344 2008.03 0.82 2007.0 2007.0 2008.0 2009.0 2009.0
20
66
  #
21
- # @return [DataFrame] a new dataframe.
22
67
  def summary
23
68
  num_keys = keys.select { |key| self[key].numeric? }
24
69
 
@@ -36,6 +81,42 @@ module RedAmber
36
81
  end
37
82
  alias_method :describe, :summary
38
83
 
84
+ # Show information of self.
85
+ #
86
+ # According to `ENV [“RED_AMBER_OUTPUT_MODE”].upcase`,
87
+ # - If it is 'TDR', returns class, shape and transposed preview by 3 rows.
88
+ # - If it is 'MINIMUM', returns class and shape.
89
+ # - If it is 'TABLE' or otherwise, returns class, shape and Table preview.
90
+ # Default value of the ENV is 'Table'.
91
+ # @return [String]
92
+ # information of self.
93
+ # @example Default (ENV ['RED_AMBER_OUTPUT_MODE'] == 'Table')
94
+ # puts df.inspect
95
+ #
96
+ # # =>
97
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c148>
98
+ # x y
99
+ # <uint8> <string>
100
+ # 0 1 A
101
+ # 1 2 B
102
+ # 2 3 C
103
+ #
104
+ # @example In case of ENV ['RED_AMBER_OUTPUT_MODE'] == 'TDR'
105
+ # puts df.inspect
106
+ #
107
+ # # =>
108
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c148>
109
+ # Vectors : 1 numeric, 1 string
110
+ # # key type level data_preview
111
+ # 0 :x uint8 3 [1, 2, 3]
112
+ # 1 :y string 3 ["A", "B", "C"]
113
+ #
114
+ # @example In case of ENV ['RED_AMBER_OUTPUT_MODE'] == 'Minimum'
115
+ # puts df.inspect
116
+ #
117
+ # # =>
118
+ # RedAmber::DataFrame : 3 x 2 Vectors
119
+ #
39
120
  def inspect
40
121
  mode = ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table')
41
122
  case mode.upcase
@@ -48,17 +129,148 @@ module RedAmber
48
129
  end
49
130
  end
50
131
 
51
- # - limit: max num of Vectors to show
52
- # - tally: max level to use tally mode
53
- # - elements: max element to show values in each vector
132
+ # Shows some information about self in a transposed style.
133
+ #
134
+ # @param limit [Integer, :all]
135
+ # maximum number of variables (columns) to show.
136
+ # Shows all valiables (columns) if it is `:all`.
137
+ # @param tally [Integer]
138
+ # maximum level to use tally mode.
139
+ # Tally mode counts the occurrences of each element and shows as a hash
140
+ # with the elements as keys and the corresponding counts as values.
141
+ # @param elements [Integer]
142
+ # maximum number of elements to show values
143
+ # in each column.
144
+ # @return [nil]
145
+ # @example Default
146
+ # diamonds = diamonds.assign_left(:index) { indices }
147
+ # diamonds
148
+ #
149
+ # # =>
150
+ # #<RedAmber::DataFrame : 53940 x 11 Vectors, 0x000000000000c314>
151
+ # index carat cut color clarity depth table price ... z
152
+ # <uint16> <double> <string> <string> <string> <double> <double> <uint16> ... <double>
153
+ # 0 0 0.23 Ideal E SI2 61.5 55.0 326 ... 2.43
154
+ # 1 1 0.21 Premium E SI1 59.8 61.0 326 ... 2.31
155
+ # 2 2 0.23 Good E VS1 56.9 65.0 327 ... 2.31
156
+ # 3 3 0.29 Premium I VS2 62.4 58.0 334 ... 2.63
157
+ # 4 4 0.31 Good J SI2 63.3 58.0 335 ... 2.75
158
+ # : : : : : : : : : ... :
159
+ # 53937 53937 0.7 Very Good D SI1 62.8 60.0 2757 ... 3.56
160
+ # 53938 53938 0.86 Premium H SI2 61.0 58.0 2757 ... 3.74
161
+ # 53939 53939 0.75 Ideal D SI2 62.2 55.0 2757 ... 3.64
162
+ #
163
+ # diamonds.tdr
164
+ #
165
+ # # =>
166
+ # RedAmber::DataFrame : 53940 x 11 Vectors
167
+ # Vectors : 8 numeric, 3 strings
168
+ # # key type level data_preview
169
+ # 0 :index uint16 53940 [0, 1, 2, 3, 4, ... ]
170
+ # 1 :carat double 273 [0.23, 0.21, 0.23, 0.29, 0.31, ... ]
171
+ # 2 :cut string 5 {"Ideal"=>21551, "Premium"=>13791, "Good"=>4906, "Very Good"=>12082, "Fair"=>1610}
172
+ # 3 :color string 7 ["E", "E", "E", "I", "J", ... ]
173
+ # 4 :clarity string 8 ["SI2", "SI1", "VS1", "VS2", "SI2", ... ]
174
+ # 5 :depth double 184 [61.5, 59.8, 56.9, 62.4, 63.3, ... ]
175
+ # 6 :table double 127 [55.0, 61.0, 65.0, 58.0, 58.0, ... ]
176
+ # 7 :price uint16 11602 [326, 326, 327, 334, 335, ... ]
177
+ # 8 :x double 554 [3.95, 3.89, 4.05, 4.2, 4.34, ... ]
178
+ # 9 :y double 552 [3.98, 3.84, 4.07, 4.23, 4.35, ... ]
179
+ # ... 1 more Vector ...
180
+ #
181
+ # @example Show all variables
182
+ # diamonds.tdr(:all)
183
+ #
184
+ # # =>
185
+ # RedAmber::DataFrame : 53940 x 11 Vectors
186
+ # Vectors : 8 numeric, 3 strings
187
+ # # key type level data_preview
188
+ # 0 :index uint16 53940 [0, 1, 2, 3, 4, ... ]
189
+ # 1 :carat double 273 [0.23, 0.21, 0.23, 0.29, 0.31, ... ]
190
+ # 2 :cut string 5 {"Ideal"=>21551, "Premium"=>13791, "Good"=>4906, "Very Good"=>12082, "Fair"=>1610}
191
+ # 3 :color string 7 ["E", "E", "E", "I", "J", ... ]
192
+ # 4 :clarity string 8 ["SI2", "SI1", "VS1", "VS2", "SI2", ... ]
193
+ # 5 :depth double 184 [61.5, 59.8, 56.9, 62.4, 63.3, ... ]
194
+ # 6 :table double 127 [55.0, 61.0, 65.0, 58.0, 58.0, ... ]
195
+ # 7 :price uint16 11602 [326, 326, 327, 334, 335, ... ]
196
+ # 8 :x double 554 [3.95, 3.89, 4.05, 4.2, 4.34, ... ]
197
+ # 9 :y double 552 [3.98, 3.84, 4.07, 4.23, 4.35, ... ]
198
+ # 10 :z double 375 [2.43, 2.31, 2.31, 2.63, 2.75, ... ]
199
+ #
200
+ # @example Use tally mode up to 8 levels
201
+ # diamonds.tdr(tally: 8)
202
+ #
203
+ # # =>
204
+ # RedAmber::DataFrame : 53940 x 11 Vectors
205
+ # Vectors : 8 numeric, 3 strings
206
+ # # key type level data_preview
207
+ # 0 :index uint16 53940 [0, 1, 2, 3, 4, ... ]
208
+ # 1 :carat double 273 [0.23, 0.21, 0.23, 0.29, 0.31, ... ]
209
+ # 2 :cut string 5 {"Ideal"=>21551, "Premium"=>13791, "Good"=>4906, "Very Good"=>12082, "Fair"=>1610}
210
+ # 3 :color string 7 {"E"=>9797, "I"=>5422, "J"=>2808, "H"=>8304, "F"=>9542, "G"=>11292, "D"=>6775}
211
+ # 4 :clarity string 8 {"SI2"=>9194, "SI1"=>13065, "VS1"=>8171, "VS2"=>12258, "VVS2"=>5066, "VVS1"=>3655, "I1"=>741, "IF"=>1790}
212
+ # 5 :depth double 184 [61.5, 59.8, 56.9, 62.4, 63.3, ... ]
213
+ # 6 :table double 127 [55.0, 61.0, 65.0, 58.0, 58.0, ... ]
214
+ # 7 :price uint16 11602 [326, 326, 327, 334, 335, ... ]
215
+ # 8 :x double 554 [3.95, 3.89, 4.05, 4.2, 4.34, ... ]
216
+ # 9 :y double 552 [3.98, 3.84, 4.07, 4.23, 4.35, ... ]
217
+ # ... 1 more Vector ...
218
+ #
219
+ # @example Increase elements to show
220
+ # diamonds.tdr(elements: 10)
221
+ #
222
+ # # =>
223
+ # RedAmber::DataFrame : 53940 x 11 Vectors
224
+ # Vectors : 8 numeric, 3 strings
225
+ # # key type level data_preview
226
+ # 0 :index uint16 53940 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... ]
227
+ # 1 :carat double 273 [0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, ... ]
228
+ # 2 :cut string 5 {"Ideal"=>21551, "Premium"=>13791, "Good"=>4906, "Very Good"=>12082, "Fair"=>1610}
229
+ # 3 :color string 7 ["E", "E", "E", "I", "J", "J", "I", "H", "E", "H", ... ]
230
+ # 4 :clarity string 8 ["SI2", "SI1", "VS1", "VS2", "SI2", "VVS2", "VVS1", "SI1", "VS2", "VS1", ... ]
231
+ # 5 :depth double 184 [61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, ... ]
232
+ # 6 :table double 127 [55.0, 61.0, 65.0, 58.0, 58.0, 57.0, 57.0, 55.0, 61.0, 61.0, ... ]
233
+ # 7 :price uint16 11602 [326, 326, 327, 334, 335, 336, 336, 337, 337, 338, ... ]
234
+ # 8 :x double 554 [3.95, 3.89, 4.05, 4.2, 4.34, 3.94, 3.95, 4.07, 3.87, 4.0, ... ]
235
+ # 9 :y double 552 [3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, ... ]
236
+ # ... 1 more Vector ...
237
+ #
54
238
  def tdr(limit = 10, tally: 5, elements: 5)
55
239
  puts tdr_str(limit, tally: tally, elements: elements)
56
240
  end
241
+ alias_method :glimpse, :tdr
57
242
 
243
+ # Shortcut for `tdr(:all)``.
244
+ #
245
+ # @return (see #tdr)
246
+ #
247
+ def tdra
248
+ puts tdr_str(:all)
249
+ end
250
+
251
+ # rubocop:enable Layout/LineLength
252
+
253
+ # Returns some information about self in a transposed style by a string.
254
+ #
255
+ # @param (see #tdr)
256
+ # @option (see #tdr)
257
+ # @return [String] TDR style string.
258
+ #
58
259
  def tdr_str(limit = 10, tally: 5, elements: 5)
59
260
  "#{shape_str}\n#{dataframe_info(limit, tally_level: tally, max_element: elements)}"
60
261
  end
61
262
 
263
+ # Returns html formatted text of self by IRuby::HTML.table.
264
+ #
265
+ # According to `ENV [“RED_AMBER_OUTPUT_MODE”].upcase`,
266
+ # - If it is 'MINIMUM', returns shape by plain text.
267
+ # - If it is 'PLAIN', returns `#inspect` value by plain text.
268
+ # - If it is 'TDR', returns shape and transposed preview by plain text.
269
+ # - If it is 'TABLE' or otherwise, returns Table preview by html format.
270
+ # Default value of the ENV is 'TABLE'.
271
+ # @return [String]
272
+ # formatted string.
273
+ #
62
274
  def to_iruby
63
275
  require 'iruby'
64
276
  return ['text/plain', '(empty DataFrame)'] if empty?
@@ -76,14 +288,32 @@ module RedAmber
76
288
  end
77
289
  end
78
290
 
79
- private # =====
80
-
291
+ # Return class and shape of self by a String.
292
+ #
293
+ # @param with_id [true, false]
294
+ # show id if true.
295
+ # @return [String]
296
+ # shape string.
297
+ # @example Default (without id)
298
+ # penguins.shape_str
299
+ #
300
+ # # =>
301
+ # "RedAmber::DataFrame : 344 x 8 Vectors"
302
+ #
303
+ # @example With id
304
+ # penguins.shape_str(with_id: true)
305
+ #
306
+ # # =>
307
+ # "RedAmber::DataFrame : 344 x 8 Vectors, 0x0000000000003980"
308
+ #
81
309
  def shape_str(with_id: false)
82
310
  shape_info = empty? ? '(empty)' : "#{size} x #{n_keys} Vector#{pl(n_keys)}"
83
311
  id = with_id ? format(', 0x%016x', object_id) : ''
84
312
  "#{self.class} : #{shape_info}#{id}"
85
313
  end
86
314
 
315
+ private # =====
316
+
87
317
  def dataframe_info(limit, tally_level: 5, max_element: 5)
88
318
  return '' if empty?
89
319
 
@@ -201,7 +431,7 @@ module RedAmber
201
431
  df = df.assign do
202
432
  vectors.each_with_object({}) do |v, assigner|
203
433
  vec = v.replace(0, v.key == INDEX_KEY ? '' : v.key.to_s)
204
- .replace(1, v.key == INDEX_KEY ? '' : "<#{original[v.key].type}>")
434
+ .replace(1, v.key == INDEX_KEY ? '' : "<#{original[v.key].type}>")
205
435
  assigner[v.key] =
206
436
  original.size > head + tail + 1 ? vec.replace(head + 2, ':') : vec
207
437
  end
@@ -1,38 +1,141 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-ins for the class DataFrame
4
+ # Mix-ins for the class DataFrame
5
5
  module DataFrameIndexable
6
- # Common method
7
- def map_indices(*indices)
8
- return self if indices.empty?
9
-
10
- indices = indices[0].data if indices[0].is_a?(Vector)
11
-
12
- new_dataframe_by(indices)
6
+ # Returns row index Vector.
7
+ #
8
+ # @overload indices
9
+ # return @indices as row indices (0...size).
10
+ #
11
+ # @return [Vector]
12
+ # a Vector of row indices.
13
+ # @example When `dataframe.size == 5`;
14
+ # dataframe.indices
15
+ #
16
+ # # =>
17
+ # #<RedAmber::Vector(:uint8, size=5):0x000000000000fb54>
18
+ # [0, 1, 2, 3, 4]
19
+ #
20
+ # @overload indices(start)
21
+ # return customized index Vector `(start..).take(size)`.
22
+ #
23
+ # @param start [#succ]
24
+ # element of start which have `#succ` method.
25
+ # @return [Vector]
26
+ # a Vector of row indices.
27
+ # @example When `dataframe.size == 5`;
28
+ # dataframe.indices(1)
29
+ #
30
+ # # =>
31
+ # #<RedAmber::Vector(:uint8, size=5):0x000000000000fba4>
32
+ # [1, 2, 3, 4, 5]
33
+ #
34
+ # dataframe.indices('a')
35
+ # # =>
36
+ # #<RedAmber::Vector(:string, size=5):0x000000000000fbb8>
37
+ # ["a", "b", "c", "d", "e"]
38
+ #
39
+ def indices(start = 0)
40
+ if start == 0 # rubocop:disable Style/NumericPredicate
41
+ @indices ||= Vector.new(0...size)
42
+ else
43
+ Vector.new((start..).take(size))
44
+ end
13
45
  end
46
+ alias_method :indexes, :indices
14
47
 
48
+ # Return sorted indexes of self by a Vector.
49
+ #
15
50
  # @param sort_keys [Arrow::SortKey]
16
51
  # :key, "key" or "+key" denotes ascending,
17
52
  # "-key" denotes descending order
18
- # @return [RedAmber::Vector] Sorted indices in Vector
53
+ # @return [RedAmber::Vector]
54
+ # sorted indices in Vector
55
+ # @example
56
+ # df
57
+ #
58
+ # # =>
59
+ # x y
60
+ # <uint8> <string>
61
+ # 0 3 B
62
+ # 1 5 A
63
+ # 2 1 B
64
+ # 3 4 A
65
+ # 4 2 C
66
+ #
67
+ # df.sort_indices('x')
68
+ #
69
+ # # =>
70
+ # #<RedAmber::Vector(:uint64, size=5):0x0000000000003854>
71
+ # [2, 4, 0, 3, 1]
72
+ #
19
73
  def sort_indices(*sort_keys)
20
74
  indices = @table.sort_indices(sort_keys.flatten)
21
75
  Vector.create(indices)
22
76
  end
23
77
 
24
- # @return [RedAmber::DataFrame] Sorted DataFrame
78
+ # Sort the contents of self.
79
+ #
80
+ # @param sort_keys [Arrow::SortKey]
81
+ # :key, "key" or "+key" denotes ascending,
82
+ # "-key" denotes descending order
83
+ # @return [RedAmber::DataFrame]
84
+ # sorted DataFrame
85
+ # @example Sort by a key
86
+ # df
87
+ #
88
+ # # =>
89
+ # x y
90
+ # <uint8> <string>
91
+ # 0 3 B
92
+ # 1 5 A
93
+ # 2 1 B
94
+ # 3 4 A
95
+ # 4 2 C
96
+ #
97
+ # df.sort('y')
98
+ #
99
+ # # =>
100
+ # #<RedAmber::DataFrame : 5 x 2 Vectors, 0x000000000000382c>
101
+ # x y
102
+ # <uint8> <string>
103
+ # 0 5 A
104
+ # 1 4 A
105
+ # 2 3 B
106
+ # 3 1 B
107
+ # 4 2 C
108
+ #
109
+ # @example Sort by two keys
110
+ # df.sort('y', 'x')
111
+ #
112
+ # # =>
113
+ # #<RedAmber::DataFrame : 5 x 2 Vectors, 0x0000000000003890>
114
+ # x y
115
+ # <uint8> <string>
116
+ # 0 4 A
117
+ # 1 5 A
118
+ # 2 1 B
119
+ # 3 3 B
120
+ # 4 2 C
121
+ #
122
+ # @example Sort in descending order
123
+ # df.sort('-x')
124
+ #
125
+ # # =>
126
+ # #<RedAmber::DataFrame : 5 x 2 Vectors, 0x0000000000003840>
127
+ # x y
128
+ # <uint8> <string>
129
+ # 0 5 A
130
+ # 1 4 A
131
+ # 2 3 B
132
+ # 3 2 C
133
+ # 4 1 B
134
+ #
25
135
  def sort(*sort_keys)
26
136
  indices = @table.sort_indices(sort_keys.flatten)
27
137
 
28
- new_dataframe_by(indices)
29
- end
30
-
31
- private
32
-
33
- def new_dataframe_by(index_array)
34
- t = Arrow::Function.find(:take).execute([@table, index_array]).value
35
- DataFrame.create(t)
138
+ take(indices)
36
139
  end
37
140
  end
38
141
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-ins for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module DataFrameLoadSave
6
6
  # Enable `self.load` as class method of DataFrame
7
7
  def self.included(klass)
@@ -10,30 +10,98 @@ module RedAmber
10
10
 
11
11
  # Enable `self.load` as class method of DataFrame
12
12
  module ClassMethods
13
- # Load DataFrame via Arrow::Table.load
14
- def load(path, options = {})
15
- DataFrame.new(Arrow::Table.load(path, options))
13
+ # Load DataFrame via Arrow::Table.load.
14
+ #
15
+ # Format is automatically detected by extension.
16
+ # @!method load(input, format: nil, compression: nil, schema: nil, skip_lines: nil)
17
+ # @param input [path]
18
+ # source path.
19
+ # @param format [:arrow_file, :batch, :arrows, :arrow_stream, :stream, :csv, :tsv]
20
+ # format specifier.
21
+ # @param compression [:gzip, nil]
22
+ # compression type.
23
+ # @param schema [Arrow::Schema]
24
+ # schema of table.
25
+ # @param skip_lines [Regexp]
26
+ # pattern of rows to skip.
27
+ # @return [DataFrame]
28
+ # loaded DataFrame.
29
+ # @example Load a tsv file
30
+ # DataFrame.load("file.tsv")
31
+ #
32
+ # @example Load a csv.gz file
33
+ # DataFrame.load("file.csv.gz")
34
+ #
35
+ # @example Load from URI
36
+ # DataFrame.load(URI("https://some_uri/file.csv"))
37
+ #
38
+ # @example Load from a Buffer
39
+ # DataFrame.load(Arrow::Buffer.new(<<~BUFFER), format: :csv)
40
+ # name,age
41
+ # Yasuko,68
42
+ # Rui,49
43
+ # Hinata,28
44
+ # BUFFER
45
+ #
46
+ # @example Load from a Buffer skipping comment line
47
+ # DataFrame.load(Arrow::Buffer.new(<<~BUFFER), format: :csv, skip_lines: /^#/)
48
+ # # comment
49
+ # name,age
50
+ # Yasuko,68
51
+ # Rui,49
52
+ # Hinata,28
53
+ # BUFFER
54
+ #
55
+ def load(input, **options)
56
+ DataFrame.new(Arrow::Table.load(input, options))
16
57
  end
17
58
  end
18
59
 
19
60
  # Save DataFrame
20
61
  #
21
- # @return [DataFrame] self.
22
- def save(output, options = {})
62
+ # Format is automatically detected by extension.
63
+ # @!method save(output, format: nil, compression: nil, schema: nil, skip_lines: nil)
64
+ # @param output [path]
65
+ # output path.
66
+ # @param format [:arrow_file, :batch, :arrows, :arrow_stream, :stream, :csv, :tsv]
67
+ # format specifier.
68
+ # @param compression [:gzip, nil]
69
+ # compression type.
70
+ # @param schema [Arrow::Schema]
71
+ # schema of table.
72
+ # @param skip_lines [Regexp]
73
+ # pattern of rows to skip.
74
+ # @return [DataFrame]
75
+ # self.
76
+ # @example Save a csv file
77
+ # DataFrame.save("file.csv")
78
+ #
79
+ # @example Save a csv.gz file
80
+ # DataFrame.save("file.csv.gz")
81
+ #
82
+ # @example Save an arrow file
83
+ # DataFrame.save("file.arrow")
84
+ #
85
+ def save(output, **options)
23
86
  @table.save(output, options)
24
87
  self
25
88
  end
26
89
 
27
90
  # Save and reload to cast automatically
28
- # Via tsv format file temporally as default
91
+ # via tsv format file temporally as default.
92
+ #
93
+ # @param format [Symbol]
94
+ # format specifier.
95
+ # @return [DataFrame]
96
+ # reloaded DataFrame.
29
97
  #
30
98
  # @note experimental feature
31
99
  def auto_cast(format: :tsv)
32
100
  return self if empty?
33
101
 
34
- tempfile = Arrow::ResizableBuffer.new(1024)
35
- save(tempfile, format: format)
36
- DataFrame.load(tempfile, format: format)
102
+ buffer = Arrow::ResizableBuffer.new(1024)
103
+ save(buffer, format: format)
104
+ DataFrame.load(buffer, format: format)
37
105
  end
38
106
  end
39
107
  end