red_amber 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +39 -20
  3. data/.yardopts +2 -0
  4. data/CHANGELOG.md +113 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE +1 -1
  7. data/README.md +25 -26
  8. data/benchmark/basic.yml +2 -2
  9. data/benchmark/combine.yml +2 -2
  10. data/benchmark/dataframe.yml +2 -2
  11. data/benchmark/group.yml +2 -2
  12. data/benchmark/reshape.yml +2 -2
  13. data/benchmark/vector.yml +3 -0
  14. data/doc/DataFrame.md +32 -12
  15. data/doc/DataFrame_Comparison.md +65 -0
  16. data/doc/SubFrames.md +11 -0
  17. data/doc/Vector.md +207 -1
  18. data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
  19. data/lib/red_amber/data_frame.rb +429 -75
  20. data/lib/red_amber/data_frame_combinable.rb +516 -66
  21. data/lib/red_amber/data_frame_displayable.rb +244 -14
  22. data/lib/red_amber/data_frame_indexable.rb +121 -18
  23. data/lib/red_amber/data_frame_loadsave.rb +78 -10
  24. data/lib/red_amber/data_frame_reshaping.rb +184 -14
  25. data/lib/red_amber/data_frame_selectable.rb +622 -66
  26. data/lib/red_amber/data_frame_variable_operation.rb +446 -34
  27. data/lib/red_amber/group.rb +187 -22
  28. data/lib/red_amber/helper.rb +70 -10
  29. data/lib/red_amber/refinements.rb +12 -5
  30. data/lib/red_amber/subframes.rb +1066 -0
  31. data/lib/red_amber/vector.rb +385 -11
  32. data/lib/red_amber/vector_aggregation.rb +312 -0
  33. data/lib/red_amber/vector_binary_element_wise.rb +387 -0
  34. data/lib/red_amber/vector_selectable.rb +217 -12
  35. data/lib/red_amber/vector_unary_element_wise.rb +436 -0
  36. data/lib/red_amber/vector_updatable.rb +278 -34
  37. data/lib/red_amber/version.rb +2 -1
  38. data/lib/red_amber.rb +13 -1
  39. data/red_amber.gemspec +2 -2
  40. metadata +13 -8
  41. data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
  42. data/lib/red_amber/vector_functions.rb +0 -242
@@ -3,22 +3,67 @@
3
3
  require 'stringio'
4
4
 
5
5
  module RedAmber
6
- # mix-ins for the class DataFrame
6
+ # Mix-in for the class DataFrame
7
7
  module DataFrameDisplayable
8
+ # Used internally to display table.
8
9
  INDEX_KEY = :index_key_for_format_table
10
+ private_constant :INDEX_KEY
9
11
 
10
- def to_s(width: 80)
12
+ # rubocop:disable Layout/LineLength
13
+
14
+ # Show a preview of self as a string.
15
+ #
16
+ # @param width [Integer]
17
+ # maximum size of result.
18
+ # @param head [Integer]
19
+ # number of records to show from head.
20
+ # @param tail [Integer]
21
+ # number of records to show at tail.
22
+ # @return [String]
23
+ # string representation of self.
24
+ # @example Show penguins dataset
25
+ # puts penguins.to_s
26
+ #
27
+ # # =>
28
+ # species island bill_length_mm bill_depth_mm flipper_length_mm ... year
29
+ # <string> <string> <double> <double> <uint8> ... <uint16>
30
+ # 0 Adelie Torgersen 39.1 18.7 181 ... 2007
31
+ # 1 Adelie Torgersen 39.5 17.4 186 ... 2007
32
+ # 2 Adelie Torgersen 40.3 18.0 195 ... 2007
33
+ # 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
34
+ # 4 Adelie Torgersen 36.7 19.3 193 ... 2007
35
+ # : : : : : : ... :
36
+ # 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
37
+ # 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
38
+ # 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
39
+ #
40
+ def to_s(width: 80, head: 5, tail: 3)
11
41
  return '' if empty?
12
42
 
13
- format_table(width: width)
43
+ format_table(width: width, head: head, tail: tail)
14
44
  end
15
45
 
16
- # Show statistical summary by a new DatFrame.
17
- # Make stats for numeric columns only.
18
- # NaNs are ignored.
19
- # Counts also show non-NaN counts.
46
+ # Show statistical summary by a new DataFrame.
47
+ #
48
+ # This method will make stats only for numeric columns.
49
+ # - NaNs are ignored.
50
+ # - `count` shows non-NaN counts.
51
+ #
52
+ # @return [DataFrame]
53
+ # a new dataframe.
54
+ # @example Statistical summary of penguins dataset
55
+ # # needs more width to show all stats in this example
56
+ # puts penguins.summary.to_s(width: 82)
57
+ #
58
+ # # =>
59
+ # variables count mean std min 25% median 75% max
60
+ # <dictionary> <uint16> <double> <double> <double> <double> <double> <double> <double>
61
+ # 0 bill_length_mm 342 43.92 5.46 32.1 39.23 44.38 48.5 59.6
62
+ # 1 bill_depth_mm 342 17.15 1.97 13.1 15.6 17.32 18.7 21.5
63
+ # 2 flipper_length_mm 342 200.92 14.06 172.0 190.0 197.0 213.0 231.0
64
+ # 3 body_mass_g 342 4201.75 801.95 2700.0 3550.0 4031.5 4750.0 6300.0
65
+ # 4 year 344 2008.03 0.82 2007.0 2007.0 2008.0 2009.0 2009.0
20
66
  #
21
- # @return [DataFrame] a new dataframe.
22
67
  def summary
23
68
  num_keys = keys.select { |key| self[key].numeric? }
24
69
 
@@ -36,6 +81,42 @@ module RedAmber
36
81
  end
37
82
  alias_method :describe, :summary
38
83
 
84
+ # Show information of self.
85
+ #
86
+ # According to `ENV [“RED_AMBER_OUTPUT_MODE”].upcase`,
87
+ # - If it is 'TDR', returns class, shape and transposed preview by 3 rows.
88
+ # - If it is 'MINIMUM', returns class and shape.
89
+ # - If it is 'TABLE' or otherwise, returns class, shape and Table preview.
90
+ # Default value of the ENV is 'Table'.
91
+ # @return [String]
92
+ # information of self.
93
+ # @example Default (ENV ['RED_AMBER_OUTPUT_MODE'] == 'Table')
94
+ # puts df.inspect
95
+ #
96
+ # # =>
97
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c148>
98
+ # x y
99
+ # <uint8> <string>
100
+ # 0 1 A
101
+ # 1 2 B
102
+ # 2 3 C
103
+ #
104
+ # @example In case of ENV ['RED_AMBER_OUTPUT_MODE'] == 'TDR'
105
+ # puts df.inspect
106
+ #
107
+ # # =>
108
+ # #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c148>
109
+ # Vectors : 1 numeric, 1 string
110
+ # # key type level data_preview
111
+ # 0 :x uint8 3 [1, 2, 3]
112
+ # 1 :y string 3 ["A", "B", "C"]
113
+ #
114
+ # @example In case of ENV ['RED_AMBER_OUTPUT_MODE'] == 'Minimum'
115
+ # puts df.inspect
116
+ #
117
+ # # =>
118
+ # RedAmber::DataFrame : 3 x 2 Vectors
119
+ #
39
120
  def inspect
40
121
  mode = ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table')
41
122
  case mode.upcase
@@ -48,17 +129,148 @@ module RedAmber
48
129
  end
49
130
  end
50
131
 
51
- # - limit: max num of Vectors to show
52
- # - tally: max level to use tally mode
53
- # - elements: max element to show values in each vector
132
+ # Shows some information about self in a transposed style.
133
+ #
134
+ # @param limit [Integer, :all]
135
+ # maximum number of variables (columns) to show.
136
+ # Shows all valiables (columns) if it is `:all`.
137
+ # @param tally [Integer]
138
+ # maximum level to use tally mode.
139
+ # Tally mode counts the occurrences of each element and shows as a hash
140
+ # with the elements as keys and the corresponding counts as values.
141
+ # @param elements [Integer]
142
+ # maximum number of elements to show values
143
+ # in each column.
144
+ # @return [nil]
145
+ # @example Default
146
+ # diamonds = diamonds.assign_left(:index) { indices }
147
+ # diamonds
148
+ #
149
+ # # =>
150
+ # #<RedAmber::DataFrame : 53940 x 11 Vectors, 0x000000000000c314>
151
+ # index carat cut color clarity depth table price ... z
152
+ # <uint16> <double> <string> <string> <string> <double> <double> <uint16> ... <double>
153
+ # 0 0 0.23 Ideal E SI2 61.5 55.0 326 ... 2.43
154
+ # 1 1 0.21 Premium E SI1 59.8 61.0 326 ... 2.31
155
+ # 2 2 0.23 Good E VS1 56.9 65.0 327 ... 2.31
156
+ # 3 3 0.29 Premium I VS2 62.4 58.0 334 ... 2.63
157
+ # 4 4 0.31 Good J SI2 63.3 58.0 335 ... 2.75
158
+ # : : : : : : : : : ... :
159
+ # 53937 53937 0.7 Very Good D SI1 62.8 60.0 2757 ... 3.56
160
+ # 53938 53938 0.86 Premium H SI2 61.0 58.0 2757 ... 3.74
161
+ # 53939 53939 0.75 Ideal D SI2 62.2 55.0 2757 ... 3.64
162
+ #
163
+ # diamonds.tdr
164
+ #
165
+ # # =>
166
+ # RedAmber::DataFrame : 53940 x 11 Vectors
167
+ # Vectors : 8 numeric, 3 strings
168
+ # # key type level data_preview
169
+ # 0 :index uint16 53940 [0, 1, 2, 3, 4, ... ]
170
+ # 1 :carat double 273 [0.23, 0.21, 0.23, 0.29, 0.31, ... ]
171
+ # 2 :cut string 5 {"Ideal"=>21551, "Premium"=>13791, "Good"=>4906, "Very Good"=>12082, "Fair"=>1610}
172
+ # 3 :color string 7 ["E", "E", "E", "I", "J", ... ]
173
+ # 4 :clarity string 8 ["SI2", "SI1", "VS1", "VS2", "SI2", ... ]
174
+ # 5 :depth double 184 [61.5, 59.8, 56.9, 62.4, 63.3, ... ]
175
+ # 6 :table double 127 [55.0, 61.0, 65.0, 58.0, 58.0, ... ]
176
+ # 7 :price uint16 11602 [326, 326, 327, 334, 335, ... ]
177
+ # 8 :x double 554 [3.95, 3.89, 4.05, 4.2, 4.34, ... ]
178
+ # 9 :y double 552 [3.98, 3.84, 4.07, 4.23, 4.35, ... ]
179
+ # ... 1 more Vector ...
180
+ #
181
+ # @example Show all variables
182
+ # diamonds.tdr(:all)
183
+ #
184
+ # # =>
185
+ # RedAmber::DataFrame : 53940 x 11 Vectors
186
+ # Vectors : 8 numeric, 3 strings
187
+ # # key type level data_preview
188
+ # 0 :index uint16 53940 [0, 1, 2, 3, 4, ... ]
189
+ # 1 :carat double 273 [0.23, 0.21, 0.23, 0.29, 0.31, ... ]
190
+ # 2 :cut string 5 {"Ideal"=>21551, "Premium"=>13791, "Good"=>4906, "Very Good"=>12082, "Fair"=>1610}
191
+ # 3 :color string 7 ["E", "E", "E", "I", "J", ... ]
192
+ # 4 :clarity string 8 ["SI2", "SI1", "VS1", "VS2", "SI2", ... ]
193
+ # 5 :depth double 184 [61.5, 59.8, 56.9, 62.4, 63.3, ... ]
194
+ # 6 :table double 127 [55.0, 61.0, 65.0, 58.0, 58.0, ... ]
195
+ # 7 :price uint16 11602 [326, 326, 327, 334, 335, ... ]
196
+ # 8 :x double 554 [3.95, 3.89, 4.05, 4.2, 4.34, ... ]
197
+ # 9 :y double 552 [3.98, 3.84, 4.07, 4.23, 4.35, ... ]
198
+ # 10 :z double 375 [2.43, 2.31, 2.31, 2.63, 2.75, ... ]
199
+ #
200
+ # @example Use tally mode up to 8 levels
201
+ # diamonds.tdr(tally: 8)
202
+ #
203
+ # # =>
204
+ # RedAmber::DataFrame : 53940 x 11 Vectors
205
+ # Vectors : 8 numeric, 3 strings
206
+ # # key type level data_preview
207
+ # 0 :index uint16 53940 [0, 1, 2, 3, 4, ... ]
208
+ # 1 :carat double 273 [0.23, 0.21, 0.23, 0.29, 0.31, ... ]
209
+ # 2 :cut string 5 {"Ideal"=>21551, "Premium"=>13791, "Good"=>4906, "Very Good"=>12082, "Fair"=>1610}
210
+ # 3 :color string 7 {"E"=>9797, "I"=>5422, "J"=>2808, "H"=>8304, "F"=>9542, "G"=>11292, "D"=>6775}
211
+ # 4 :clarity string 8 {"SI2"=>9194, "SI1"=>13065, "VS1"=>8171, "VS2"=>12258, "VVS2"=>5066, "VVS1"=>3655, "I1"=>741, "IF"=>1790}
212
+ # 5 :depth double 184 [61.5, 59.8, 56.9, 62.4, 63.3, ... ]
213
+ # 6 :table double 127 [55.0, 61.0, 65.0, 58.0, 58.0, ... ]
214
+ # 7 :price uint16 11602 [326, 326, 327, 334, 335, ... ]
215
+ # 8 :x double 554 [3.95, 3.89, 4.05, 4.2, 4.34, ... ]
216
+ # 9 :y double 552 [3.98, 3.84, 4.07, 4.23, 4.35, ... ]
217
+ # ... 1 more Vector ...
218
+ #
219
+ # @example Increase elements to show
220
+ # diamonds.tdr(elements: 10)
221
+ #
222
+ # # =>
223
+ # RedAmber::DataFrame : 53940 x 11 Vectors
224
+ # Vectors : 8 numeric, 3 strings
225
+ # # key type level data_preview
226
+ # 0 :index uint16 53940 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... ]
227
+ # 1 :carat double 273 [0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, ... ]
228
+ # 2 :cut string 5 {"Ideal"=>21551, "Premium"=>13791, "Good"=>4906, "Very Good"=>12082, "Fair"=>1610}
229
+ # 3 :color string 7 ["E", "E", "E", "I", "J", "J", "I", "H", "E", "H", ... ]
230
+ # 4 :clarity string 8 ["SI2", "SI1", "VS1", "VS2", "SI2", "VVS2", "VVS1", "SI1", "VS2", "VS1", ... ]
231
+ # 5 :depth double 184 [61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, ... ]
232
+ # 6 :table double 127 [55.0, 61.0, 65.0, 58.0, 58.0, 57.0, 57.0, 55.0, 61.0, 61.0, ... ]
233
+ # 7 :price uint16 11602 [326, 326, 327, 334, 335, 336, 336, 337, 337, 338, ... ]
234
+ # 8 :x double 554 [3.95, 3.89, 4.05, 4.2, 4.34, 3.94, 3.95, 4.07, 3.87, 4.0, ... ]
235
+ # 9 :y double 552 [3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, ... ]
236
+ # ... 1 more Vector ...
237
+ #
54
238
  def tdr(limit = 10, tally: 5, elements: 5)
55
239
  puts tdr_str(limit, tally: tally, elements: elements)
56
240
  end
241
+ alias_method :glimpse, :tdr
57
242
 
243
+ # Shortcut for `tdr(:all)``.
244
+ #
245
+ # @return (see #tdr)
246
+ #
247
+ def tdra
248
+ puts tdr_str(:all)
249
+ end
250
+
251
+ # rubocop:enable Layout/LineLength
252
+
253
+ # Returns some information about self in a transposed style by a string.
254
+ #
255
+ # @param (see #tdr)
256
+ # @option (see #tdr)
257
+ # @return [String] TDR style string.
258
+ #
58
259
  def tdr_str(limit = 10, tally: 5, elements: 5)
59
260
  "#{shape_str}\n#{dataframe_info(limit, tally_level: tally, max_element: elements)}"
60
261
  end
61
262
 
263
+ # Returns html formatted text of self by IRuby::HTML.table.
264
+ #
265
+ # According to `ENV [“RED_AMBER_OUTPUT_MODE”].upcase`,
266
+ # - If it is 'MINIMUM', returns shape by plain text.
267
+ # - If it is 'PLAIN', returns `#inspect` value by plain text.
268
+ # - If it is 'TDR', returns shape and transposed preview by plain text.
269
+ # - If it is 'TABLE' or otherwise, returns Table preview by html format.
270
+ # Default value of the ENV is 'TABLE'.
271
+ # @return [String]
272
+ # formatted string.
273
+ #
62
274
  def to_iruby
63
275
  require 'iruby'
64
276
  return ['text/plain', '(empty DataFrame)'] if empty?
@@ -76,14 +288,32 @@ module RedAmber
76
288
  end
77
289
  end
78
290
 
79
- private # =====
80
-
291
+ # Return class and shape of self by a String.
292
+ #
293
+ # @param with_id [true, false]
294
+ # show id if true.
295
+ # @return [String]
296
+ # shape string.
297
+ # @example Default (without id)
298
+ # penguins.shape_str
299
+ #
300
+ # # =>
301
+ # "RedAmber::DataFrame : 344 x 8 Vectors"
302
+ #
303
+ # @example With id
304
+ # penguins.shape_str(with_id: true)
305
+ #
306
+ # # =>
307
+ # "RedAmber::DataFrame : 344 x 8 Vectors, 0x0000000000003980"
308
+ #
81
309
  def shape_str(with_id: false)
82
310
  shape_info = empty? ? '(empty)' : "#{size} x #{n_keys} Vector#{pl(n_keys)}"
83
311
  id = with_id ? format(', 0x%016x', object_id) : ''
84
312
  "#{self.class} : #{shape_info}#{id}"
85
313
  end
86
314
 
315
+ private # =====
316
+
87
317
  def dataframe_info(limit, tally_level: 5, max_element: 5)
88
318
  return '' if empty?
89
319
 
@@ -201,7 +431,7 @@ module RedAmber
201
431
  df = df.assign do
202
432
  vectors.each_with_object({}) do |v, assigner|
203
433
  vec = v.replace(0, v.key == INDEX_KEY ? '' : v.key.to_s)
204
- .replace(1, v.key == INDEX_KEY ? '' : "<#{original[v.key].type}>")
434
+ .replace(1, v.key == INDEX_KEY ? '' : "<#{original[v.key].type}>")
205
435
  assigner[v.key] =
206
436
  original.size > head + tail + 1 ? vec.replace(head + 2, ':') : vec
207
437
  end
@@ -1,38 +1,141 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-ins for the class DataFrame
4
+ # Mix-ins for the class DataFrame
5
5
  module DataFrameIndexable
6
- # Common method
7
- def map_indices(*indices)
8
- return self if indices.empty?
9
-
10
- indices = indices[0].data if indices[0].is_a?(Vector)
11
-
12
- new_dataframe_by(indices)
6
+ # Returns row index Vector.
7
+ #
8
+ # @overload indices
9
+ # return @indices as row indices (0...size).
10
+ #
11
+ # @return [Vector]
12
+ # a Vector of row indices.
13
+ # @example When `dataframe.size == 5`;
14
+ # dataframe.indices
15
+ #
16
+ # # =>
17
+ # #<RedAmber::Vector(:uint8, size=5):0x000000000000fb54>
18
+ # [0, 1, 2, 3, 4]
19
+ #
20
+ # @overload indices(start)
21
+ # return customized index Vector `(start..).take(size)`.
22
+ #
23
+ # @param start [#succ]
24
+ # element of start which have `#succ` method.
25
+ # @return [Vector]
26
+ # a Vector of row indices.
27
+ # @example When `dataframe.size == 5`;
28
+ # dataframe.indices(1)
29
+ #
30
+ # # =>
31
+ # #<RedAmber::Vector(:uint8, size=5):0x000000000000fba4>
32
+ # [1, 2, 3, 4, 5]
33
+ #
34
+ # dataframe.indices('a')
35
+ # # =>
36
+ # #<RedAmber::Vector(:string, size=5):0x000000000000fbb8>
37
+ # ["a", "b", "c", "d", "e"]
38
+ #
39
+ def indices(start = 0)
40
+ if start == 0 # rubocop:disable Style/NumericPredicate
41
+ @indices ||= Vector.new(0...size)
42
+ else
43
+ Vector.new((start..).take(size))
44
+ end
13
45
  end
46
+ alias_method :indexes, :indices
14
47
 
48
+ # Return sorted indexes of self by a Vector.
49
+ #
15
50
  # @param sort_keys [Arrow::SortKey]
16
51
  # :key, "key" or "+key" denotes ascending,
17
52
  # "-key" denotes descending order
18
- # @return [RedAmber::Vector] Sorted indices in Vector
53
+ # @return [RedAmber::Vector]
54
+ # sorted indices in Vector
55
+ # @example
56
+ # df
57
+ #
58
+ # # =>
59
+ # x y
60
+ # <uint8> <string>
61
+ # 0 3 B
62
+ # 1 5 A
63
+ # 2 1 B
64
+ # 3 4 A
65
+ # 4 2 C
66
+ #
67
+ # df.sort_indices('x')
68
+ #
69
+ # # =>
70
+ # #<RedAmber::Vector(:uint64, size=5):0x0000000000003854>
71
+ # [2, 4, 0, 3, 1]
72
+ #
19
73
  def sort_indices(*sort_keys)
20
74
  indices = @table.sort_indices(sort_keys.flatten)
21
75
  Vector.create(indices)
22
76
  end
23
77
 
24
- # @return [RedAmber::DataFrame] Sorted DataFrame
78
+ # Sort the contents of self.
79
+ #
80
+ # @param sort_keys [Arrow::SortKey]
81
+ # :key, "key" or "+key" denotes ascending,
82
+ # "-key" denotes descending order
83
+ # @return [RedAmber::DataFrame]
84
+ # sorted DataFrame
85
+ # @example Sort by a key
86
+ # df
87
+ #
88
+ # # =>
89
+ # x y
90
+ # <uint8> <string>
91
+ # 0 3 B
92
+ # 1 5 A
93
+ # 2 1 B
94
+ # 3 4 A
95
+ # 4 2 C
96
+ #
97
+ # df.sort('y')
98
+ #
99
+ # # =>
100
+ # #<RedAmber::DataFrame : 5 x 2 Vectors, 0x000000000000382c>
101
+ # x y
102
+ # <uint8> <string>
103
+ # 0 5 A
104
+ # 1 4 A
105
+ # 2 3 B
106
+ # 3 1 B
107
+ # 4 2 C
108
+ #
109
+ # @example Sort by two keys
110
+ # df.sort('y', 'x')
111
+ #
112
+ # # =>
113
+ # #<RedAmber::DataFrame : 5 x 2 Vectors, 0x0000000000003890>
114
+ # x y
115
+ # <uint8> <string>
116
+ # 0 4 A
117
+ # 1 5 A
118
+ # 2 1 B
119
+ # 3 3 B
120
+ # 4 2 C
121
+ #
122
+ # @example Sort in descending order
123
+ # df.sort('-x')
124
+ #
125
+ # # =>
126
+ # #<RedAmber::DataFrame : 5 x 2 Vectors, 0x0000000000003840>
127
+ # x y
128
+ # <uint8> <string>
129
+ # 0 5 A
130
+ # 1 4 A
131
+ # 2 3 B
132
+ # 3 2 C
133
+ # 4 1 B
134
+ #
25
135
  def sort(*sort_keys)
26
136
  indices = @table.sort_indices(sort_keys.flatten)
27
137
 
28
- new_dataframe_by(indices)
29
- end
30
-
31
- private
32
-
33
- def new_dataframe_by(index_array)
34
- t = Arrow::Function.find(:take).execute([@table, index_array]).value
35
- DataFrame.create(t)
138
+ take(indices)
36
139
  end
37
140
  end
38
141
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RedAmber
4
- # mix-ins for the class DataFrame
4
+ # Mix-in for the class DataFrame
5
5
  module DataFrameLoadSave
6
6
  # Enable `self.load` as class method of DataFrame
7
7
  def self.included(klass)
@@ -10,30 +10,98 @@ module RedAmber
10
10
 
11
11
  # Enable `self.load` as class method of DataFrame
12
12
  module ClassMethods
13
- # Load DataFrame via Arrow::Table.load
14
- def load(path, options = {})
15
- DataFrame.new(Arrow::Table.load(path, options))
13
+ # Load DataFrame via Arrow::Table.load.
14
+ #
15
+ # Format is automatically detected by extension.
16
+ # @!method load(input, format: nil, compression: nil, schema: nil, skip_lines: nil)
17
+ # @param input [path]
18
+ # source path.
19
+ # @param format [:arrow_file, :batch, :arrows, :arrow_stream, :stream, :csv, :tsv]
20
+ # format specifier.
21
+ # @param compression [:gzip, nil]
22
+ # compression type.
23
+ # @param schema [Arrow::Schema]
24
+ # schema of table.
25
+ # @param skip_lines [Regexp]
26
+ # pattern of rows to skip.
27
+ # @return [DataFrame]
28
+ # loaded DataFrame.
29
+ # @example Load a tsv file
30
+ # DataFrame.load("file.tsv")
31
+ #
32
+ # @example Load a csv.gz file
33
+ # DataFrame.load("file.csv.gz")
34
+ #
35
+ # @example Load from URI
36
+ # DataFrame.load(URI("https://some_uri/file.csv"))
37
+ #
38
+ # @example Load from a Buffer
39
+ # DataFrame.load(Arrow::Buffer.new(<<~BUFFER), format: :csv)
40
+ # name,age
41
+ # Yasuko,68
42
+ # Rui,49
43
+ # Hinata,28
44
+ # BUFFER
45
+ #
46
+ # @example Load from a Buffer skipping comment line
47
+ # DataFrame.load(Arrow::Buffer.new(<<~BUFFER), format: :csv, skip_lines: /^#/)
48
+ # # comment
49
+ # name,age
50
+ # Yasuko,68
51
+ # Rui,49
52
+ # Hinata,28
53
+ # BUFFER
54
+ #
55
+ def load(input, **options)
56
+ DataFrame.new(Arrow::Table.load(input, options))
16
57
  end
17
58
  end
18
59
 
19
60
  # Save DataFrame
20
61
  #
21
- # @return [DataFrame] self.
22
- def save(output, options = {})
62
+ # Format is automatically detected by extension.
63
+ # @!method save(output, format: nil, compression: nil, schema: nil, skip_lines: nil)
64
+ # @param output [path]
65
+ # output path.
66
+ # @param format [:arrow_file, :batch, :arrows, :arrow_stream, :stream, :csv, :tsv]
67
+ # format specifier.
68
+ # @param compression [:gzip, nil]
69
+ # compression type.
70
+ # @param schema [Arrow::Schema]
71
+ # schema of table.
72
+ # @param skip_lines [Regexp]
73
+ # pattern of rows to skip.
74
+ # @return [DataFrame]
75
+ # self.
76
+ # @example Save a csv file
77
+ # DataFrame.save("file.csv")
78
+ #
79
+ # @example Save a csv.gz file
80
+ # DataFrame.save("file.csv.gz")
81
+ #
82
+ # @example Save an arrow file
83
+ # DataFrame.save("file.arrow")
84
+ #
85
+ def save(output, **options)
23
86
  @table.save(output, options)
24
87
  self
25
88
  end
26
89
 
27
90
  # Save and reload to cast automatically
28
- # Via tsv format file temporally as default
91
+ # via tsv format file temporally as default.
92
+ #
93
+ # @param format [Symbol]
94
+ # format specifier.
95
+ # @return [DataFrame]
96
+ # reloaded DataFrame.
29
97
  #
30
98
  # @note experimental feature
31
99
  def auto_cast(format: :tsv)
32
100
  return self if empty?
33
101
 
34
- tempfile = Arrow::ResizableBuffer.new(1024)
35
- save(tempfile, format: format)
36
- DataFrame.load(tempfile, format: format)
102
+ buffer = Arrow::ResizableBuffer.new(1024)
103
+ save(buffer, format: format)
104
+ DataFrame.load(buffer, format: format)
37
105
  end
38
106
  end
39
107
  end