red_amber 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +39 -20
- data/.yardopts +2 -0
- data/CHANGELOG.md +113 -0
- data/Gemfile +1 -1
- data/LICENSE +1 -1
- data/README.md +25 -26
- data/benchmark/basic.yml +2 -2
- data/benchmark/combine.yml +2 -2
- data/benchmark/dataframe.yml +2 -2
- data/benchmark/group.yml +2 -2
- data/benchmark/reshape.yml +2 -2
- data/benchmark/vector.yml +3 -0
- data/doc/DataFrame.md +32 -12
- data/doc/DataFrame_Comparison.md +65 -0
- data/doc/SubFrames.md +11 -0
- data/doc/Vector.md +207 -1
- data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
- data/lib/red_amber/data_frame.rb +429 -75
- data/lib/red_amber/data_frame_combinable.rb +516 -66
- data/lib/red_amber/data_frame_displayable.rb +244 -14
- data/lib/red_amber/data_frame_indexable.rb +121 -18
- data/lib/red_amber/data_frame_loadsave.rb +78 -10
- data/lib/red_amber/data_frame_reshaping.rb +184 -14
- data/lib/red_amber/data_frame_selectable.rb +622 -66
- data/lib/red_amber/data_frame_variable_operation.rb +446 -34
- data/lib/red_amber/group.rb +187 -22
- data/lib/red_amber/helper.rb +70 -10
- data/lib/red_amber/refinements.rb +12 -5
- data/lib/red_amber/subframes.rb +1066 -0
- data/lib/red_amber/vector.rb +385 -11
- data/lib/red_amber/vector_aggregation.rb +312 -0
- data/lib/red_amber/vector_binary_element_wise.rb +387 -0
- data/lib/red_amber/vector_selectable.rb +217 -12
- data/lib/red_amber/vector_unary_element_wise.rb +436 -0
- data/lib/red_amber/vector_updatable.rb +278 -34
- data/lib/red_amber/version.rb +2 -1
- data/lib/red_amber.rb +13 -1
- data/red_amber.gemspec +2 -2
- metadata +13 -8
- data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
- data/lib/red_amber/vector_functions.rb +0 -242
@@ -3,22 +3,67 @@
|
|
3
3
|
require 'stringio'
|
4
4
|
|
5
5
|
module RedAmber
|
6
|
-
#
|
6
|
+
# Mix-in for the class DataFrame
|
7
7
|
module DataFrameDisplayable
|
8
|
+
# Used internally to display table.
|
8
9
|
INDEX_KEY = :index_key_for_format_table
|
10
|
+
private_constant :INDEX_KEY
|
9
11
|
|
10
|
-
|
12
|
+
# rubocop:disable Layout/LineLength
|
13
|
+
|
14
|
+
# Show a preview of self as a string.
|
15
|
+
#
|
16
|
+
# @param width [Integer]
|
17
|
+
# maximum size of result.
|
18
|
+
# @param head [Integer]
|
19
|
+
# number of records to show from head.
|
20
|
+
# @param tail [Integer]
|
21
|
+
# number of records to show at tail.
|
22
|
+
# @return [String]
|
23
|
+
# string representation of self.
|
24
|
+
# @example Show penguins dataset
|
25
|
+
# puts penguins.to_s
|
26
|
+
#
|
27
|
+
# # =>
|
28
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
29
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
30
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
31
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
32
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
33
|
+
# 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
34
|
+
# 4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
35
|
+
# : : : : : : ... :
|
36
|
+
# 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
37
|
+
# 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
38
|
+
# 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
39
|
+
#
|
40
|
+
def to_s(width: 80, head: 5, tail: 3)
|
11
41
|
return '' if empty?
|
12
42
|
|
13
|
-
format_table(width: width)
|
43
|
+
format_table(width: width, head: head, tail: tail)
|
14
44
|
end
|
15
45
|
|
16
|
-
# Show statistical summary by a new
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
46
|
+
# Show statistical summary by a new DataFrame.
|
47
|
+
#
|
48
|
+
# This method will make stats only for numeric columns.
|
49
|
+
# - NaNs are ignored.
|
50
|
+
# - `count` shows non-NaN counts.
|
51
|
+
#
|
52
|
+
# @return [DataFrame]
|
53
|
+
# a new dataframe.
|
54
|
+
# @example Statistical summary of penguins dataset
|
55
|
+
# # needs more width to show all stats in this example
|
56
|
+
# puts penguins.summary.to_s(width: 82)
|
57
|
+
#
|
58
|
+
# # =>
|
59
|
+
# variables count mean std min 25% median 75% max
|
60
|
+
# <dictionary> <uint16> <double> <double> <double> <double> <double> <double> <double>
|
61
|
+
# 0 bill_length_mm 342 43.92 5.46 32.1 39.23 44.38 48.5 59.6
|
62
|
+
# 1 bill_depth_mm 342 17.15 1.97 13.1 15.6 17.32 18.7 21.5
|
63
|
+
# 2 flipper_length_mm 342 200.92 14.06 172.0 190.0 197.0 213.0 231.0
|
64
|
+
# 3 body_mass_g 342 4201.75 801.95 2700.0 3550.0 4031.5 4750.0 6300.0
|
65
|
+
# 4 year 344 2008.03 0.82 2007.0 2007.0 2008.0 2009.0 2009.0
|
20
66
|
#
|
21
|
-
# @return [DataFrame] a new dataframe.
|
22
67
|
def summary
|
23
68
|
num_keys = keys.select { |key| self[key].numeric? }
|
24
69
|
|
@@ -36,6 +81,42 @@ module RedAmber
|
|
36
81
|
end
|
37
82
|
alias_method :describe, :summary
|
38
83
|
|
84
|
+
# Show information of self.
|
85
|
+
#
|
86
|
+
# According to `ENV [“RED_AMBER_OUTPUT_MODE”].upcase`,
|
87
|
+
# - If it is 'TDR', returns class, shape and transposed preview by 3 rows.
|
88
|
+
# - If it is 'MINIMUM', returns class and shape.
|
89
|
+
# - If it is 'TABLE' or otherwise, returns class, shape and Table preview.
|
90
|
+
# Default value of the ENV is 'Table'.
|
91
|
+
# @return [String]
|
92
|
+
# information of self.
|
93
|
+
# @example Default (ENV ['RED_AMBER_OUTPUT_MODE'] == 'Table')
|
94
|
+
# puts df.inspect
|
95
|
+
#
|
96
|
+
# # =>
|
97
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c148>
|
98
|
+
# x y
|
99
|
+
# <uint8> <string>
|
100
|
+
# 0 1 A
|
101
|
+
# 1 2 B
|
102
|
+
# 2 3 C
|
103
|
+
#
|
104
|
+
# @example In case of ENV ['RED_AMBER_OUTPUT_MODE'] == 'TDR'
|
105
|
+
# puts df.inspect
|
106
|
+
#
|
107
|
+
# # =>
|
108
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c148>
|
109
|
+
# Vectors : 1 numeric, 1 string
|
110
|
+
# # key type level data_preview
|
111
|
+
# 0 :x uint8 3 [1, 2, 3]
|
112
|
+
# 1 :y string 3 ["A", "B", "C"]
|
113
|
+
#
|
114
|
+
# @example In case of ENV ['RED_AMBER_OUTPUT_MODE'] == 'Minimum'
|
115
|
+
# puts df.inspect
|
116
|
+
#
|
117
|
+
# # =>
|
118
|
+
# RedAmber::DataFrame : 3 x 2 Vectors
|
119
|
+
#
|
39
120
|
def inspect
|
40
121
|
mode = ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table')
|
41
122
|
case mode.upcase
|
@@ -48,17 +129,148 @@ module RedAmber
|
|
48
129
|
end
|
49
130
|
end
|
50
131
|
|
51
|
-
#
|
52
|
-
#
|
53
|
-
#
|
132
|
+
# Shows some information about self in a transposed style.
|
133
|
+
#
|
134
|
+
# @param limit [Integer, :all]
|
135
|
+
# maximum number of variables (columns) to show.
|
136
|
+
# Shows all valiables (columns) if it is `:all`.
|
137
|
+
# @param tally [Integer]
|
138
|
+
# maximum level to use tally mode.
|
139
|
+
# Tally mode counts the occurrences of each element and shows as a hash
|
140
|
+
# with the elements as keys and the corresponding counts as values.
|
141
|
+
# @param elements [Integer]
|
142
|
+
# maximum number of elements to show values
|
143
|
+
# in each column.
|
144
|
+
# @return [nil]
|
145
|
+
# @example Default
|
146
|
+
# diamonds = diamonds.assign_left(:index) { indices }
|
147
|
+
# diamonds
|
148
|
+
#
|
149
|
+
# # =>
|
150
|
+
# #<RedAmber::DataFrame : 53940 x 11 Vectors, 0x000000000000c314>
|
151
|
+
# index carat cut color clarity depth table price ... z
|
152
|
+
# <uint16> <double> <string> <string> <string> <double> <double> <uint16> ... <double>
|
153
|
+
# 0 0 0.23 Ideal E SI2 61.5 55.0 326 ... 2.43
|
154
|
+
# 1 1 0.21 Premium E SI1 59.8 61.0 326 ... 2.31
|
155
|
+
# 2 2 0.23 Good E VS1 56.9 65.0 327 ... 2.31
|
156
|
+
# 3 3 0.29 Premium I VS2 62.4 58.0 334 ... 2.63
|
157
|
+
# 4 4 0.31 Good J SI2 63.3 58.0 335 ... 2.75
|
158
|
+
# : : : : : : : : : ... :
|
159
|
+
# 53937 53937 0.7 Very Good D SI1 62.8 60.0 2757 ... 3.56
|
160
|
+
# 53938 53938 0.86 Premium H SI2 61.0 58.0 2757 ... 3.74
|
161
|
+
# 53939 53939 0.75 Ideal D SI2 62.2 55.0 2757 ... 3.64
|
162
|
+
#
|
163
|
+
# diamonds.tdr
|
164
|
+
#
|
165
|
+
# # =>
|
166
|
+
# RedAmber::DataFrame : 53940 x 11 Vectors
|
167
|
+
# Vectors : 8 numeric, 3 strings
|
168
|
+
# # key type level data_preview
|
169
|
+
# 0 :index uint16 53940 [0, 1, 2, 3, 4, ... ]
|
170
|
+
# 1 :carat double 273 [0.23, 0.21, 0.23, 0.29, 0.31, ... ]
|
171
|
+
# 2 :cut string 5 {"Ideal"=>21551, "Premium"=>13791, "Good"=>4906, "Very Good"=>12082, "Fair"=>1610}
|
172
|
+
# 3 :color string 7 ["E", "E", "E", "I", "J", ... ]
|
173
|
+
# 4 :clarity string 8 ["SI2", "SI1", "VS1", "VS2", "SI2", ... ]
|
174
|
+
# 5 :depth double 184 [61.5, 59.8, 56.9, 62.4, 63.3, ... ]
|
175
|
+
# 6 :table double 127 [55.0, 61.0, 65.0, 58.0, 58.0, ... ]
|
176
|
+
# 7 :price uint16 11602 [326, 326, 327, 334, 335, ... ]
|
177
|
+
# 8 :x double 554 [3.95, 3.89, 4.05, 4.2, 4.34, ... ]
|
178
|
+
# 9 :y double 552 [3.98, 3.84, 4.07, 4.23, 4.35, ... ]
|
179
|
+
# ... 1 more Vector ...
|
180
|
+
#
|
181
|
+
# @example Show all variables
|
182
|
+
# diamonds.tdr(:all)
|
183
|
+
#
|
184
|
+
# # =>
|
185
|
+
# RedAmber::DataFrame : 53940 x 11 Vectors
|
186
|
+
# Vectors : 8 numeric, 3 strings
|
187
|
+
# # key type level data_preview
|
188
|
+
# 0 :index uint16 53940 [0, 1, 2, 3, 4, ... ]
|
189
|
+
# 1 :carat double 273 [0.23, 0.21, 0.23, 0.29, 0.31, ... ]
|
190
|
+
# 2 :cut string 5 {"Ideal"=>21551, "Premium"=>13791, "Good"=>4906, "Very Good"=>12082, "Fair"=>1610}
|
191
|
+
# 3 :color string 7 ["E", "E", "E", "I", "J", ... ]
|
192
|
+
# 4 :clarity string 8 ["SI2", "SI1", "VS1", "VS2", "SI2", ... ]
|
193
|
+
# 5 :depth double 184 [61.5, 59.8, 56.9, 62.4, 63.3, ... ]
|
194
|
+
# 6 :table double 127 [55.0, 61.0, 65.0, 58.0, 58.0, ... ]
|
195
|
+
# 7 :price uint16 11602 [326, 326, 327, 334, 335, ... ]
|
196
|
+
# 8 :x double 554 [3.95, 3.89, 4.05, 4.2, 4.34, ... ]
|
197
|
+
# 9 :y double 552 [3.98, 3.84, 4.07, 4.23, 4.35, ... ]
|
198
|
+
# 10 :z double 375 [2.43, 2.31, 2.31, 2.63, 2.75, ... ]
|
199
|
+
#
|
200
|
+
# @example Use tally mode up to 8 levels
|
201
|
+
# diamonds.tdr(tally: 8)
|
202
|
+
#
|
203
|
+
# # =>
|
204
|
+
# RedAmber::DataFrame : 53940 x 11 Vectors
|
205
|
+
# Vectors : 8 numeric, 3 strings
|
206
|
+
# # key type level data_preview
|
207
|
+
# 0 :index uint16 53940 [0, 1, 2, 3, 4, ... ]
|
208
|
+
# 1 :carat double 273 [0.23, 0.21, 0.23, 0.29, 0.31, ... ]
|
209
|
+
# 2 :cut string 5 {"Ideal"=>21551, "Premium"=>13791, "Good"=>4906, "Very Good"=>12082, "Fair"=>1610}
|
210
|
+
# 3 :color string 7 {"E"=>9797, "I"=>5422, "J"=>2808, "H"=>8304, "F"=>9542, "G"=>11292, "D"=>6775}
|
211
|
+
# 4 :clarity string 8 {"SI2"=>9194, "SI1"=>13065, "VS1"=>8171, "VS2"=>12258, "VVS2"=>5066, "VVS1"=>3655, "I1"=>741, "IF"=>1790}
|
212
|
+
# 5 :depth double 184 [61.5, 59.8, 56.9, 62.4, 63.3, ... ]
|
213
|
+
# 6 :table double 127 [55.0, 61.0, 65.0, 58.0, 58.0, ... ]
|
214
|
+
# 7 :price uint16 11602 [326, 326, 327, 334, 335, ... ]
|
215
|
+
# 8 :x double 554 [3.95, 3.89, 4.05, 4.2, 4.34, ... ]
|
216
|
+
# 9 :y double 552 [3.98, 3.84, 4.07, 4.23, 4.35, ... ]
|
217
|
+
# ... 1 more Vector ...
|
218
|
+
#
|
219
|
+
# @example Increase elements to show
|
220
|
+
# diamonds.tdr(elements: 10)
|
221
|
+
#
|
222
|
+
# # =>
|
223
|
+
# RedAmber::DataFrame : 53940 x 11 Vectors
|
224
|
+
# Vectors : 8 numeric, 3 strings
|
225
|
+
# # key type level data_preview
|
226
|
+
# 0 :index uint16 53940 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... ]
|
227
|
+
# 1 :carat double 273 [0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, ... ]
|
228
|
+
# 2 :cut string 5 {"Ideal"=>21551, "Premium"=>13791, "Good"=>4906, "Very Good"=>12082, "Fair"=>1610}
|
229
|
+
# 3 :color string 7 ["E", "E", "E", "I", "J", "J", "I", "H", "E", "H", ... ]
|
230
|
+
# 4 :clarity string 8 ["SI2", "SI1", "VS1", "VS2", "SI2", "VVS2", "VVS1", "SI1", "VS2", "VS1", ... ]
|
231
|
+
# 5 :depth double 184 [61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, ... ]
|
232
|
+
# 6 :table double 127 [55.0, 61.0, 65.0, 58.0, 58.0, 57.0, 57.0, 55.0, 61.0, 61.0, ... ]
|
233
|
+
# 7 :price uint16 11602 [326, 326, 327, 334, 335, 336, 336, 337, 337, 338, ... ]
|
234
|
+
# 8 :x double 554 [3.95, 3.89, 4.05, 4.2, 4.34, 3.94, 3.95, 4.07, 3.87, 4.0, ... ]
|
235
|
+
# 9 :y double 552 [3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, ... ]
|
236
|
+
# ... 1 more Vector ...
|
237
|
+
#
|
54
238
|
def tdr(limit = 10, tally: 5, elements: 5)
|
55
239
|
puts tdr_str(limit, tally: tally, elements: elements)
|
56
240
|
end
|
241
|
+
alias_method :glimpse, :tdr
|
57
242
|
|
243
|
+
# Shortcut for `tdr(:all)``.
|
244
|
+
#
|
245
|
+
# @return (see #tdr)
|
246
|
+
#
|
247
|
+
def tdra
|
248
|
+
puts tdr_str(:all)
|
249
|
+
end
|
250
|
+
|
251
|
+
# rubocop:enable Layout/LineLength
|
252
|
+
|
253
|
+
# Returns some information about self in a transposed style by a string.
|
254
|
+
#
|
255
|
+
# @param (see #tdr)
|
256
|
+
# @option (see #tdr)
|
257
|
+
# @return [String] TDR style string.
|
258
|
+
#
|
58
259
|
def tdr_str(limit = 10, tally: 5, elements: 5)
|
59
260
|
"#{shape_str}\n#{dataframe_info(limit, tally_level: tally, max_element: elements)}"
|
60
261
|
end
|
61
262
|
|
263
|
+
# Returns html formatted text of self by IRuby::HTML.table.
|
264
|
+
#
|
265
|
+
# According to `ENV [“RED_AMBER_OUTPUT_MODE”].upcase`,
|
266
|
+
# - If it is 'MINIMUM', returns shape by plain text.
|
267
|
+
# - If it is 'PLAIN', returns `#inspect` value by plain text.
|
268
|
+
# - If it is 'TDR', returns shape and transposed preview by plain text.
|
269
|
+
# - If it is 'TABLE' or otherwise, returns Table preview by html format.
|
270
|
+
# Default value of the ENV is 'TABLE'.
|
271
|
+
# @return [String]
|
272
|
+
# formatted string.
|
273
|
+
#
|
62
274
|
def to_iruby
|
63
275
|
require 'iruby'
|
64
276
|
return ['text/plain', '(empty DataFrame)'] if empty?
|
@@ -76,14 +288,32 @@ module RedAmber
|
|
76
288
|
end
|
77
289
|
end
|
78
290
|
|
79
|
-
|
80
|
-
|
291
|
+
# Return class and shape of self by a String.
|
292
|
+
#
|
293
|
+
# @param with_id [true, false]
|
294
|
+
# show id if true.
|
295
|
+
# @return [String]
|
296
|
+
# shape string.
|
297
|
+
# @example Default (without id)
|
298
|
+
# penguins.shape_str
|
299
|
+
#
|
300
|
+
# # =>
|
301
|
+
# "RedAmber::DataFrame : 344 x 8 Vectors"
|
302
|
+
#
|
303
|
+
# @example With id
|
304
|
+
# penguins.shape_str(with_id: true)
|
305
|
+
#
|
306
|
+
# # =>
|
307
|
+
# "RedAmber::DataFrame : 344 x 8 Vectors, 0x0000000000003980"
|
308
|
+
#
|
81
309
|
def shape_str(with_id: false)
|
82
310
|
shape_info = empty? ? '(empty)' : "#{size} x #{n_keys} Vector#{pl(n_keys)}"
|
83
311
|
id = with_id ? format(', 0x%016x', object_id) : ''
|
84
312
|
"#{self.class} : #{shape_info}#{id}"
|
85
313
|
end
|
86
314
|
|
315
|
+
private # =====
|
316
|
+
|
87
317
|
def dataframe_info(limit, tally_level: 5, max_element: 5)
|
88
318
|
return '' if empty?
|
89
319
|
|
@@ -201,7 +431,7 @@ module RedAmber
|
|
201
431
|
df = df.assign do
|
202
432
|
vectors.each_with_object({}) do |v, assigner|
|
203
433
|
vec = v.replace(0, v.key == INDEX_KEY ? '' : v.key.to_s)
|
204
|
-
|
434
|
+
.replace(1, v.key == INDEX_KEY ? '' : "<#{original[v.key].type}>")
|
205
435
|
assigner[v.key] =
|
206
436
|
original.size > head + tail + 1 ? vec.replace(head + 2, ':') : vec
|
207
437
|
end
|
@@ -1,38 +1,141 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module RedAmber
|
4
|
-
#
|
4
|
+
# Mix-ins for the class DataFrame
|
5
5
|
module DataFrameIndexable
|
6
|
-
#
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
6
|
+
# Returns row index Vector.
|
7
|
+
#
|
8
|
+
# @overload indices
|
9
|
+
# return @indices as row indices (0...size).
|
10
|
+
#
|
11
|
+
# @return [Vector]
|
12
|
+
# a Vector of row indices.
|
13
|
+
# @example When `dataframe.size == 5`;
|
14
|
+
# dataframe.indices
|
15
|
+
#
|
16
|
+
# # =>
|
17
|
+
# #<RedAmber::Vector(:uint8, size=5):0x000000000000fb54>
|
18
|
+
# [0, 1, 2, 3, 4]
|
19
|
+
#
|
20
|
+
# @overload indices(start)
|
21
|
+
# return customized index Vector `(start..).take(size)`.
|
22
|
+
#
|
23
|
+
# @param start [#succ]
|
24
|
+
# element of start which have `#succ` method.
|
25
|
+
# @return [Vector]
|
26
|
+
# a Vector of row indices.
|
27
|
+
# @example When `dataframe.size == 5`;
|
28
|
+
# dataframe.indices(1)
|
29
|
+
#
|
30
|
+
# # =>
|
31
|
+
# #<RedAmber::Vector(:uint8, size=5):0x000000000000fba4>
|
32
|
+
# [1, 2, 3, 4, 5]
|
33
|
+
#
|
34
|
+
# dataframe.indices('a')
|
35
|
+
# # =>
|
36
|
+
# #<RedAmber::Vector(:string, size=5):0x000000000000fbb8>
|
37
|
+
# ["a", "b", "c", "d", "e"]
|
38
|
+
#
|
39
|
+
def indices(start = 0)
|
40
|
+
if start == 0 # rubocop:disable Style/NumericPredicate
|
41
|
+
@indices ||= Vector.new(0...size)
|
42
|
+
else
|
43
|
+
Vector.new((start..).take(size))
|
44
|
+
end
|
13
45
|
end
|
46
|
+
alias_method :indexes, :indices
|
14
47
|
|
48
|
+
# Return sorted indexes of self by a Vector.
|
49
|
+
#
|
15
50
|
# @param sort_keys [Arrow::SortKey]
|
16
51
|
# :key, "key" or "+key" denotes ascending,
|
17
52
|
# "-key" denotes descending order
|
18
|
-
# @return [RedAmber::Vector]
|
53
|
+
# @return [RedAmber::Vector]
|
54
|
+
# sorted indices in Vector
|
55
|
+
# @example
|
56
|
+
# df
|
57
|
+
#
|
58
|
+
# # =>
|
59
|
+
# x y
|
60
|
+
# <uint8> <string>
|
61
|
+
# 0 3 B
|
62
|
+
# 1 5 A
|
63
|
+
# 2 1 B
|
64
|
+
# 3 4 A
|
65
|
+
# 4 2 C
|
66
|
+
#
|
67
|
+
# df.sort_indices('x')
|
68
|
+
#
|
69
|
+
# # =>
|
70
|
+
# #<RedAmber::Vector(:uint64, size=5):0x0000000000003854>
|
71
|
+
# [2, 4, 0, 3, 1]
|
72
|
+
#
|
19
73
|
def sort_indices(*sort_keys)
|
20
74
|
indices = @table.sort_indices(sort_keys.flatten)
|
21
75
|
Vector.create(indices)
|
22
76
|
end
|
23
77
|
|
24
|
-
#
|
78
|
+
# Sort the contents of self.
|
79
|
+
#
|
80
|
+
# @param sort_keys [Arrow::SortKey]
|
81
|
+
# :key, "key" or "+key" denotes ascending,
|
82
|
+
# "-key" denotes descending order
|
83
|
+
# @return [RedAmber::DataFrame]
|
84
|
+
# sorted DataFrame
|
85
|
+
# @example Sort by a key
|
86
|
+
# df
|
87
|
+
#
|
88
|
+
# # =>
|
89
|
+
# x y
|
90
|
+
# <uint8> <string>
|
91
|
+
# 0 3 B
|
92
|
+
# 1 5 A
|
93
|
+
# 2 1 B
|
94
|
+
# 3 4 A
|
95
|
+
# 4 2 C
|
96
|
+
#
|
97
|
+
# df.sort('y')
|
98
|
+
#
|
99
|
+
# # =>
|
100
|
+
# #<RedAmber::DataFrame : 5 x 2 Vectors, 0x000000000000382c>
|
101
|
+
# x y
|
102
|
+
# <uint8> <string>
|
103
|
+
# 0 5 A
|
104
|
+
# 1 4 A
|
105
|
+
# 2 3 B
|
106
|
+
# 3 1 B
|
107
|
+
# 4 2 C
|
108
|
+
#
|
109
|
+
# @example Sort by two keys
|
110
|
+
# df.sort('y', 'x')
|
111
|
+
#
|
112
|
+
# # =>
|
113
|
+
# #<RedAmber::DataFrame : 5 x 2 Vectors, 0x0000000000003890>
|
114
|
+
# x y
|
115
|
+
# <uint8> <string>
|
116
|
+
# 0 4 A
|
117
|
+
# 1 5 A
|
118
|
+
# 2 1 B
|
119
|
+
# 3 3 B
|
120
|
+
# 4 2 C
|
121
|
+
#
|
122
|
+
# @example Sort in descending order
|
123
|
+
# df.sort('-x')
|
124
|
+
#
|
125
|
+
# # =>
|
126
|
+
# #<RedAmber::DataFrame : 5 x 2 Vectors, 0x0000000000003840>
|
127
|
+
# x y
|
128
|
+
# <uint8> <string>
|
129
|
+
# 0 5 A
|
130
|
+
# 1 4 A
|
131
|
+
# 2 3 B
|
132
|
+
# 3 2 C
|
133
|
+
# 4 1 B
|
134
|
+
#
|
25
135
|
def sort(*sort_keys)
|
26
136
|
indices = @table.sort_indices(sort_keys.flatten)
|
27
137
|
|
28
|
-
|
29
|
-
end
|
30
|
-
|
31
|
-
private
|
32
|
-
|
33
|
-
def new_dataframe_by(index_array)
|
34
|
-
t = Arrow::Function.find(:take).execute([@table, index_array]).value
|
35
|
-
DataFrame.create(t)
|
138
|
+
take(indices)
|
36
139
|
end
|
37
140
|
end
|
38
141
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module RedAmber
|
4
|
-
#
|
4
|
+
# Mix-in for the class DataFrame
|
5
5
|
module DataFrameLoadSave
|
6
6
|
# Enable `self.load` as class method of DataFrame
|
7
7
|
def self.included(klass)
|
@@ -10,30 +10,98 @@ module RedAmber
|
|
10
10
|
|
11
11
|
# Enable `self.load` as class method of DataFrame
|
12
12
|
module ClassMethods
|
13
|
-
# Load DataFrame via Arrow::Table.load
|
14
|
-
|
15
|
-
|
13
|
+
# Load DataFrame via Arrow::Table.load.
|
14
|
+
#
|
15
|
+
# Format is automatically detected by extension.
|
16
|
+
# @!method load(input, format: nil, compression: nil, schema: nil, skip_lines: nil)
|
17
|
+
# @param input [path]
|
18
|
+
# source path.
|
19
|
+
# @param format [:arrow_file, :batch, :arrows, :arrow_stream, :stream, :csv, :tsv]
|
20
|
+
# format specifier.
|
21
|
+
# @param compression [:gzip, nil]
|
22
|
+
# compression type.
|
23
|
+
# @param schema [Arrow::Schema]
|
24
|
+
# schema of table.
|
25
|
+
# @param skip_lines [Regexp]
|
26
|
+
# pattern of rows to skip.
|
27
|
+
# @return [DataFrame]
|
28
|
+
# loaded DataFrame.
|
29
|
+
# @example Load a tsv file
|
30
|
+
# DataFrame.load("file.tsv")
|
31
|
+
#
|
32
|
+
# @example Load a csv.gz file
|
33
|
+
# DataFrame.load("file.csv.gz")
|
34
|
+
#
|
35
|
+
# @example Load from URI
|
36
|
+
# DataFrame.load(URI("https://some_uri/file.csv"))
|
37
|
+
#
|
38
|
+
# @example Load from a Buffer
|
39
|
+
# DataFrame.load(Arrow::Buffer.new(<<~BUFFER), format: :csv)
|
40
|
+
# name,age
|
41
|
+
# Yasuko,68
|
42
|
+
# Rui,49
|
43
|
+
# Hinata,28
|
44
|
+
# BUFFER
|
45
|
+
#
|
46
|
+
# @example Load from a Buffer skipping comment line
|
47
|
+
# DataFrame.load(Arrow::Buffer.new(<<~BUFFER), format: :csv, skip_lines: /^#/)
|
48
|
+
# # comment
|
49
|
+
# name,age
|
50
|
+
# Yasuko,68
|
51
|
+
# Rui,49
|
52
|
+
# Hinata,28
|
53
|
+
# BUFFER
|
54
|
+
#
|
55
|
+
def load(input, **options)
|
56
|
+
DataFrame.new(Arrow::Table.load(input, options))
|
16
57
|
end
|
17
58
|
end
|
18
59
|
|
19
60
|
# Save DataFrame
|
20
61
|
#
|
21
|
-
#
|
22
|
-
|
62
|
+
# Format is automatically detected by extension.
|
63
|
+
# @!method save(output, format: nil, compression: nil, schema: nil, skip_lines: nil)
|
64
|
+
# @param output [path]
|
65
|
+
# output path.
|
66
|
+
# @param format [:arrow_file, :batch, :arrows, :arrow_stream, :stream, :csv, :tsv]
|
67
|
+
# format specifier.
|
68
|
+
# @param compression [:gzip, nil]
|
69
|
+
# compression type.
|
70
|
+
# @param schema [Arrow::Schema]
|
71
|
+
# schema of table.
|
72
|
+
# @param skip_lines [Regexp]
|
73
|
+
# pattern of rows to skip.
|
74
|
+
# @return [DataFrame]
|
75
|
+
# self.
|
76
|
+
# @example Save a csv file
|
77
|
+
# DataFrame.save("file.csv")
|
78
|
+
#
|
79
|
+
# @example Save a csv.gz file
|
80
|
+
# DataFrame.save("file.csv.gz")
|
81
|
+
#
|
82
|
+
# @example Save an arrow file
|
83
|
+
# DataFrame.save("file.arrow")
|
84
|
+
#
|
85
|
+
def save(output, **options)
|
23
86
|
@table.save(output, options)
|
24
87
|
self
|
25
88
|
end
|
26
89
|
|
27
90
|
# Save and reload to cast automatically
|
28
|
-
#
|
91
|
+
# via tsv format file temporally as default.
|
92
|
+
#
|
93
|
+
# @param format [Symbol]
|
94
|
+
# format specifier.
|
95
|
+
# @return [DataFrame]
|
96
|
+
# reloaded DataFrame.
|
29
97
|
#
|
30
98
|
# @note experimental feature
|
31
99
|
def auto_cast(format: :tsv)
|
32
100
|
return self if empty?
|
33
101
|
|
34
|
-
|
35
|
-
save(
|
36
|
-
DataFrame.load(
|
102
|
+
buffer = Arrow::ResizableBuffer.new(1024)
|
103
|
+
save(buffer, format: format)
|
104
|
+
DataFrame.load(buffer, format: format)
|
37
105
|
end
|
38
106
|
end
|
39
107
|
end
|