red_amber 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +39 -20
- data/.yardopts +2 -0
- data/CHANGELOG.md +113 -0
- data/Gemfile +1 -1
- data/LICENSE +1 -1
- data/README.md +25 -26
- data/benchmark/basic.yml +2 -2
- data/benchmark/combine.yml +2 -2
- data/benchmark/dataframe.yml +2 -2
- data/benchmark/group.yml +2 -2
- data/benchmark/reshape.yml +2 -2
- data/benchmark/vector.yml +3 -0
- data/doc/DataFrame.md +32 -12
- data/doc/DataFrame_Comparison.md +65 -0
- data/doc/SubFrames.md +11 -0
- data/doc/Vector.md +207 -1
- data/doc/yard-templates/default/fulldoc/html/css/common.css +6 -0
- data/lib/red_amber/data_frame.rb +429 -75
- data/lib/red_amber/data_frame_combinable.rb +516 -66
- data/lib/red_amber/data_frame_displayable.rb +244 -14
- data/lib/red_amber/data_frame_indexable.rb +121 -18
- data/lib/red_amber/data_frame_loadsave.rb +78 -10
- data/lib/red_amber/data_frame_reshaping.rb +184 -14
- data/lib/red_amber/data_frame_selectable.rb +622 -66
- data/lib/red_amber/data_frame_variable_operation.rb +446 -34
- data/lib/red_amber/group.rb +187 -22
- data/lib/red_amber/helper.rb +70 -10
- data/lib/red_amber/refinements.rb +12 -5
- data/lib/red_amber/subframes.rb +1066 -0
- data/lib/red_amber/vector.rb +385 -11
- data/lib/red_amber/vector_aggregation.rb +312 -0
- data/lib/red_amber/vector_binary_element_wise.rb +387 -0
- data/lib/red_amber/vector_selectable.rb +217 -12
- data/lib/red_amber/vector_unary_element_wise.rb +436 -0
- data/lib/red_amber/vector_updatable.rb +278 -34
- data/lib/red_amber/version.rb +2 -1
- data/lib/red_amber.rb +13 -1
- data/red_amber.gemspec +2 -2
- metadata +13 -8
- data/doc/image/dataframe/reshaping_DataFrames.png +0 -0
- data/lib/red_amber/vector_functions.rb +0 -242
@@ -3,22 +3,67 @@
|
|
3
3
|
require 'stringio'
|
4
4
|
|
5
5
|
module RedAmber
|
6
|
-
#
|
6
|
+
# Mix-in for the class DataFrame
|
7
7
|
module DataFrameDisplayable
|
8
|
+
# Used internally to display table.
|
8
9
|
INDEX_KEY = :index_key_for_format_table
|
10
|
+
private_constant :INDEX_KEY
|
9
11
|
|
10
|
-
|
12
|
+
# rubocop:disable Layout/LineLength
|
13
|
+
|
14
|
+
# Show a preview of self as a string.
|
15
|
+
#
|
16
|
+
# @param width [Integer]
|
17
|
+
# maximum size of result.
|
18
|
+
# @param head [Integer]
|
19
|
+
# number of records to show from head.
|
20
|
+
# @param tail [Integer]
|
21
|
+
# number of records to show at tail.
|
22
|
+
# @return [String]
|
23
|
+
# string representation of self.
|
24
|
+
# @example Show penguins dataset
|
25
|
+
# puts penguins.to_s
|
26
|
+
#
|
27
|
+
# # =>
|
28
|
+
# species island bill_length_mm bill_depth_mm flipper_length_mm ... year
|
29
|
+
# <string> <string> <double> <double> <uint8> ... <uint16>
|
30
|
+
# 0 Adelie Torgersen 39.1 18.7 181 ... 2007
|
31
|
+
# 1 Adelie Torgersen 39.5 17.4 186 ... 2007
|
32
|
+
# 2 Adelie Torgersen 40.3 18.0 195 ... 2007
|
33
|
+
# 3 Adelie Torgersen (nil) (nil) (nil) ... 2007
|
34
|
+
# 4 Adelie Torgersen 36.7 19.3 193 ... 2007
|
35
|
+
# : : : : : : ... :
|
36
|
+
# 341 Gentoo Biscoe 50.4 15.7 222 ... 2009
|
37
|
+
# 342 Gentoo Biscoe 45.2 14.8 212 ... 2009
|
38
|
+
# 343 Gentoo Biscoe 49.9 16.1 213 ... 2009
|
39
|
+
#
|
40
|
+
def to_s(width: 80, head: 5, tail: 3)
|
11
41
|
return '' if empty?
|
12
42
|
|
13
|
-
format_table(width: width)
|
43
|
+
format_table(width: width, head: head, tail: tail)
|
14
44
|
end
|
15
45
|
|
16
|
-
# Show statistical summary by a new
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
46
|
+
# Show statistical summary by a new DataFrame.
|
47
|
+
#
|
48
|
+
# This method will make stats only for numeric columns.
|
49
|
+
# - NaNs are ignored.
|
50
|
+
# - `count` shows non-NaN counts.
|
51
|
+
#
|
52
|
+
# @return [DataFrame]
|
53
|
+
# a new dataframe.
|
54
|
+
# @example Statistical summary of penguins dataset
|
55
|
+
# # needs more width to show all stats in this example
|
56
|
+
# puts penguins.summary.to_s(width: 82)
|
57
|
+
#
|
58
|
+
# # =>
|
59
|
+
# variables count mean std min 25% median 75% max
|
60
|
+
# <dictionary> <uint16> <double> <double> <double> <double> <double> <double> <double>
|
61
|
+
# 0 bill_length_mm 342 43.92 5.46 32.1 39.23 44.38 48.5 59.6
|
62
|
+
# 1 bill_depth_mm 342 17.15 1.97 13.1 15.6 17.32 18.7 21.5
|
63
|
+
# 2 flipper_length_mm 342 200.92 14.06 172.0 190.0 197.0 213.0 231.0
|
64
|
+
# 3 body_mass_g 342 4201.75 801.95 2700.0 3550.0 4031.5 4750.0 6300.0
|
65
|
+
# 4 year 344 2008.03 0.82 2007.0 2007.0 2008.0 2009.0 2009.0
|
20
66
|
#
|
21
|
-
# @return [DataFrame] a new dataframe.
|
22
67
|
def summary
|
23
68
|
num_keys = keys.select { |key| self[key].numeric? }
|
24
69
|
|
@@ -36,6 +81,42 @@ module RedAmber
|
|
36
81
|
end
|
37
82
|
alias_method :describe, :summary
|
38
83
|
|
84
|
+
# Show information of self.
|
85
|
+
#
|
86
|
+
# According to `ENV [“RED_AMBER_OUTPUT_MODE”].upcase`,
|
87
|
+
# - If it is 'TDR', returns class, shape and transposed preview by 3 rows.
|
88
|
+
# - If it is 'MINIMUM', returns class and shape.
|
89
|
+
# - If it is 'TABLE' or otherwise, returns class, shape and Table preview.
|
90
|
+
# Default value of the ENV is 'Table'.
|
91
|
+
# @return [String]
|
92
|
+
# information of self.
|
93
|
+
# @example Default (ENV ['RED_AMBER_OUTPUT_MODE'] == 'Table')
|
94
|
+
# puts df.inspect
|
95
|
+
#
|
96
|
+
# # =>
|
97
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c148>
|
98
|
+
# x y
|
99
|
+
# <uint8> <string>
|
100
|
+
# 0 1 A
|
101
|
+
# 1 2 B
|
102
|
+
# 2 3 C
|
103
|
+
#
|
104
|
+
# @example In case of ENV ['RED_AMBER_OUTPUT_MODE'] == 'TDR'
|
105
|
+
# puts df.inspect
|
106
|
+
#
|
107
|
+
# # =>
|
108
|
+
# #<RedAmber::DataFrame : 3 x 2 Vectors, 0x000000000000c148>
|
109
|
+
# Vectors : 1 numeric, 1 string
|
110
|
+
# # key type level data_preview
|
111
|
+
# 0 :x uint8 3 [1, 2, 3]
|
112
|
+
# 1 :y string 3 ["A", "B", "C"]
|
113
|
+
#
|
114
|
+
# @example In case of ENV ['RED_AMBER_OUTPUT_MODE'] == 'Minimum'
|
115
|
+
# puts df.inspect
|
116
|
+
#
|
117
|
+
# # =>
|
118
|
+
# RedAmber::DataFrame : 3 x 2 Vectors
|
119
|
+
#
|
39
120
|
def inspect
|
40
121
|
mode = ENV.fetch('RED_AMBER_OUTPUT_MODE', 'Table')
|
41
122
|
case mode.upcase
|
@@ -48,17 +129,148 @@ module RedAmber
|
|
48
129
|
end
|
49
130
|
end
|
50
131
|
|
51
|
-
#
|
52
|
-
#
|
53
|
-
#
|
132
|
+
# Shows some information about self in a transposed style.
|
133
|
+
#
|
134
|
+
# @param limit [Integer, :all]
|
135
|
+
# maximum number of variables (columns) to show.
|
136
|
+
# Shows all valiables (columns) if it is `:all`.
|
137
|
+
# @param tally [Integer]
|
138
|
+
# maximum level to use tally mode.
|
139
|
+
# Tally mode counts the occurrences of each element and shows as a hash
|
140
|
+
# with the elements as keys and the corresponding counts as values.
|
141
|
+
# @param elements [Integer]
|
142
|
+
# maximum number of elements to show values
|
143
|
+
# in each column.
|
144
|
+
# @return [nil]
|
145
|
+
# @example Default
|
146
|
+
# diamonds = diamonds.assign_left(:index) { indices }
|
147
|
+
# diamonds
|
148
|
+
#
|
149
|
+
# # =>
|
150
|
+
# #<RedAmber::DataFrame : 53940 x 11 Vectors, 0x000000000000c314>
|
151
|
+
# index carat cut color clarity depth table price ... z
|
152
|
+
# <uint16> <double> <string> <string> <string> <double> <double> <uint16> ... <double>
|
153
|
+
# 0 0 0.23 Ideal E SI2 61.5 55.0 326 ... 2.43
|
154
|
+
# 1 1 0.21 Premium E SI1 59.8 61.0 326 ... 2.31
|
155
|
+
# 2 2 0.23 Good E VS1 56.9 65.0 327 ... 2.31
|
156
|
+
# 3 3 0.29 Premium I VS2 62.4 58.0 334 ... 2.63
|
157
|
+
# 4 4 0.31 Good J SI2 63.3 58.0 335 ... 2.75
|
158
|
+
# : : : : : : : : : ... :
|
159
|
+
# 53937 53937 0.7 Very Good D SI1 62.8 60.0 2757 ... 3.56
|
160
|
+
# 53938 53938 0.86 Premium H SI2 61.0 58.0 2757 ... 3.74
|
161
|
+
# 53939 53939 0.75 Ideal D SI2 62.2 55.0 2757 ... 3.64
|
162
|
+
#
|
163
|
+
# diamonds.tdr
|
164
|
+
#
|
165
|
+
# # =>
|
166
|
+
# RedAmber::DataFrame : 53940 x 11 Vectors
|
167
|
+
# Vectors : 8 numeric, 3 strings
|
168
|
+
# # key type level data_preview
|
169
|
+
# 0 :index uint16 53940 [0, 1, 2, 3, 4, ... ]
|
170
|
+
# 1 :carat double 273 [0.23, 0.21, 0.23, 0.29, 0.31, ... ]
|
171
|
+
# 2 :cut string 5 {"Ideal"=>21551, "Premium"=>13791, "Good"=>4906, "Very Good"=>12082, "Fair"=>1610}
|
172
|
+
# 3 :color string 7 ["E", "E", "E", "I", "J", ... ]
|
173
|
+
# 4 :clarity string 8 ["SI2", "SI1", "VS1", "VS2", "SI2", ... ]
|
174
|
+
# 5 :depth double 184 [61.5, 59.8, 56.9, 62.4, 63.3, ... ]
|
175
|
+
# 6 :table double 127 [55.0, 61.0, 65.0, 58.0, 58.0, ... ]
|
176
|
+
# 7 :price uint16 11602 [326, 326, 327, 334, 335, ... ]
|
177
|
+
# 8 :x double 554 [3.95, 3.89, 4.05, 4.2, 4.34, ... ]
|
178
|
+
# 9 :y double 552 [3.98, 3.84, 4.07, 4.23, 4.35, ... ]
|
179
|
+
# ... 1 more Vector ...
|
180
|
+
#
|
181
|
+
# @example Show all variables
|
182
|
+
# diamonds.tdr(:all)
|
183
|
+
#
|
184
|
+
# # =>
|
185
|
+
# RedAmber::DataFrame : 53940 x 11 Vectors
|
186
|
+
# Vectors : 8 numeric, 3 strings
|
187
|
+
# # key type level data_preview
|
188
|
+
# 0 :index uint16 53940 [0, 1, 2, 3, 4, ... ]
|
189
|
+
# 1 :carat double 273 [0.23, 0.21, 0.23, 0.29, 0.31, ... ]
|
190
|
+
# 2 :cut string 5 {"Ideal"=>21551, "Premium"=>13791, "Good"=>4906, "Very Good"=>12082, "Fair"=>1610}
|
191
|
+
# 3 :color string 7 ["E", "E", "E", "I", "J", ... ]
|
192
|
+
# 4 :clarity string 8 ["SI2", "SI1", "VS1", "VS2", "SI2", ... ]
|
193
|
+
# 5 :depth double 184 [61.5, 59.8, 56.9, 62.4, 63.3, ... ]
|
194
|
+
# 6 :table double 127 [55.0, 61.0, 65.0, 58.0, 58.0, ... ]
|
195
|
+
# 7 :price uint16 11602 [326, 326, 327, 334, 335, ... ]
|
196
|
+
# 8 :x double 554 [3.95, 3.89, 4.05, 4.2, 4.34, ... ]
|
197
|
+
# 9 :y double 552 [3.98, 3.84, 4.07, 4.23, 4.35, ... ]
|
198
|
+
# 10 :z double 375 [2.43, 2.31, 2.31, 2.63, 2.75, ... ]
|
199
|
+
#
|
200
|
+
# @example Use tally mode up to 8 levels
|
201
|
+
# diamonds.tdr(tally: 8)
|
202
|
+
#
|
203
|
+
# # =>
|
204
|
+
# RedAmber::DataFrame : 53940 x 11 Vectors
|
205
|
+
# Vectors : 8 numeric, 3 strings
|
206
|
+
# # key type level data_preview
|
207
|
+
# 0 :index uint16 53940 [0, 1, 2, 3, 4, ... ]
|
208
|
+
# 1 :carat double 273 [0.23, 0.21, 0.23, 0.29, 0.31, ... ]
|
209
|
+
# 2 :cut string 5 {"Ideal"=>21551, "Premium"=>13791, "Good"=>4906, "Very Good"=>12082, "Fair"=>1610}
|
210
|
+
# 3 :color string 7 {"E"=>9797, "I"=>5422, "J"=>2808, "H"=>8304, "F"=>9542, "G"=>11292, "D"=>6775}
|
211
|
+
# 4 :clarity string 8 {"SI2"=>9194, "SI1"=>13065, "VS1"=>8171, "VS2"=>12258, "VVS2"=>5066, "VVS1"=>3655, "I1"=>741, "IF"=>1790}
|
212
|
+
# 5 :depth double 184 [61.5, 59.8, 56.9, 62.4, 63.3, ... ]
|
213
|
+
# 6 :table double 127 [55.0, 61.0, 65.0, 58.0, 58.0, ... ]
|
214
|
+
# 7 :price uint16 11602 [326, 326, 327, 334, 335, ... ]
|
215
|
+
# 8 :x double 554 [3.95, 3.89, 4.05, 4.2, 4.34, ... ]
|
216
|
+
# 9 :y double 552 [3.98, 3.84, 4.07, 4.23, 4.35, ... ]
|
217
|
+
# ... 1 more Vector ...
|
218
|
+
#
|
219
|
+
# @example Increase elements to show
|
220
|
+
# diamonds.tdr(elements: 10)
|
221
|
+
#
|
222
|
+
# # =>
|
223
|
+
# RedAmber::DataFrame : 53940 x 11 Vectors
|
224
|
+
# Vectors : 8 numeric, 3 strings
|
225
|
+
# # key type level data_preview
|
226
|
+
# 0 :index uint16 53940 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... ]
|
227
|
+
# 1 :carat double 273 [0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, ... ]
|
228
|
+
# 2 :cut string 5 {"Ideal"=>21551, "Premium"=>13791, "Good"=>4906, "Very Good"=>12082, "Fair"=>1610}
|
229
|
+
# 3 :color string 7 ["E", "E", "E", "I", "J", "J", "I", "H", "E", "H", ... ]
|
230
|
+
# 4 :clarity string 8 ["SI2", "SI1", "VS1", "VS2", "SI2", "VVS2", "VVS1", "SI1", "VS2", "VS1", ... ]
|
231
|
+
# 5 :depth double 184 [61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, ... ]
|
232
|
+
# 6 :table double 127 [55.0, 61.0, 65.0, 58.0, 58.0, 57.0, 57.0, 55.0, 61.0, 61.0, ... ]
|
233
|
+
# 7 :price uint16 11602 [326, 326, 327, 334, 335, 336, 336, 337, 337, 338, ... ]
|
234
|
+
# 8 :x double 554 [3.95, 3.89, 4.05, 4.2, 4.34, 3.94, 3.95, 4.07, 3.87, 4.0, ... ]
|
235
|
+
# 9 :y double 552 [3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, ... ]
|
236
|
+
# ... 1 more Vector ...
|
237
|
+
#
|
54
238
|
def tdr(limit = 10, tally: 5, elements: 5)
|
55
239
|
puts tdr_str(limit, tally: tally, elements: elements)
|
56
240
|
end
|
241
|
+
alias_method :glimpse, :tdr
|
57
242
|
|
243
|
+
# Shortcut for `tdr(:all)``.
|
244
|
+
#
|
245
|
+
# @return (see #tdr)
|
246
|
+
#
|
247
|
+
def tdra
|
248
|
+
puts tdr_str(:all)
|
249
|
+
end
|
250
|
+
|
251
|
+
# rubocop:enable Layout/LineLength
|
252
|
+
|
253
|
+
# Returns some information about self in a transposed style by a string.
|
254
|
+
#
|
255
|
+
# @param (see #tdr)
|
256
|
+
# @option (see #tdr)
|
257
|
+
# @return [String] TDR style string.
|
258
|
+
#
|
58
259
|
def tdr_str(limit = 10, tally: 5, elements: 5)
|
59
260
|
"#{shape_str}\n#{dataframe_info(limit, tally_level: tally, max_element: elements)}"
|
60
261
|
end
|
61
262
|
|
263
|
+
# Returns html formatted text of self by IRuby::HTML.table.
|
264
|
+
#
|
265
|
+
# According to `ENV [“RED_AMBER_OUTPUT_MODE”].upcase`,
|
266
|
+
# - If it is 'MINIMUM', returns shape by plain text.
|
267
|
+
# - If it is 'PLAIN', returns `#inspect` value by plain text.
|
268
|
+
# - If it is 'TDR', returns shape and transposed preview by plain text.
|
269
|
+
# - If it is 'TABLE' or otherwise, returns Table preview by html format.
|
270
|
+
# Default value of the ENV is 'TABLE'.
|
271
|
+
# @return [String]
|
272
|
+
# formatted string.
|
273
|
+
#
|
62
274
|
def to_iruby
|
63
275
|
require 'iruby'
|
64
276
|
return ['text/plain', '(empty DataFrame)'] if empty?
|
@@ -76,14 +288,32 @@ module RedAmber
|
|
76
288
|
end
|
77
289
|
end
|
78
290
|
|
79
|
-
|
80
|
-
|
291
|
+
# Return class and shape of self by a String.
|
292
|
+
#
|
293
|
+
# @param with_id [true, false]
|
294
|
+
# show id if true.
|
295
|
+
# @return [String]
|
296
|
+
# shape string.
|
297
|
+
# @example Default (without id)
|
298
|
+
# penguins.shape_str
|
299
|
+
#
|
300
|
+
# # =>
|
301
|
+
# "RedAmber::DataFrame : 344 x 8 Vectors"
|
302
|
+
#
|
303
|
+
# @example With id
|
304
|
+
# penguins.shape_str(with_id: true)
|
305
|
+
#
|
306
|
+
# # =>
|
307
|
+
# "RedAmber::DataFrame : 344 x 8 Vectors, 0x0000000000003980"
|
308
|
+
#
|
81
309
|
def shape_str(with_id: false)
|
82
310
|
shape_info = empty? ? '(empty)' : "#{size} x #{n_keys} Vector#{pl(n_keys)}"
|
83
311
|
id = with_id ? format(', 0x%016x', object_id) : ''
|
84
312
|
"#{self.class} : #{shape_info}#{id}"
|
85
313
|
end
|
86
314
|
|
315
|
+
private # =====
|
316
|
+
|
87
317
|
def dataframe_info(limit, tally_level: 5, max_element: 5)
|
88
318
|
return '' if empty?
|
89
319
|
|
@@ -201,7 +431,7 @@ module RedAmber
|
|
201
431
|
df = df.assign do
|
202
432
|
vectors.each_with_object({}) do |v, assigner|
|
203
433
|
vec = v.replace(0, v.key == INDEX_KEY ? '' : v.key.to_s)
|
204
|
-
|
434
|
+
.replace(1, v.key == INDEX_KEY ? '' : "<#{original[v.key].type}>")
|
205
435
|
assigner[v.key] =
|
206
436
|
original.size > head + tail + 1 ? vec.replace(head + 2, ':') : vec
|
207
437
|
end
|
@@ -1,38 +1,141 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module RedAmber
|
4
|
-
#
|
4
|
+
# Mix-ins for the class DataFrame
|
5
5
|
module DataFrameIndexable
|
6
|
-
#
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
6
|
+
# Returns row index Vector.
|
7
|
+
#
|
8
|
+
# @overload indices
|
9
|
+
# return @indices as row indices (0...size).
|
10
|
+
#
|
11
|
+
# @return [Vector]
|
12
|
+
# a Vector of row indices.
|
13
|
+
# @example When `dataframe.size == 5`;
|
14
|
+
# dataframe.indices
|
15
|
+
#
|
16
|
+
# # =>
|
17
|
+
# #<RedAmber::Vector(:uint8, size=5):0x000000000000fb54>
|
18
|
+
# [0, 1, 2, 3, 4]
|
19
|
+
#
|
20
|
+
# @overload indices(start)
|
21
|
+
# return customized index Vector `(start..).take(size)`.
|
22
|
+
#
|
23
|
+
# @param start [#succ]
|
24
|
+
# element of start which have `#succ` method.
|
25
|
+
# @return [Vector]
|
26
|
+
# a Vector of row indices.
|
27
|
+
# @example When `dataframe.size == 5`;
|
28
|
+
# dataframe.indices(1)
|
29
|
+
#
|
30
|
+
# # =>
|
31
|
+
# #<RedAmber::Vector(:uint8, size=5):0x000000000000fba4>
|
32
|
+
# [1, 2, 3, 4, 5]
|
33
|
+
#
|
34
|
+
# dataframe.indices('a')
|
35
|
+
# # =>
|
36
|
+
# #<RedAmber::Vector(:string, size=5):0x000000000000fbb8>
|
37
|
+
# ["a", "b", "c", "d", "e"]
|
38
|
+
#
|
39
|
+
def indices(start = 0)
|
40
|
+
if start == 0 # rubocop:disable Style/NumericPredicate
|
41
|
+
@indices ||= Vector.new(0...size)
|
42
|
+
else
|
43
|
+
Vector.new((start..).take(size))
|
44
|
+
end
|
13
45
|
end
|
46
|
+
alias_method :indexes, :indices
|
14
47
|
|
48
|
+
# Return sorted indexes of self by a Vector.
|
49
|
+
#
|
15
50
|
# @param sort_keys [Arrow::SortKey]
|
16
51
|
# :key, "key" or "+key" denotes ascending,
|
17
52
|
# "-key" denotes descending order
|
18
|
-
# @return [RedAmber::Vector]
|
53
|
+
# @return [RedAmber::Vector]
|
54
|
+
# sorted indices in Vector
|
55
|
+
# @example
|
56
|
+
# df
|
57
|
+
#
|
58
|
+
# # =>
|
59
|
+
# x y
|
60
|
+
# <uint8> <string>
|
61
|
+
# 0 3 B
|
62
|
+
# 1 5 A
|
63
|
+
# 2 1 B
|
64
|
+
# 3 4 A
|
65
|
+
# 4 2 C
|
66
|
+
#
|
67
|
+
# df.sort_indices('x')
|
68
|
+
#
|
69
|
+
# # =>
|
70
|
+
# #<RedAmber::Vector(:uint64, size=5):0x0000000000003854>
|
71
|
+
# [2, 4, 0, 3, 1]
|
72
|
+
#
|
19
73
|
def sort_indices(*sort_keys)
|
20
74
|
indices = @table.sort_indices(sort_keys.flatten)
|
21
75
|
Vector.create(indices)
|
22
76
|
end
|
23
77
|
|
24
|
-
#
|
78
|
+
# Sort the contents of self.
|
79
|
+
#
|
80
|
+
# @param sort_keys [Arrow::SortKey]
|
81
|
+
# :key, "key" or "+key" denotes ascending,
|
82
|
+
# "-key" denotes descending order
|
83
|
+
# @return [RedAmber::DataFrame]
|
84
|
+
# sorted DataFrame
|
85
|
+
# @example Sort by a key
|
86
|
+
# df
|
87
|
+
#
|
88
|
+
# # =>
|
89
|
+
# x y
|
90
|
+
# <uint8> <string>
|
91
|
+
# 0 3 B
|
92
|
+
# 1 5 A
|
93
|
+
# 2 1 B
|
94
|
+
# 3 4 A
|
95
|
+
# 4 2 C
|
96
|
+
#
|
97
|
+
# df.sort('y')
|
98
|
+
#
|
99
|
+
# # =>
|
100
|
+
# #<RedAmber::DataFrame : 5 x 2 Vectors, 0x000000000000382c>
|
101
|
+
# x y
|
102
|
+
# <uint8> <string>
|
103
|
+
# 0 5 A
|
104
|
+
# 1 4 A
|
105
|
+
# 2 3 B
|
106
|
+
# 3 1 B
|
107
|
+
# 4 2 C
|
108
|
+
#
|
109
|
+
# @example Sort by two keys
|
110
|
+
# df.sort('y', 'x')
|
111
|
+
#
|
112
|
+
# # =>
|
113
|
+
# #<RedAmber::DataFrame : 5 x 2 Vectors, 0x0000000000003890>
|
114
|
+
# x y
|
115
|
+
# <uint8> <string>
|
116
|
+
# 0 4 A
|
117
|
+
# 1 5 A
|
118
|
+
# 2 1 B
|
119
|
+
# 3 3 B
|
120
|
+
# 4 2 C
|
121
|
+
#
|
122
|
+
# @example Sort in descending order
|
123
|
+
# df.sort('-x')
|
124
|
+
#
|
125
|
+
# # =>
|
126
|
+
# #<RedAmber::DataFrame : 5 x 2 Vectors, 0x0000000000003840>
|
127
|
+
# x y
|
128
|
+
# <uint8> <string>
|
129
|
+
# 0 5 A
|
130
|
+
# 1 4 A
|
131
|
+
# 2 3 B
|
132
|
+
# 3 2 C
|
133
|
+
# 4 1 B
|
134
|
+
#
|
25
135
|
def sort(*sort_keys)
|
26
136
|
indices = @table.sort_indices(sort_keys.flatten)
|
27
137
|
|
28
|
-
|
29
|
-
end
|
30
|
-
|
31
|
-
private
|
32
|
-
|
33
|
-
def new_dataframe_by(index_array)
|
34
|
-
t = Arrow::Function.find(:take).execute([@table, index_array]).value
|
35
|
-
DataFrame.create(t)
|
138
|
+
take(indices)
|
36
139
|
end
|
37
140
|
end
|
38
141
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module RedAmber
|
4
|
-
#
|
4
|
+
# Mix-in for the class DataFrame
|
5
5
|
module DataFrameLoadSave
|
6
6
|
# Enable `self.load` as class method of DataFrame
|
7
7
|
def self.included(klass)
|
@@ -10,30 +10,98 @@ module RedAmber
|
|
10
10
|
|
11
11
|
# Enable `self.load` as class method of DataFrame
|
12
12
|
module ClassMethods
|
13
|
-
# Load DataFrame via Arrow::Table.load
|
14
|
-
|
15
|
-
|
13
|
+
# Load DataFrame via Arrow::Table.load.
|
14
|
+
#
|
15
|
+
# Format is automatically detected by extension.
|
16
|
+
# @!method load(input, format: nil, compression: nil, schema: nil, skip_lines: nil)
|
17
|
+
# @param input [path]
|
18
|
+
# source path.
|
19
|
+
# @param format [:arrow_file, :batch, :arrows, :arrow_stream, :stream, :csv, :tsv]
|
20
|
+
# format specifier.
|
21
|
+
# @param compression [:gzip, nil]
|
22
|
+
# compression type.
|
23
|
+
# @param schema [Arrow::Schema]
|
24
|
+
# schema of table.
|
25
|
+
# @param skip_lines [Regexp]
|
26
|
+
# pattern of rows to skip.
|
27
|
+
# @return [DataFrame]
|
28
|
+
# loaded DataFrame.
|
29
|
+
# @example Load a tsv file
|
30
|
+
# DataFrame.load("file.tsv")
|
31
|
+
#
|
32
|
+
# @example Load a csv.gz file
|
33
|
+
# DataFrame.load("file.csv.gz")
|
34
|
+
#
|
35
|
+
# @example Load from URI
|
36
|
+
# DataFrame.load(URI("https://some_uri/file.csv"))
|
37
|
+
#
|
38
|
+
# @example Load from a Buffer
|
39
|
+
# DataFrame.load(Arrow::Buffer.new(<<~BUFFER), format: :csv)
|
40
|
+
# name,age
|
41
|
+
# Yasuko,68
|
42
|
+
# Rui,49
|
43
|
+
# Hinata,28
|
44
|
+
# BUFFER
|
45
|
+
#
|
46
|
+
# @example Load from a Buffer skipping comment line
|
47
|
+
# DataFrame.load(Arrow::Buffer.new(<<~BUFFER), format: :csv, skip_lines: /^#/)
|
48
|
+
# # comment
|
49
|
+
# name,age
|
50
|
+
# Yasuko,68
|
51
|
+
# Rui,49
|
52
|
+
# Hinata,28
|
53
|
+
# BUFFER
|
54
|
+
#
|
55
|
+
def load(input, **options)
|
56
|
+
DataFrame.new(Arrow::Table.load(input, options))
|
16
57
|
end
|
17
58
|
end
|
18
59
|
|
19
60
|
# Save DataFrame
|
20
61
|
#
|
21
|
-
#
|
22
|
-
|
62
|
+
# Format is automatically detected by extension.
|
63
|
+
# @!method save(output, format: nil, compression: nil, schema: nil, skip_lines: nil)
|
64
|
+
# @param output [path]
|
65
|
+
# output path.
|
66
|
+
# @param format [:arrow_file, :batch, :arrows, :arrow_stream, :stream, :csv, :tsv]
|
67
|
+
# format specifier.
|
68
|
+
# @param compression [:gzip, nil]
|
69
|
+
# compression type.
|
70
|
+
# @param schema [Arrow::Schema]
|
71
|
+
# schema of table.
|
72
|
+
# @param skip_lines [Regexp]
|
73
|
+
# pattern of rows to skip.
|
74
|
+
# @return [DataFrame]
|
75
|
+
# self.
|
76
|
+
# @example Save a csv file
|
77
|
+
# DataFrame.save("file.csv")
|
78
|
+
#
|
79
|
+
# @example Save a csv.gz file
|
80
|
+
# DataFrame.save("file.csv.gz")
|
81
|
+
#
|
82
|
+
# @example Save an arrow file
|
83
|
+
# DataFrame.save("file.arrow")
|
84
|
+
#
|
85
|
+
def save(output, **options)
|
23
86
|
@table.save(output, options)
|
24
87
|
self
|
25
88
|
end
|
26
89
|
|
27
90
|
# Save and reload to cast automatically
|
28
|
-
#
|
91
|
+
# via tsv format file temporally as default.
|
92
|
+
#
|
93
|
+
# @param format [Symbol]
|
94
|
+
# format specifier.
|
95
|
+
# @return [DataFrame]
|
96
|
+
# reloaded DataFrame.
|
29
97
|
#
|
30
98
|
# @note experimental feature
|
31
99
|
def auto_cast(format: :tsv)
|
32
100
|
return self if empty?
|
33
101
|
|
34
|
-
|
35
|
-
save(
|
36
|
-
DataFrame.load(
|
102
|
+
buffer = Arrow::ResizableBuffer.new(1024)
|
103
|
+
save(buffer, format: format)
|
104
|
+
DataFrame.load(buffer, format: format)
|
37
105
|
end
|
38
106
|
end
|
39
107
|
end
|