carray-dataframe 1.0.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/API.txt +1 -7
- data/README.md +3 -1
- data/Rakefile +11 -0
- data/carray-dataframe.gemspec +8 -6
- data/lib/carray-dataframe.rb +13 -0
- data/lib/carray-dataframe/arranger.rb +209 -0
- data/lib/carray-dataframe/cadf_array.rb +106 -0
- data/lib/carray-dataframe/converter.rb +97 -0
- data/lib/carray-dataframe/dataframe.rb +1279 -0
- data/lib/carray-dataframe/group.rb +199 -0
- data/lib/carray-dataframe/iloc_accessor.rb +62 -0
- data/lib/carray-dataframe/io.rb +96 -0
- data/lib/carray-dataframe/join.rb +283 -0
- data/lib/carray-dataframe/loc_accessor.rb +145 -0
- data/lib/carray-dataframe/pivot.rb +54 -0
- data/lib/carray-dataframe/reference.rb +142 -0
- data/lib/carray-dataframe/to_html.rb +102 -0
- metadata +23 -28
- data/examples/R/fit.rb +0 -24
- data/examples/R/iris.rb +0 -9
- data/examples/R/japan_area.rb +0 -30
- data/examples/R/kyaku.rb +0 -22
- data/examples/group_by.rb +0 -78
- data/examples/hist.rb +0 -27
- data/examples/iris.rb +0 -29
- data/examples/map.rb +0 -23
- data/examples/match.rb +0 -21
- data/examples/test.xlsx +0 -0
- data/examples/test1.rb +0 -44
- data/examples/test2.rb +0 -14
- data/examples/test3.db +0 -0
- data/examples/test3.rb +0 -11
- data/examples/test3.xlsx +0 -0
- data/examples/to_excel.rb +0 -27
- data/lib/R.rb +0 -365
- data/lib/carray/autoload/autoload_dataframe_dataframe.rb +0 -26
- data/lib/carray/dataframe/dataframe.rb +0 -1640
@@ -0,0 +1,1279 @@
|
|
1
|
+
require "carray"
|
2
|
+
require "carray/table"
|
3
|
+
|
4
|
+
def CADataFrame (*argv)
|
5
|
+
return CADataFrame.new(*argv)
|
6
|
+
end
|
7
|
+
|
8
|
+
class CADataFrame
|
9
|
+
|
10
|
+
#
|
11
|
+
# Constructor
|
12
|
+
#
|
13
|
+
def initialize (data, index: nil, columns: nil, order: nil, clone: false, &block)
|
14
|
+
# @column_names = Array holds column names and its order
|
15
|
+
# @column_data = Hash holds data entities
|
16
|
+
# @row_number = Integer holds number of rows
|
17
|
+
# @row_index = CArray stores row index (any object)
|
18
|
+
# @__methods__ = ...
|
19
|
+
# Stores data entity
|
20
|
+
|
21
|
+
case data
|
22
|
+
when Hash
|
23
|
+
raise "columns option is not needed for hash data" if columns
|
24
|
+
@column_data = columns_to_columns(data)
|
25
|
+
@column_names = @column_data.keys
|
26
|
+
when CArray
|
27
|
+
if columns
|
28
|
+
@column_names = columns.map(&:to_s)
|
29
|
+
else
|
30
|
+
if data.respond_to?(:column_names)
|
31
|
+
@column_names = data.column_names.map(&:to_s)
|
32
|
+
elsif order
|
33
|
+
@column_names = order.map(&:to_s)
|
34
|
+
else
|
35
|
+
raise "can't determin column names use columns or order option"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
if @column_names.size != data.dim1
|
39
|
+
raise "mismatch between 'column_names' and table columns"
|
40
|
+
end
|
41
|
+
@column_data = table_to_columns(data)
|
42
|
+
when Array
|
43
|
+
case data.first
|
44
|
+
when Hash
|
45
|
+
@column_data = {}
|
46
|
+
dummy = {}
|
47
|
+
data.each do |hash|
|
48
|
+
dummy.update(hash)
|
49
|
+
end
|
50
|
+
@column_names = []
|
51
|
+
dummy.each_key do |k|
|
52
|
+
list = []
|
53
|
+
data.each do |hash|
|
54
|
+
list << (hash[k] || UNDEF)
|
55
|
+
end
|
56
|
+
name = k.to_s
|
57
|
+
@column_names << name
|
58
|
+
@column_data[name] = list.to_ca
|
59
|
+
end
|
60
|
+
else
|
61
|
+
if columns
|
62
|
+
@column_names = columns.map(&:to_s)
|
63
|
+
elsif order
|
64
|
+
@column_names = order.map(&:to_s)
|
65
|
+
else
|
66
|
+
raise "columns or order option should be given"
|
67
|
+
end
|
68
|
+
@column_data = array_to_columns(data)
|
69
|
+
end
|
70
|
+
else
|
71
|
+
raise "unknown data"
|
72
|
+
end
|
73
|
+
|
74
|
+
if order
|
75
|
+
if @column_names.size != order.size
|
76
|
+
raise 'invalid order option'
|
77
|
+
end
|
78
|
+
new_column_data = {}
|
79
|
+
order.each do |key|
|
80
|
+
if @column_data.has_key?(key.to_s)
|
81
|
+
new_column_data[key.to_s] = @column_data[key.to_s]
|
82
|
+
else
|
83
|
+
raise 'invalid column name '#{key.to_s}' in order option'
|
84
|
+
end
|
85
|
+
end
|
86
|
+
@column_data = new_column_data
|
87
|
+
@column_names = new_column_data.keys
|
88
|
+
end
|
89
|
+
|
90
|
+
# Sets @row_number and check column length
|
91
|
+
@row_number = @column_data.first[1].size
|
92
|
+
if @column_names.any?{ |key| @column_data[key].size != @row_number }
|
93
|
+
raise "column sizes mismatch"
|
94
|
+
end
|
95
|
+
|
96
|
+
# Processing option 'index'
|
97
|
+
set_index(index, inplace: true)
|
98
|
+
@__methods__ = {}
|
99
|
+
|
100
|
+
if clone
|
101
|
+
raise NotImplementedError, "copy option is not implemented"
|
102
|
+
end
|
103
|
+
|
104
|
+
if block_given?
|
105
|
+
arrange(&block)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
attr_reader :column_data, :column_names, :row_index, :row_number
|
110
|
+
|
111
|
+
def columns
|
112
|
+
@column_data
|
113
|
+
end
|
114
|
+
|
115
|
+
def __methods__
|
116
|
+
return @__methods__
|
117
|
+
end
|
118
|
+
|
119
|
+
def table_to_columns (table)
|
120
|
+
new_columns = {}
|
121
|
+
@column_names.each_with_index do |name, i|
|
122
|
+
new_columns[name.to_s] = table[nil,i].to_ca
|
123
|
+
end
|
124
|
+
return new_columns
|
125
|
+
end
|
126
|
+
|
127
|
+
private :table_to_columns
|
128
|
+
|
129
|
+
def columns_to_columns (columns)
|
130
|
+
new_columns = {}
|
131
|
+
row_number = []
|
132
|
+
columns.each_with_index do |(key, col), k|
|
133
|
+
case col
|
134
|
+
when CArray
|
135
|
+
column = col
|
136
|
+
column = column.flatten unless column.rank == 1
|
137
|
+
when Array
|
138
|
+
column = col.to_ca
|
139
|
+
unless column.rank == 1
|
140
|
+
list = col.clone
|
141
|
+
column = CArray.object(list.size).convert { list.shift }
|
142
|
+
end
|
143
|
+
else
|
144
|
+
begin
|
145
|
+
column = col.to_ca
|
146
|
+
column = column.flatten unless column.rank == 1
|
147
|
+
rescue
|
148
|
+
raise "#{k}-th column can't be converted to CArray"
|
149
|
+
end
|
150
|
+
end
|
151
|
+
if key == ""
|
152
|
+
@row_index = column
|
153
|
+
else
|
154
|
+
new_columns[key.to_s] = column
|
155
|
+
end
|
156
|
+
end
|
157
|
+
return new_columns
|
158
|
+
end
|
159
|
+
|
160
|
+
private :columns_to_columns
|
161
|
+
|
162
|
+
def array_to_columns (array)
|
163
|
+
new_columns = {}
|
164
|
+
case array.first
|
165
|
+
when CArray
|
166
|
+
if @column_names.size != data.size
|
167
|
+
raise "mismatch between 'columns' and table columns"
|
168
|
+
end
|
169
|
+
@column_names.each_with_index do |key, k|
|
170
|
+
column = array[k]
|
171
|
+
column = column.flatten unless column.rank == 1
|
172
|
+
new_columns[key.to_s] = column
|
173
|
+
end
|
174
|
+
when Array
|
175
|
+
table = array.transpose
|
176
|
+
@column_names.each_with_index do |key, k|
|
177
|
+
new_columns[key] = table[k].to_ca
|
178
|
+
end
|
179
|
+
else
|
180
|
+
raise "invalid array content for CADataFrame"
|
181
|
+
end
|
182
|
+
return new_columns
|
183
|
+
end
|
184
|
+
|
185
|
+
private :array_to_columns
|
186
|
+
|
187
|
+
def set_index (index, drop: true, inplace: false)
|
188
|
+
if inplace
|
189
|
+
case index
|
190
|
+
when nil
|
191
|
+
when String, Symbol
|
192
|
+
index = index.to_s
|
193
|
+
raise "can't find column named '#{index}'" unless @column_names.include?(index)
|
194
|
+
if drop
|
195
|
+
@row_index = @column_data.delete(index)
|
196
|
+
@column_names.delete(index)
|
197
|
+
else
|
198
|
+
@row_index = @column_data[index]
|
199
|
+
end
|
200
|
+
else
|
201
|
+
@row_index = index.to_ca
|
202
|
+
end
|
203
|
+
self
|
204
|
+
else
|
205
|
+
return to_df.set_index(index, drop: drop, inplace: true)
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
def replace (other)
|
210
|
+
@column_names = other.column_names
|
211
|
+
@column_data = other.column_data
|
212
|
+
@row_index = other.row_index
|
213
|
+
@row_number = other.row_number
|
214
|
+
@__methods__ = other.__methods__
|
215
|
+
return self
|
216
|
+
end
|
217
|
+
|
218
|
+
def has_column?(name)
|
219
|
+
return @column_names.include?(name)
|
220
|
+
end
|
221
|
+
|
222
|
+
def column_types
|
223
|
+
return @column_names.map{|name| @column_data[name].data_type_name }
|
224
|
+
end
|
225
|
+
|
226
|
+
#
|
227
|
+
# Column, Row Access
|
228
|
+
#
|
229
|
+
|
230
|
+
def column (spec)
|
231
|
+
case spec
|
232
|
+
when Integer
|
233
|
+
return @column_data[@column_names[spec]]
|
234
|
+
when String, Symbol
|
235
|
+
return @column_data[spec.to_s]
|
236
|
+
else
|
237
|
+
raise "invalid column specifier"
|
238
|
+
end
|
239
|
+
end
|
240
|
+
alias col column
|
241
|
+
|
242
|
+
def loc
|
243
|
+
@loc ||= CADataFrame::LocAccessor.new(self)
|
244
|
+
return @loc
|
245
|
+
end
|
246
|
+
|
247
|
+
def iloc (&block)
|
248
|
+
@iloc ||= CADataFrame::ILocAccessor.new(self)
|
249
|
+
return @iloc
|
250
|
+
end
|
251
|
+
|
252
|
+
# TO BE FIXED
|
253
|
+
def index
|
254
|
+
if @row_index
|
255
|
+
return @row_index.to_ca
|
256
|
+
else
|
257
|
+
return CArray.int(@row_number).seq
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
def head (n=10)
|
262
|
+
rmax = [@row_number, n].min
|
263
|
+
return row[0..rmax-1]
|
264
|
+
end
|
265
|
+
|
266
|
+
def tail (n=10)
|
267
|
+
rmin = -([@row_number, n].min)
|
268
|
+
return row[rmin..-1]
|
269
|
+
end
|
270
|
+
|
271
|
+
def method (hash)
|
272
|
+
new_hash = {}
|
273
|
+
hash.each do |key, value|
|
274
|
+
new_hash[key.to_s] = value.to_s
|
275
|
+
end
|
276
|
+
@__methods__.update(new_hash)
|
277
|
+
end
|
278
|
+
|
279
|
+
def method_missing (name, *args)
|
280
|
+
if args.size == 0
|
281
|
+
name = name.to_s
|
282
|
+
if has_column?(name)
|
283
|
+
return @column_data[name]
|
284
|
+
elsif has_column?(name.gsub(/_/,'.')) ### For R
|
285
|
+
return @column_data[name.gsub(/_/,'.')]
|
286
|
+
elsif @__methods__.include?(name)
|
287
|
+
return @column_data[@__methods__[name]]
|
288
|
+
end
|
289
|
+
end
|
290
|
+
raise "no method '#{name}' for CADataFrame"
|
291
|
+
end
|
292
|
+
|
293
|
+
#
|
294
|
+
# Iterators
|
295
|
+
#
|
296
|
+
|
297
|
+
def each_column (&block)
|
298
|
+
return @column_data.each(&block)
|
299
|
+
end
|
300
|
+
|
301
|
+
def each_column_name (&block)
|
302
|
+
return @column_names.each(&block)
|
303
|
+
end
|
304
|
+
|
305
|
+
def each_row_index (&block)
|
306
|
+
if @row_index
|
307
|
+
@row_index.each(&block)
|
308
|
+
else
|
309
|
+
@row_number.times(&block)
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
def each_row (with: Array, columns: nil, &block)
|
314
|
+
case columns
|
315
|
+
when Array
|
316
|
+
column_names = columns
|
317
|
+
when Regexp
|
318
|
+
column_names = @column_names.grep(columns)
|
319
|
+
else
|
320
|
+
column_names = @column_names
|
321
|
+
end
|
322
|
+
if with == Array
|
323
|
+
@row_number.times do |i|
|
324
|
+
yield column_names.map{|n| @column_data[n][i] }
|
325
|
+
end
|
326
|
+
elsif with == Hash
|
327
|
+
row = {}
|
328
|
+
@row_number.times do |i|
|
329
|
+
column_names.each do |c|
|
330
|
+
row[c] = @column_data[c][i]
|
331
|
+
end
|
332
|
+
yield row
|
333
|
+
end
|
334
|
+
elsif with == CArray
|
335
|
+
joined = CArray.join(@column_data.values_at(*column_names))
|
336
|
+
joined[:i,nil].each do |block|
|
337
|
+
yield block.to_ca.compact
|
338
|
+
end
|
339
|
+
else
|
340
|
+
raise "invalid data type for loop variable"
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
def each_row_with_row_index (with: Array, &block)
|
345
|
+
if with == Array
|
346
|
+
if @row_index
|
347
|
+
@row_index.each_with_index do |idx, i|
|
348
|
+
yield @column_data.map{|n,c| c[i] }, idx
|
349
|
+
end
|
350
|
+
else
|
351
|
+
@row_number.times do |i|
|
352
|
+
yield @column_data.map{|n,c| c[i] }, i
|
353
|
+
end
|
354
|
+
end
|
355
|
+
elsif with == Hash
|
356
|
+
row = {}
|
357
|
+
if @row_index
|
358
|
+
@row_index.each_with_index do |idx, i|
|
359
|
+
@column_names.each do |c|
|
360
|
+
row[c] = @column_data[c][i]
|
361
|
+
end
|
362
|
+
yield row, @row_index[i]
|
363
|
+
end
|
364
|
+
else
|
365
|
+
@row_number.times do |idx, i|
|
366
|
+
@column_names.each do |c|
|
367
|
+
row[c] = @column_data[c][i]
|
368
|
+
end
|
369
|
+
yield row, @row_index[i]
|
370
|
+
end
|
371
|
+
end
|
372
|
+
else
|
373
|
+
raise "invalid data type for loop variable"
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
def where (mask, *args)
|
378
|
+
mask.column_names.each do |key|
|
379
|
+
if has_column?(key)
|
380
|
+
case args.size
|
381
|
+
when 1
|
382
|
+
column(key)[mask.column(key).boolean.not] = args[0]
|
383
|
+
when 2
|
384
|
+
column(key)[mask.column(key).boolean.not] = args[0]
|
385
|
+
column(key)[mask.column(key).boolean] = args[1]
|
386
|
+
end
|
387
|
+
end
|
388
|
+
end
|
389
|
+
end
|
390
|
+
|
391
|
+
def fill (*names, value)
|
392
|
+
names.each do |name|
|
393
|
+
if has_column?(name)
|
394
|
+
column(name).fill(value)
|
395
|
+
end
|
396
|
+
end
|
397
|
+
return self
|
398
|
+
end
|
399
|
+
|
400
|
+
#
|
401
|
+
# Arrange
|
402
|
+
#
|
403
|
+
|
404
|
+
def arrange (&block)
|
405
|
+
return Arranger.new(self).arrange(&block)
|
406
|
+
end
|
407
|
+
|
408
|
+
def rename (name1, name2)
|
409
|
+
if idx = @column_names.index(name1.to_s)
|
410
|
+
@column_names[idx] = name2.to_s
|
411
|
+
column = @column_data[name1.to_s]
|
412
|
+
@column_data.delete(name1.to_s)
|
413
|
+
@column_data[name2.to_s] = column
|
414
|
+
else
|
415
|
+
raise "unknown column name #{name1}"
|
416
|
+
end
|
417
|
+
end
|
418
|
+
|
419
|
+
def downcase
|
420
|
+
new_column_names = []
|
421
|
+
new_columns = {}
|
422
|
+
each_column_name do |name|
|
423
|
+
new_column_names << name.downcase
|
424
|
+
new_columns[name.downcase] = @column_data[name]
|
425
|
+
end
|
426
|
+
@column_names = new_column_names
|
427
|
+
@column_data = new_columns
|
428
|
+
return self
|
429
|
+
end
|
430
|
+
|
431
|
+
def append_column (name, new_column = nil, &block)
|
432
|
+
if new_column
|
433
|
+
# do nothing
|
434
|
+
elsif block
|
435
|
+
new_column = instance_exec(&block)
|
436
|
+
else
|
437
|
+
new_column = @column_data.first[1].template(:object)
|
438
|
+
end
|
439
|
+
unless new_column.is_a?(CArray)
|
440
|
+
new_column = new_column.to_ca
|
441
|
+
end
|
442
|
+
if new_column.rank != 1 or new_column.size != @row_number
|
443
|
+
raise "invalid shape of appended column"
|
444
|
+
end
|
445
|
+
@column_names.push(name.to_s)
|
446
|
+
@column_data[name.to_s] = new_column
|
447
|
+
return new_column
|
448
|
+
end
|
449
|
+
|
450
|
+
alias append append_column
|
451
|
+
|
452
|
+
def prepend_column (name, new_column = nil, &block)
|
453
|
+
if new_column
|
454
|
+
# do nothing
|
455
|
+
elsif block
|
456
|
+
new_column = instance_exec(&block)
|
457
|
+
else
|
458
|
+
new_column = @column_data.first[1].template(:object)
|
459
|
+
end
|
460
|
+
unless new_column.is_a?(CArray)
|
461
|
+
new_column = new_column.to_ca
|
462
|
+
end
|
463
|
+
if new_column.rank != 1 or new_column.size != @row_number
|
464
|
+
raise "invalid shape of appended column"
|
465
|
+
end
|
466
|
+
@column_names.unshift(name.to_s)
|
467
|
+
@column_data[name.to_s] = new_column
|
468
|
+
return new_column
|
469
|
+
end
|
470
|
+
|
471
|
+
alias lead prepend_column
|
472
|
+
|
473
|
+
def drop_column (*columns)
|
474
|
+
if columns.empty?
|
475
|
+
return self
|
476
|
+
else
|
477
|
+
names = []
|
478
|
+
columns.each do |c|
|
479
|
+
case c
|
480
|
+
when String
|
481
|
+
names << c
|
482
|
+
when Symbol
|
483
|
+
names << c.to_s
|
484
|
+
when Regexp
|
485
|
+
names.push *@column_names.grep(c)
|
486
|
+
else
|
487
|
+
raise "invalid column specification"
|
488
|
+
end
|
489
|
+
end
|
490
|
+
end
|
491
|
+
new_columns = {}
|
492
|
+
each_column_name do |name|
|
493
|
+
unless names.include?(name)
|
494
|
+
new_columns[name] = column(name)
|
495
|
+
end
|
496
|
+
end
|
497
|
+
return replace CADataFrame.new(new_columns, index: @row_index)
|
498
|
+
end
|
499
|
+
|
500
|
+
alias eliminate_column drop_column
|
501
|
+
|
502
|
+
def vacant_copy
|
503
|
+
new_columns = {}
|
504
|
+
each_column_name do |key|
|
505
|
+
new_columns[key] = CArray.object(0)
|
506
|
+
end
|
507
|
+
return CADataFrame.new(new_columns)
|
508
|
+
end
|
509
|
+
|
510
|
+
def merge (*args)
|
511
|
+
return CADataFrame.merge(self, *args)
|
512
|
+
end
|
513
|
+
|
514
|
+
def execute (&block)
|
515
|
+
case block.arity
|
516
|
+
when 1
|
517
|
+
return instance_exec(self, &block)
|
518
|
+
else
|
519
|
+
return instance_exec(&block)
|
520
|
+
end
|
521
|
+
end
|
522
|
+
|
523
|
+
def calculate (label, &block)
|
524
|
+
hash = {}
|
525
|
+
each_column_name do |name|
|
526
|
+
begin
|
527
|
+
if block
|
528
|
+
hash[name] = [yield(name, column(name))]
|
529
|
+
else
|
530
|
+
hash[name] = [column(name).send(label.intern)]
|
531
|
+
end
|
532
|
+
rescue
|
533
|
+
hash[name] = [UNDEF]
|
534
|
+
end
|
535
|
+
end
|
536
|
+
return CADataFrame.new(hash, index: [label])
|
537
|
+
end
|
538
|
+
|
539
|
+
def resample (&block)
|
540
|
+
new_columns = {}
|
541
|
+
each_column_name do |name|
|
542
|
+
begin
|
543
|
+
new_columns[name] = yield(name, column(name))
|
544
|
+
rescue
|
545
|
+
end
|
546
|
+
end
|
547
|
+
return CADataFrame.new(new_columns)
|
548
|
+
end
|
549
|
+
|
550
|
+
def select (*columns, &block)
|
551
|
+
if columns.empty?
|
552
|
+
names = @column_names
|
553
|
+
else
|
554
|
+
names = []
|
555
|
+
columns.each do |c|
|
556
|
+
case c
|
557
|
+
when String
|
558
|
+
names << c
|
559
|
+
when Symbol
|
560
|
+
names << c.to_s
|
561
|
+
when Regexp
|
562
|
+
names.push *@column_names.grep(c)
|
563
|
+
else
|
564
|
+
raise "invalid column specification"
|
565
|
+
end
|
566
|
+
end
|
567
|
+
end
|
568
|
+
if block
|
569
|
+
row = instance_exec(&block)
|
570
|
+
else
|
571
|
+
row = nil
|
572
|
+
end
|
573
|
+
new_columns = {}
|
574
|
+
names.map(&:to_s).each do |name|
|
575
|
+
new_columns[name] = column(name)[row]
|
576
|
+
end
|
577
|
+
return CADataFrame.new(new_columns, index: @row_index ? @row_index[row] : nil)
|
578
|
+
end
|
579
|
+
#
|
580
|
+
# Maintenance
|
581
|
+
#
|
582
|
+
|
583
|
+
def unmask! (value = nil)
|
584
|
+
each_column_name do |name|
|
585
|
+
column(name).unmask(value)
|
586
|
+
end
|
587
|
+
return self
|
588
|
+
end
|
589
|
+
|
590
|
+
def unmask (value = nil)
|
591
|
+
return to_df.unmask!(value)
|
592
|
+
end
|
593
|
+
|
594
|
+
def detouch
|
595
|
+
@column_data = @column_data.clone
|
596
|
+
each_column_name do |name|
|
597
|
+
@column_data[name] = @column_data[name].to_ca
|
598
|
+
end
|
599
|
+
if @row_index
|
600
|
+
@row_index = @row_index.clone
|
601
|
+
end
|
602
|
+
return self
|
603
|
+
end
|
604
|
+
|
605
|
+
def delete_masked_rows
|
606
|
+
not_masked = @column_data.first[1].template(:boolean) { true }
|
607
|
+
@column_names.each do |name|
|
608
|
+
not_masked &= @column_data[name].is_not_masked
|
609
|
+
end
|
610
|
+
columns = {}
|
611
|
+
@column_names.each_with_index do |name, i|
|
612
|
+
columns[name] = @column_data[name].to_ca[not_masked]
|
613
|
+
end
|
614
|
+
return CADataFrame.new(columns)
|
615
|
+
end
|
616
|
+
|
617
|
+
def delete_rows (&block)
|
618
|
+
masked = instance_eval(&block)
|
619
|
+
columns = {}
|
620
|
+
@column_names.each_with_index do |name, i|
|
621
|
+
columns[name] = @column_data[name].to_ca[masked]
|
622
|
+
end
|
623
|
+
return CADataFrame.new(columns)
|
624
|
+
end
|
625
|
+
#
|
626
|
+
# Transformation
|
627
|
+
#
|
628
|
+
|
629
|
+
def reorder (&block)
|
630
|
+
index = instance_exec(&block)
|
631
|
+
new_columns = {}
|
632
|
+
each_column_name do |name|
|
633
|
+
new_columns[name] = column(name)[index]
|
634
|
+
end
|
635
|
+
return CADataFrame.new(new_columns, index: @row_index ? @row_index[index] : nil)
|
636
|
+
end
|
637
|
+
|
638
|
+
def order_by (*names, &block)
|
639
|
+
if names.empty?
|
640
|
+
if block
|
641
|
+
ret = instance_exec(&block)
|
642
|
+
case ret
|
643
|
+
when CArray
|
644
|
+
list = [ret]
|
645
|
+
when Array
|
646
|
+
list = ret
|
647
|
+
end
|
648
|
+
end
|
649
|
+
else
|
650
|
+
list = @column_data.values_at(*names.map{|s| s.to_s})
|
651
|
+
end
|
652
|
+
return reorder { CA.sort_addr(*list) }
|
653
|
+
end
|
654
|
+
|
655
|
+
def reverse
|
656
|
+
new_columns = {}
|
657
|
+
each_column_name do |name|
|
658
|
+
new_columns[name] = column(name).reverse
|
659
|
+
end
|
660
|
+
return CADataFrame.new(new_columns, index: @row_index ? @row_index.reverse : nil)
|
661
|
+
end
|
662
|
+
|
663
|
+
def transpose (columns: nil)
|
664
|
+
if columns
|
665
|
+
columns = columns.map(&:to_s)
|
666
|
+
else
|
667
|
+
if @row_index
|
668
|
+
columns = @row_index.convert(:object) {|v| v.to_s }
|
669
|
+
else
|
670
|
+
columns = CArray.object(@row_number).seq("a",:succ)
|
671
|
+
end
|
672
|
+
end
|
673
|
+
return CADataFrame.new(ca.transpose, index: @column_names.to_ca, columns: columns)
|
674
|
+
end
|
675
|
+
|
676
|
+
def add_suffix (suf)
|
677
|
+
new_columns = {}
|
678
|
+
each_column_name do |name|
|
679
|
+
new_name = (name.to_s + suf).to_s
|
680
|
+
new_columns[new_name] = column(name)
|
681
|
+
end
|
682
|
+
return CADataFrame.new(new_columns, index: @row_index)
|
683
|
+
end
|
684
|
+
#
|
685
|
+
# Conversions
|
686
|
+
#
|
687
|
+
|
688
|
+
def to_df
|
689
|
+
new_columns = {}
|
690
|
+
each_column_name do |name|
|
691
|
+
new_columns[name] = column(name)
|
692
|
+
end
|
693
|
+
return CADataFrame.new(new_columns, index: @row_index).detouch
|
694
|
+
end
|
695
|
+
|
696
|
+
def objectify
|
697
|
+
new_columns = {}
|
698
|
+
each_column_name do |name|
|
699
|
+
new_columns[name] = column(name).object
|
700
|
+
end
|
701
|
+
return CADataFrame.new(new_columns, index: @row_index)
|
702
|
+
end
|
703
|
+
|
704
|
+
def ca (*names)
|
705
|
+
if names.empty?
|
706
|
+
return CADFArray.new(@column_names, @column_data)
|
707
|
+
else
|
708
|
+
return CADFArray.new(names.map(&:to_s), @column_data)
|
709
|
+
end
|
710
|
+
end
|
711
|
+
|
712
|
+
def to_ca (*names)
|
713
|
+
return ca(*names).to_ca
|
714
|
+
end
|
715
|
+
|
716
|
+
def to_hash
|
717
|
+
hash = {}
|
718
|
+
if @row_index
|
719
|
+
hash["index"] = @row_index
|
720
|
+
end
|
721
|
+
@column_data.each do |k,v|
|
722
|
+
hash[k] = v.to_a
|
723
|
+
end
|
724
|
+
return hash
|
725
|
+
end
|
726
|
+
|
727
|
+
alias to_h to_hash
|
728
|
+
|
729
|
+
def columns_to_hash (key_name, value_names)
|
730
|
+
hash = {}
|
731
|
+
unless @column_names.include?(key_name)
|
732
|
+
raise ArgumentError, "include invalid key column name #{key_name}"
|
733
|
+
end
|
734
|
+
case value_names
|
735
|
+
when String
|
736
|
+
unless @column_names.include?(value_names)
|
737
|
+
raise ArgumentError, "invalid key column name #{value_names}"
|
738
|
+
end
|
739
|
+
key_columns = @column_data[key_name]
|
740
|
+
value_columns = @column_data[value_names]
|
741
|
+
@row_number.times do |i|
|
742
|
+
hash[key_columns[i]] = value_columns[i]
|
743
|
+
end
|
744
|
+
when Array
|
745
|
+
unless value_names.all?{|s| @column_names.include?(s) }
|
746
|
+
raise ArgumentError, "include invalid column name in #{value_names.join(' ')}"
|
747
|
+
end
|
748
|
+
key_columns = @column_data[key_name]
|
749
|
+
value_columns = @column_data.values_at(*value_names)
|
750
|
+
@row_number.times do |i|
|
751
|
+
hash[key_columns[i]] = value_columns.map{|c| c[i]}
|
752
|
+
end
|
753
|
+
else
|
754
|
+
raise ArgumentError, "invalud argument"
|
755
|
+
end
|
756
|
+
return hash
|
757
|
+
end
|
758
|
+
|
759
|
+
private
|
760
|
+
|
761
|
+
def __obj_to_string__ (obj)
|
762
|
+
case obj
|
763
|
+
when Float
|
764
|
+
"%.6g" % obj
|
765
|
+
when nil
|
766
|
+
"nil"
|
767
|
+
else
|
768
|
+
obj.to_s
|
769
|
+
end
|
770
|
+
end
|
771
|
+
|
772
|
+
def __strwidth__ (string)
|
773
|
+
if string.ascii_only?
|
774
|
+
return string.length
|
775
|
+
else
|
776
|
+
return string.each_char.inject(0){|s,c| s += c.bytesize > 1 ? 2 : 1 }
|
777
|
+
end
|
778
|
+
end
|
779
|
+
public
|
780
|
+
|
781
|
+
def ascii_table (rowmax = :full, time_format: nil, index: true)
|
782
|
+
columns = @column_data.clone
|
783
|
+
@column_names.each do |name|
|
784
|
+
if columns[name].is_a?(CATimeIndex)
|
785
|
+
if time_format
|
786
|
+
columns[name] = columns[name].time.time_strftime(time_format)
|
787
|
+
else
|
788
|
+
columns[name] = columns[name].time.time_strftime("%F %T%:z")
|
789
|
+
end
|
790
|
+
end
|
791
|
+
end
|
792
|
+
if index
|
793
|
+
if @row_index
|
794
|
+
namelist = [" "] + @column_names
|
795
|
+
if @row_index.is_a?(CATimeIndex)
|
796
|
+
if time_format
|
797
|
+
row_index = @row_index.time.time_strftime(time_format)
|
798
|
+
else
|
799
|
+
row_index = @row_index.time.time_strftime("%F %T%:z")
|
800
|
+
end
|
801
|
+
else
|
802
|
+
row_index = @row_index
|
803
|
+
end
|
804
|
+
tbl = CADFArray.new(namelist, columns.update(" " => row_index))
|
805
|
+
else
|
806
|
+
namelist = [" "] + @column_names
|
807
|
+
tbl = CADFArray.new(namelist, columns.update(" " => CArray.int(@row_number).seq))
|
808
|
+
end
|
809
|
+
else
|
810
|
+
namelist = @column_names
|
811
|
+
tbl = CADFArray.new(namelist, columns)
|
812
|
+
end
|
813
|
+
if rowmax.is_a?(Integer) and @row_number > rowmax
|
814
|
+
list = tbl[0..(rowmax/2),nil].to_a
|
815
|
+
list.push namelist.map { "..." }
|
816
|
+
list.push *(tbl[-rowmax/2+1..-1,nil].to_a)
|
817
|
+
tbl = list.to_ca
|
818
|
+
end
|
819
|
+
datastr = tbl.convert {|c| __obj_to_string__(c) }.unmask("")
|
820
|
+
datamb = datastr.convert(:boolean, &:"ascii_only?").not.sum(0).ne(0)
|
821
|
+
namemb = namelist.to_ca.convert(:boolean) {|c| c.to_s.ascii_only? }.eq(0)
|
822
|
+
mb = datamb.or(namemb)
|
823
|
+
namelen = namelist.map(&:length).to_ca
|
824
|
+
datalen = datastr.convert(&:length)
|
825
|
+
if mb.max == 0
|
826
|
+
if datalen.size == 0
|
827
|
+
lengths = namelen.to_a
|
828
|
+
else
|
829
|
+
lengths = datalen.max(0).pmax(namelen).to_a
|
830
|
+
end
|
831
|
+
hrule = "-" + lengths.map {|len| "-"*len}.join("--") + "-"
|
832
|
+
header = " " +
|
833
|
+
[namelist, lengths].transpose.map{|name, len|
|
834
|
+
"#{name.to_s.ljust(len)}" }.join(" ") + " "
|
835
|
+
ary = [hrule, header, hrule]
|
836
|
+
if datalen.size > 0
|
837
|
+
datastr[:i,nil].each_with_index do |blk, i|
|
838
|
+
list = blk.flatten.to_a
|
839
|
+
ary << " " + [list, lengths].transpose.map{|value, len|
|
840
|
+
"#{value.ljust(len)}"}.join(" ") + " "
|
841
|
+
end
|
842
|
+
end
|
843
|
+
ary << hrule
|
844
|
+
return "DataFrame: rows#=#{@row_number}: \n" + ary.join("\n")
|
845
|
+
else
|
846
|
+
namewidth = namelist.to_ca.convert{|c| __strwidth__(c.to_s) }
|
847
|
+
if datalen.size == 0
|
848
|
+
maxwidth = namewidth
|
849
|
+
else
|
850
|
+
datawidth = datastr.convert{|c| __strwidth__(c.to_s) }
|
851
|
+
maxwidth = datawidth.max(0).pmax(namewidth)
|
852
|
+
end
|
853
|
+
len = maxwidth[:*,nil] - datawidth + datalen
|
854
|
+
hrule = "-" + maxwidth.map {|len| "-"*len}.join("--") + "-"
|
855
|
+
header = " " +
|
856
|
+
[namelist, maxwidth.to_a].transpose.map{|name, len|
|
857
|
+
"#{name.to_s.ljust(len-__strwidth__(name.to_s)+name.to_s.length)}" }.join(" ") + " "
|
858
|
+
ary = [hrule, header, hrule]
|
859
|
+
if datalen.size > 0
|
860
|
+
datastr[:i,nil].each_with_addr do |blk, i|
|
861
|
+
list = blk.flatten.to_a
|
862
|
+
ary << " " + list.map.with_index {|value, j|
|
863
|
+
"#{value.ljust(len[i,j])}"}.join(" ") + " "
|
864
|
+
end
|
865
|
+
end
|
866
|
+
ary << hrule
|
867
|
+
return "DataFrame: row#=#{@row_number}: \n" + ary.join("\n")
|
868
|
+
end
|
869
|
+
end
|
870
|
+
|
871
|
+
def inspect
|
872
|
+
return ascii_table(8)
|
873
|
+
end
|
874
|
+
|
875
|
+
def to_s
|
876
|
+
return ascii_table
|
877
|
+
end
|
878
|
+
|
879
|
+
def to_ary
|
880
|
+
return [to_s]
|
881
|
+
end
|
882
|
+
end
|
883
|
+
|
884
|
+
#############################################################
|
885
|
+
#
|
886
|
+
# Class methods
|
887
|
+
#
|
888
|
+
#############################################################
|
889
|
+
class CADataFrame
|
890
|
+
|
891
|
+
def self.merge (*args)
|
892
|
+
ref = args.first
|
893
|
+
new_columns = {}
|
894
|
+
args.each do |table|
|
895
|
+
table.column_names.each do |name|
|
896
|
+
new_columns[name] = table.col(name)
|
897
|
+
end
|
898
|
+
end
|
899
|
+
return CADataFrame.new(new_columns, index: ref.row_index)
|
900
|
+
end
|
901
|
+
|
902
|
+
def self.concat (*args)
|
903
|
+
ref = args.first
|
904
|
+
column_names = ref.column_names
|
905
|
+
new_columns = {}
|
906
|
+
column_names.each do |name|
|
907
|
+
list = args.map{|t| t.column(name) }
|
908
|
+
if list.first.is_a?(CATimeIndex)
|
909
|
+
new_columns[name] = CATimeIndex.concat(*list)
|
910
|
+
else
|
911
|
+
data_type = list.first.data_type
|
912
|
+
new_columns[name] = CArray.bind(data_type, list, 0)
|
913
|
+
end
|
914
|
+
end
|
915
|
+
list = args.map(&:row_index)
|
916
|
+
if list.all?
|
917
|
+
if list.first.is_a?(CATimeIndex)
|
918
|
+
new_row_index = CATimeIndex.concat(*list)
|
919
|
+
else
|
920
|
+
new_row_index = CArray.join(*list).flatten
|
921
|
+
end
|
922
|
+
else
|
923
|
+
new_row_index = nil
|
924
|
+
end
|
925
|
+
return CADataFrame.new(new_columns, index: new_row_index)
|
926
|
+
end
|
927
|
+
end
|
928
|
+
|
929
|
+
#############################################################
|
930
|
+
#
|
931
|
+
# BASIC Comparison
|
932
|
+
#
|
933
|
+
#############################################################
|
934
|
+
class CADataFrame
|
935
|
+
|
936
|
+
def -@
|
937
|
+
return cmp(:-@)
|
938
|
+
end
|
939
|
+
|
940
|
+
def < (other)
|
941
|
+
return cmp(:<, other)
|
942
|
+
end
|
943
|
+
|
944
|
+
def <= (other)
|
945
|
+
return cmp(:<=, other)
|
946
|
+
end
|
947
|
+
|
948
|
+
def > (other)
|
949
|
+
return cmp(:>, other)
|
950
|
+
end
|
951
|
+
|
952
|
+
def >= (other)
|
953
|
+
return cmp(:>=, other)
|
954
|
+
end
|
955
|
+
|
956
|
+
def is_masked
|
957
|
+
return cmp(:is_masked)
|
958
|
+
end
|
959
|
+
|
960
|
+
def is_finite
|
961
|
+
return cmp(:is_finite)
|
962
|
+
end
|
963
|
+
|
964
|
+
private
|
965
|
+
|
966
|
+
def cmp (method, *argv)
|
967
|
+
return CADataFrame.new(ca.send(method,*argv), columns: @column_names)
|
968
|
+
end
|
969
|
+
|
970
|
+
end
|
971
|
+
|
972
|
+
#############################################################
|
973
|
+
#
|
974
|
+
# BASIC Manipulations
|
975
|
+
#
|
976
|
+
#############################################################
|
977
|
+
class CADataFrame
|
978
|
+
|
979
|
+
def matchup (keyname, reference)
|
980
|
+
key = column(keyname.to_s)
|
981
|
+
idx = reference.matchup(key)
|
982
|
+
new_columns = {}
|
983
|
+
each_column_name do |name|
|
984
|
+
if name == keyname
|
985
|
+
new_columns[name] = reference
|
986
|
+
else
|
987
|
+
new_columns[name] = column(name).project(idx)
|
988
|
+
end
|
989
|
+
end
|
990
|
+
if @row_index
|
991
|
+
new_row_index = @row_index.project(idx).unmask(nil)
|
992
|
+
else
|
993
|
+
new_row_index = nil
|
994
|
+
end
|
995
|
+
return CADataFrame.new(new_columns, index: new_row_index) {
|
996
|
+
self.send(keyname)[] = reference
|
997
|
+
}
|
998
|
+
end
|
999
|
+
|
1000
|
+
def histogram (name, scale = nil, options = nil)
|
1001
|
+
if scale.nil?
|
1002
|
+
return group_by(name).table{ { :count => col(name).count_valid } }
|
1003
|
+
else
|
1004
|
+
if options
|
1005
|
+
hist = CAHistogram.int(scale, options)
|
1006
|
+
else
|
1007
|
+
hist = CAHistogram.int(scale)
|
1008
|
+
end
|
1009
|
+
hist.increment(@column_data[name.to_s])
|
1010
|
+
hash = {
|
1011
|
+
name.to_s => hist.midpoints[0],
|
1012
|
+
"#{name}_L".to_s => scale[0..-2],
|
1013
|
+
"#{name}_R".to_s => scale.shift(-1)[0..-2],
|
1014
|
+
:count => hist[0..-2].to_ca,
|
1015
|
+
}
|
1016
|
+
return CADataFrame.new(hash)
|
1017
|
+
end
|
1018
|
+
end
|
1019
|
+
|
1020
|
+
def classify (name, scale = nil, opt = {})
|
1021
|
+
if not scale
|
1022
|
+
column = @column_data[name.to_s]
|
1023
|
+
mids = column.uniq
|
1024
|
+
mapper = {}
|
1025
|
+
mids.each_with_index do |v,i|
|
1026
|
+
mapper[v] = i
|
1027
|
+
end
|
1028
|
+
cls = columns.convert(:int32) {|v| mapper[v] }
|
1029
|
+
hash = {
|
1030
|
+
"#{name}_M" => mids,
|
1031
|
+
"#{name}_L" => mids,
|
1032
|
+
"#{name}_R" => mids,
|
1033
|
+
"#{name}_CLASS" => cls
|
1034
|
+
}
|
1035
|
+
else
|
1036
|
+
option = {
|
1037
|
+
:include_upper => false,
|
1038
|
+
:include_lowest => true,
|
1039
|
+
:offset => 0,
|
1040
|
+
}.update(opt)
|
1041
|
+
column = @column_data[name.to_s]
|
1042
|
+
cls = scale.bin(column,
|
1043
|
+
option[:include_upper],
|
1044
|
+
option[:include_lowest],
|
1045
|
+
option[:offset])
|
1046
|
+
mids = ((scale + scale.shifted(-1))/2)[0..-2].to_ca
|
1047
|
+
left = scale[0..-2]
|
1048
|
+
right = scale.shift(-1)[0..-2]
|
1049
|
+
hash = {
|
1050
|
+
"#{name}_M" => mids.project(cls).to_ca,
|
1051
|
+
"#{name}_L" => left.project(cls).to_ca,
|
1052
|
+
"#{name}_R" => right.project(cls).to_ca,
|
1053
|
+
"#{name}_CLASS" => cls
|
1054
|
+
}
|
1055
|
+
end
|
1056
|
+
return CADataFrame.new(hash)
|
1057
|
+
end
|
1058
|
+
|
1059
|
+
def cross (name1, name2)
|
1060
|
+
col1 = column(name1)
|
1061
|
+
col2 = column(name2)
|
1062
|
+
var1 = col1.uniq.sort
|
1063
|
+
var2 = col2.uniq.sort
|
1064
|
+
hash = {}
|
1065
|
+
count = Hash.new {0}
|
1066
|
+
var1.each do |v1|
|
1067
|
+
var2.each do |v2|
|
1068
|
+
hash[[v1,v2]] = 0
|
1069
|
+
end
|
1070
|
+
end
|
1071
|
+
list = CArray.join([col1, col2]).to_a
|
1072
|
+
list.each do |item|
|
1073
|
+
hash[item] += 1
|
1074
|
+
end
|
1075
|
+
out = CArray.object(var1.size, var2.size) { 0 }
|
1076
|
+
var1.each_with_index do |v1, i|
|
1077
|
+
var2.each_with_index do |v2, j|
|
1078
|
+
out[i,j] = hash[[v1,v2]]
|
1079
|
+
end
|
1080
|
+
end
|
1081
|
+
return CADataFrame.new(out, index: var1, columns: var2)
|
1082
|
+
end
|
1083
|
+
end
|
1084
|
+
|
1085
|
+
#############################################################
|
1086
|
+
#
|
1087
|
+
# CArray
|
1088
|
+
#
|
1089
|
+
#############################################################
|
1090
|
+
class CADataFrame
|
1091
|
+
|
1092
|
+
def sum
|
1093
|
+
new_columns = {}
|
1094
|
+
each_column do |name, col|
|
1095
|
+
new_columns[name] = [col.sum]
|
1096
|
+
end
|
1097
|
+
return CADataFrame.new(new_columns, index: ["sum"])
|
1098
|
+
end
|
1099
|
+
|
1100
|
+
def mean
|
1101
|
+
new_columns = {}
|
1102
|
+
each_column do |name, col|
|
1103
|
+
new_columns[name] = [col.mean]
|
1104
|
+
end
|
1105
|
+
return CADataFrame.new(new_columns, index: ["mean"])
|
1106
|
+
end
|
1107
|
+
end
|
1108
|
+
|
1109
|
+
class CArray
|
1110
|
+
|
1111
|
+
def describe_type
|
1112
|
+
type = nil
|
1113
|
+
case true
|
1114
|
+
when numeric?
|
1115
|
+
type = :numeric
|
1116
|
+
when boolean?
|
1117
|
+
type = :categorical
|
1118
|
+
else
|
1119
|
+
begin
|
1120
|
+
self / 1
|
1121
|
+
type = :numeric
|
1122
|
+
rescue
|
1123
|
+
type = :categorical
|
1124
|
+
end
|
1125
|
+
end
|
1126
|
+
type
|
1127
|
+
end
|
1128
|
+
|
1129
|
+
private :describe_type
|
1130
|
+
|
1131
|
+
def describe (as: nil)
|
1132
|
+
if as
|
1133
|
+
type = as.intern
|
1134
|
+
else
|
1135
|
+
type = describe_type
|
1136
|
+
end
|
1137
|
+
case type
|
1138
|
+
when :numeric
|
1139
|
+
describe_numeric
|
1140
|
+
when :categorical
|
1141
|
+
describe_categorical
|
1142
|
+
else
|
1143
|
+
raise "unknown"
|
1144
|
+
end
|
1145
|
+
end
|
1146
|
+
|
1147
|
+
def describe_numeric
|
1148
|
+
min, q25, median, q75, max = *quantile
|
1149
|
+
{
|
1150
|
+
count: is_masked.count_false,
|
1151
|
+
mean: mean,
|
1152
|
+
std: stddev,
|
1153
|
+
max: max,
|
1154
|
+
q75: q75,
|
1155
|
+
median: median,
|
1156
|
+
q25: q25,
|
1157
|
+
min: min,
|
1158
|
+
}
|
1159
|
+
end
|
1160
|
+
|
1161
|
+
def describe_categorical
|
1162
|
+
hash = {}
|
1163
|
+
each do |v|
|
1164
|
+
hash[v] ||= 0
|
1165
|
+
hash[v] += 1
|
1166
|
+
end
|
1167
|
+
top, freq = hash.max_by{|x| x[1]}
|
1168
|
+
{
|
1169
|
+
count: is_masked.count_false,
|
1170
|
+
unique: hash.size,
|
1171
|
+
top: top,
|
1172
|
+
freq: freq,
|
1173
|
+
}
|
1174
|
+
end
|
1175
|
+
|
1176
|
+
def summary
|
1177
|
+
summary_categorical
|
1178
|
+
end
|
1179
|
+
|
1180
|
+
def summary_categorical
|
1181
|
+
hash = {}
|
1182
|
+
each do |v|
|
1183
|
+
hash[v] ||= 0
|
1184
|
+
hash[v] += 1
|
1185
|
+
end
|
1186
|
+
hash
|
1187
|
+
end
|
1188
|
+
|
1189
|
+
end
|
1190
|
+
|
1191
|
+
class CADataFrame
|
1192
|
+
|
1193
|
+
def describe
|
1194
|
+
list = []
|
1195
|
+
@column_data.each do |name, column|
|
1196
|
+
list << column.describe
|
1197
|
+
end
|
1198
|
+
CADataFrame.new(list, index: @column_names).transpose
|
1199
|
+
end
|
1200
|
+
|
1201
|
+
def summary (*names)
|
1202
|
+
data = []
|
1203
|
+
names.each do |name|
|
1204
|
+
data << @column_data[name].summary
|
1205
|
+
end
|
1206
|
+
CADataFrame.new(data, index: names).transpose
|
1207
|
+
end
|
1208
|
+
|
1209
|
+
end
|
1210
|
+
|
1211
|
+
|
1212
|
+
class CArray
|
1213
|
+
|
1214
|
+
def get_dummies
|
1215
|
+
keys = uniq
|
1216
|
+
hash = {}
|
1217
|
+
keys.each do |k|
|
1218
|
+
hash[k] = self.eq(k)
|
1219
|
+
end
|
1220
|
+
return hash
|
1221
|
+
end
|
1222
|
+
|
1223
|
+
end
|
1224
|
+
|
1225
|
+
class CADataFrame
|
1226
|
+
|
1227
|
+
def get_dummies (*names, prefix: nil, prefix_sep: "_")
|
1228
|
+
keep_columns = {}
|
1229
|
+
new_columns = {}
|
1230
|
+
k = 0
|
1231
|
+
@column_names.each do |name|
|
1232
|
+
unless names.include?(name)
|
1233
|
+
keep_columns[name] = @column_data[name]
|
1234
|
+
next
|
1235
|
+
end
|
1236
|
+
hash = @column_data[name].get_dummies
|
1237
|
+
case prefix
|
1238
|
+
when nil
|
1239
|
+
hash.each do |v, dummy|
|
1240
|
+
new_columns["#{name}#{prefix_sep}#{v}"] = dummy
|
1241
|
+
end
|
1242
|
+
when String
|
1243
|
+
hash.each do |v, dummy|
|
1244
|
+
new_columns["#{prefix}#{prefix_sep}#{v}"] = dummy
|
1245
|
+
end
|
1246
|
+
when Array
|
1247
|
+
hash.each do |v, dummy|
|
1248
|
+
new_columns["#{prefix[k]}#{prefix_sep}#{v}"] = dummy
|
1249
|
+
end
|
1250
|
+
when Hash
|
1251
|
+
hash.each do |v, dummy|
|
1252
|
+
new_columns["#{prefix[name]}#{prefix_sep}#{v}"] = dummy
|
1253
|
+
end
|
1254
|
+
end
|
1255
|
+
k += 1
|
1256
|
+
end
|
1257
|
+
CADataFrame.new(keep_columns.update(new_columns), index: @row_index)
|
1258
|
+
end
|
1259
|
+
|
1260
|
+
end
|
1261
|
+
|
1262
|
+
|
1263
|
+
class CADataFrame
|
1264
|
+
|
1265
|
+
def save (filename)
|
1266
|
+
open(filename, "w") {|io|
|
1267
|
+
Marshal.dump(self, io)
|
1268
|
+
}
|
1269
|
+
end
|
1270
|
+
|
1271
|
+
def self.load (filename)
|
1272
|
+
out = open(filename, "r") {|io|
|
1273
|
+
Marshal.load(io)
|
1274
|
+
}
|
1275
|
+
raise "invalid data" unless out.is_a?(CADataFrame)
|
1276
|
+
return out
|
1277
|
+
end
|
1278
|
+
|
1279
|
+
end
|