carray-dataframe 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/API.txt +83 -0
- data/README.md +5 -0
- data/carray-dataframe.gemspec +25 -0
- data/examples/R/fit.rb +24 -0
- data/examples/R/iris.rb +9 -0
- data/examples/R/japan_area.rb +30 -0
- data/examples/R/kyaku.rb +22 -0
- data/examples/group_by.rb +78 -0
- data/examples/hist.rb +27 -0
- data/examples/iris.rb +29 -0
- data/examples/map.rb +23 -0
- data/examples/match.rb +21 -0
- data/examples/test.xlsx +0 -0
- data/examples/test1.rb +44 -0
- data/examples/test2.rb +14 -0
- data/examples/test3.db +0 -0
- data/examples/test3.rb +11 -0
- data/examples/test3.xlsx +0 -0
- data/examples/to_excel.rb +27 -0
- data/lib/R.rb +365 -0
- data/lib/carray/autoload/autoload_dataframe_dataframe.rb +26 -0
- data/lib/carray/dataframe/dataframe.rb +1640 -0
- metadata +106 -0
@@ -0,0 +1,1640 @@
|
|
1
|
+
require "carray"
|
2
|
+
require "carray/io/table"
|
3
|
+
|
4
|
+
module CA::TableMethods
|
5
|
+
|
6
|
+
def to_dataframe (&block)
|
7
|
+
df = CADataFrame.new(self, &block)
|
8
|
+
if @header or @note
|
9
|
+
df.instance_variable_set(:@header, @header)
|
10
|
+
df.instance_variable_set(:@note, @note)
|
11
|
+
class << df
|
12
|
+
attr_reader :note
|
13
|
+
def header (name=nil)
|
14
|
+
if name
|
15
|
+
return @header[name.to_s]
|
16
|
+
else
|
17
|
+
return @column_names
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
return df
|
23
|
+
end
|
24
|
+
|
25
|
+
alias to_df to_dataframe
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
class CADataFrame
|
30
|
+
|
31
|
+
#
|
32
|
+
# Constructor
|
33
|
+
#
|
34
|
+
|
35
|
+
def initialize (columns_or_table, row_index: nil, column_names: nil, &block)
|
36
|
+
case columns_or_table
|
37
|
+
when Hash
|
38
|
+
columns = columns_or_table
|
39
|
+
@column_names = columns.keys.map(&:to_s)
|
40
|
+
@columns = normalize_columns(columns)
|
41
|
+
@row_number = @columns.first[1].size
|
42
|
+
if @column_names.any?{ |key| @columns[key].size != @row_number }
|
43
|
+
raise "column sizes mismatch"
|
44
|
+
end
|
45
|
+
when CArray
|
46
|
+
table = columns_or_table
|
47
|
+
if column_names
|
48
|
+
@column_names = column_names.map(&:to_s)
|
49
|
+
else
|
50
|
+
if table.respond_to?(:column_names)
|
51
|
+
@column_names = table.column_names.map(&:to_s)
|
52
|
+
else
|
53
|
+
raise "data table (CArray) has no method 'column_names'."
|
54
|
+
end
|
55
|
+
end
|
56
|
+
@columns = table_to_columns(table)
|
57
|
+
@row_number = table.dim0
|
58
|
+
else
|
59
|
+
raise "unknown data"
|
60
|
+
end
|
61
|
+
if row_index
|
62
|
+
@row_index = row_index.to_ca.object
|
63
|
+
else
|
64
|
+
@row_index = nil
|
65
|
+
end
|
66
|
+
@__methods__ = {}
|
67
|
+
if block_given?
|
68
|
+
arrange(&block)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def __methods__
|
73
|
+
return @__methods__
|
74
|
+
end
|
75
|
+
|
76
|
+
def replace (other)
|
77
|
+
@column_names = other.column_names
|
78
|
+
@columns = other.columns
|
79
|
+
@row_index = other.row_index
|
80
|
+
@row_number = other.row_number
|
81
|
+
@__methors__ = other.__methods__
|
82
|
+
return self
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
def table_to_columns (table)
|
88
|
+
new_columns = {}
|
89
|
+
@column_names.each_with_index do |name, i|
|
90
|
+
new_columns[name] = table[nil,i]
|
91
|
+
end
|
92
|
+
return new_columns
|
93
|
+
end
|
94
|
+
|
95
|
+
def normalize_columns (columns)
|
96
|
+
new_columns = {}
|
97
|
+
columns.each_key do |key|
|
98
|
+
case columns[key]
|
99
|
+
when CArray
|
100
|
+
column = columns[key]
|
101
|
+
when Array
|
102
|
+
column = columns[key].to_ca
|
103
|
+
if column.rank != 1
|
104
|
+
list = columns[key].clone
|
105
|
+
column = CArray.object(list.size).convert { list.shift }
|
106
|
+
end
|
107
|
+
else
|
108
|
+
column = columns[key].to_ca
|
109
|
+
end
|
110
|
+
new_columns[key.to_s] = column
|
111
|
+
end
|
112
|
+
return new_columns
|
113
|
+
end
|
114
|
+
|
115
|
+
public
|
116
|
+
|
117
|
+
#
|
118
|
+
# Attributes
|
119
|
+
#
|
120
|
+
|
121
|
+
attr_reader :columns, :column_names, :row_index, :column_number, :row_number
|
122
|
+
|
123
|
+
def has_column?(name)
|
124
|
+
return @column_names.include?(name)
|
125
|
+
end
|
126
|
+
|
127
|
+
def column_types
|
128
|
+
return @columns_names.map{|name| @columns[name].data_type_name }
|
129
|
+
end
|
130
|
+
|
131
|
+
#
|
132
|
+
# Column, Row Access
|
133
|
+
#
|
134
|
+
|
135
|
+
def column (name_or_index)
|
136
|
+
case name_or_index
|
137
|
+
when Integer
|
138
|
+
return @columns[@column_names[name_or_index]]
|
139
|
+
when String, Symbol
|
140
|
+
return @columns[name_or_index.to_s]
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
alias col column
|
145
|
+
|
146
|
+
def row (idx)
|
147
|
+
if @row_index
|
148
|
+
addr = @row_index.search(idx)
|
149
|
+
return @column_names.map{|name| @columns[name][addr]}.to_ca
|
150
|
+
else
|
151
|
+
return @column_names.map{|name| @columns[name][idx]}.to_ca
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def index
|
156
|
+
return CArray.int(@row_number).seq
|
157
|
+
end
|
158
|
+
|
159
|
+
def method (hash)
|
160
|
+
new_hash = {}
|
161
|
+
hash.each do |key, value|
|
162
|
+
new_hash[key.to_s] = value.to_s
|
163
|
+
end
|
164
|
+
@__methods__.update(new_hash)
|
165
|
+
end
|
166
|
+
|
167
|
+
def method_missing (name, *args)
|
168
|
+
if args.size == 0
|
169
|
+
name = name.to_s
|
170
|
+
if has_column?(name)
|
171
|
+
return @columns[name]
|
172
|
+
elsif has_column?(name.gsub(/_/,'.')) ### For R
|
173
|
+
return @columns[name.gsub(/_/,'.')]
|
174
|
+
elsif @__methods__.include?(name)
|
175
|
+
return @columns[@__methods__[name]]
|
176
|
+
end
|
177
|
+
end
|
178
|
+
raise "no method '#{name}' for CADataFrame"
|
179
|
+
end
|
180
|
+
|
181
|
+
|
182
|
+
#
|
183
|
+
# Iterators
|
184
|
+
#
|
185
|
+
|
186
|
+
def each_column (&block)
|
187
|
+
return @columns.each(&block)
|
188
|
+
end
|
189
|
+
|
190
|
+
def each_column_name (&block)
|
191
|
+
return @column_names.each(&block)
|
192
|
+
end
|
193
|
+
|
194
|
+
def each_row_index (&block)
|
195
|
+
if @row_index
|
196
|
+
@row_index.each(&block)
|
197
|
+
else
|
198
|
+
@row_number.times(&block)
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def each_row (with: Array, &block)
|
203
|
+
if with == Array
|
204
|
+
@row_number.times do |i|
|
205
|
+
yield @columns.map{|n,c| c[i] }
|
206
|
+
end
|
207
|
+
elsif with == Hash
|
208
|
+
row = {}
|
209
|
+
@row_number.times do |i|
|
210
|
+
@column_names.each do |c|
|
211
|
+
row[c] = @columns[c][i]
|
212
|
+
end
|
213
|
+
yield row
|
214
|
+
end
|
215
|
+
else
|
216
|
+
raise "invalid data type for loop variable"
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
def each_row_with_row_index (with: Array, &block)
|
221
|
+
if with == Array
|
222
|
+
if @row_index
|
223
|
+
@row_index.each_with_index do |idx, i|
|
224
|
+
yield @columns.map{|n,c| c[i] }, idx
|
225
|
+
end
|
226
|
+
else
|
227
|
+
@row_number.times do |i|
|
228
|
+
yield @columns.map{|n,c| c[i] }, i
|
229
|
+
end
|
230
|
+
end
|
231
|
+
elsif with == Hash
|
232
|
+
row = {}
|
233
|
+
if @row_index
|
234
|
+
@row_index.each_with_index do |idx, i|
|
235
|
+
@column_names.each do |c|
|
236
|
+
row[c] = @columns[c][i]
|
237
|
+
end
|
238
|
+
yield row, @row_index[i]
|
239
|
+
end
|
240
|
+
else
|
241
|
+
@row_number.times do |idx, i|
|
242
|
+
@column_names.each do |c|
|
243
|
+
row[c] = @columns[c][i]
|
244
|
+
end
|
245
|
+
yield row, @row_index[i]
|
246
|
+
end
|
247
|
+
end
|
248
|
+
else
|
249
|
+
raise "invalid data type for loop variable"
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
#
|
254
|
+
# Referencing
|
255
|
+
#
|
256
|
+
|
257
|
+
def [] (*argv)
|
258
|
+
row, col = *argv
|
259
|
+
new_columns = {}
|
260
|
+
if col.is_a?(NilClass)
|
261
|
+
case row
|
262
|
+
when CADataFrame
|
263
|
+
each_column_name do |key|
|
264
|
+
if row.has_column?(key)
|
265
|
+
new_columns[key] = column(key).maskout(row.column(key))
|
266
|
+
else
|
267
|
+
new_columns[key] = column(key).to_ca
|
268
|
+
end
|
269
|
+
end
|
270
|
+
return CADataFrame.new(new_columns, row_index: row.row_index ? row.row_index : nil)
|
271
|
+
when String
|
272
|
+
return self[nil,row]
|
273
|
+
when Array
|
274
|
+
if row.all?{|s| s.is_a?(String) }
|
275
|
+
return self[nil,row]
|
276
|
+
else
|
277
|
+
@column_names.each do |key|
|
278
|
+
new_columns[key] = @columns[key][row]
|
279
|
+
end
|
280
|
+
end
|
281
|
+
return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[row] : nil)
|
282
|
+
else
|
283
|
+
if row.is_a?(Integer)
|
284
|
+
row = [row]
|
285
|
+
end
|
286
|
+
@column_names.each do |key|
|
287
|
+
new_columns[key] = @columns[key][row]
|
288
|
+
end
|
289
|
+
return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[row] : nil)
|
290
|
+
end
|
291
|
+
else
|
292
|
+
if row.is_a?(Integer)
|
293
|
+
row = [row]
|
294
|
+
end
|
295
|
+
case col
|
296
|
+
when String, Symbol
|
297
|
+
key = col.to_s
|
298
|
+
if has_column?(key)
|
299
|
+
return column(key)[row]
|
300
|
+
else
|
301
|
+
raise "unknow column name '#{key}'"
|
302
|
+
end
|
303
|
+
when Array
|
304
|
+
if col.all?{|s| s.is_a?(String) }
|
305
|
+
col.each do |key|
|
306
|
+
key = key.to_s
|
307
|
+
if has_column?(key)
|
308
|
+
new_columns[key] = column(key)[row]
|
309
|
+
else
|
310
|
+
raise "unknow column name '#{key}'"
|
311
|
+
end
|
312
|
+
end
|
313
|
+
else
|
314
|
+
keys = @column_names.to_ca[col].to_a
|
315
|
+
keys.each do |key|
|
316
|
+
new_columns[key] = column(key)[row]
|
317
|
+
end
|
318
|
+
end
|
319
|
+
return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[row] : nil)
|
320
|
+
else
|
321
|
+
if col.is_a?(Integer)
|
322
|
+
col = [col]
|
323
|
+
end
|
324
|
+
keys = @column_names.to_ca[col].to_a
|
325
|
+
keys.each do |key|
|
326
|
+
new_columns[key] = column(key)[row]
|
327
|
+
end
|
328
|
+
return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[row] : nil)
|
329
|
+
end
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
#
|
334
|
+
# Setting Values
|
335
|
+
#
|
336
|
+
|
337
|
+
def []= (*argv)
|
338
|
+
value = argv.pop
|
339
|
+
row, col = *argv
|
340
|
+
case col
|
341
|
+
when NilClass
|
342
|
+
case row
|
343
|
+
when CADataFrame
|
344
|
+
each_column_name do |key|
|
345
|
+
if row.has_column?(key)
|
346
|
+
column(key)[row.column(key)] = value
|
347
|
+
end
|
348
|
+
end
|
349
|
+
when String
|
350
|
+
self[nil,row] = value
|
351
|
+
else
|
352
|
+
col = @column_names.to_a
|
353
|
+
self[row,col] = value
|
354
|
+
end
|
355
|
+
when String, Symbol
|
356
|
+
key = col.to_s
|
357
|
+
if has_column?(key)
|
358
|
+
column(key)[row] = value
|
359
|
+
else
|
360
|
+
arrange {
|
361
|
+
append key, value
|
362
|
+
}
|
363
|
+
end
|
364
|
+
when Array
|
365
|
+
col.each do |key|
|
366
|
+
key = key.to_s
|
367
|
+
if has_column?(key)
|
368
|
+
column(key)[row] = value
|
369
|
+
else
|
370
|
+
raise "unknow column name '#{key}'"
|
371
|
+
end
|
372
|
+
end
|
373
|
+
else
|
374
|
+
if col.is_a?(Integer)
|
375
|
+
col = [col]
|
376
|
+
end
|
377
|
+
keys = @column_names.to_ca[col].to_a
|
378
|
+
keys.each do |key|
|
379
|
+
column(key)[row] = value
|
380
|
+
end
|
381
|
+
end
|
382
|
+
return value
|
383
|
+
end
|
384
|
+
|
385
|
+
def where (mask, value)
|
386
|
+
mask.column_names.each do |key|
|
387
|
+
if has_column?(key)
|
388
|
+
column(key)[mask.column(key).boolean.not] = value
|
389
|
+
end
|
390
|
+
end
|
391
|
+
return value
|
392
|
+
end
|
393
|
+
|
394
|
+
def fill (*names, value)
|
395
|
+
names.each do |name|
|
396
|
+
if has_column?(name)
|
397
|
+
column(name).fill(value)
|
398
|
+
end
|
399
|
+
end
|
400
|
+
return self
|
401
|
+
end
|
402
|
+
|
403
|
+
#
|
404
|
+
# Arrange
|
405
|
+
#
|
406
|
+
|
407
|
+
def arrange (&block)
|
408
|
+
return Arranger.new(self).arrange(&block)
|
409
|
+
end
|
410
|
+
|
411
|
+
def rename (name1, name2)
|
412
|
+
if idx = @column_names.index(name1.to_s)
|
413
|
+
@column_names[idx] = name2.to_s
|
414
|
+
column = @columns[name1.to_s]
|
415
|
+
@columns.delete(name1.to_s)
|
416
|
+
@columns[name2.to_s] = column
|
417
|
+
else
|
418
|
+
raise "unknown column name #{name1}"
|
419
|
+
end
|
420
|
+
end
|
421
|
+
|
422
|
+
def downcase
|
423
|
+
new_column_names = []
|
424
|
+
new_columns = {}
|
425
|
+
each_column_name do |name|
|
426
|
+
new_column_names << name.downcase
|
427
|
+
new_columns[name.downcase] = column(name)
|
428
|
+
end
|
429
|
+
@column_names = new_column_names
|
430
|
+
@columns = new_columns
|
431
|
+
return self
|
432
|
+
end
|
433
|
+
|
434
|
+
def append (name, new_column = nil, &block)
|
435
|
+
if new_column
|
436
|
+
# do nothing
|
437
|
+
elsif block
|
438
|
+
new_column = instance_exec(&block)
|
439
|
+
else
|
440
|
+
new_column = @columns.first[1].template(:object)
|
441
|
+
end
|
442
|
+
unless new_column.is_a?(CArray)
|
443
|
+
new_column = new_column.to_ca
|
444
|
+
end
|
445
|
+
if new_column.rank != 1 or new_column.size != @row_number
|
446
|
+
raise "invalid shape of appended column"
|
447
|
+
end
|
448
|
+
@column_names.push(name)
|
449
|
+
@columns[name] = new_column
|
450
|
+
return new_column
|
451
|
+
end
|
452
|
+
|
453
|
+
def lead (name, new_column = nil, &block)
|
454
|
+
if new_column
|
455
|
+
# do nothing
|
456
|
+
elsif block
|
457
|
+
new_column = instance_exec(&block)
|
458
|
+
else
|
459
|
+
new_column = @columns.first[1].template(:object)
|
460
|
+
end
|
461
|
+
unless new_column.is_a?(CArray)
|
462
|
+
new_column = new_column.to_ca
|
463
|
+
end
|
464
|
+
if new_column.rank != 1 or new_column.size != @row_number
|
465
|
+
raise "invalid shape of appended column"
|
466
|
+
end
|
467
|
+
@column_names.unshift(name)
|
468
|
+
@columns[name] = new_column
|
469
|
+
return new_column
|
470
|
+
end
|
471
|
+
|
472
|
+
def vacant_copy
|
473
|
+
new_columns = {}
|
474
|
+
each_column_name do |key|
|
475
|
+
new_columns[key] = CArray.object(0)
|
476
|
+
end
|
477
|
+
return CADataFrame.new(new_columns)
|
478
|
+
end
|
479
|
+
|
480
|
+
def merge (*args)
|
481
|
+
return CADataFrame.merge(self, *args)
|
482
|
+
end
|
483
|
+
|
484
|
+
def execute (&block)
|
485
|
+
return instance_exec(&block)
|
486
|
+
end
|
487
|
+
|
488
|
+
def calculate (label, &block)
|
489
|
+
hash = {}
|
490
|
+
each_column_name do |name|
|
491
|
+
begin
|
492
|
+
if block
|
493
|
+
hash[name] = [yield(name, column(name))]
|
494
|
+
else
|
495
|
+
hash[name] = [column(name).send(label.intern)]
|
496
|
+
end
|
497
|
+
rescue
|
498
|
+
hash[name] = [UNDEF]
|
499
|
+
end
|
500
|
+
end
|
501
|
+
return CADataFrame.new(hash, row_index: [label])
|
502
|
+
end
|
503
|
+
|
504
|
+
def resample (&block)
|
505
|
+
new_columns = {}
|
506
|
+
each_column_name do |name|
|
507
|
+
begin
|
508
|
+
new_columns[name] = yield(name, column(name))
|
509
|
+
rescue
|
510
|
+
end
|
511
|
+
end
|
512
|
+
return CADataFrame.new(new_columns)
|
513
|
+
end
|
514
|
+
|
515
|
+
def select (*names, &block)
|
516
|
+
if names.empty?
|
517
|
+
names = @column_names
|
518
|
+
end
|
519
|
+
if block
|
520
|
+
row = instance_exec(&block)
|
521
|
+
else
|
522
|
+
row = nil
|
523
|
+
end
|
524
|
+
new_columns = {}
|
525
|
+
names.map(&:to_s).each do |name|
|
526
|
+
new_columns[name] = column(name)[row]
|
527
|
+
end
|
528
|
+
return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[row] : nil)
|
529
|
+
end
|
530
|
+
|
531
|
+
#
|
532
|
+
# Maintenance
|
533
|
+
#
|
534
|
+
|
535
|
+
def unmask! (value = nil)
|
536
|
+
each_column_name do |name|
|
537
|
+
column(name).unmask(value)
|
538
|
+
end
|
539
|
+
return self
|
540
|
+
end
|
541
|
+
|
542
|
+
def unmask (value = nil)
|
543
|
+
return to_df.unmask!(value)
|
544
|
+
end
|
545
|
+
|
546
|
+
def detouch!
|
547
|
+
@columns = @columns.clone
|
548
|
+
each_column_name do |name|
|
549
|
+
@columns[name] = @columns[name].to_ca
|
550
|
+
end
|
551
|
+
if @row_index
|
552
|
+
@row_index = @row_index.clone
|
553
|
+
end
|
554
|
+
return self
|
555
|
+
end
|
556
|
+
|
557
|
+
#
|
558
|
+
# Transformation
|
559
|
+
#
|
560
|
+
|
561
|
+
def eliminate_columns (*names)
|
562
|
+
if names.empty?
|
563
|
+
return self
|
564
|
+
end
|
565
|
+
names = names.map(&:to_s)
|
566
|
+
new_columns = {}
|
567
|
+
each_column_name do |name|
|
568
|
+
unless names.include?(name)
|
569
|
+
new_columns[name] = column(name)
|
570
|
+
end
|
571
|
+
end
|
572
|
+
return CADataFrame.new(new_columns, row_index: @row_index)
|
573
|
+
end
|
574
|
+
|
575
|
+
def reorder (&block)
|
576
|
+
index = instance_exec(&block)
|
577
|
+
new_columns = {}
|
578
|
+
each_column_name do |name|
|
579
|
+
new_columns[name] = column(name)[index]
|
580
|
+
end
|
581
|
+
return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[index] : nil)
|
582
|
+
end
|
583
|
+
|
584
|
+
def order_by (*names, &block)
|
585
|
+
if names.empty?
|
586
|
+
if block
|
587
|
+
ret = instance_exec(&block)
|
588
|
+
case ret
|
589
|
+
when CArray
|
590
|
+
list = [ret]
|
591
|
+
when Array
|
592
|
+
list = ret
|
593
|
+
end
|
594
|
+
end
|
595
|
+
else
|
596
|
+
list = @columns.values_at(*names.map{|s| s.to_s})
|
597
|
+
end
|
598
|
+
return reorder { CA.sort_addr(*list) }
|
599
|
+
end
|
600
|
+
|
601
|
+
def reverse
|
602
|
+
new_columns = {}
|
603
|
+
each_column_name do |name|
|
604
|
+
new_columns[name] = column(name).reverse
|
605
|
+
end
|
606
|
+
return CADataFrame.new(new_columns, row_index: @row_index ? @row_index.reverse : nil)
|
607
|
+
end
|
608
|
+
|
609
|
+
def transpose (column_names: nil)
|
610
|
+
if column_names
|
611
|
+
column_names = header.map(&:to_s)
|
612
|
+
else
|
613
|
+
if @row_index
|
614
|
+
column_names = @row_index.convert(:object) {|v| v.to_s }
|
615
|
+
else
|
616
|
+
column_names = CArray.object(@row_number).seq("a",:succ)
|
617
|
+
end
|
618
|
+
end
|
619
|
+
return CADataFrame.new(ca.transpose, row_index: @column_names.to_ca, column_names: column_names)
|
620
|
+
end
|
621
|
+
|
622
|
+
def add_suffix (suf)
|
623
|
+
new_columns = {}
|
624
|
+
each_column_name do |name|
|
625
|
+
new_name = (name.to_s + suf).to_s
|
626
|
+
new_columns[new_name] = column(name)
|
627
|
+
end
|
628
|
+
return CADataFrame.new(new_columns, row_index: @row_index)
|
629
|
+
end
|
630
|
+
|
631
|
+
#
|
632
|
+
# Conversions
|
633
|
+
#
|
634
|
+
|
635
|
+
def to_df
|
636
|
+
new_columns = {}
|
637
|
+
each_column_name do |name|
|
638
|
+
new_columns[name] = column(name)
|
639
|
+
end
|
640
|
+
return CADataFrame.new(new_columns, row_index: @row_index).detouch!
|
641
|
+
end
|
642
|
+
|
643
|
+
def objectify
|
644
|
+
new_columns = {}
|
645
|
+
each_column_name do |name|
|
646
|
+
new_columns[name] = column(name).object
|
647
|
+
end
|
648
|
+
return CADataFrame.new(new_columns, row_index: @row_index)
|
649
|
+
end
|
650
|
+
|
651
|
+
def ca (*names)
|
652
|
+
if names.empty?
|
653
|
+
return CADFArray.new(@column_names, @columns)
|
654
|
+
else
|
655
|
+
return CADFArray.new(names.map(&:to_s), @columns)
|
656
|
+
end
|
657
|
+
end
|
658
|
+
|
659
|
+
def to_ca (*names)
|
660
|
+
return ca(*names).to_ca
|
661
|
+
end
|
662
|
+
|
663
|
+
def to_hash
|
664
|
+
hash = {}
|
665
|
+
@columns.each do |k,v|
|
666
|
+
hash[k] = v.to_a
|
667
|
+
end
|
668
|
+
return hash
|
669
|
+
end
|
670
|
+
|
671
|
+
def columns_to_hash (key_name, value_names)
|
672
|
+
hash = {}
|
673
|
+
unless @column_names.include?(key_name)
|
674
|
+
raise ArgumentError, "include invalid key column name #{key_name}"
|
675
|
+
end
|
676
|
+
case value_names
|
677
|
+
when String
|
678
|
+
unless @column_names.include?(value_names)
|
679
|
+
raise ArgumentError, "invalid key column name #{value_names}"
|
680
|
+
end
|
681
|
+
key_columns = @columns[key_name]
|
682
|
+
value_columns = @columns[value_names]
|
683
|
+
@row_number.times do |i|
|
684
|
+
hash[key_columns[i]] = value_columns[i]
|
685
|
+
end
|
686
|
+
when Array
|
687
|
+
unless value_names.all?{|s| @column_names.include?(s) }
|
688
|
+
raise ArgumentError, "include invalid column name in #{value_names.join(' ')}"
|
689
|
+
end
|
690
|
+
key_columns = @columns[key_name]
|
691
|
+
value_columns = @columns.values_at(*value_names)
|
692
|
+
@row_number.times do |i|
|
693
|
+
hash[key_columns[i]] = value_columns.map{|c| c[i]}
|
694
|
+
end
|
695
|
+
else
|
696
|
+
raise ArgumentError, "invalud argument"
|
697
|
+
end
|
698
|
+
return hash
|
699
|
+
end
|
700
|
+
|
701
|
+
private
|
702
|
+
|
703
|
+
def __obj_to_string__ (obj)
|
704
|
+
case obj
|
705
|
+
when Float
|
706
|
+
"%.6g" % obj
|
707
|
+
else
|
708
|
+
obj.to_s
|
709
|
+
end
|
710
|
+
end
|
711
|
+
|
712
|
+
def __strwidth__ (string)
|
713
|
+
if string.ascii_only?
|
714
|
+
return string.length
|
715
|
+
else
|
716
|
+
return string.each_char.inject(0){|s,c| s += c.bytesize > 1 ? 2 : 1 }
|
717
|
+
end
|
718
|
+
end
|
719
|
+
|
720
|
+
public
|
721
|
+
|
722
|
+
def ascii_table (rowmax = :full)
|
723
|
+
if @row_index
|
724
|
+
namelist = [" "] + @column_names
|
725
|
+
tbl = CADFArray.new(namelist, @columns.clone.update(" " => @row_index))
|
726
|
+
else
|
727
|
+
namelist = [" "] + @column_names
|
728
|
+
tbl = CADFArray.new(namelist, @columns.clone.update(" " => CArray.int(@row_number).seq))
|
729
|
+
end
|
730
|
+
if rowmax.is_a?(Integer) and @row_number > rowmax
|
731
|
+
list = tbl[0..(rowmax/2),nil].to_a
|
732
|
+
list.push namelist.map { "..." }
|
733
|
+
list.push *(tbl[-rowmax/2+1..-1,nil].to_a)
|
734
|
+
tbl = list.to_ca
|
735
|
+
end
|
736
|
+
datastr = tbl.convert {|c| __obj_to_string__(c) }.unmask("")
|
737
|
+
datamb = datastr.convert(:boolean, &:"ascii_only?").not.sum(0).ne(0)
|
738
|
+
namemb = namelist.to_ca.convert(:boolean) {|c| c.to_s.ascii_only? }.eq(0)
|
739
|
+
mb = datamb.or(namemb)
|
740
|
+
namelen = namelist.map(&:length).to_ca
|
741
|
+
datalen = datastr.convert(&:length)
|
742
|
+
if mb.max == 0
|
743
|
+
if datalen.size == 0
|
744
|
+
lengths = namelen.to_a
|
745
|
+
else
|
746
|
+
lengths = datalen.max(0).pmax(namelen).to_a
|
747
|
+
end
|
748
|
+
hrule = "-" + lengths.map {|len| "-"*len}.join("--") + "-"
|
749
|
+
header = " " +
|
750
|
+
[namelist, lengths].transpose.map{|name, len|
|
751
|
+
"#{name.to_s.ljust(len)}" }.join(" ") + " "
|
752
|
+
ary = [hrule, header, hrule]
|
753
|
+
if datalen.size > 0
|
754
|
+
datastr[:i,nil].each_with_index do |blk, i|
|
755
|
+
list = blk.flatten.to_a
|
756
|
+
ary << " " + [list, lengths].transpose.map{|value, len|
|
757
|
+
"#{value.ljust(len)}"}.join(" ") + " "
|
758
|
+
end
|
759
|
+
end
|
760
|
+
ary << hrule
|
761
|
+
return "DataFrame: rows#=#{@row_number}: \n" + ary.join("\n")
|
762
|
+
else
|
763
|
+
namewidth = namelist.to_ca.convert{|c| __strwidth__(c.to_s) }
|
764
|
+
if datalen.size == 0
|
765
|
+
maxwidth = namewidth
|
766
|
+
else
|
767
|
+
datawidth = datastr.convert{|c| __strwidth__(c.to_s) }
|
768
|
+
maxwidth = datawidth.max(0).pmax(namewidth)
|
769
|
+
end
|
770
|
+
len = maxwidth[:*,nil] - datawidth + datalen
|
771
|
+
hrule = "-" + maxwidth.map {|len| "-"*len}.join("--") + "-"
|
772
|
+
header = " " +
|
773
|
+
[namelist, maxwidth.to_a].transpose.map{|name, len|
|
774
|
+
"#{name.to_s.ljust(len-__strwidth__(name.to_s)+name.to_s.length)}" }.join(" ") + " "
|
775
|
+
ary = [hrule, header, hrule]
|
776
|
+
if datalen.size > 0
|
777
|
+
datastr[:i,nil].each_with_addr do |blk, i|
|
778
|
+
list = blk.flatten.to_a
|
779
|
+
ary << " " + list.map.with_index {|value, j|
|
780
|
+
"#{value.ljust(len[i,j])}"}.join(" ") + " "
|
781
|
+
end
|
782
|
+
end
|
783
|
+
ary << hrule
|
784
|
+
return "DataFrame: row#=#{@row_number}: \n" + ary.join("\n")
|
785
|
+
end
|
786
|
+
end
|
787
|
+
|
788
|
+
|
789
|
+
def inspect
|
790
|
+
return ascii_table(10)
|
791
|
+
end
|
792
|
+
|
793
|
+
def to_s
|
794
|
+
return ascii_table
|
795
|
+
end
|
796
|
+
|
797
|
+
def to_ary
|
798
|
+
return [to_s]
|
799
|
+
end
|
800
|
+
|
801
|
+
|
802
|
+
end
|
803
|
+
|
804
|
+
#############################################################
|
805
|
+
#
|
806
|
+
# ARRANGER
|
807
|
+
#
|
808
|
+
#############################################################
|
809
|
+
|
810
|
+
|
811
|
+
class CADataFrame
|
812
|
+
|
813
|
+
class Arranger
|
814
|
+
|
815
|
+
def initialize (dataframe)
|
816
|
+
@dataframe = dataframe
|
817
|
+
end
|
818
|
+
|
819
|
+
def arrange (&block)
|
820
|
+
instance_exec(&block)
|
821
|
+
return @dataframe
|
822
|
+
end
|
823
|
+
|
824
|
+
private
|
825
|
+
|
826
|
+
def column_names
|
827
|
+
return @dataframe.column_names
|
828
|
+
end
|
829
|
+
|
830
|
+
def row_number
|
831
|
+
return @dataframe.row_number
|
832
|
+
end
|
833
|
+
|
834
|
+
def method (hash)
|
835
|
+
@dataframe.method(hash)
|
836
|
+
end
|
837
|
+
|
838
|
+
def timeseries (name, fmt = "%Y-%m-%d %H:%M:%S")
|
839
|
+
@dataframe.columns[name.to_s] = @dataframe.columns[name.to_s].strptime(fmt)
|
840
|
+
end
|
841
|
+
|
842
|
+
def type (type, name, mask = :novalue)
|
843
|
+
@dataframe.columns[name.to_s] = @dataframe.columns[name.to_s].to_type(type)
|
844
|
+
if mask != :novalue
|
845
|
+
@dataframe.columns[name.to_s].maskout!(options[:maskout])
|
846
|
+
end
|
847
|
+
end
|
848
|
+
|
849
|
+
def eliminate (*names)
|
850
|
+
if names.empty?
|
851
|
+
return self
|
852
|
+
end
|
853
|
+
names = names.map(&:to_s)
|
854
|
+
@dataframe.column_names.each do |name|
|
855
|
+
if names.include?(name)
|
856
|
+
@dataframe.columns.delete(name)
|
857
|
+
@dataframe.column_names.delete(name)
|
858
|
+
end
|
859
|
+
end
|
860
|
+
end
|
861
|
+
|
862
|
+
def template (*args, &block)
|
863
|
+
return @dataframe.columns.first[1].template(*args, &block)
|
864
|
+
end
|
865
|
+
|
866
|
+
def double (*names)
|
867
|
+
names.flatten.map(&:to_s).each do |name|
|
868
|
+
if @dataframe.column_names.include?(name)
|
869
|
+
type(:double, name)
|
870
|
+
else
|
871
|
+
raise "Unknown column name '#{name}'"
|
872
|
+
end
|
873
|
+
end
|
874
|
+
end
|
875
|
+
|
876
|
+
def int (*names)
|
877
|
+
names.flatten.map(&:to_s).each do |name|
|
878
|
+
if @dataframe.column_names.include?(name)
|
879
|
+
type(:int, name)
|
880
|
+
else
|
881
|
+
raise "Unknown column name '#{name}'"
|
882
|
+
end
|
883
|
+
end
|
884
|
+
end
|
885
|
+
|
886
|
+
def maskout (value, *names)
|
887
|
+
names.flatten.map(&:to_s).each do |name|
|
888
|
+
@dataframe.columns[name].maskout!(value)
|
889
|
+
end
|
890
|
+
end
|
891
|
+
|
892
|
+
def unmask (value, *names)
|
893
|
+
names.flatten.map(&:to_s).each do |name|
|
894
|
+
@dataframe.columns[name].unmask(value)
|
895
|
+
end
|
896
|
+
end
|
897
|
+
|
898
|
+
def col (name)
|
899
|
+
return @dataframe.col(name)
|
900
|
+
end
|
901
|
+
|
902
|
+
def append (name, new_column)
|
903
|
+
if new_column
|
904
|
+
# do nothing
|
905
|
+
else
|
906
|
+
new_column = @dataframe.columns.first[1].template(:object)
|
907
|
+
end
|
908
|
+
unless new_column.is_a?(CArray)
|
909
|
+
new_column = new_column.to_ca
|
910
|
+
end
|
911
|
+
@dataframe.columns[name.to_s] = new_column
|
912
|
+
@dataframe.column_names.push(name.to_s)
|
913
|
+
end
|
914
|
+
|
915
|
+
def lead (name, new_column)
|
916
|
+
if new_column
|
917
|
+
# do nothing
|
918
|
+
else
|
919
|
+
new_column = @dataframe.columns.first[1].template(:object)
|
920
|
+
end
|
921
|
+
unless new_column.is_a?(CArray)
|
922
|
+
new_column = new_column.to_ca
|
923
|
+
end
|
924
|
+
@dataframe.columns[name.to_s] = new_column
|
925
|
+
@dataframe.column_names.unshift(name.to_s)
|
926
|
+
end
|
927
|
+
|
928
|
+
def rename (name1, name2)
|
929
|
+
if idx = @dataframe.column_names.index(name1.to_s)
|
930
|
+
@dataframe.column_names[idx] = name2.to_s
|
931
|
+
column = @dataframe.columns[name1.to_s]
|
932
|
+
@dataframe.columns.delete(name1.to_s)
|
933
|
+
@dataframe.columns[name2.to_s] = column
|
934
|
+
else
|
935
|
+
raise "unknown column name #{name1}"
|
936
|
+
end
|
937
|
+
end
|
938
|
+
|
939
|
+
def downcase
|
940
|
+
@dataframe.downcase
|
941
|
+
end
|
942
|
+
|
943
|
+
def classify (name, scale, opt = {})
|
944
|
+
return @dataframe.classify(name, scale, opt)
|
945
|
+
end
|
946
|
+
|
947
|
+
def map (mapper, name_or_column)
|
948
|
+
case name_or_column
|
949
|
+
when String, Symbol
|
950
|
+
name = name_or_column
|
951
|
+
column = @dataframe.columns[name.to_s]
|
952
|
+
when CArray
|
953
|
+
column = name_or_column
|
954
|
+
when Array
|
955
|
+
column = name_or_column.to_ca
|
956
|
+
else
|
957
|
+
raise "invalid argument"
|
958
|
+
end
|
959
|
+
case mapper
|
960
|
+
when Hash
|
961
|
+
return column.convert(:object) {|v| hash[v] }
|
962
|
+
when CArray
|
963
|
+
return mapper.project(column)
|
964
|
+
when Array
|
965
|
+
return mapper.to_ca.project(column)
|
966
|
+
end
|
967
|
+
end
|
968
|
+
|
969
|
+
def method_missing (name, *args)
|
970
|
+
if args.size == 0
|
971
|
+
if @dataframe.column_names.include?(name.to_s)
|
972
|
+
return @dataframe.columns[name.to_s]
|
973
|
+
elsif @dataframe.__methods__.include?(name.to_s)
|
974
|
+
return @dataframe.columns[@dataframe.__methods__[name.to_s]]
|
975
|
+
end
|
976
|
+
end
|
977
|
+
super
|
978
|
+
end
|
979
|
+
|
980
|
+
end
|
981
|
+
|
982
|
+
end
|
983
|
+
|
984
|
+
#############################################################
|
985
|
+
#
|
986
|
+
# Class methods
|
987
|
+
#
|
988
|
+
#############################################################
|
989
|
+
|
990
|
+
class CADataFrame
|
991
|
+
|
992
|
+
def self.merge (*args)
|
993
|
+
ref = args.first
|
994
|
+
new_columns = {}
|
995
|
+
args.each do |table|
|
996
|
+
table.column_names.each do |name|
|
997
|
+
new_columns[name] = table.col(name)
|
998
|
+
end
|
999
|
+
end
|
1000
|
+
return CADataFrame.new(new_columns, row_index: ref.row_index)
|
1001
|
+
end
|
1002
|
+
|
1003
|
+
def self.concat (*args)
|
1004
|
+
ref = args.first
|
1005
|
+
column_names = ref.column_names
|
1006
|
+
new_columns = {}
|
1007
|
+
column_names.each do |name|
|
1008
|
+
list = args.map{|t| t.col(name) }
|
1009
|
+
data_type = list.first.data_type
|
1010
|
+
new_columns[name] = CArray.bind(data_type, list, 0)
|
1011
|
+
end
|
1012
|
+
if args.map(&:row_index).all?
|
1013
|
+
new_row_index = CArray.join(*args.map(&:row_index))
|
1014
|
+
else
|
1015
|
+
new_row_index = nil
|
1016
|
+
end
|
1017
|
+
return CADataFrame.new(new_columns, row_index: new_row_index)
|
1018
|
+
end
|
1019
|
+
|
1020
|
+
|
1021
|
+
end
|
1022
|
+
|
1023
|
+
#############################################################
|
1024
|
+
#
|
1025
|
+
# CADFArray
|
1026
|
+
#
|
1027
|
+
#############################################################
|
1028
|
+
|
1029
|
+
class CADFArray < CAObject # :nodoc:
|
1030
|
+
|
1031
|
+
def initialize (column_names, columns)
|
1032
|
+
@column_names = column_names
|
1033
|
+
@columns = columns
|
1034
|
+
dim = [@columns[@column_names.first].size, @column_names.size]
|
1035
|
+
extend CA::TableMethods
|
1036
|
+
super(:object, dim, :read_only=>true)
|
1037
|
+
__create_mask__
|
1038
|
+
end
|
1039
|
+
|
1040
|
+
attr_reader :column_names
|
1041
|
+
|
1042
|
+
def fetch_index (idx)
|
1043
|
+
r, c = *idx
|
1044
|
+
name = @column_names[c]
|
1045
|
+
return @columns[name][r]
|
1046
|
+
end
|
1047
|
+
|
1048
|
+
def copy_data (data)
|
1049
|
+
@column_names.each_with_index do |name, i|
|
1050
|
+
data[nil,i] = @columns[name].value
|
1051
|
+
end
|
1052
|
+
end
|
1053
|
+
|
1054
|
+
def create_mask
|
1055
|
+
end
|
1056
|
+
|
1057
|
+
def mask_fetch_index (idx)
|
1058
|
+
r, c = *idx
|
1059
|
+
name = @column_names[c]
|
1060
|
+
if @columns[name].has_mask?
|
1061
|
+
return @columns[name].mask[r]
|
1062
|
+
else
|
1063
|
+
return 0
|
1064
|
+
end
|
1065
|
+
end
|
1066
|
+
|
1067
|
+
def mask_copy_data (data)
|
1068
|
+
@column_names.each_with_index do |name, i|
|
1069
|
+
if @columns[name].has_mask?
|
1070
|
+
data[nil,i] = @columns[name].mask
|
1071
|
+
end
|
1072
|
+
end
|
1073
|
+
end
|
1074
|
+
|
1075
|
+
def to_ca
|
1076
|
+
obj = super
|
1077
|
+
obj.extend CA::TableMethods
|
1078
|
+
obj.column_names = @column_names
|
1079
|
+
return obj
|
1080
|
+
end
|
1081
|
+
|
1082
|
+
end
|
1083
|
+
|
1084
|
+
#############################################################
|
1085
|
+
#
|
1086
|
+
# BASIC Comparison
|
1087
|
+
#
|
1088
|
+
#############################################################
|
1089
|
+
|
1090
|
+
|
1091
|
+
class CADataFrame
|
1092
|
+
|
1093
|
+
def -@
|
1094
|
+
return cmp(:-@)
|
1095
|
+
end
|
1096
|
+
|
1097
|
+
def < (other)
|
1098
|
+
return cmp(:<, other)
|
1099
|
+
end
|
1100
|
+
|
1101
|
+
def <= (other)
|
1102
|
+
return cmp(:<=, other)
|
1103
|
+
end
|
1104
|
+
|
1105
|
+
def > (other)
|
1106
|
+
return cmp(:>, other)
|
1107
|
+
end
|
1108
|
+
|
1109
|
+
def >= (other)
|
1110
|
+
return cmp(:>=, other)
|
1111
|
+
end
|
1112
|
+
|
1113
|
+
def is_masked
|
1114
|
+
return cmp(:is_masked)
|
1115
|
+
end
|
1116
|
+
|
1117
|
+
def is_finite
|
1118
|
+
return cmp(:is_finite)
|
1119
|
+
end
|
1120
|
+
|
1121
|
+
private
|
1122
|
+
|
1123
|
+
def cmp (method, *argv)
|
1124
|
+
return CADataFrame.new(ca.send(method,*argv), column_names: @column_names)
|
1125
|
+
end
|
1126
|
+
|
1127
|
+
end
|
1128
|
+
|
1129
|
+
#############################################################
|
1130
|
+
#
|
1131
|
+
# BASIC Manipulations
|
1132
|
+
#
|
1133
|
+
#############################################################
|
1134
|
+
|
1135
|
+
class CADataFrame
|
1136
|
+
|
1137
|
+
def matchup (keyname, reference)
|
1138
|
+
key = column(keyname.to_s)
|
1139
|
+
idx = reference.matchup(key)
|
1140
|
+
new_columns = {}
|
1141
|
+
each_column_name do |name|
|
1142
|
+
if name == keyname
|
1143
|
+
new_columns[name] = reference
|
1144
|
+
else
|
1145
|
+
new_columns[name] = column(name).project(idx)
|
1146
|
+
end
|
1147
|
+
end
|
1148
|
+
if @row_index
|
1149
|
+
new_row_index = @row_index.project(idx).unmask(nil)
|
1150
|
+
else
|
1151
|
+
new_row_index = nil
|
1152
|
+
end
|
1153
|
+
return CADataFrame.new(new_columns, row_index: new_row_index) {
|
1154
|
+
self.send(keyname)[] = reference
|
1155
|
+
}
|
1156
|
+
end
|
1157
|
+
|
1158
|
+
def join (table, on: nil)
|
1159
|
+
end
|
1160
|
+
|
1161
|
+
def histogram (name, scale = nil, options = nil)
|
1162
|
+
if scale.nil?
|
1163
|
+
return group_by(name).table{ { :count => col(name).count_valid } }
|
1164
|
+
else
|
1165
|
+
if options
|
1166
|
+
hist = CAHistogram.int(scale, options)
|
1167
|
+
else
|
1168
|
+
hist = CAHistogram.int(scale)
|
1169
|
+
end
|
1170
|
+
hist.increment(@columns[name.to_s])
|
1171
|
+
hash = {
|
1172
|
+
name.to_s => hist.midpoints[0],
|
1173
|
+
"#{name}_L".to_s => scale[0..-2],
|
1174
|
+
"#{name}_R".to_s => scale.shift(-1)[0..-2],
|
1175
|
+
:count => hist[0..-2].to_ca,
|
1176
|
+
}
|
1177
|
+
return CADataFrame.new(hash)
|
1178
|
+
end
|
1179
|
+
end
|
1180
|
+
|
1181
|
+
def classify (name, scale = nil, opt = {})
|
1182
|
+
if not scale
|
1183
|
+
column = @columns[name.to_s]
|
1184
|
+
mids = column.uniq
|
1185
|
+
mapper = {}
|
1186
|
+
mids.each_with_index do |v,i|
|
1187
|
+
mapper[v] = i
|
1188
|
+
end
|
1189
|
+
cls = columns.convert(:int32) {|v| mapper[v] }
|
1190
|
+
hash = {
|
1191
|
+
"#{name}_M" => mids,
|
1192
|
+
"#{name}_L" => mids,
|
1193
|
+
"#{name}_R" => mids,
|
1194
|
+
"#{name}_CLASS" => cls
|
1195
|
+
}
|
1196
|
+
else
|
1197
|
+
option = {
|
1198
|
+
:include_upper => false,
|
1199
|
+
:include_lowest => true,
|
1200
|
+
:offset => 0,
|
1201
|
+
}.update(opt)
|
1202
|
+
column = @columns[name.to_s]
|
1203
|
+
cls = scale.bin(column,
|
1204
|
+
option[:include_upper],
|
1205
|
+
option[:include_lowest],
|
1206
|
+
option[:offset])
|
1207
|
+
mids = ((scale + scale.shifted(-1))/2)[0..-2].to_ca
|
1208
|
+
left = scale[0..-2]
|
1209
|
+
right = scale.shift(-1)[0..-2]
|
1210
|
+
hash = {
|
1211
|
+
"#{name}_M" => mids.project(cls).to_ca,
|
1212
|
+
"#{name}_L" => left.project(cls).to_ca,
|
1213
|
+
"#{name}_R" => right.project(cls).to_ca,
|
1214
|
+
"#{name}_CLASS" => cls
|
1215
|
+
}
|
1216
|
+
end
|
1217
|
+
return CADataFrame.new(hash)
|
1218
|
+
end
|
1219
|
+
|
1220
|
+
def cross (name1, name2)
|
1221
|
+
col1 = column(name1)
|
1222
|
+
col2 = column(name2)
|
1223
|
+
var1 = col1.uniq.sort
|
1224
|
+
var2 = col2.uniq.sort
|
1225
|
+
hash = {}
|
1226
|
+
count = Hash.new {0}
|
1227
|
+
var1.each do |v1|
|
1228
|
+
var2.each do |v2|
|
1229
|
+
hash[[v1,v2]] = 0
|
1230
|
+
end
|
1231
|
+
end
|
1232
|
+
list = CArray.join([col1, col2]).to_a
|
1233
|
+
list.each do |item|
|
1234
|
+
hash[item] += 1
|
1235
|
+
end
|
1236
|
+
out = CArray.object(var1.size, var2.size) { 0 }
|
1237
|
+
var1.each_with_index do |v1, i|
|
1238
|
+
var2.each_with_index do |v2, j|
|
1239
|
+
out[i,j] = hash[[v1,v2]]
|
1240
|
+
end
|
1241
|
+
end
|
1242
|
+
return CADataFrame.new(out, row_index: var1, column_names: var2)
|
1243
|
+
end
|
1244
|
+
|
1245
|
+
end
|
1246
|
+
|
1247
|
+
|
1248
|
+
#############################################################
|
1249
|
+
#
|
1250
|
+
# GROUPING
|
1251
|
+
#
|
1252
|
+
#############################################################
|
1253
|
+
|
1254
|
+
class CADataFrame
|
1255
|
+
|
1256
|
+
def group_by (*names)
|
1257
|
+
if names.size == 1
|
1258
|
+
return CADataFrameGroup.new(self, names[0])
|
1259
|
+
else
|
1260
|
+
return CADataFrameGroupMulti.new(self, *names)
|
1261
|
+
end
|
1262
|
+
end
|
1263
|
+
|
1264
|
+
end
|
1265
|
+
|
1266
|
+
class CADataFrameGroup
|
1267
|
+
|
1268
|
+
def initialize (dataframe, name)
|
1269
|
+
@dataframe = dataframe
|
1270
|
+
case name
|
1271
|
+
when Hash
|
1272
|
+
name, list = name.first
|
1273
|
+
@column = @dataframe.col(name)
|
1274
|
+
@keys = list.to_ca
|
1275
|
+
else
|
1276
|
+
@column = @dataframe.col(name)
|
1277
|
+
@keys = @column.uniq.sort
|
1278
|
+
end
|
1279
|
+
@name = name.to_s
|
1280
|
+
@addrs = {}
|
1281
|
+
@keys.each do |k|
|
1282
|
+
@addrs[k] = @column.eq(k).where
|
1283
|
+
end
|
1284
|
+
end
|
1285
|
+
|
1286
|
+
def table (&block)
|
1287
|
+
hashpool = []
|
1288
|
+
@keys.each do |k|
|
1289
|
+
hashpool << @dataframe[@addrs[k]].execute(&block)
|
1290
|
+
end
|
1291
|
+
columns = {@name=>@keys}
|
1292
|
+
hashpool.each_with_index do |hash, i|
|
1293
|
+
hash.each do |key, value|
|
1294
|
+
columns[key] ||= []
|
1295
|
+
columns[key][i] = value
|
1296
|
+
end
|
1297
|
+
end
|
1298
|
+
return CADataFrame.new(columns)
|
1299
|
+
end
|
1300
|
+
|
1301
|
+
def calculate (label, &block)
|
1302
|
+
new_columns = {@name=>@keys}
|
1303
|
+
@dataframe.each_column do |name, clmn|
|
1304
|
+
if name == @name
|
1305
|
+
next
|
1306
|
+
end
|
1307
|
+
new_columns[name] = CArray.object(@keys.size) { UNDEF }
|
1308
|
+
@keys.each_with_index do |k, i|
|
1309
|
+
begin
|
1310
|
+
if block
|
1311
|
+
new_columns[name][i] = yield(name, clmn[@addrs[k]])
|
1312
|
+
else
|
1313
|
+
new_columns[name][i] = clmn[@addrs[k]].send(label.intern)
|
1314
|
+
end
|
1315
|
+
rescue
|
1316
|
+
end
|
1317
|
+
end
|
1318
|
+
end
|
1319
|
+
return CADataFrame.new(new_columns)
|
1320
|
+
end
|
1321
|
+
|
1322
|
+
def [] (group_value)
|
1323
|
+
if map = @addrs[group_value]
|
1324
|
+
return @dataframe[map]
|
1325
|
+
else
|
1326
|
+
return @dataframe.vacant_copy
|
1327
|
+
end
|
1328
|
+
end
|
1329
|
+
|
1330
|
+
def each
|
1331
|
+
@addrs.each do |key, map|
|
1332
|
+
yield @dataframe[map]
|
1333
|
+
end
|
1334
|
+
end
|
1335
|
+
|
1336
|
+
def each_with_index
|
1337
|
+
@addrs.each do |key, map|
|
1338
|
+
yield @dataframe[map], key
|
1339
|
+
end
|
1340
|
+
end
|
1341
|
+
|
1342
|
+
include Enumerable
|
1343
|
+
|
1344
|
+
end
|
1345
|
+
|
1346
|
+
class CADataFrameGroupMulti
|
1347
|
+
|
1348
|
+
def initialize (dataframe, *names)
|
1349
|
+
@rank = names.size
|
1350
|
+
@dataframe = dataframe
|
1351
|
+
@names = []
|
1352
|
+
@column = []
|
1353
|
+
@keys = []
|
1354
|
+
names.each_with_index do |name, i|
|
1355
|
+
case name
|
1356
|
+
when Hash
|
1357
|
+
name, list = name.first
|
1358
|
+
@column[i] = @dataframe.col(name)
|
1359
|
+
@keys[i] = list.to_ca
|
1360
|
+
else
|
1361
|
+
@column[i] = @dataframe.col(name)
|
1362
|
+
@keys[i] = @column[i].to_ca.uniq.sort
|
1363
|
+
end
|
1364
|
+
@names[i] = name
|
1365
|
+
end
|
1366
|
+
@addrs = {}
|
1367
|
+
each_with_keys do |list|
|
1368
|
+
flag = @column[0].eq(list[0])
|
1369
|
+
(1...@rank).each do |i|
|
1370
|
+
flag &= @column[i].eq(list[i])
|
1371
|
+
end
|
1372
|
+
@addrs[list] = flag.where
|
1373
|
+
end
|
1374
|
+
end
|
1375
|
+
|
1376
|
+
def each_with_keys (&block)
|
1377
|
+
@keys[0].to_a.product(*@keys[1..-1].map(&:to_a)).each(&block)
|
1378
|
+
end
|
1379
|
+
|
1380
|
+
def table (&block)
|
1381
|
+
hashpool = []
|
1382
|
+
each_with_keys do |list|
|
1383
|
+
hashpool << @dataframe[@addrs[list]].execute(&block)
|
1384
|
+
end
|
1385
|
+
columns = {}
|
1386
|
+
@names.each do |name|
|
1387
|
+
columns[name] = []
|
1388
|
+
end
|
1389
|
+
each_with_keys.with_index do |list,j|
|
1390
|
+
@names.each_with_index do |name,i|
|
1391
|
+
columns[name][j] = list[i]
|
1392
|
+
end
|
1393
|
+
end
|
1394
|
+
hashpool.each_with_index do |hash, i|
|
1395
|
+
hash.each do |key, value|
|
1396
|
+
columns[key] ||= []
|
1397
|
+
columns[key][i] = value
|
1398
|
+
end
|
1399
|
+
end
|
1400
|
+
return CADataFrame.new(columns)
|
1401
|
+
end
|
1402
|
+
|
1403
|
+
def [] (group_value)
|
1404
|
+
if map = @addrs[group_value]
|
1405
|
+
return @dataframe[map]
|
1406
|
+
else
|
1407
|
+
return @dataframe.vacant_copy
|
1408
|
+
end
|
1409
|
+
end
|
1410
|
+
|
1411
|
+
def each
|
1412
|
+
each_with_keys do |key|
|
1413
|
+
yield key, @dataframe[@addrs[key]]
|
1414
|
+
end
|
1415
|
+
end
|
1416
|
+
|
1417
|
+
end
|
1418
|
+
|
1419
|
+
#############################################################
|
1420
|
+
#
|
1421
|
+
# PIVOT TABLE
|
1422
|
+
#
|
1423
|
+
#############################################################
|
1424
|
+
|
1425
|
+
class CADataFrame
|
1426
|
+
|
1427
|
+
def pivot (name1, name2)
|
1428
|
+
return CADataFramePivot.new(self, name1, name2)
|
1429
|
+
end
|
1430
|
+
|
1431
|
+
end
|
1432
|
+
|
1433
|
+
class CADataFramePivot
|
1434
|
+
|
1435
|
+
def initialize (dataframe, name1, name2)
|
1436
|
+
@dataframe = dataframe
|
1437
|
+
case name1
|
1438
|
+
when Hash
|
1439
|
+
name1, list = name1.first
|
1440
|
+
@column1 = @dataframe.col(name1)
|
1441
|
+
@keys1 = list.to_ca
|
1442
|
+
else
|
1443
|
+
@column1 = @dataframe.col(name1)
|
1444
|
+
@keys1 = @column1.uniq.sort
|
1445
|
+
end
|
1446
|
+
case name2
|
1447
|
+
when Hash
|
1448
|
+
name2, list = name2.first
|
1449
|
+
@column2 = @dataframe.col(name2)
|
1450
|
+
@keys2 = list
|
1451
|
+
else
|
1452
|
+
@column2 = @dataframe.col(name2)
|
1453
|
+
@keys2 = @column2.uniq.sort
|
1454
|
+
end
|
1455
|
+
@addrs = {}
|
1456
|
+
@keys1.each do |k1|
|
1457
|
+
@keys2.each do |k2|
|
1458
|
+
@addrs[[k1,k2]] = (@column1.eq(k1) & @column2.eq(k2)).where
|
1459
|
+
end
|
1460
|
+
end
|
1461
|
+
end
|
1462
|
+
|
1463
|
+
def table (&block)
|
1464
|
+
columns = {}
|
1465
|
+
@keys2.each do |k2|
|
1466
|
+
columns[k2] = CArray.object(@keys1.size) { UNDEF }
|
1467
|
+
end
|
1468
|
+
@keys1.each_with_index do |k1, i|
|
1469
|
+
@keys2.each do |k2|
|
1470
|
+
columns[k2][i] = @dataframe[@addrs[[k1,k2]]].execute(&block)
|
1471
|
+
end
|
1472
|
+
end
|
1473
|
+
return CADataFrame.new(columns, row_index: @keys1)
|
1474
|
+
end
|
1475
|
+
|
1476
|
+
end
|
1477
|
+
|
1478
|
+
|
1479
|
+
#############################################################
|
1480
|
+
#
|
1481
|
+
# CArray
|
1482
|
+
#
|
1483
|
+
#############################################################
|
1484
|
+
|
1485
|
+
|
1486
|
+
class CArray
|
1487
|
+
|
1488
|
+
def value_counts
|
1489
|
+
hash = {}
|
1490
|
+
values = uniq
|
1491
|
+
values.each do |value|
|
1492
|
+
hash[value] = 0
|
1493
|
+
end
|
1494
|
+
each do |value|
|
1495
|
+
hash[value] += 1
|
1496
|
+
end
|
1497
|
+
counts = values.convert{|value| hash[value]}
|
1498
|
+
return CADataFrame.new({'value' => values, 'count' => counts})
|
1499
|
+
end
|
1500
|
+
|
1501
|
+
end
|
1502
|
+
|
1503
|
+
|
1504
|
+
|
1505
|
+
class CADataFrame
|
1506
|
+
|
1507
|
+
def to_sqlite3 (*args)
|
1508
|
+
self.ca.to_sqlite3(*args)
|
1509
|
+
end
|
1510
|
+
|
1511
|
+
def to_sql (tablename)
|
1512
|
+
if @column_names.any?{ |s| s =~ /[\. \-]/ }
|
1513
|
+
columns = {}
|
1514
|
+
each_column_name do |name|
|
1515
|
+
name2 = name.gsub(/[\. \-]/, '_')
|
1516
|
+
columns[name2] = column(name)
|
1517
|
+
end
|
1518
|
+
df = CADataFrame.new(columns)
|
1519
|
+
return df.to_sqlite3(database: ":memory:", table: tablename)
|
1520
|
+
else
|
1521
|
+
return to_sqlite3(database: ":memory:", table: tablename)
|
1522
|
+
end
|
1523
|
+
end
|
1524
|
+
|
1525
|
+
end
|
1526
|
+
|
1527
|
+
module SQLite3
|
1528
|
+
|
1529
|
+
class Database
|
1530
|
+
|
1531
|
+
def to_df (expr)
|
1532
|
+
return CADataFrame.load_sqlite3 self, expr
|
1533
|
+
end
|
1534
|
+
|
1535
|
+
end
|
1536
|
+
|
1537
|
+
end
|
1538
|
+
|
1539
|
+
######################################
|
1540
|
+
#
|
1541
|
+
# IO methods
|
1542
|
+
#
|
1543
|
+
######################################
|
1544
|
+
|
1545
|
+
require "spreadsheet"
|
1546
|
+
|
1547
|
+
class CArray
|
1548
|
+
|
1549
|
+
def save_excel (filename, &block)
|
1550
|
+
if self.rank >= 3
|
1551
|
+
raise "too large rank (>2) to write excel file"
|
1552
|
+
end
|
1553
|
+
book = Spreadsheet::Workbook.new
|
1554
|
+
worksheet = book.create_worksheet
|
1555
|
+
self.dim0.times do |i|
|
1556
|
+
worksheet.row(i).push *self[i,nil]
|
1557
|
+
end
|
1558
|
+
if block
|
1559
|
+
block.call(worksheet)
|
1560
|
+
end
|
1561
|
+
book.write(filename)
|
1562
|
+
end
|
1563
|
+
|
1564
|
+
def self.load_excel (filename, sheet=0)
|
1565
|
+
book = Spreadsheet.open(filename)
|
1566
|
+
sheet = book.worksheet(sheet)
|
1567
|
+
return sheet.map(&:to_a).to_ca
|
1568
|
+
end
|
1569
|
+
|
1570
|
+
end
|
1571
|
+
|
1572
|
+
class CADataFrame
|
1573
|
+
|
1574
|
+
def self.load_sqlite3 (*args)
|
1575
|
+
return CArray.load_sqlite3(*args).to_dataframe.arrange{ maskout nil, *column_names }
|
1576
|
+
end
|
1577
|
+
|
1578
|
+
|
1579
|
+
def self.load_csv (*args, &block)
|
1580
|
+
return CArray.load_csv(*args, &block).to_dataframe.arrange{ maskout nil, *column_names }
|
1581
|
+
end
|
1582
|
+
|
1583
|
+
def self.from_csv (*args, &block)
|
1584
|
+
return CArray.from_csv(*args, &block).to_dataframe.arrange{ maskout nil, *column_names }
|
1585
|
+
end
|
1586
|
+
|
1587
|
+
def to_csv (io = "", option = {}, rs: $/, sep: ",", fill: "", with_row_index: true, &block)
|
1588
|
+
if @row_index and with_row_index
|
1589
|
+
namelist = [""] + @column_names
|
1590
|
+
tbl = CADFArray.new(namelist, @columns.clone.update("" => @row_index))
|
1591
|
+
else
|
1592
|
+
tbl = ca.to_ca
|
1593
|
+
end
|
1594
|
+
return tbl.to_csv(io, option, rs: rs, sep: sep, fill: fill, &block)
|
1595
|
+
end
|
1596
|
+
|
1597
|
+
def to_daru
|
1598
|
+
require "daru"
|
1599
|
+
columns = {}
|
1600
|
+
each_column_name do |name|
|
1601
|
+
columns[name] = column(name).to_a
|
1602
|
+
end
|
1603
|
+
if @row_index
|
1604
|
+
return Daru::DataFrame.new(columns, index: @row_index.to_a, order: @column_names)
|
1605
|
+
else
|
1606
|
+
return Daru::DataFrame.new(columns, order: @column_names)
|
1607
|
+
end
|
1608
|
+
end
|
1609
|
+
|
1610
|
+
def to_xlsx (filename, sheet_name: 'Sheet1', with_row_index: false, &block)
|
1611
|
+
require "axlsx"
|
1612
|
+
xl = Axlsx::Package.new
|
1613
|
+
xl.use_shared_strings = true
|
1614
|
+
sheet = xl.workbook.add_worksheet(name: sheet_name)
|
1615
|
+
df = self.to_df.objectify.unmask("=NA()")
|
1616
|
+
if with_row_index
|
1617
|
+
sheet.add_row([""] + column_names)
|
1618
|
+
df.each_row_with_row_index(with: Array) do |list, i|
|
1619
|
+
sheet.add_row([i] + list)
|
1620
|
+
end
|
1621
|
+
else
|
1622
|
+
sheet.add_row(column_names)
|
1623
|
+
df.each_row(with: Array) do |list|
|
1624
|
+
sheet.add_row(list)
|
1625
|
+
end
|
1626
|
+
end
|
1627
|
+
if block_given?
|
1628
|
+
yield sheet
|
1629
|
+
end
|
1630
|
+
xl.serialize(filename)
|
1631
|
+
end
|
1632
|
+
|
1633
|
+
end
|
1634
|
+
|
1635
|
+
|
1636
|
+
|
1637
|
+
|
1638
|
+
|
1639
|
+
|
1640
|
+
|