carray-dataframe 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/API.txt +83 -0
- data/README.md +5 -0
- data/carray-dataframe.gemspec +25 -0
- data/examples/R/fit.rb +24 -0
- data/examples/R/iris.rb +9 -0
- data/examples/R/japan_area.rb +30 -0
- data/examples/R/kyaku.rb +22 -0
- data/examples/group_by.rb +78 -0
- data/examples/hist.rb +27 -0
- data/examples/iris.rb +29 -0
- data/examples/map.rb +23 -0
- data/examples/match.rb +21 -0
- data/examples/test.xlsx +0 -0
- data/examples/test1.rb +44 -0
- data/examples/test2.rb +14 -0
- data/examples/test3.db +0 -0
- data/examples/test3.rb +11 -0
- data/examples/test3.xlsx +0 -0
- data/examples/to_excel.rb +27 -0
- data/lib/R.rb +365 -0
- data/lib/carray/autoload/autoload_dataframe_dataframe.rb +26 -0
- data/lib/carray/dataframe/dataframe.rb +1640 -0
- metadata +106 -0
@@ -0,0 +1,1640 @@
|
|
1
|
+
require "carray"
|
2
|
+
require "carray/io/table"
|
3
|
+
|
4
|
+
module CA::TableMethods
|
5
|
+
|
6
|
+
def to_dataframe (&block)
|
7
|
+
df = CADataFrame.new(self, &block)
|
8
|
+
if @header or @note
|
9
|
+
df.instance_variable_set(:@header, @header)
|
10
|
+
df.instance_variable_set(:@note, @note)
|
11
|
+
class << df
|
12
|
+
attr_reader :note
|
13
|
+
def header (name=nil)
|
14
|
+
if name
|
15
|
+
return @header[name.to_s]
|
16
|
+
else
|
17
|
+
return @column_names
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
return df
|
23
|
+
end
|
24
|
+
|
25
|
+
alias to_df to_dataframe
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
class CADataFrame
|
30
|
+
|
31
|
+
#
|
32
|
+
# Constructor
|
33
|
+
#
|
34
|
+
|
35
|
+
def initialize (columns_or_table, row_index: nil, column_names: nil, &block)
|
36
|
+
case columns_or_table
|
37
|
+
when Hash
|
38
|
+
columns = columns_or_table
|
39
|
+
@column_names = columns.keys.map(&:to_s)
|
40
|
+
@columns = normalize_columns(columns)
|
41
|
+
@row_number = @columns.first[1].size
|
42
|
+
if @column_names.any?{ |key| @columns[key].size != @row_number }
|
43
|
+
raise "column sizes mismatch"
|
44
|
+
end
|
45
|
+
when CArray
|
46
|
+
table = columns_or_table
|
47
|
+
if column_names
|
48
|
+
@column_names = column_names.map(&:to_s)
|
49
|
+
else
|
50
|
+
if table.respond_to?(:column_names)
|
51
|
+
@column_names = table.column_names.map(&:to_s)
|
52
|
+
else
|
53
|
+
raise "data table (CArray) has no method 'column_names'."
|
54
|
+
end
|
55
|
+
end
|
56
|
+
@columns = table_to_columns(table)
|
57
|
+
@row_number = table.dim0
|
58
|
+
else
|
59
|
+
raise "unknown data"
|
60
|
+
end
|
61
|
+
if row_index
|
62
|
+
@row_index = row_index.to_ca.object
|
63
|
+
else
|
64
|
+
@row_index = nil
|
65
|
+
end
|
66
|
+
@__methods__ = {}
|
67
|
+
if block_given?
|
68
|
+
arrange(&block)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def __methods__
|
73
|
+
return @__methods__
|
74
|
+
end
|
75
|
+
|
76
|
+
def replace (other)
|
77
|
+
@column_names = other.column_names
|
78
|
+
@columns = other.columns
|
79
|
+
@row_index = other.row_index
|
80
|
+
@row_number = other.row_number
|
81
|
+
@__methors__ = other.__methods__
|
82
|
+
return self
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
def table_to_columns (table)
|
88
|
+
new_columns = {}
|
89
|
+
@column_names.each_with_index do |name, i|
|
90
|
+
new_columns[name] = table[nil,i]
|
91
|
+
end
|
92
|
+
return new_columns
|
93
|
+
end
|
94
|
+
|
95
|
+
def normalize_columns (columns)
|
96
|
+
new_columns = {}
|
97
|
+
columns.each_key do |key|
|
98
|
+
case columns[key]
|
99
|
+
when CArray
|
100
|
+
column = columns[key]
|
101
|
+
when Array
|
102
|
+
column = columns[key].to_ca
|
103
|
+
if column.rank != 1
|
104
|
+
list = columns[key].clone
|
105
|
+
column = CArray.object(list.size).convert { list.shift }
|
106
|
+
end
|
107
|
+
else
|
108
|
+
column = columns[key].to_ca
|
109
|
+
end
|
110
|
+
new_columns[key.to_s] = column
|
111
|
+
end
|
112
|
+
return new_columns
|
113
|
+
end
|
114
|
+
|
115
|
+
public
|
116
|
+
|
117
|
+
#
|
118
|
+
# Attributes
|
119
|
+
#
|
120
|
+
|
121
|
+
attr_reader :columns, :column_names, :row_index, :column_number, :row_number
|
122
|
+
|
123
|
+
def has_column?(name)
|
124
|
+
return @column_names.include?(name)
|
125
|
+
end
|
126
|
+
|
127
|
+
def column_types
|
128
|
+
return @columns_names.map{|name| @columns[name].data_type_name }
|
129
|
+
end
|
130
|
+
|
131
|
+
#
|
132
|
+
# Column, Row Access
|
133
|
+
#
|
134
|
+
|
135
|
+
def column (name_or_index)
|
136
|
+
case name_or_index
|
137
|
+
when Integer
|
138
|
+
return @columns[@column_names[name_or_index]]
|
139
|
+
when String, Symbol
|
140
|
+
return @columns[name_or_index.to_s]
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
alias col column
|
145
|
+
|
146
|
+
def row (idx)
|
147
|
+
if @row_index
|
148
|
+
addr = @row_index.search(idx)
|
149
|
+
return @column_names.map{|name| @columns[name][addr]}.to_ca
|
150
|
+
else
|
151
|
+
return @column_names.map{|name| @columns[name][idx]}.to_ca
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def index
|
156
|
+
return CArray.int(@row_number).seq
|
157
|
+
end
|
158
|
+
|
159
|
+
def method (hash)
|
160
|
+
new_hash = {}
|
161
|
+
hash.each do |key, value|
|
162
|
+
new_hash[key.to_s] = value.to_s
|
163
|
+
end
|
164
|
+
@__methods__.update(new_hash)
|
165
|
+
end
|
166
|
+
|
167
|
+
def method_missing (name, *args)
|
168
|
+
if args.size == 0
|
169
|
+
name = name.to_s
|
170
|
+
if has_column?(name)
|
171
|
+
return @columns[name]
|
172
|
+
elsif has_column?(name.gsub(/_/,'.')) ### For R
|
173
|
+
return @columns[name.gsub(/_/,'.')]
|
174
|
+
elsif @__methods__.include?(name)
|
175
|
+
return @columns[@__methods__[name]]
|
176
|
+
end
|
177
|
+
end
|
178
|
+
raise "no method '#{name}' for CADataFrame"
|
179
|
+
end
|
180
|
+
|
181
|
+
|
182
|
+
#
|
183
|
+
# Iterators
|
184
|
+
#
|
185
|
+
|
186
|
+
def each_column (&block)
|
187
|
+
return @columns.each(&block)
|
188
|
+
end
|
189
|
+
|
190
|
+
def each_column_name (&block)
|
191
|
+
return @column_names.each(&block)
|
192
|
+
end
|
193
|
+
|
194
|
+
def each_row_index (&block)
|
195
|
+
if @row_index
|
196
|
+
@row_index.each(&block)
|
197
|
+
else
|
198
|
+
@row_number.times(&block)
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def each_row (with: Array, &block)
|
203
|
+
if with == Array
|
204
|
+
@row_number.times do |i|
|
205
|
+
yield @columns.map{|n,c| c[i] }
|
206
|
+
end
|
207
|
+
elsif with == Hash
|
208
|
+
row = {}
|
209
|
+
@row_number.times do |i|
|
210
|
+
@column_names.each do |c|
|
211
|
+
row[c] = @columns[c][i]
|
212
|
+
end
|
213
|
+
yield row
|
214
|
+
end
|
215
|
+
else
|
216
|
+
raise "invalid data type for loop variable"
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
def each_row_with_row_index (with: Array, &block)
|
221
|
+
if with == Array
|
222
|
+
if @row_index
|
223
|
+
@row_index.each_with_index do |idx, i|
|
224
|
+
yield @columns.map{|n,c| c[i] }, idx
|
225
|
+
end
|
226
|
+
else
|
227
|
+
@row_number.times do |i|
|
228
|
+
yield @columns.map{|n,c| c[i] }, i
|
229
|
+
end
|
230
|
+
end
|
231
|
+
elsif with == Hash
|
232
|
+
row = {}
|
233
|
+
if @row_index
|
234
|
+
@row_index.each_with_index do |idx, i|
|
235
|
+
@column_names.each do |c|
|
236
|
+
row[c] = @columns[c][i]
|
237
|
+
end
|
238
|
+
yield row, @row_index[i]
|
239
|
+
end
|
240
|
+
else
|
241
|
+
@row_number.times do |idx, i|
|
242
|
+
@column_names.each do |c|
|
243
|
+
row[c] = @columns[c][i]
|
244
|
+
end
|
245
|
+
yield row, @row_index[i]
|
246
|
+
end
|
247
|
+
end
|
248
|
+
else
|
249
|
+
raise "invalid data type for loop variable"
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
#
|
254
|
+
# Referencing
|
255
|
+
#
|
256
|
+
|
257
|
+
def [] (*argv)
|
258
|
+
row, col = *argv
|
259
|
+
new_columns = {}
|
260
|
+
if col.is_a?(NilClass)
|
261
|
+
case row
|
262
|
+
when CADataFrame
|
263
|
+
each_column_name do |key|
|
264
|
+
if row.has_column?(key)
|
265
|
+
new_columns[key] = column(key).maskout(row.column(key))
|
266
|
+
else
|
267
|
+
new_columns[key] = column(key).to_ca
|
268
|
+
end
|
269
|
+
end
|
270
|
+
return CADataFrame.new(new_columns, row_index: row.row_index ? row.row_index : nil)
|
271
|
+
when String
|
272
|
+
return self[nil,row]
|
273
|
+
when Array
|
274
|
+
if row.all?{|s| s.is_a?(String) }
|
275
|
+
return self[nil,row]
|
276
|
+
else
|
277
|
+
@column_names.each do |key|
|
278
|
+
new_columns[key] = @columns[key][row]
|
279
|
+
end
|
280
|
+
end
|
281
|
+
return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[row] : nil)
|
282
|
+
else
|
283
|
+
if row.is_a?(Integer)
|
284
|
+
row = [row]
|
285
|
+
end
|
286
|
+
@column_names.each do |key|
|
287
|
+
new_columns[key] = @columns[key][row]
|
288
|
+
end
|
289
|
+
return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[row] : nil)
|
290
|
+
end
|
291
|
+
else
|
292
|
+
if row.is_a?(Integer)
|
293
|
+
row = [row]
|
294
|
+
end
|
295
|
+
case col
|
296
|
+
when String, Symbol
|
297
|
+
key = col.to_s
|
298
|
+
if has_column?(key)
|
299
|
+
return column(key)[row]
|
300
|
+
else
|
301
|
+
raise "unknow column name '#{key}'"
|
302
|
+
end
|
303
|
+
when Array
|
304
|
+
if col.all?{|s| s.is_a?(String) }
|
305
|
+
col.each do |key|
|
306
|
+
key = key.to_s
|
307
|
+
if has_column?(key)
|
308
|
+
new_columns[key] = column(key)[row]
|
309
|
+
else
|
310
|
+
raise "unknow column name '#{key}'"
|
311
|
+
end
|
312
|
+
end
|
313
|
+
else
|
314
|
+
keys = @column_names.to_ca[col].to_a
|
315
|
+
keys.each do |key|
|
316
|
+
new_columns[key] = column(key)[row]
|
317
|
+
end
|
318
|
+
end
|
319
|
+
return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[row] : nil)
|
320
|
+
else
|
321
|
+
if col.is_a?(Integer)
|
322
|
+
col = [col]
|
323
|
+
end
|
324
|
+
keys = @column_names.to_ca[col].to_a
|
325
|
+
keys.each do |key|
|
326
|
+
new_columns[key] = column(key)[row]
|
327
|
+
end
|
328
|
+
return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[row] : nil)
|
329
|
+
end
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
#
|
334
|
+
# Setting Values
|
335
|
+
#
|
336
|
+
|
337
|
+
def []= (*argv)
|
338
|
+
value = argv.pop
|
339
|
+
row, col = *argv
|
340
|
+
case col
|
341
|
+
when NilClass
|
342
|
+
case row
|
343
|
+
when CADataFrame
|
344
|
+
each_column_name do |key|
|
345
|
+
if row.has_column?(key)
|
346
|
+
column(key)[row.column(key)] = value
|
347
|
+
end
|
348
|
+
end
|
349
|
+
when String
|
350
|
+
self[nil,row] = value
|
351
|
+
else
|
352
|
+
col = @column_names.to_a
|
353
|
+
self[row,col] = value
|
354
|
+
end
|
355
|
+
when String, Symbol
|
356
|
+
key = col.to_s
|
357
|
+
if has_column?(key)
|
358
|
+
column(key)[row] = value
|
359
|
+
else
|
360
|
+
arrange {
|
361
|
+
append key, value
|
362
|
+
}
|
363
|
+
end
|
364
|
+
when Array
|
365
|
+
col.each do |key|
|
366
|
+
key = key.to_s
|
367
|
+
if has_column?(key)
|
368
|
+
column(key)[row] = value
|
369
|
+
else
|
370
|
+
raise "unknow column name '#{key}'"
|
371
|
+
end
|
372
|
+
end
|
373
|
+
else
|
374
|
+
if col.is_a?(Integer)
|
375
|
+
col = [col]
|
376
|
+
end
|
377
|
+
keys = @column_names.to_ca[col].to_a
|
378
|
+
keys.each do |key|
|
379
|
+
column(key)[row] = value
|
380
|
+
end
|
381
|
+
end
|
382
|
+
return value
|
383
|
+
end
|
384
|
+
|
385
|
+
def where (mask, value)
|
386
|
+
mask.column_names.each do |key|
|
387
|
+
if has_column?(key)
|
388
|
+
column(key)[mask.column(key).boolean.not] = value
|
389
|
+
end
|
390
|
+
end
|
391
|
+
return value
|
392
|
+
end
|
393
|
+
|
394
|
+
def fill (*names, value)
|
395
|
+
names.each do |name|
|
396
|
+
if has_column?(name)
|
397
|
+
column(name).fill(value)
|
398
|
+
end
|
399
|
+
end
|
400
|
+
return self
|
401
|
+
end
|
402
|
+
|
403
|
+
#
|
404
|
+
# Arrange
|
405
|
+
#
|
406
|
+
|
407
|
+
def arrange (&block)
|
408
|
+
return Arranger.new(self).arrange(&block)
|
409
|
+
end
|
410
|
+
|
411
|
+
def rename (name1, name2)
|
412
|
+
if idx = @column_names.index(name1.to_s)
|
413
|
+
@column_names[idx] = name2.to_s
|
414
|
+
column = @columns[name1.to_s]
|
415
|
+
@columns.delete(name1.to_s)
|
416
|
+
@columns[name2.to_s] = column
|
417
|
+
else
|
418
|
+
raise "unknown column name #{name1}"
|
419
|
+
end
|
420
|
+
end
|
421
|
+
|
422
|
+
def downcase
|
423
|
+
new_column_names = []
|
424
|
+
new_columns = {}
|
425
|
+
each_column_name do |name|
|
426
|
+
new_column_names << name.downcase
|
427
|
+
new_columns[name.downcase] = column(name)
|
428
|
+
end
|
429
|
+
@column_names = new_column_names
|
430
|
+
@columns = new_columns
|
431
|
+
return self
|
432
|
+
end
|
433
|
+
|
434
|
+
def append (name, new_column = nil, &block)
|
435
|
+
if new_column
|
436
|
+
# do nothing
|
437
|
+
elsif block
|
438
|
+
new_column = instance_exec(&block)
|
439
|
+
else
|
440
|
+
new_column = @columns.first[1].template(:object)
|
441
|
+
end
|
442
|
+
unless new_column.is_a?(CArray)
|
443
|
+
new_column = new_column.to_ca
|
444
|
+
end
|
445
|
+
if new_column.rank != 1 or new_column.size != @row_number
|
446
|
+
raise "invalid shape of appended column"
|
447
|
+
end
|
448
|
+
@column_names.push(name)
|
449
|
+
@columns[name] = new_column
|
450
|
+
return new_column
|
451
|
+
end
|
452
|
+
|
453
|
+
def lead (name, new_column = nil, &block)
|
454
|
+
if new_column
|
455
|
+
# do nothing
|
456
|
+
elsif block
|
457
|
+
new_column = instance_exec(&block)
|
458
|
+
else
|
459
|
+
new_column = @columns.first[1].template(:object)
|
460
|
+
end
|
461
|
+
unless new_column.is_a?(CArray)
|
462
|
+
new_column = new_column.to_ca
|
463
|
+
end
|
464
|
+
if new_column.rank != 1 or new_column.size != @row_number
|
465
|
+
raise "invalid shape of appended column"
|
466
|
+
end
|
467
|
+
@column_names.unshift(name)
|
468
|
+
@columns[name] = new_column
|
469
|
+
return new_column
|
470
|
+
end
|
471
|
+
|
472
|
+
def vacant_copy
|
473
|
+
new_columns = {}
|
474
|
+
each_column_name do |key|
|
475
|
+
new_columns[key] = CArray.object(0)
|
476
|
+
end
|
477
|
+
return CADataFrame.new(new_columns)
|
478
|
+
end
|
479
|
+
|
480
|
+
def merge (*args)
|
481
|
+
return CADataFrame.merge(self, *args)
|
482
|
+
end
|
483
|
+
|
484
|
+
def execute (&block)
|
485
|
+
return instance_exec(&block)
|
486
|
+
end
|
487
|
+
|
488
|
+
def calculate (label, &block)
|
489
|
+
hash = {}
|
490
|
+
each_column_name do |name|
|
491
|
+
begin
|
492
|
+
if block
|
493
|
+
hash[name] = [yield(name, column(name))]
|
494
|
+
else
|
495
|
+
hash[name] = [column(name).send(label.intern)]
|
496
|
+
end
|
497
|
+
rescue
|
498
|
+
hash[name] = [UNDEF]
|
499
|
+
end
|
500
|
+
end
|
501
|
+
return CADataFrame.new(hash, row_index: [label])
|
502
|
+
end
|
503
|
+
|
504
|
+
def resample (&block)
|
505
|
+
new_columns = {}
|
506
|
+
each_column_name do |name|
|
507
|
+
begin
|
508
|
+
new_columns[name] = yield(name, column(name))
|
509
|
+
rescue
|
510
|
+
end
|
511
|
+
end
|
512
|
+
return CADataFrame.new(new_columns)
|
513
|
+
end
|
514
|
+
|
515
|
+
def select (*names, &block)
|
516
|
+
if names.empty?
|
517
|
+
names = @column_names
|
518
|
+
end
|
519
|
+
if block
|
520
|
+
row = instance_exec(&block)
|
521
|
+
else
|
522
|
+
row = nil
|
523
|
+
end
|
524
|
+
new_columns = {}
|
525
|
+
names.map(&:to_s).each do |name|
|
526
|
+
new_columns[name] = column(name)[row]
|
527
|
+
end
|
528
|
+
return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[row] : nil)
|
529
|
+
end
|
530
|
+
|
531
|
+
#
|
532
|
+
# Maintenance
|
533
|
+
#
|
534
|
+
|
535
|
+
def unmask! (value = nil)
|
536
|
+
each_column_name do |name|
|
537
|
+
column(name).unmask(value)
|
538
|
+
end
|
539
|
+
return self
|
540
|
+
end
|
541
|
+
|
542
|
+
def unmask (value = nil)
|
543
|
+
return to_df.unmask!(value)
|
544
|
+
end
|
545
|
+
|
546
|
+
def detouch!
|
547
|
+
@columns = @columns.clone
|
548
|
+
each_column_name do |name|
|
549
|
+
@columns[name] = @columns[name].to_ca
|
550
|
+
end
|
551
|
+
if @row_index
|
552
|
+
@row_index = @row_index.clone
|
553
|
+
end
|
554
|
+
return self
|
555
|
+
end
|
556
|
+
|
557
|
+
#
|
558
|
+
# Transformation
|
559
|
+
#
|
560
|
+
|
561
|
+
def eliminate_columns (*names)
|
562
|
+
if names.empty?
|
563
|
+
return self
|
564
|
+
end
|
565
|
+
names = names.map(&:to_s)
|
566
|
+
new_columns = {}
|
567
|
+
each_column_name do |name|
|
568
|
+
unless names.include?(name)
|
569
|
+
new_columns[name] = column(name)
|
570
|
+
end
|
571
|
+
end
|
572
|
+
return CADataFrame.new(new_columns, row_index: @row_index)
|
573
|
+
end
|
574
|
+
|
575
|
+
def reorder (&block)
|
576
|
+
index = instance_exec(&block)
|
577
|
+
new_columns = {}
|
578
|
+
each_column_name do |name|
|
579
|
+
new_columns[name] = column(name)[index]
|
580
|
+
end
|
581
|
+
return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[index] : nil)
|
582
|
+
end
|
583
|
+
|
584
|
+
def order_by (*names, &block)
|
585
|
+
if names.empty?
|
586
|
+
if block
|
587
|
+
ret = instance_exec(&block)
|
588
|
+
case ret
|
589
|
+
when CArray
|
590
|
+
list = [ret]
|
591
|
+
when Array
|
592
|
+
list = ret
|
593
|
+
end
|
594
|
+
end
|
595
|
+
else
|
596
|
+
list = @columns.values_at(*names.map{|s| s.to_s})
|
597
|
+
end
|
598
|
+
return reorder { CA.sort_addr(*list) }
|
599
|
+
end
|
600
|
+
|
601
|
+
def reverse
|
602
|
+
new_columns = {}
|
603
|
+
each_column_name do |name|
|
604
|
+
new_columns[name] = column(name).reverse
|
605
|
+
end
|
606
|
+
return CADataFrame.new(new_columns, row_index: @row_index ? @row_index.reverse : nil)
|
607
|
+
end
|
608
|
+
|
609
|
+
def transpose (column_names: nil)
|
610
|
+
if column_names
|
611
|
+
column_names = header.map(&:to_s)
|
612
|
+
else
|
613
|
+
if @row_index
|
614
|
+
column_names = @row_index.convert(:object) {|v| v.to_s }
|
615
|
+
else
|
616
|
+
column_names = CArray.object(@row_number).seq("a",:succ)
|
617
|
+
end
|
618
|
+
end
|
619
|
+
return CADataFrame.new(ca.transpose, row_index: @column_names.to_ca, column_names: column_names)
|
620
|
+
end
|
621
|
+
|
622
|
+
def add_suffix (suf)
|
623
|
+
new_columns = {}
|
624
|
+
each_column_name do |name|
|
625
|
+
new_name = (name.to_s + suf).to_s
|
626
|
+
new_columns[new_name] = column(name)
|
627
|
+
end
|
628
|
+
return CADataFrame.new(new_columns, row_index: @row_index)
|
629
|
+
end
|
630
|
+
|
631
|
+
#
|
632
|
+
# Conversions
|
633
|
+
#
|
634
|
+
|
635
|
+
def to_df
|
636
|
+
new_columns = {}
|
637
|
+
each_column_name do |name|
|
638
|
+
new_columns[name] = column(name)
|
639
|
+
end
|
640
|
+
return CADataFrame.new(new_columns, row_index: @row_index).detouch!
|
641
|
+
end
|
642
|
+
|
643
|
+
def objectify
|
644
|
+
new_columns = {}
|
645
|
+
each_column_name do |name|
|
646
|
+
new_columns[name] = column(name).object
|
647
|
+
end
|
648
|
+
return CADataFrame.new(new_columns, row_index: @row_index)
|
649
|
+
end
|
650
|
+
|
651
|
+
def ca (*names)
|
652
|
+
if names.empty?
|
653
|
+
return CADFArray.new(@column_names, @columns)
|
654
|
+
else
|
655
|
+
return CADFArray.new(names.map(&:to_s), @columns)
|
656
|
+
end
|
657
|
+
end
|
658
|
+
|
659
|
+
def to_ca (*names)
|
660
|
+
return ca(*names).to_ca
|
661
|
+
end
|
662
|
+
|
663
|
+
def to_hash
|
664
|
+
hash = {}
|
665
|
+
@columns.each do |k,v|
|
666
|
+
hash[k] = v.to_a
|
667
|
+
end
|
668
|
+
return hash
|
669
|
+
end
|
670
|
+
|
671
|
+
def columns_to_hash (key_name, value_names)
|
672
|
+
hash = {}
|
673
|
+
unless @column_names.include?(key_name)
|
674
|
+
raise ArgumentError, "include invalid key column name #{key_name}"
|
675
|
+
end
|
676
|
+
case value_names
|
677
|
+
when String
|
678
|
+
unless @column_names.include?(value_names)
|
679
|
+
raise ArgumentError, "invalid key column name #{value_names}"
|
680
|
+
end
|
681
|
+
key_columns = @columns[key_name]
|
682
|
+
value_columns = @columns[value_names]
|
683
|
+
@row_number.times do |i|
|
684
|
+
hash[key_columns[i]] = value_columns[i]
|
685
|
+
end
|
686
|
+
when Array
|
687
|
+
unless value_names.all?{|s| @column_names.include?(s) }
|
688
|
+
raise ArgumentError, "include invalid column name in #{value_names.join(' ')}"
|
689
|
+
end
|
690
|
+
key_columns = @columns[key_name]
|
691
|
+
value_columns = @columns.values_at(*value_names)
|
692
|
+
@row_number.times do |i|
|
693
|
+
hash[key_columns[i]] = value_columns.map{|c| c[i]}
|
694
|
+
end
|
695
|
+
else
|
696
|
+
raise ArgumentError, "invalud argument"
|
697
|
+
end
|
698
|
+
return hash
|
699
|
+
end
|
700
|
+
|
701
|
+
private
|
702
|
+
|
703
|
+
def __obj_to_string__ (obj)
|
704
|
+
case obj
|
705
|
+
when Float
|
706
|
+
"%.6g" % obj
|
707
|
+
else
|
708
|
+
obj.to_s
|
709
|
+
end
|
710
|
+
end
|
711
|
+
|
712
|
+
def __strwidth__ (string)
|
713
|
+
if string.ascii_only?
|
714
|
+
return string.length
|
715
|
+
else
|
716
|
+
return string.each_char.inject(0){|s,c| s += c.bytesize > 1 ? 2 : 1 }
|
717
|
+
end
|
718
|
+
end
|
719
|
+
|
720
|
+
public
|
721
|
+
|
722
|
+
def ascii_table (rowmax = :full)
|
723
|
+
if @row_index
|
724
|
+
namelist = [" "] + @column_names
|
725
|
+
tbl = CADFArray.new(namelist, @columns.clone.update(" " => @row_index))
|
726
|
+
else
|
727
|
+
namelist = [" "] + @column_names
|
728
|
+
tbl = CADFArray.new(namelist, @columns.clone.update(" " => CArray.int(@row_number).seq))
|
729
|
+
end
|
730
|
+
if rowmax.is_a?(Integer) and @row_number > rowmax
|
731
|
+
list = tbl[0..(rowmax/2),nil].to_a
|
732
|
+
list.push namelist.map { "..." }
|
733
|
+
list.push *(tbl[-rowmax/2+1..-1,nil].to_a)
|
734
|
+
tbl = list.to_ca
|
735
|
+
end
|
736
|
+
datastr = tbl.convert {|c| __obj_to_string__(c) }.unmask("")
|
737
|
+
datamb = datastr.convert(:boolean, &:"ascii_only?").not.sum(0).ne(0)
|
738
|
+
namemb = namelist.to_ca.convert(:boolean) {|c| c.to_s.ascii_only? }.eq(0)
|
739
|
+
mb = datamb.or(namemb)
|
740
|
+
namelen = namelist.map(&:length).to_ca
|
741
|
+
datalen = datastr.convert(&:length)
|
742
|
+
if mb.max == 0
|
743
|
+
if datalen.size == 0
|
744
|
+
lengths = namelen.to_a
|
745
|
+
else
|
746
|
+
lengths = datalen.max(0).pmax(namelen).to_a
|
747
|
+
end
|
748
|
+
hrule = "-" + lengths.map {|len| "-"*len}.join("--") + "-"
|
749
|
+
header = " " +
|
750
|
+
[namelist, lengths].transpose.map{|name, len|
|
751
|
+
"#{name.to_s.ljust(len)}" }.join(" ") + " "
|
752
|
+
ary = [hrule, header, hrule]
|
753
|
+
if datalen.size > 0
|
754
|
+
datastr[:i,nil].each_with_index do |blk, i|
|
755
|
+
list = blk.flatten.to_a
|
756
|
+
ary << " " + [list, lengths].transpose.map{|value, len|
|
757
|
+
"#{value.ljust(len)}"}.join(" ") + " "
|
758
|
+
end
|
759
|
+
end
|
760
|
+
ary << hrule
|
761
|
+
return "DataFrame: rows#=#{@row_number}: \n" + ary.join("\n")
|
762
|
+
else
|
763
|
+
namewidth = namelist.to_ca.convert{|c| __strwidth__(c.to_s) }
|
764
|
+
if datalen.size == 0
|
765
|
+
maxwidth = namewidth
|
766
|
+
else
|
767
|
+
datawidth = datastr.convert{|c| __strwidth__(c.to_s) }
|
768
|
+
maxwidth = datawidth.max(0).pmax(namewidth)
|
769
|
+
end
|
770
|
+
len = maxwidth[:*,nil] - datawidth + datalen
|
771
|
+
hrule = "-" + maxwidth.map {|len| "-"*len}.join("--") + "-"
|
772
|
+
header = " " +
|
773
|
+
[namelist, maxwidth.to_a].transpose.map{|name, len|
|
774
|
+
"#{name.to_s.ljust(len-__strwidth__(name.to_s)+name.to_s.length)}" }.join(" ") + " "
|
775
|
+
ary = [hrule, header, hrule]
|
776
|
+
if datalen.size > 0
|
777
|
+
datastr[:i,nil].each_with_addr do |blk, i|
|
778
|
+
list = blk.flatten.to_a
|
779
|
+
ary << " " + list.map.with_index {|value, j|
|
780
|
+
"#{value.ljust(len[i,j])}"}.join(" ") + " "
|
781
|
+
end
|
782
|
+
end
|
783
|
+
ary << hrule
|
784
|
+
return "DataFrame: row#=#{@row_number}: \n" + ary.join("\n")
|
785
|
+
end
|
786
|
+
end
|
787
|
+
|
788
|
+
|
789
|
+
def inspect
|
790
|
+
return ascii_table(10)
|
791
|
+
end
|
792
|
+
|
793
|
+
def to_s
|
794
|
+
return ascii_table
|
795
|
+
end
|
796
|
+
|
797
|
+
def to_ary
|
798
|
+
return [to_s]
|
799
|
+
end
|
800
|
+
|
801
|
+
|
802
|
+
end
|
803
|
+
|
804
|
+
#############################################################
|
805
|
+
#
|
806
|
+
# ARRANGER
|
807
|
+
#
|
808
|
+
#############################################################
|
809
|
+
|
810
|
+
|
811
|
+
class CADataFrame
|
812
|
+
|
813
|
+
class Arranger
|
814
|
+
|
815
|
+
def initialize (dataframe)
|
816
|
+
@dataframe = dataframe
|
817
|
+
end
|
818
|
+
|
819
|
+
def arrange (&block)
|
820
|
+
instance_exec(&block)
|
821
|
+
return @dataframe
|
822
|
+
end
|
823
|
+
|
824
|
+
private
|
825
|
+
|
826
|
+
def column_names
|
827
|
+
return @dataframe.column_names
|
828
|
+
end
|
829
|
+
|
830
|
+
def row_number
|
831
|
+
return @dataframe.row_number
|
832
|
+
end
|
833
|
+
|
834
|
+
def method (hash)
|
835
|
+
@dataframe.method(hash)
|
836
|
+
end
|
837
|
+
|
838
|
+
def timeseries (name, fmt = "%Y-%m-%d %H:%M:%S")
|
839
|
+
@dataframe.columns[name.to_s] = @dataframe.columns[name.to_s].strptime(fmt)
|
840
|
+
end
|
841
|
+
|
842
|
+
def type (type, name, mask = :novalue)
|
843
|
+
@dataframe.columns[name.to_s] = @dataframe.columns[name.to_s].to_type(type)
|
844
|
+
if mask != :novalue
|
845
|
+
@dataframe.columns[name.to_s].maskout!(options[:maskout])
|
846
|
+
end
|
847
|
+
end
|
848
|
+
|
849
|
+
def eliminate (*names)
|
850
|
+
if names.empty?
|
851
|
+
return self
|
852
|
+
end
|
853
|
+
names = names.map(&:to_s)
|
854
|
+
@dataframe.column_names.each do |name|
|
855
|
+
if names.include?(name)
|
856
|
+
@dataframe.columns.delete(name)
|
857
|
+
@dataframe.column_names.delete(name)
|
858
|
+
end
|
859
|
+
end
|
860
|
+
end
|
861
|
+
|
862
|
+
def template (*args, &block)
|
863
|
+
return @dataframe.columns.first[1].template(*args, &block)
|
864
|
+
end
|
865
|
+
|
866
|
+
def double (*names)
|
867
|
+
names.flatten.map(&:to_s).each do |name|
|
868
|
+
if @dataframe.column_names.include?(name)
|
869
|
+
type(:double, name)
|
870
|
+
else
|
871
|
+
raise "Unknown column name '#{name}'"
|
872
|
+
end
|
873
|
+
end
|
874
|
+
end
|
875
|
+
|
876
|
+
def int (*names)
|
877
|
+
names.flatten.map(&:to_s).each do |name|
|
878
|
+
if @dataframe.column_names.include?(name)
|
879
|
+
type(:int, name)
|
880
|
+
else
|
881
|
+
raise "Unknown column name '#{name}'"
|
882
|
+
end
|
883
|
+
end
|
884
|
+
end
|
885
|
+
|
886
|
+
def maskout (value, *names)
|
887
|
+
names.flatten.map(&:to_s).each do |name|
|
888
|
+
@dataframe.columns[name].maskout!(value)
|
889
|
+
end
|
890
|
+
end
|
891
|
+
|
892
|
+
def unmask (value, *names)
|
893
|
+
names.flatten.map(&:to_s).each do |name|
|
894
|
+
@dataframe.columns[name].unmask(value)
|
895
|
+
end
|
896
|
+
end
|
897
|
+
|
898
|
+
def col (name)
|
899
|
+
return @dataframe.col(name)
|
900
|
+
end
|
901
|
+
|
902
|
+
def append (name, new_column)
|
903
|
+
if new_column
|
904
|
+
# do nothing
|
905
|
+
else
|
906
|
+
new_column = @dataframe.columns.first[1].template(:object)
|
907
|
+
end
|
908
|
+
unless new_column.is_a?(CArray)
|
909
|
+
new_column = new_column.to_ca
|
910
|
+
end
|
911
|
+
@dataframe.columns[name.to_s] = new_column
|
912
|
+
@dataframe.column_names.push(name.to_s)
|
913
|
+
end
|
914
|
+
|
915
|
+
def lead (name, new_column)
|
916
|
+
if new_column
|
917
|
+
# do nothing
|
918
|
+
else
|
919
|
+
new_column = @dataframe.columns.first[1].template(:object)
|
920
|
+
end
|
921
|
+
unless new_column.is_a?(CArray)
|
922
|
+
new_column = new_column.to_ca
|
923
|
+
end
|
924
|
+
@dataframe.columns[name.to_s] = new_column
|
925
|
+
@dataframe.column_names.unshift(name.to_s)
|
926
|
+
end
|
927
|
+
|
928
|
+
def rename (name1, name2)
|
929
|
+
if idx = @dataframe.column_names.index(name1.to_s)
|
930
|
+
@dataframe.column_names[idx] = name2.to_s
|
931
|
+
column = @dataframe.columns[name1.to_s]
|
932
|
+
@dataframe.columns.delete(name1.to_s)
|
933
|
+
@dataframe.columns[name2.to_s] = column
|
934
|
+
else
|
935
|
+
raise "unknown column name #{name1}"
|
936
|
+
end
|
937
|
+
end
|
938
|
+
|
939
|
+
def downcase
|
940
|
+
@dataframe.downcase
|
941
|
+
end
|
942
|
+
|
943
|
+
def classify (name, scale, opt = {})
|
944
|
+
return @dataframe.classify(name, scale, opt)
|
945
|
+
end
|
946
|
+
|
947
|
+
def map (mapper, name_or_column)
|
948
|
+
case name_or_column
|
949
|
+
when String, Symbol
|
950
|
+
name = name_or_column
|
951
|
+
column = @dataframe.columns[name.to_s]
|
952
|
+
when CArray
|
953
|
+
column = name_or_column
|
954
|
+
when Array
|
955
|
+
column = name_or_column.to_ca
|
956
|
+
else
|
957
|
+
raise "invalid argument"
|
958
|
+
end
|
959
|
+
case mapper
|
960
|
+
when Hash
|
961
|
+
return column.convert(:object) {|v| hash[v] }
|
962
|
+
when CArray
|
963
|
+
return mapper.project(column)
|
964
|
+
when Array
|
965
|
+
return mapper.to_ca.project(column)
|
966
|
+
end
|
967
|
+
end
|
968
|
+
|
969
|
+
def method_missing (name, *args)
|
970
|
+
if args.size == 0
|
971
|
+
if @dataframe.column_names.include?(name.to_s)
|
972
|
+
return @dataframe.columns[name.to_s]
|
973
|
+
elsif @dataframe.__methods__.include?(name.to_s)
|
974
|
+
return @dataframe.columns[@dataframe.__methods__[name.to_s]]
|
975
|
+
end
|
976
|
+
end
|
977
|
+
super
|
978
|
+
end
|
979
|
+
|
980
|
+
end
|
981
|
+
|
982
|
+
end
|
983
|
+
|
984
|
+
#############################################################
|
985
|
+
#
|
986
|
+
# Class methods
|
987
|
+
#
|
988
|
+
#############################################################
|
989
|
+
|
990
|
+
class CADataFrame
|
991
|
+
|
992
|
+
def self.merge (*args)
|
993
|
+
ref = args.first
|
994
|
+
new_columns = {}
|
995
|
+
args.each do |table|
|
996
|
+
table.column_names.each do |name|
|
997
|
+
new_columns[name] = table.col(name)
|
998
|
+
end
|
999
|
+
end
|
1000
|
+
return CADataFrame.new(new_columns, row_index: ref.row_index)
|
1001
|
+
end
|
1002
|
+
|
1003
|
+
def self.concat (*args)
|
1004
|
+
ref = args.first
|
1005
|
+
column_names = ref.column_names
|
1006
|
+
new_columns = {}
|
1007
|
+
column_names.each do |name|
|
1008
|
+
list = args.map{|t| t.col(name) }
|
1009
|
+
data_type = list.first.data_type
|
1010
|
+
new_columns[name] = CArray.bind(data_type, list, 0)
|
1011
|
+
end
|
1012
|
+
if args.map(&:row_index).all?
|
1013
|
+
new_row_index = CArray.join(*args.map(&:row_index))
|
1014
|
+
else
|
1015
|
+
new_row_index = nil
|
1016
|
+
end
|
1017
|
+
return CADataFrame.new(new_columns, row_index: new_row_index)
|
1018
|
+
end
|
1019
|
+
|
1020
|
+
|
1021
|
+
end
|
1022
|
+
|
1023
|
+
#############################################################
|
1024
|
+
#
|
1025
|
+
# CADFArray
|
1026
|
+
#
|
1027
|
+
#############################################################
|
1028
|
+
|
1029
|
+
class CADFArray < CAObject # :nodoc:
|
1030
|
+
|
1031
|
+
def initialize (column_names, columns)
|
1032
|
+
@column_names = column_names
|
1033
|
+
@columns = columns
|
1034
|
+
dim = [@columns[@column_names.first].size, @column_names.size]
|
1035
|
+
extend CA::TableMethods
|
1036
|
+
super(:object, dim, :read_only=>true)
|
1037
|
+
__create_mask__
|
1038
|
+
end
|
1039
|
+
|
1040
|
+
attr_reader :column_names
|
1041
|
+
|
1042
|
+
def fetch_index (idx)
|
1043
|
+
r, c = *idx
|
1044
|
+
name = @column_names[c]
|
1045
|
+
return @columns[name][r]
|
1046
|
+
end
|
1047
|
+
|
1048
|
+
def copy_data (data)
|
1049
|
+
@column_names.each_with_index do |name, i|
|
1050
|
+
data[nil,i] = @columns[name].value
|
1051
|
+
end
|
1052
|
+
end
|
1053
|
+
|
1054
|
+
def create_mask
|
1055
|
+
end
|
1056
|
+
|
1057
|
+
def mask_fetch_index (idx)
|
1058
|
+
r, c = *idx
|
1059
|
+
name = @column_names[c]
|
1060
|
+
if @columns[name].has_mask?
|
1061
|
+
return @columns[name].mask[r]
|
1062
|
+
else
|
1063
|
+
return 0
|
1064
|
+
end
|
1065
|
+
end
|
1066
|
+
|
1067
|
+
def mask_copy_data (data)
|
1068
|
+
@column_names.each_with_index do |name, i|
|
1069
|
+
if @columns[name].has_mask?
|
1070
|
+
data[nil,i] = @columns[name].mask
|
1071
|
+
end
|
1072
|
+
end
|
1073
|
+
end
|
1074
|
+
|
1075
|
+
def to_ca
|
1076
|
+
obj = super
|
1077
|
+
obj.extend CA::TableMethods
|
1078
|
+
obj.column_names = @column_names
|
1079
|
+
return obj
|
1080
|
+
end
|
1081
|
+
|
1082
|
+
end
|
1083
|
+
|
1084
|
+
#############################################################
|
1085
|
+
#
|
1086
|
+
# BASIC Comparison
|
1087
|
+
#
|
1088
|
+
#############################################################
|
1089
|
+
|
1090
|
+
|
1091
|
+
class CADataFrame
|
1092
|
+
|
1093
|
+
def -@
|
1094
|
+
return cmp(:-@)
|
1095
|
+
end
|
1096
|
+
|
1097
|
+
def < (other)
|
1098
|
+
return cmp(:<, other)
|
1099
|
+
end
|
1100
|
+
|
1101
|
+
def <= (other)
|
1102
|
+
return cmp(:<=, other)
|
1103
|
+
end
|
1104
|
+
|
1105
|
+
def > (other)
|
1106
|
+
return cmp(:>, other)
|
1107
|
+
end
|
1108
|
+
|
1109
|
+
def >= (other)
|
1110
|
+
return cmp(:>=, other)
|
1111
|
+
end
|
1112
|
+
|
1113
|
+
def is_masked
|
1114
|
+
return cmp(:is_masked)
|
1115
|
+
end
|
1116
|
+
|
1117
|
+
def is_finite
|
1118
|
+
return cmp(:is_finite)
|
1119
|
+
end
|
1120
|
+
|
1121
|
+
private
|
1122
|
+
|
1123
|
+
def cmp (method, *argv)
|
1124
|
+
return CADataFrame.new(ca.send(method,*argv), column_names: @column_names)
|
1125
|
+
end
|
1126
|
+
|
1127
|
+
end
|
1128
|
+
|
1129
|
+
#############################################################
|
1130
|
+
#
|
1131
|
+
# BASIC Manipulations
|
1132
|
+
#
|
1133
|
+
#############################################################
|
1134
|
+
|
1135
|
+
class CADataFrame
|
1136
|
+
|
1137
|
+
def matchup (keyname, reference)
|
1138
|
+
key = column(keyname.to_s)
|
1139
|
+
idx = reference.matchup(key)
|
1140
|
+
new_columns = {}
|
1141
|
+
each_column_name do |name|
|
1142
|
+
if name == keyname
|
1143
|
+
new_columns[name] = reference
|
1144
|
+
else
|
1145
|
+
new_columns[name] = column(name).project(idx)
|
1146
|
+
end
|
1147
|
+
end
|
1148
|
+
if @row_index
|
1149
|
+
new_row_index = @row_index.project(idx).unmask(nil)
|
1150
|
+
else
|
1151
|
+
new_row_index = nil
|
1152
|
+
end
|
1153
|
+
return CADataFrame.new(new_columns, row_index: new_row_index) {
|
1154
|
+
self.send(keyname)[] = reference
|
1155
|
+
}
|
1156
|
+
end
|
1157
|
+
|
1158
|
+
def join (table, on: nil)
|
1159
|
+
end
|
1160
|
+
|
1161
|
+
def histogram (name, scale = nil, options = nil)
|
1162
|
+
if scale.nil?
|
1163
|
+
return group_by(name).table{ { :count => col(name).count_valid } }
|
1164
|
+
else
|
1165
|
+
if options
|
1166
|
+
hist = CAHistogram.int(scale, options)
|
1167
|
+
else
|
1168
|
+
hist = CAHistogram.int(scale)
|
1169
|
+
end
|
1170
|
+
hist.increment(@columns[name.to_s])
|
1171
|
+
hash = {
|
1172
|
+
name.to_s => hist.midpoints[0],
|
1173
|
+
"#{name}_L".to_s => scale[0..-2],
|
1174
|
+
"#{name}_R".to_s => scale.shift(-1)[0..-2],
|
1175
|
+
:count => hist[0..-2].to_ca,
|
1176
|
+
}
|
1177
|
+
return CADataFrame.new(hash)
|
1178
|
+
end
|
1179
|
+
end
|
1180
|
+
|
1181
|
+
def classify (name, scale = nil, opt = {})
|
1182
|
+
if not scale
|
1183
|
+
column = @columns[name.to_s]
|
1184
|
+
mids = column.uniq
|
1185
|
+
mapper = {}
|
1186
|
+
mids.each_with_index do |v,i|
|
1187
|
+
mapper[v] = i
|
1188
|
+
end
|
1189
|
+
cls = columns.convert(:int32) {|v| mapper[v] }
|
1190
|
+
hash = {
|
1191
|
+
"#{name}_M" => mids,
|
1192
|
+
"#{name}_L" => mids,
|
1193
|
+
"#{name}_R" => mids,
|
1194
|
+
"#{name}_CLASS" => cls
|
1195
|
+
}
|
1196
|
+
else
|
1197
|
+
option = {
|
1198
|
+
:include_upper => false,
|
1199
|
+
:include_lowest => true,
|
1200
|
+
:offset => 0,
|
1201
|
+
}.update(opt)
|
1202
|
+
column = @columns[name.to_s]
|
1203
|
+
cls = scale.bin(column,
|
1204
|
+
option[:include_upper],
|
1205
|
+
option[:include_lowest],
|
1206
|
+
option[:offset])
|
1207
|
+
mids = ((scale + scale.shifted(-1))/2)[0..-2].to_ca
|
1208
|
+
left = scale[0..-2]
|
1209
|
+
right = scale.shift(-1)[0..-2]
|
1210
|
+
hash = {
|
1211
|
+
"#{name}_M" => mids.project(cls).to_ca,
|
1212
|
+
"#{name}_L" => left.project(cls).to_ca,
|
1213
|
+
"#{name}_R" => right.project(cls).to_ca,
|
1214
|
+
"#{name}_CLASS" => cls
|
1215
|
+
}
|
1216
|
+
end
|
1217
|
+
return CADataFrame.new(hash)
|
1218
|
+
end
|
1219
|
+
|
1220
|
+
def cross (name1, name2)
|
1221
|
+
col1 = column(name1)
|
1222
|
+
col2 = column(name2)
|
1223
|
+
var1 = col1.uniq.sort
|
1224
|
+
var2 = col2.uniq.sort
|
1225
|
+
hash = {}
|
1226
|
+
count = Hash.new {0}
|
1227
|
+
var1.each do |v1|
|
1228
|
+
var2.each do |v2|
|
1229
|
+
hash[[v1,v2]] = 0
|
1230
|
+
end
|
1231
|
+
end
|
1232
|
+
list = CArray.join([col1, col2]).to_a
|
1233
|
+
list.each do |item|
|
1234
|
+
hash[item] += 1
|
1235
|
+
end
|
1236
|
+
out = CArray.object(var1.size, var2.size) { 0 }
|
1237
|
+
var1.each_with_index do |v1, i|
|
1238
|
+
var2.each_with_index do |v2, j|
|
1239
|
+
out[i,j] = hash[[v1,v2]]
|
1240
|
+
end
|
1241
|
+
end
|
1242
|
+
return CADataFrame.new(out, row_index: var1, column_names: var2)
|
1243
|
+
end
|
1244
|
+
|
1245
|
+
end
|
1246
|
+
|
1247
|
+
|
1248
|
+
#############################################################
|
1249
|
+
#
|
1250
|
+
# GROUPING
|
1251
|
+
#
|
1252
|
+
#############################################################
|
1253
|
+
|
1254
|
+
class CADataFrame
|
1255
|
+
|
1256
|
+
def group_by (*names)
|
1257
|
+
if names.size == 1
|
1258
|
+
return CADataFrameGroup.new(self, names[0])
|
1259
|
+
else
|
1260
|
+
return CADataFrameGroupMulti.new(self, *names)
|
1261
|
+
end
|
1262
|
+
end
|
1263
|
+
|
1264
|
+
end
|
1265
|
+
|
1266
|
+
class CADataFrameGroup
|
1267
|
+
|
1268
|
+
def initialize (dataframe, name)
|
1269
|
+
@dataframe = dataframe
|
1270
|
+
case name
|
1271
|
+
when Hash
|
1272
|
+
name, list = name.first
|
1273
|
+
@column = @dataframe.col(name)
|
1274
|
+
@keys = list.to_ca
|
1275
|
+
else
|
1276
|
+
@column = @dataframe.col(name)
|
1277
|
+
@keys = @column.uniq.sort
|
1278
|
+
end
|
1279
|
+
@name = name.to_s
|
1280
|
+
@addrs = {}
|
1281
|
+
@keys.each do |k|
|
1282
|
+
@addrs[k] = @column.eq(k).where
|
1283
|
+
end
|
1284
|
+
end
|
1285
|
+
|
1286
|
+
def table (&block)
|
1287
|
+
hashpool = []
|
1288
|
+
@keys.each do |k|
|
1289
|
+
hashpool << @dataframe[@addrs[k]].execute(&block)
|
1290
|
+
end
|
1291
|
+
columns = {@name=>@keys}
|
1292
|
+
hashpool.each_with_index do |hash, i|
|
1293
|
+
hash.each do |key, value|
|
1294
|
+
columns[key] ||= []
|
1295
|
+
columns[key][i] = value
|
1296
|
+
end
|
1297
|
+
end
|
1298
|
+
return CADataFrame.new(columns)
|
1299
|
+
end
|
1300
|
+
|
1301
|
+
def calculate (label, &block)
|
1302
|
+
new_columns = {@name=>@keys}
|
1303
|
+
@dataframe.each_column do |name, clmn|
|
1304
|
+
if name == @name
|
1305
|
+
next
|
1306
|
+
end
|
1307
|
+
new_columns[name] = CArray.object(@keys.size) { UNDEF }
|
1308
|
+
@keys.each_with_index do |k, i|
|
1309
|
+
begin
|
1310
|
+
if block
|
1311
|
+
new_columns[name][i] = yield(name, clmn[@addrs[k]])
|
1312
|
+
else
|
1313
|
+
new_columns[name][i] = clmn[@addrs[k]].send(label.intern)
|
1314
|
+
end
|
1315
|
+
rescue
|
1316
|
+
end
|
1317
|
+
end
|
1318
|
+
end
|
1319
|
+
return CADataFrame.new(new_columns)
|
1320
|
+
end
|
1321
|
+
|
1322
|
+
def [] (group_value)
|
1323
|
+
if map = @addrs[group_value]
|
1324
|
+
return @dataframe[map]
|
1325
|
+
else
|
1326
|
+
return @dataframe.vacant_copy
|
1327
|
+
end
|
1328
|
+
end
|
1329
|
+
|
1330
|
+
def each
|
1331
|
+
@addrs.each do |key, map|
|
1332
|
+
yield @dataframe[map]
|
1333
|
+
end
|
1334
|
+
end
|
1335
|
+
|
1336
|
+
def each_with_index
|
1337
|
+
@addrs.each do |key, map|
|
1338
|
+
yield @dataframe[map], key
|
1339
|
+
end
|
1340
|
+
end
|
1341
|
+
|
1342
|
+
include Enumerable
|
1343
|
+
|
1344
|
+
end
|
1345
|
+
|
1346
|
+
class CADataFrameGroupMulti
|
1347
|
+
|
1348
|
+
def initialize (dataframe, *names)
|
1349
|
+
@rank = names.size
|
1350
|
+
@dataframe = dataframe
|
1351
|
+
@names = []
|
1352
|
+
@column = []
|
1353
|
+
@keys = []
|
1354
|
+
names.each_with_index do |name, i|
|
1355
|
+
case name
|
1356
|
+
when Hash
|
1357
|
+
name, list = name.first
|
1358
|
+
@column[i] = @dataframe.col(name)
|
1359
|
+
@keys[i] = list.to_ca
|
1360
|
+
else
|
1361
|
+
@column[i] = @dataframe.col(name)
|
1362
|
+
@keys[i] = @column[i].to_ca.uniq.sort
|
1363
|
+
end
|
1364
|
+
@names[i] = name
|
1365
|
+
end
|
1366
|
+
@addrs = {}
|
1367
|
+
each_with_keys do |list|
|
1368
|
+
flag = @column[0].eq(list[0])
|
1369
|
+
(1...@rank).each do |i|
|
1370
|
+
flag &= @column[i].eq(list[i])
|
1371
|
+
end
|
1372
|
+
@addrs[list] = flag.where
|
1373
|
+
end
|
1374
|
+
end
|
1375
|
+
|
1376
|
+
def each_with_keys (&block)
|
1377
|
+
@keys[0].to_a.product(*@keys[1..-1].map(&:to_a)).each(&block)
|
1378
|
+
end
|
1379
|
+
|
1380
|
+
def table (&block)
|
1381
|
+
hashpool = []
|
1382
|
+
each_with_keys do |list|
|
1383
|
+
hashpool << @dataframe[@addrs[list]].execute(&block)
|
1384
|
+
end
|
1385
|
+
columns = {}
|
1386
|
+
@names.each do |name|
|
1387
|
+
columns[name] = []
|
1388
|
+
end
|
1389
|
+
each_with_keys.with_index do |list,j|
|
1390
|
+
@names.each_with_index do |name,i|
|
1391
|
+
columns[name][j] = list[i]
|
1392
|
+
end
|
1393
|
+
end
|
1394
|
+
hashpool.each_with_index do |hash, i|
|
1395
|
+
hash.each do |key, value|
|
1396
|
+
columns[key] ||= []
|
1397
|
+
columns[key][i] = value
|
1398
|
+
end
|
1399
|
+
end
|
1400
|
+
return CADataFrame.new(columns)
|
1401
|
+
end
|
1402
|
+
|
1403
|
+
def [] (group_value)
|
1404
|
+
if map = @addrs[group_value]
|
1405
|
+
return @dataframe[map]
|
1406
|
+
else
|
1407
|
+
return @dataframe.vacant_copy
|
1408
|
+
end
|
1409
|
+
end
|
1410
|
+
|
1411
|
+
def each
|
1412
|
+
each_with_keys do |key|
|
1413
|
+
yield key, @dataframe[@addrs[key]]
|
1414
|
+
end
|
1415
|
+
end
|
1416
|
+
|
1417
|
+
end
|
1418
|
+
|
1419
|
+
#############################################################
|
1420
|
+
#
|
1421
|
+
# PIVOT TABLE
|
1422
|
+
#
|
1423
|
+
#############################################################
|
1424
|
+
|
1425
|
+
class CADataFrame
|
1426
|
+
|
1427
|
+
def pivot (name1, name2)
|
1428
|
+
return CADataFramePivot.new(self, name1, name2)
|
1429
|
+
end
|
1430
|
+
|
1431
|
+
end
|
1432
|
+
|
1433
|
+
class CADataFramePivot
|
1434
|
+
|
1435
|
+
def initialize (dataframe, name1, name2)
|
1436
|
+
@dataframe = dataframe
|
1437
|
+
case name1
|
1438
|
+
when Hash
|
1439
|
+
name1, list = name1.first
|
1440
|
+
@column1 = @dataframe.col(name1)
|
1441
|
+
@keys1 = list.to_ca
|
1442
|
+
else
|
1443
|
+
@column1 = @dataframe.col(name1)
|
1444
|
+
@keys1 = @column1.uniq.sort
|
1445
|
+
end
|
1446
|
+
case name2
|
1447
|
+
when Hash
|
1448
|
+
name2, list = name2.first
|
1449
|
+
@column2 = @dataframe.col(name2)
|
1450
|
+
@keys2 = list
|
1451
|
+
else
|
1452
|
+
@column2 = @dataframe.col(name2)
|
1453
|
+
@keys2 = @column2.uniq.sort
|
1454
|
+
end
|
1455
|
+
@addrs = {}
|
1456
|
+
@keys1.each do |k1|
|
1457
|
+
@keys2.each do |k2|
|
1458
|
+
@addrs[[k1,k2]] = (@column1.eq(k1) & @column2.eq(k2)).where
|
1459
|
+
end
|
1460
|
+
end
|
1461
|
+
end
|
1462
|
+
|
1463
|
+
def table (&block)
|
1464
|
+
columns = {}
|
1465
|
+
@keys2.each do |k2|
|
1466
|
+
columns[k2] = CArray.object(@keys1.size) { UNDEF }
|
1467
|
+
end
|
1468
|
+
@keys1.each_with_index do |k1, i|
|
1469
|
+
@keys2.each do |k2|
|
1470
|
+
columns[k2][i] = @dataframe[@addrs[[k1,k2]]].execute(&block)
|
1471
|
+
end
|
1472
|
+
end
|
1473
|
+
return CADataFrame.new(columns, row_index: @keys1)
|
1474
|
+
end
|
1475
|
+
|
1476
|
+
end
|
1477
|
+
|
1478
|
+
|
1479
|
+
#############################################################
|
1480
|
+
#
|
1481
|
+
# CArray
|
1482
|
+
#
|
1483
|
+
#############################################################
|
1484
|
+
|
1485
|
+
|
1486
|
+
class CArray
|
1487
|
+
|
1488
|
+
def value_counts
|
1489
|
+
hash = {}
|
1490
|
+
values = uniq
|
1491
|
+
values.each do |value|
|
1492
|
+
hash[value] = 0
|
1493
|
+
end
|
1494
|
+
each do |value|
|
1495
|
+
hash[value] += 1
|
1496
|
+
end
|
1497
|
+
counts = values.convert{|value| hash[value]}
|
1498
|
+
return CADataFrame.new({'value' => values, 'count' => counts})
|
1499
|
+
end
|
1500
|
+
|
1501
|
+
end
|
1502
|
+
|
1503
|
+
|
1504
|
+
|
1505
|
+
class CADataFrame
|
1506
|
+
|
1507
|
+
def to_sqlite3 (*args)
|
1508
|
+
self.ca.to_sqlite3(*args)
|
1509
|
+
end
|
1510
|
+
|
1511
|
+
def to_sql (tablename)
|
1512
|
+
if @column_names.any?{ |s| s =~ /[\. \-]/ }
|
1513
|
+
columns = {}
|
1514
|
+
each_column_name do |name|
|
1515
|
+
name2 = name.gsub(/[\. \-]/, '_')
|
1516
|
+
columns[name2] = column(name)
|
1517
|
+
end
|
1518
|
+
df = CADataFrame.new(columns)
|
1519
|
+
return df.to_sqlite3(database: ":memory:", table: tablename)
|
1520
|
+
else
|
1521
|
+
return to_sqlite3(database: ":memory:", table: tablename)
|
1522
|
+
end
|
1523
|
+
end
|
1524
|
+
|
1525
|
+
end
|
1526
|
+
|
1527
|
+
module SQLite3
|
1528
|
+
|
1529
|
+
class Database
|
1530
|
+
|
1531
|
+
def to_df (expr)
|
1532
|
+
return CADataFrame.load_sqlite3 self, expr
|
1533
|
+
end
|
1534
|
+
|
1535
|
+
end
|
1536
|
+
|
1537
|
+
end
|
1538
|
+
|
1539
|
+
######################################
|
1540
|
+
#
|
1541
|
+
# IO methods
|
1542
|
+
#
|
1543
|
+
######################################
|
1544
|
+
|
1545
|
+
require "spreadsheet"
|
1546
|
+
|
1547
|
+
class CArray
|
1548
|
+
|
1549
|
+
def save_excel (filename, &block)
|
1550
|
+
if self.rank >= 3
|
1551
|
+
raise "too large rank (>2) to write excel file"
|
1552
|
+
end
|
1553
|
+
book = Spreadsheet::Workbook.new
|
1554
|
+
worksheet = book.create_worksheet
|
1555
|
+
self.dim0.times do |i|
|
1556
|
+
worksheet.row(i).push *self[i,nil]
|
1557
|
+
end
|
1558
|
+
if block
|
1559
|
+
block.call(worksheet)
|
1560
|
+
end
|
1561
|
+
book.write(filename)
|
1562
|
+
end
|
1563
|
+
|
1564
|
+
def self.load_excel (filename, sheet=0)
|
1565
|
+
book = Spreadsheet.open(filename)
|
1566
|
+
sheet = book.worksheet(sheet)
|
1567
|
+
return sheet.map(&:to_a).to_ca
|
1568
|
+
end
|
1569
|
+
|
1570
|
+
end
|
1571
|
+
|
1572
|
+
class CADataFrame
|
1573
|
+
|
1574
|
+
def self.load_sqlite3 (*args)
|
1575
|
+
return CArray.load_sqlite3(*args).to_dataframe.arrange{ maskout nil, *column_names }
|
1576
|
+
end
|
1577
|
+
|
1578
|
+
|
1579
|
+
def self.load_csv (*args, &block)
|
1580
|
+
return CArray.load_csv(*args, &block).to_dataframe.arrange{ maskout nil, *column_names }
|
1581
|
+
end
|
1582
|
+
|
1583
|
+
def self.from_csv (*args, &block)
|
1584
|
+
return CArray.from_csv(*args, &block).to_dataframe.arrange{ maskout nil, *column_names }
|
1585
|
+
end
|
1586
|
+
|
1587
|
+
def to_csv (io = "", option = {}, rs: $/, sep: ",", fill: "", with_row_index: true, &block)
|
1588
|
+
if @row_index and with_row_index
|
1589
|
+
namelist = [""] + @column_names
|
1590
|
+
tbl = CADFArray.new(namelist, @columns.clone.update("" => @row_index))
|
1591
|
+
else
|
1592
|
+
tbl = ca.to_ca
|
1593
|
+
end
|
1594
|
+
return tbl.to_csv(io, option, rs: rs, sep: sep, fill: fill, &block)
|
1595
|
+
end
|
1596
|
+
|
1597
|
+
def to_daru
|
1598
|
+
require "daru"
|
1599
|
+
columns = {}
|
1600
|
+
each_column_name do |name|
|
1601
|
+
columns[name] = column(name).to_a
|
1602
|
+
end
|
1603
|
+
if @row_index
|
1604
|
+
return Daru::DataFrame.new(columns, index: @row_index.to_a, order: @column_names)
|
1605
|
+
else
|
1606
|
+
return Daru::DataFrame.new(columns, order: @column_names)
|
1607
|
+
end
|
1608
|
+
end
|
1609
|
+
|
1610
|
+
def to_xlsx (filename, sheet_name: 'Sheet1', with_row_index: false, &block)
|
1611
|
+
require "axlsx"
|
1612
|
+
xl = Axlsx::Package.new
|
1613
|
+
xl.use_shared_strings = true
|
1614
|
+
sheet = xl.workbook.add_worksheet(name: sheet_name)
|
1615
|
+
df = self.to_df.objectify.unmask("=NA()")
|
1616
|
+
if with_row_index
|
1617
|
+
sheet.add_row([""] + column_names)
|
1618
|
+
df.each_row_with_row_index(with: Array) do |list, i|
|
1619
|
+
sheet.add_row([i] + list)
|
1620
|
+
end
|
1621
|
+
else
|
1622
|
+
sheet.add_row(column_names)
|
1623
|
+
df.each_row(with: Array) do |list|
|
1624
|
+
sheet.add_row(list)
|
1625
|
+
end
|
1626
|
+
end
|
1627
|
+
if block_given?
|
1628
|
+
yield sheet
|
1629
|
+
end
|
1630
|
+
xl.serialize(filename)
|
1631
|
+
end
|
1632
|
+
|
1633
|
+
end
|
1634
|
+
|
1635
|
+
|
1636
|
+
|
1637
|
+
|
1638
|
+
|
1639
|
+
|
1640
|
+
|