carray-dataframe 1.0.0 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,26 +0,0 @@
1
-
2
- module CA::TableMethods
3
- autoload_method "to_df", "carray/dataframe/dataframe"
4
- end
5
-
6
- autoload :CADataFrame, "carray/dataframe/dataframe"
7
-
8
- autoload :DataFrame, "carray/dataframe/dataframe"
9
- autoload :RSReceiver, "carray/dataframe/dataframe"
10
-
11
- class RSRuby
12
- autoload_method "setup", "carray/dataframe/dataframe"
13
- autoload_method "recieve", "carray/dataframe/dataframe"
14
- end
15
-
16
- module Daru
17
- class DataFrame
18
- autoload_method "to_df", "carray/dataframe/dataframe"
19
- end
20
- end
21
-
22
- module SQLite3
23
- class Database
24
- autoload_method "to_df", "carray/dataframe/dataframe"
25
- end
26
- end
@@ -1,1640 +0,0 @@
1
- require "carray"
2
- require "carray/io/table"
3
-
4
- module CA::TableMethods
5
-
6
- def to_dataframe (&block)
7
- df = CADataFrame.new(self, &block)
8
- if @header or @note
9
- df.instance_variable_set(:@header, @header)
10
- df.instance_variable_set(:@note, @note)
11
- class << df
12
- attr_reader :note
13
- def header (name=nil)
14
- if name
15
- return @header[name.to_s]
16
- else
17
- return @column_names
18
- end
19
- end
20
- end
21
- end
22
- return df
23
- end
24
-
25
- alias to_df to_dataframe
26
-
27
- end
28
-
29
- class CADataFrame
30
-
31
- #
32
- # Constructor
33
- #
34
-
35
- def initialize (columns_or_table, row_index: nil, column_names: nil, &block)
36
- case columns_or_table
37
- when Hash
38
- columns = columns_or_table
39
- @column_names = columns.keys.map(&:to_s)
40
- @columns = normalize_columns(columns)
41
- @row_number = @columns.first[1].size
42
- if @column_names.any?{ |key| @columns[key].size != @row_number }
43
- raise "column sizes mismatch"
44
- end
45
- when CArray
46
- table = columns_or_table
47
- if column_names
48
- @column_names = column_names.map(&:to_s)
49
- else
50
- if table.respond_to?(:column_names)
51
- @column_names = table.column_names.map(&:to_s)
52
- else
53
- raise "data table (CArray) has no method 'column_names'."
54
- end
55
- end
56
- @columns = table_to_columns(table)
57
- @row_number = table.dim0
58
- else
59
- raise "unknown data"
60
- end
61
- if row_index
62
- @row_index = row_index.to_ca.object
63
- else
64
- @row_index = nil
65
- end
66
- @__methods__ = {}
67
- if block_given?
68
- arrange(&block)
69
- end
70
- end
71
-
72
- def __methods__
73
- return @__methods__
74
- end
75
-
76
- def replace (other)
77
- @column_names = other.column_names
78
- @columns = other.columns
79
- @row_index = other.row_index
80
- @row_number = other.row_number
81
- @__methors__ = other.__methods__
82
- return self
83
- end
84
-
85
- private
86
-
87
- def table_to_columns (table)
88
- new_columns = {}
89
- @column_names.each_with_index do |name, i|
90
- new_columns[name] = table[nil,i]
91
- end
92
- return new_columns
93
- end
94
-
95
- def normalize_columns (columns)
96
- new_columns = {}
97
- columns.each_key do |key|
98
- case columns[key]
99
- when CArray
100
- column = columns[key]
101
- when Array
102
- column = columns[key].to_ca
103
- if column.rank != 1
104
- list = columns[key].clone
105
- column = CArray.object(list.size).convert { list.shift }
106
- end
107
- else
108
- column = columns[key].to_ca
109
- end
110
- new_columns[key.to_s] = column
111
- end
112
- return new_columns
113
- end
114
-
115
- public
116
-
117
- #
118
- # Attributes
119
- #
120
-
121
- attr_reader :columns, :column_names, :row_index, :column_number, :row_number
122
-
123
- def has_column?(name)
124
- return @column_names.include?(name)
125
- end
126
-
127
- def column_types
128
- return @columns_names.map{|name| @columns[name].data_type_name }
129
- end
130
-
131
- #
132
- # Column, Row Access
133
- #
134
-
135
- def column (name_or_index)
136
- case name_or_index
137
- when Integer
138
- return @columns[@column_names[name_or_index]]
139
- when String, Symbol
140
- return @columns[name_or_index.to_s]
141
- end
142
- end
143
-
144
- alias col column
145
-
146
- def row (idx)
147
- if @row_index
148
- addr = @row_index.search(idx)
149
- return @column_names.map{|name| @columns[name][addr]}.to_ca
150
- else
151
- return @column_names.map{|name| @columns[name][idx]}.to_ca
152
- end
153
- end
154
-
155
- def index
156
- return CArray.int(@row_number).seq
157
- end
158
-
159
- def method (hash)
160
- new_hash = {}
161
- hash.each do |key, value|
162
- new_hash[key.to_s] = value.to_s
163
- end
164
- @__methods__.update(new_hash)
165
- end
166
-
167
- def method_missing (name, *args)
168
- if args.size == 0
169
- name = name.to_s
170
- if has_column?(name)
171
- return @columns[name]
172
- elsif has_column?(name.gsub(/_/,'.')) ### For R
173
- return @columns[name.gsub(/_/,'.')]
174
- elsif @__methods__.include?(name)
175
- return @columns[@__methods__[name]]
176
- end
177
- end
178
- raise "no method '#{name}' for CADataFrame"
179
- end
180
-
181
-
182
- #
183
- # Iterators
184
- #
185
-
186
- def each_column (&block)
187
- return @columns.each(&block)
188
- end
189
-
190
- def each_column_name (&block)
191
- return @column_names.each(&block)
192
- end
193
-
194
- def each_row_index (&block)
195
- if @row_index
196
- @row_index.each(&block)
197
- else
198
- @row_number.times(&block)
199
- end
200
- end
201
-
202
- def each_row (with: Array, &block)
203
- if with == Array
204
- @row_number.times do |i|
205
- yield @columns.map{|n,c| c[i] }
206
- end
207
- elsif with == Hash
208
- row = {}
209
- @row_number.times do |i|
210
- @column_names.each do |c|
211
- row[c] = @columns[c][i]
212
- end
213
- yield row
214
- end
215
- else
216
- raise "invalid data type for loop variable"
217
- end
218
- end
219
-
220
- def each_row_with_row_index (with: Array, &block)
221
- if with == Array
222
- if @row_index
223
- @row_index.each_with_index do |idx, i|
224
- yield @columns.map{|n,c| c[i] }, idx
225
- end
226
- else
227
- @row_number.times do |i|
228
- yield @columns.map{|n,c| c[i] }, i
229
- end
230
- end
231
- elsif with == Hash
232
- row = {}
233
- if @row_index
234
- @row_index.each_with_index do |idx, i|
235
- @column_names.each do |c|
236
- row[c] = @columns[c][i]
237
- end
238
- yield row, @row_index[i]
239
- end
240
- else
241
- @row_number.times do |idx, i|
242
- @column_names.each do |c|
243
- row[c] = @columns[c][i]
244
- end
245
- yield row, @row_index[i]
246
- end
247
- end
248
- else
249
- raise "invalid data type for loop variable"
250
- end
251
- end
252
-
253
- #
254
- # Referencing
255
- #
256
-
257
- def [] (*argv)
258
- row, col = *argv
259
- new_columns = {}
260
- if col.is_a?(NilClass)
261
- case row
262
- when CADataFrame
263
- each_column_name do |key|
264
- if row.has_column?(key)
265
- new_columns[key] = column(key).maskout(row.column(key))
266
- else
267
- new_columns[key] = column(key).to_ca
268
- end
269
- end
270
- return CADataFrame.new(new_columns, row_index: row.row_index ? row.row_index : nil)
271
- when String
272
- return self[nil,row]
273
- when Array
274
- if row.all?{|s| s.is_a?(String) }
275
- return self[nil,row]
276
- else
277
- @column_names.each do |key|
278
- new_columns[key] = @columns[key][row]
279
- end
280
- end
281
- return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[row] : nil)
282
- else
283
- if row.is_a?(Integer)
284
- row = [row]
285
- end
286
- @column_names.each do |key|
287
- new_columns[key] = @columns[key][row]
288
- end
289
- return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[row] : nil)
290
- end
291
- else
292
- if row.is_a?(Integer)
293
- row = [row]
294
- end
295
- case col
296
- when String, Symbol
297
- key = col.to_s
298
- if has_column?(key)
299
- return column(key)[row]
300
- else
301
- raise "unknow column name '#{key}'"
302
- end
303
- when Array
304
- if col.all?{|s| s.is_a?(String) }
305
- col.each do |key|
306
- key = key.to_s
307
- if has_column?(key)
308
- new_columns[key] = column(key)[row]
309
- else
310
- raise "unknow column name '#{key}'"
311
- end
312
- end
313
- else
314
- keys = @column_names.to_ca[col].to_a
315
- keys.each do |key|
316
- new_columns[key] = column(key)[row]
317
- end
318
- end
319
- return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[row] : nil)
320
- else
321
- if col.is_a?(Integer)
322
- col = [col]
323
- end
324
- keys = @column_names.to_ca[col].to_a
325
- keys.each do |key|
326
- new_columns[key] = column(key)[row]
327
- end
328
- return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[row] : nil)
329
- end
330
- end
331
- end
332
-
333
- #
334
- # Setting Values
335
- #
336
-
337
- def []= (*argv)
338
- value = argv.pop
339
- row, col = *argv
340
- case col
341
- when NilClass
342
- case row
343
- when CADataFrame
344
- each_column_name do |key|
345
- if row.has_column?(key)
346
- column(key)[row.column(key)] = value
347
- end
348
- end
349
- when String
350
- self[nil,row] = value
351
- else
352
- col = @column_names.to_a
353
- self[row,col] = value
354
- end
355
- when String, Symbol
356
- key = col.to_s
357
- if has_column?(key)
358
- column(key)[row] = value
359
- else
360
- arrange {
361
- append key, value
362
- }
363
- end
364
- when Array
365
- col.each do |key|
366
- key = key.to_s
367
- if has_column?(key)
368
- column(key)[row] = value
369
- else
370
- raise "unknow column name '#{key}'"
371
- end
372
- end
373
- else
374
- if col.is_a?(Integer)
375
- col = [col]
376
- end
377
- keys = @column_names.to_ca[col].to_a
378
- keys.each do |key|
379
- column(key)[row] = value
380
- end
381
- end
382
- return value
383
- end
384
-
385
- def where (mask, value)
386
- mask.column_names.each do |key|
387
- if has_column?(key)
388
- column(key)[mask.column(key).boolean.not] = value
389
- end
390
- end
391
- return value
392
- end
393
-
394
- def fill (*names, value)
395
- names.each do |name|
396
- if has_column?(name)
397
- column(name).fill(value)
398
- end
399
- end
400
- return self
401
- end
402
-
403
- #
404
- # Arrange
405
- #
406
-
407
- def arrange (&block)
408
- return Arranger.new(self).arrange(&block)
409
- end
410
-
411
- def rename (name1, name2)
412
- if idx = @column_names.index(name1.to_s)
413
- @column_names[idx] = name2.to_s
414
- column = @columns[name1.to_s]
415
- @columns.delete(name1.to_s)
416
- @columns[name2.to_s] = column
417
- else
418
- raise "unknown column name #{name1}"
419
- end
420
- end
421
-
422
- def downcase
423
- new_column_names = []
424
- new_columns = {}
425
- each_column_name do |name|
426
- new_column_names << name.downcase
427
- new_columns[name.downcase] = column(name)
428
- end
429
- @column_names = new_column_names
430
- @columns = new_columns
431
- return self
432
- end
433
-
434
- def append (name, new_column = nil, &block)
435
- if new_column
436
- # do nothing
437
- elsif block
438
- new_column = instance_exec(&block)
439
- else
440
- new_column = @columns.first[1].template(:object)
441
- end
442
- unless new_column.is_a?(CArray)
443
- new_column = new_column.to_ca
444
- end
445
- if new_column.rank != 1 or new_column.size != @row_number
446
- raise "invalid shape of appended column"
447
- end
448
- @column_names.push(name)
449
- @columns[name] = new_column
450
- return new_column
451
- end
452
-
453
- def lead (name, new_column = nil, &block)
454
- if new_column
455
- # do nothing
456
- elsif block
457
- new_column = instance_exec(&block)
458
- else
459
- new_column = @columns.first[1].template(:object)
460
- end
461
- unless new_column.is_a?(CArray)
462
- new_column = new_column.to_ca
463
- end
464
- if new_column.rank != 1 or new_column.size != @row_number
465
- raise "invalid shape of appended column"
466
- end
467
- @column_names.unshift(name)
468
- @columns[name] = new_column
469
- return new_column
470
- end
471
-
472
- def vacant_copy
473
- new_columns = {}
474
- each_column_name do |key|
475
- new_columns[key] = CArray.object(0)
476
- end
477
- return CADataFrame.new(new_columns)
478
- end
479
-
480
- def merge (*args)
481
- return CADataFrame.merge(self, *args)
482
- end
483
-
484
- def execute (&block)
485
- return instance_exec(&block)
486
- end
487
-
488
- def calculate (label, &block)
489
- hash = {}
490
- each_column_name do |name|
491
- begin
492
- if block
493
- hash[name] = [yield(name, column(name))]
494
- else
495
- hash[name] = [column(name).send(label.intern)]
496
- end
497
- rescue
498
- hash[name] = [UNDEF]
499
- end
500
- end
501
- return CADataFrame.new(hash, row_index: [label])
502
- end
503
-
504
- def resample (&block)
505
- new_columns = {}
506
- each_column_name do |name|
507
- begin
508
- new_columns[name] = yield(name, column(name))
509
- rescue
510
- end
511
- end
512
- return CADataFrame.new(new_columns)
513
- end
514
-
515
- def select (*names, &block)
516
- if names.empty?
517
- names = @column_names
518
- end
519
- if block
520
- row = instance_exec(&block)
521
- else
522
- row = nil
523
- end
524
- new_columns = {}
525
- names.map(&:to_s).each do |name|
526
- new_columns[name] = column(name)[row]
527
- end
528
- return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[row] : nil)
529
- end
530
-
531
- #
532
- # Maintenance
533
- #
534
-
535
- def unmask! (value = nil)
536
- each_column_name do |name|
537
- column(name).unmask(value)
538
- end
539
- return self
540
- end
541
-
542
- def unmask (value = nil)
543
- return to_df.unmask!(value)
544
- end
545
-
546
- def detouch!
547
- @columns = @columns.clone
548
- each_column_name do |name|
549
- @columns[name] = @columns[name].to_ca
550
- end
551
- if @row_index
552
- @row_index = @row_index.clone
553
- end
554
- return self
555
- end
556
-
557
- #
558
- # Transformation
559
- #
560
-
561
- def eliminate_columns (*names)
562
- if names.empty?
563
- return self
564
- end
565
- names = names.map(&:to_s)
566
- new_columns = {}
567
- each_column_name do |name|
568
- unless names.include?(name)
569
- new_columns[name] = column(name)
570
- end
571
- end
572
- return CADataFrame.new(new_columns, row_index: @row_index)
573
- end
574
-
575
- def reorder (&block)
576
- index = instance_exec(&block)
577
- new_columns = {}
578
- each_column_name do |name|
579
- new_columns[name] = column(name)[index]
580
- end
581
- return CADataFrame.new(new_columns, row_index: @row_index ? @row_index[index] : nil)
582
- end
583
-
584
- def order_by (*names, &block)
585
- if names.empty?
586
- if block
587
- ret = instance_exec(&block)
588
- case ret
589
- when CArray
590
- list = [ret]
591
- when Array
592
- list = ret
593
- end
594
- end
595
- else
596
- list = @columns.values_at(*names.map{|s| s.to_s})
597
- end
598
- return reorder { CA.sort_addr(*list) }
599
- end
600
-
601
- def reverse
602
- new_columns = {}
603
- each_column_name do |name|
604
- new_columns[name] = column(name).reverse
605
- end
606
- return CADataFrame.new(new_columns, row_index: @row_index ? @row_index.reverse : nil)
607
- end
608
-
609
- def transpose (column_names: nil)
610
- if column_names
611
- column_names = header.map(&:to_s)
612
- else
613
- if @row_index
614
- column_names = @row_index.convert(:object) {|v| v.to_s }
615
- else
616
- column_names = CArray.object(@row_number).seq("a",:succ)
617
- end
618
- end
619
- return CADataFrame.new(ca.transpose, row_index: @column_names.to_ca, column_names: column_names)
620
- end
621
-
622
- def add_suffix (suf)
623
- new_columns = {}
624
- each_column_name do |name|
625
- new_name = (name.to_s + suf).to_s
626
- new_columns[new_name] = column(name)
627
- end
628
- return CADataFrame.new(new_columns, row_index: @row_index)
629
- end
630
-
631
- #
632
- # Conversions
633
- #
634
-
635
- def to_df
636
- new_columns = {}
637
- each_column_name do |name|
638
- new_columns[name] = column(name)
639
- end
640
- return CADataFrame.new(new_columns, row_index: @row_index).detouch!
641
- end
642
-
643
- def objectify
644
- new_columns = {}
645
- each_column_name do |name|
646
- new_columns[name] = column(name).object
647
- end
648
- return CADataFrame.new(new_columns, row_index: @row_index)
649
- end
650
-
651
- def ca (*names)
652
- if names.empty?
653
- return CADFArray.new(@column_names, @columns)
654
- else
655
- return CADFArray.new(names.map(&:to_s), @columns)
656
- end
657
- end
658
-
659
- def to_ca (*names)
660
- return ca(*names).to_ca
661
- end
662
-
663
- def to_hash
664
- hash = {}
665
- @columns.each do |k,v|
666
- hash[k] = v.to_a
667
- end
668
- return hash
669
- end
670
-
671
- def columns_to_hash (key_name, value_names)
672
- hash = {}
673
- unless @column_names.include?(key_name)
674
- raise ArgumentError, "include invalid key column name #{key_name}"
675
- end
676
- case value_names
677
- when String
678
- unless @column_names.include?(value_names)
679
- raise ArgumentError, "invalid key column name #{value_names}"
680
- end
681
- key_columns = @columns[key_name]
682
- value_columns = @columns[value_names]
683
- @row_number.times do |i|
684
- hash[key_columns[i]] = value_columns[i]
685
- end
686
- when Array
687
- unless value_names.all?{|s| @column_names.include?(s) }
688
- raise ArgumentError, "include invalid column name in #{value_names.join(' ')}"
689
- end
690
- key_columns = @columns[key_name]
691
- value_columns = @columns.values_at(*value_names)
692
- @row_number.times do |i|
693
- hash[key_columns[i]] = value_columns.map{|c| c[i]}
694
- end
695
- else
696
- raise ArgumentError, "invalud argument"
697
- end
698
- return hash
699
- end
700
-
701
- private
702
-
703
- def __obj_to_string__ (obj)
704
- case obj
705
- when Float
706
- "%.6g" % obj
707
- else
708
- obj.to_s
709
- end
710
- end
711
-
712
- def __strwidth__ (string)
713
- if string.ascii_only?
714
- return string.length
715
- else
716
- return string.each_char.inject(0){|s,c| s += c.bytesize > 1 ? 2 : 1 }
717
- end
718
- end
719
-
720
- public
721
-
722
- def ascii_table (rowmax = :full)
723
- if @row_index
724
- namelist = [" "] + @column_names
725
- tbl = CADFArray.new(namelist, @columns.clone.update(" " => @row_index))
726
- else
727
- namelist = [" "] + @column_names
728
- tbl = CADFArray.new(namelist, @columns.clone.update(" " => CArray.int(@row_number).seq))
729
- end
730
- if rowmax.is_a?(Integer) and @row_number > rowmax
731
- list = tbl[0..(rowmax/2),nil].to_a
732
- list.push namelist.map { "..." }
733
- list.push *(tbl[-rowmax/2+1..-1,nil].to_a)
734
- tbl = list.to_ca
735
- end
736
- datastr = tbl.convert {|c| __obj_to_string__(c) }.unmask("")
737
- datamb = datastr.convert(:boolean, &:"ascii_only?").not.sum(0).ne(0)
738
- namemb = namelist.to_ca.convert(:boolean) {|c| c.to_s.ascii_only? }.eq(0)
739
- mb = datamb.or(namemb)
740
- namelen = namelist.map(&:length).to_ca
741
- datalen = datastr.convert(&:length)
742
- if mb.max == 0
743
- if datalen.size == 0
744
- lengths = namelen.to_a
745
- else
746
- lengths = datalen.max(0).pmax(namelen).to_a
747
- end
748
- hrule = "-" + lengths.map {|len| "-"*len}.join("--") + "-"
749
- header = " " +
750
- [namelist, lengths].transpose.map{|name, len|
751
- "#{name.to_s.ljust(len)}" }.join(" ") + " "
752
- ary = [hrule, header, hrule]
753
- if datalen.size > 0
754
- datastr[:i,nil].each_with_index do |blk, i|
755
- list = blk.flatten.to_a
756
- ary << " " + [list, lengths].transpose.map{|value, len|
757
- "#{value.ljust(len)}"}.join(" ") + " "
758
- end
759
- end
760
- ary << hrule
761
- return "DataFrame: rows#=#{@row_number}: \n" + ary.join("\n")
762
- else
763
- namewidth = namelist.to_ca.convert{|c| __strwidth__(c.to_s) }
764
- if datalen.size == 0
765
- maxwidth = namewidth
766
- else
767
- datawidth = datastr.convert{|c| __strwidth__(c.to_s) }
768
- maxwidth = datawidth.max(0).pmax(namewidth)
769
- end
770
- len = maxwidth[:*,nil] - datawidth + datalen
771
- hrule = "-" + maxwidth.map {|len| "-"*len}.join("--") + "-"
772
- header = " " +
773
- [namelist, maxwidth.to_a].transpose.map{|name, len|
774
- "#{name.to_s.ljust(len-__strwidth__(name.to_s)+name.to_s.length)}" }.join(" ") + " "
775
- ary = [hrule, header, hrule]
776
- if datalen.size > 0
777
- datastr[:i,nil].each_with_addr do |blk, i|
778
- list = blk.flatten.to_a
779
- ary << " " + list.map.with_index {|value, j|
780
- "#{value.ljust(len[i,j])}"}.join(" ") + " "
781
- end
782
- end
783
- ary << hrule
784
- return "DataFrame: row#=#{@row_number}: \n" + ary.join("\n")
785
- end
786
- end
787
-
788
-
789
- def inspect
790
- return ascii_table(10)
791
- end
792
-
793
- def to_s
794
- return ascii_table
795
- end
796
-
797
- def to_ary
798
- return [to_s]
799
- end
800
-
801
-
802
- end
803
-
804
- #############################################################
805
- #
806
- # ARRANGER
807
- #
808
- #############################################################
809
-
810
-
811
- class CADataFrame
812
-
813
- class Arranger
814
-
815
- def initialize (dataframe)
816
- @dataframe = dataframe
817
- end
818
-
819
- def arrange (&block)
820
- instance_exec(&block)
821
- return @dataframe
822
- end
823
-
824
- private
825
-
826
- def column_names
827
- return @dataframe.column_names
828
- end
829
-
830
- def row_number
831
- return @dataframe.row_number
832
- end
833
-
834
- def method (hash)
835
- @dataframe.method(hash)
836
- end
837
-
838
- def timeseries (name, fmt = "%Y-%m-%d %H:%M:%S")
839
- @dataframe.columns[name.to_s] = @dataframe.columns[name.to_s].strptime(fmt)
840
- end
841
-
842
- def type (type, name, mask = :novalue)
843
- @dataframe.columns[name.to_s] = @dataframe.columns[name.to_s].to_type(type)
844
- if mask != :novalue
845
- @dataframe.columns[name.to_s].maskout!(options[:maskout])
846
- end
847
- end
848
-
849
- def eliminate (*names)
850
- if names.empty?
851
- return self
852
- end
853
- names = names.map(&:to_s)
854
- @dataframe.column_names.each do |name|
855
- if names.include?(name)
856
- @dataframe.columns.delete(name)
857
- @dataframe.column_names.delete(name)
858
- end
859
- end
860
- end
861
-
862
- def template (*args, &block)
863
- return @dataframe.columns.first[1].template(*args, &block)
864
- end
865
-
866
- def double (*names)
867
- names.flatten.map(&:to_s).each do |name|
868
- if @dataframe.column_names.include?(name)
869
- type(:double, name)
870
- else
871
- raise "Unknown column name '#{name}'"
872
- end
873
- end
874
- end
875
-
876
- def int (*names)
877
- names.flatten.map(&:to_s).each do |name|
878
- if @dataframe.column_names.include?(name)
879
- type(:int, name)
880
- else
881
- raise "Unknown column name '#{name}'"
882
- end
883
- end
884
- end
885
-
886
- def maskout (value, *names)
887
- names.flatten.map(&:to_s).each do |name|
888
- @dataframe.columns[name].maskout!(value)
889
- end
890
- end
891
-
892
- def unmask (value, *names)
893
- names.flatten.map(&:to_s).each do |name|
894
- @dataframe.columns[name].unmask(value)
895
- end
896
- end
897
-
898
- def col (name)
899
- return @dataframe.col(name)
900
- end
901
-
902
- def append (name, new_column)
903
- if new_column
904
- # do nothing
905
- else
906
- new_column = @dataframe.columns.first[1].template(:object)
907
- end
908
- unless new_column.is_a?(CArray)
909
- new_column = new_column.to_ca
910
- end
911
- @dataframe.columns[name.to_s] = new_column
912
- @dataframe.column_names.push(name.to_s)
913
- end
914
-
915
- def lead (name, new_column)
916
- if new_column
917
- # do nothing
918
- else
919
- new_column = @dataframe.columns.first[1].template(:object)
920
- end
921
- unless new_column.is_a?(CArray)
922
- new_column = new_column.to_ca
923
- end
924
- @dataframe.columns[name.to_s] = new_column
925
- @dataframe.column_names.unshift(name.to_s)
926
- end
927
-
928
- def rename (name1, name2)
929
- if idx = @dataframe.column_names.index(name1.to_s)
930
- @dataframe.column_names[idx] = name2.to_s
931
- column = @dataframe.columns[name1.to_s]
932
- @dataframe.columns.delete(name1.to_s)
933
- @dataframe.columns[name2.to_s] = column
934
- else
935
- raise "unknown column name #{name1}"
936
- end
937
- end
938
-
939
- def downcase
940
- @dataframe.downcase
941
- end
942
-
943
- def classify (name, scale, opt = {})
944
- return @dataframe.classify(name, scale, opt)
945
- end
946
-
947
- def map (mapper, name_or_column)
948
- case name_or_column
949
- when String, Symbol
950
- name = name_or_column
951
- column = @dataframe.columns[name.to_s]
952
- when CArray
953
- column = name_or_column
954
- when Array
955
- column = name_or_column.to_ca
956
- else
957
- raise "invalid argument"
958
- end
959
- case mapper
960
- when Hash
961
- return column.convert(:object) {|v| hash[v] }
962
- when CArray
963
- return mapper.project(column)
964
- when Array
965
- return mapper.to_ca.project(column)
966
- end
967
- end
968
-
969
- def method_missing (name, *args)
970
- if args.size == 0
971
- if @dataframe.column_names.include?(name.to_s)
972
- return @dataframe.columns[name.to_s]
973
- elsif @dataframe.__methods__.include?(name.to_s)
974
- return @dataframe.columns[@dataframe.__methods__[name.to_s]]
975
- end
976
- end
977
- super
978
- end
979
-
980
- end
981
-
982
- end
983
-
984
- #############################################################
985
- #
986
- # Class methods
987
- #
988
- #############################################################
989
-
990
- class CADataFrame
991
-
992
- def self.merge (*args)
993
- ref = args.first
994
- new_columns = {}
995
- args.each do |table|
996
- table.column_names.each do |name|
997
- new_columns[name] = table.col(name)
998
- end
999
- end
1000
- return CADataFrame.new(new_columns, row_index: ref.row_index)
1001
- end
1002
-
1003
- def self.concat (*args)
1004
- ref = args.first
1005
- column_names = ref.column_names
1006
- new_columns = {}
1007
- column_names.each do |name|
1008
- list = args.map{|t| t.col(name) }
1009
- data_type = list.first.data_type
1010
- new_columns[name] = CArray.bind(data_type, list, 0)
1011
- end
1012
- if args.map(&:row_index).all?
1013
- new_row_index = CArray.join(*args.map(&:row_index))
1014
- else
1015
- new_row_index = nil
1016
- end
1017
- return CADataFrame.new(new_columns, row_index: new_row_index)
1018
- end
1019
-
1020
-
1021
- end
1022
-
1023
- #############################################################
1024
- #
1025
- # CADFArray
1026
- #
1027
- #############################################################
1028
-
1029
- class CADFArray < CAObject # :nodoc:
1030
-
1031
- def initialize (column_names, columns)
1032
- @column_names = column_names
1033
- @columns = columns
1034
- dim = [@columns[@column_names.first].size, @column_names.size]
1035
- extend CA::TableMethods
1036
- super(:object, dim, :read_only=>true)
1037
- __create_mask__
1038
- end
1039
-
1040
- attr_reader :column_names
1041
-
1042
- def fetch_index (idx)
1043
- r, c = *idx
1044
- name = @column_names[c]
1045
- return @columns[name][r]
1046
- end
1047
-
1048
- def copy_data (data)
1049
- @column_names.each_with_index do |name, i|
1050
- data[nil,i] = @columns[name].value
1051
- end
1052
- end
1053
-
1054
- def create_mask
1055
- end
1056
-
1057
- def mask_fetch_index (idx)
1058
- r, c = *idx
1059
- name = @column_names[c]
1060
- if @columns[name].has_mask?
1061
- return @columns[name].mask[r]
1062
- else
1063
- return 0
1064
- end
1065
- end
1066
-
1067
- def mask_copy_data (data)
1068
- @column_names.each_with_index do |name, i|
1069
- if @columns[name].has_mask?
1070
- data[nil,i] = @columns[name].mask
1071
- end
1072
- end
1073
- end
1074
-
1075
- def to_ca
1076
- obj = super
1077
- obj.extend CA::TableMethods
1078
- obj.column_names = @column_names
1079
- return obj
1080
- end
1081
-
1082
- end
1083
-
1084
- #############################################################
1085
- #
1086
- # BASIC Comparison
1087
- #
1088
- #############################################################
1089
-
1090
-
1091
- class CADataFrame
1092
-
1093
- def -@
1094
- return cmp(:-@)
1095
- end
1096
-
1097
- def < (other)
1098
- return cmp(:<, other)
1099
- end
1100
-
1101
- def <= (other)
1102
- return cmp(:<=, other)
1103
- end
1104
-
1105
- def > (other)
1106
- return cmp(:>, other)
1107
- end
1108
-
1109
- def >= (other)
1110
- return cmp(:>=, other)
1111
- end
1112
-
1113
- def is_masked
1114
- return cmp(:is_masked)
1115
- end
1116
-
1117
- def is_finite
1118
- return cmp(:is_finite)
1119
- end
1120
-
1121
- private
1122
-
1123
- def cmp (method, *argv)
1124
- return CADataFrame.new(ca.send(method,*argv), column_names: @column_names)
1125
- end
1126
-
1127
- end
1128
-
1129
- #############################################################
1130
- #
1131
- # BASIC Manipulations
1132
- #
1133
- #############################################################
1134
-
1135
- class CADataFrame
1136
-
1137
- def matchup (keyname, reference)
1138
- key = column(keyname.to_s)
1139
- idx = reference.matchup(key)
1140
- new_columns = {}
1141
- each_column_name do |name|
1142
- if name == keyname
1143
- new_columns[name] = reference
1144
- else
1145
- new_columns[name] = column(name).project(idx)
1146
- end
1147
- end
1148
- if @row_index
1149
- new_row_index = @row_index.project(idx).unmask(nil)
1150
- else
1151
- new_row_index = nil
1152
- end
1153
- return CADataFrame.new(new_columns, row_index: new_row_index) {
1154
- self.send(keyname)[] = reference
1155
- }
1156
- end
1157
-
1158
- def join (table, on: nil)
1159
- end
1160
-
1161
- def histogram (name, scale = nil, options = nil)
1162
- if scale.nil?
1163
- return group_by(name).table{ { :count => col(name).count_valid } }
1164
- else
1165
- if options
1166
- hist = CAHistogram.int(scale, options)
1167
- else
1168
- hist = CAHistogram.int(scale)
1169
- end
1170
- hist.increment(@columns[name.to_s])
1171
- hash = {
1172
- name.to_s => hist.midpoints[0],
1173
- "#{name}_L".to_s => scale[0..-2],
1174
- "#{name}_R".to_s => scale.shift(-1)[0..-2],
1175
- :count => hist[0..-2].to_ca,
1176
- }
1177
- return CADataFrame.new(hash)
1178
- end
1179
- end
1180
-
1181
- def classify (name, scale = nil, opt = {})
1182
- if not scale
1183
- column = @columns[name.to_s]
1184
- mids = column.uniq
1185
- mapper = {}
1186
- mids.each_with_index do |v,i|
1187
- mapper[v] = i
1188
- end
1189
- cls = columns.convert(:int32) {|v| mapper[v] }
1190
- hash = {
1191
- "#{name}_M" => mids,
1192
- "#{name}_L" => mids,
1193
- "#{name}_R" => mids,
1194
- "#{name}_CLASS" => cls
1195
- }
1196
- else
1197
- option = {
1198
- :include_upper => false,
1199
- :include_lowest => true,
1200
- :offset => 0,
1201
- }.update(opt)
1202
- column = @columns[name.to_s]
1203
- cls = scale.bin(column,
1204
- option[:include_upper],
1205
- option[:include_lowest],
1206
- option[:offset])
1207
- mids = ((scale + scale.shifted(-1))/2)[0..-2].to_ca
1208
- left = scale[0..-2]
1209
- right = scale.shift(-1)[0..-2]
1210
- hash = {
1211
- "#{name}_M" => mids.project(cls).to_ca,
1212
- "#{name}_L" => left.project(cls).to_ca,
1213
- "#{name}_R" => right.project(cls).to_ca,
1214
- "#{name}_CLASS" => cls
1215
- }
1216
- end
1217
- return CADataFrame.new(hash)
1218
- end
1219
-
1220
- def cross (name1, name2)
1221
- col1 = column(name1)
1222
- col2 = column(name2)
1223
- var1 = col1.uniq.sort
1224
- var2 = col2.uniq.sort
1225
- hash = {}
1226
- count = Hash.new {0}
1227
- var1.each do |v1|
1228
- var2.each do |v2|
1229
- hash[[v1,v2]] = 0
1230
- end
1231
- end
1232
- list = CArray.join([col1, col2]).to_a
1233
- list.each do |item|
1234
- hash[item] += 1
1235
- end
1236
- out = CArray.object(var1.size, var2.size) { 0 }
1237
- var1.each_with_index do |v1, i|
1238
- var2.each_with_index do |v2, j|
1239
- out[i,j] = hash[[v1,v2]]
1240
- end
1241
- end
1242
- return CADataFrame.new(out, row_index: var1, column_names: var2)
1243
- end
1244
-
1245
- end
1246
-
1247
-
1248
- #############################################################
1249
- #
1250
- # GROUPING
1251
- #
1252
- #############################################################
1253
-
1254
- class CADataFrame
1255
-
1256
- def group_by (*names)
1257
- if names.size == 1
1258
- return CADataFrameGroup.new(self, names[0])
1259
- else
1260
- return CADataFrameGroupMulti.new(self, *names)
1261
- end
1262
- end
1263
-
1264
- end
1265
-
1266
- class CADataFrameGroup
1267
-
1268
- def initialize (dataframe, name)
1269
- @dataframe = dataframe
1270
- case name
1271
- when Hash
1272
- name, list = name.first
1273
- @column = @dataframe.col(name)
1274
- @keys = list.to_ca
1275
- else
1276
- @column = @dataframe.col(name)
1277
- @keys = @column.uniq.sort
1278
- end
1279
- @name = name.to_s
1280
- @addrs = {}
1281
- @keys.each do |k|
1282
- @addrs[k] = @column.eq(k).where
1283
- end
1284
- end
1285
-
1286
- def table (&block)
1287
- hashpool = []
1288
- @keys.each do |k|
1289
- hashpool << @dataframe[@addrs[k]].execute(&block)
1290
- end
1291
- columns = {@name=>@keys}
1292
- hashpool.each_with_index do |hash, i|
1293
- hash.each do |key, value|
1294
- columns[key] ||= []
1295
- columns[key][i] = value
1296
- end
1297
- end
1298
- return CADataFrame.new(columns)
1299
- end
1300
-
1301
- def calculate (label, &block)
1302
- new_columns = {@name=>@keys}
1303
- @dataframe.each_column do |name, clmn|
1304
- if name == @name
1305
- next
1306
- end
1307
- new_columns[name] = CArray.object(@keys.size) { UNDEF }
1308
- @keys.each_with_index do |k, i|
1309
- begin
1310
- if block
1311
- new_columns[name][i] = yield(name, clmn[@addrs[k]])
1312
- else
1313
- new_columns[name][i] = clmn[@addrs[k]].send(label.intern)
1314
- end
1315
- rescue
1316
- end
1317
- end
1318
- end
1319
- return CADataFrame.new(new_columns)
1320
- end
1321
-
1322
- def [] (group_value)
1323
- if map = @addrs[group_value]
1324
- return @dataframe[map]
1325
- else
1326
- return @dataframe.vacant_copy
1327
- end
1328
- end
1329
-
1330
- def each
1331
- @addrs.each do |key, map|
1332
- yield @dataframe[map]
1333
- end
1334
- end
1335
-
1336
- def each_with_index
1337
- @addrs.each do |key, map|
1338
- yield @dataframe[map], key
1339
- end
1340
- end
1341
-
1342
- include Enumerable
1343
-
1344
- end
1345
-
1346
- class CADataFrameGroupMulti
1347
-
1348
- def initialize (dataframe, *names)
1349
- @rank = names.size
1350
- @dataframe = dataframe
1351
- @names = []
1352
- @column = []
1353
- @keys = []
1354
- names.each_with_index do |name, i|
1355
- case name
1356
- when Hash
1357
- name, list = name.first
1358
- @column[i] = @dataframe.col(name)
1359
- @keys[i] = list.to_ca
1360
- else
1361
- @column[i] = @dataframe.col(name)
1362
- @keys[i] = @column[i].to_ca.uniq.sort
1363
- end
1364
- @names[i] = name
1365
- end
1366
- @addrs = {}
1367
- each_with_keys do |list|
1368
- flag = @column[0].eq(list[0])
1369
- (1...@rank).each do |i|
1370
- flag &= @column[i].eq(list[i])
1371
- end
1372
- @addrs[list] = flag.where
1373
- end
1374
- end
1375
-
1376
- def each_with_keys (&block)
1377
- @keys[0].to_a.product(*@keys[1..-1].map(&:to_a)).each(&block)
1378
- end
1379
-
1380
- def table (&block)
1381
- hashpool = []
1382
- each_with_keys do |list|
1383
- hashpool << @dataframe[@addrs[list]].execute(&block)
1384
- end
1385
- columns = {}
1386
- @names.each do |name|
1387
- columns[name] = []
1388
- end
1389
- each_with_keys.with_index do |list,j|
1390
- @names.each_with_index do |name,i|
1391
- columns[name][j] = list[i]
1392
- end
1393
- end
1394
- hashpool.each_with_index do |hash, i|
1395
- hash.each do |key, value|
1396
- columns[key] ||= []
1397
- columns[key][i] = value
1398
- end
1399
- end
1400
- return CADataFrame.new(columns)
1401
- end
1402
-
1403
- def [] (group_value)
1404
- if map = @addrs[group_value]
1405
- return @dataframe[map]
1406
- else
1407
- return @dataframe.vacant_copy
1408
- end
1409
- end
1410
-
1411
- def each
1412
- each_with_keys do |key|
1413
- yield key, @dataframe[@addrs[key]]
1414
- end
1415
- end
1416
-
1417
- end
1418
-
1419
- #############################################################
1420
- #
1421
- # PIVOT TABLE
1422
- #
1423
- #############################################################
1424
-
1425
- class CADataFrame
1426
-
1427
- def pivot (name1, name2)
1428
- return CADataFramePivot.new(self, name1, name2)
1429
- end
1430
-
1431
- end
1432
-
1433
- class CADataFramePivot
1434
-
1435
- def initialize (dataframe, name1, name2)
1436
- @dataframe = dataframe
1437
- case name1
1438
- when Hash
1439
- name1, list = name1.first
1440
- @column1 = @dataframe.col(name1)
1441
- @keys1 = list.to_ca
1442
- else
1443
- @column1 = @dataframe.col(name1)
1444
- @keys1 = @column1.uniq.sort
1445
- end
1446
- case name2
1447
- when Hash
1448
- name2, list = name2.first
1449
- @column2 = @dataframe.col(name2)
1450
- @keys2 = list
1451
- else
1452
- @column2 = @dataframe.col(name2)
1453
- @keys2 = @column2.uniq.sort
1454
- end
1455
- @addrs = {}
1456
- @keys1.each do |k1|
1457
- @keys2.each do |k2|
1458
- @addrs[[k1,k2]] = (@column1.eq(k1) & @column2.eq(k2)).where
1459
- end
1460
- end
1461
- end
1462
-
1463
- def table (&block)
1464
- columns = {}
1465
- @keys2.each do |k2|
1466
- columns[k2] = CArray.object(@keys1.size) { UNDEF }
1467
- end
1468
- @keys1.each_with_index do |k1, i|
1469
- @keys2.each do |k2|
1470
- columns[k2][i] = @dataframe[@addrs[[k1,k2]]].execute(&block)
1471
- end
1472
- end
1473
- return CADataFrame.new(columns, row_index: @keys1)
1474
- end
1475
-
1476
- end
1477
-
1478
-
1479
- #############################################################
1480
- #
1481
- # CArray
1482
- #
1483
- #############################################################
1484
-
1485
-
1486
- class CArray
1487
-
1488
- def value_counts
1489
- hash = {}
1490
- values = uniq
1491
- values.each do |value|
1492
- hash[value] = 0
1493
- end
1494
- each do |value|
1495
- hash[value] += 1
1496
- end
1497
- counts = values.convert{|value| hash[value]}
1498
- return CADataFrame.new({'value' => values, 'count' => counts})
1499
- end
1500
-
1501
- end
1502
-
1503
-
1504
-
1505
- class CADataFrame
1506
-
1507
- def to_sqlite3 (*args)
1508
- self.ca.to_sqlite3(*args)
1509
- end
1510
-
1511
- def to_sql (tablename)
1512
- if @column_names.any?{ |s| s =~ /[\. \-]/ }
1513
- columns = {}
1514
- each_column_name do |name|
1515
- name2 = name.gsub(/[\. \-]/, '_')
1516
- columns[name2] = column(name)
1517
- end
1518
- df = CADataFrame.new(columns)
1519
- return df.to_sqlite3(database: ":memory:", table: tablename)
1520
- else
1521
- return to_sqlite3(database: ":memory:", table: tablename)
1522
- end
1523
- end
1524
-
1525
- end
1526
-
1527
- module SQLite3
1528
-
1529
- class Database
1530
-
1531
- def to_df (expr)
1532
- return CADataFrame.load_sqlite3 self, expr
1533
- end
1534
-
1535
- end
1536
-
1537
- end
1538
-
1539
- ######################################
1540
- #
1541
- # IO methods
1542
- #
1543
- ######################################
1544
-
1545
- require "spreadsheet"
1546
-
1547
- class CArray
1548
-
1549
- def save_excel (filename, &block)
1550
- if self.rank >= 3
1551
- raise "too large rank (>2) to write excel file"
1552
- end
1553
- book = Spreadsheet::Workbook.new
1554
- worksheet = book.create_worksheet
1555
- self.dim0.times do |i|
1556
- worksheet.row(i).push *self[i,nil]
1557
- end
1558
- if block
1559
- block.call(worksheet)
1560
- end
1561
- book.write(filename)
1562
- end
1563
-
1564
- def self.load_excel (filename, sheet=0)
1565
- book = Spreadsheet.open(filename)
1566
- sheet = book.worksheet(sheet)
1567
- return sheet.map(&:to_a).to_ca
1568
- end
1569
-
1570
- end
1571
-
1572
- class CADataFrame
1573
-
1574
- def self.load_sqlite3 (*args)
1575
- return CArray.load_sqlite3(*args).to_dataframe.arrange{ maskout nil, *column_names }
1576
- end
1577
-
1578
-
1579
- def self.load_csv (*args, &block)
1580
- return CArray.load_csv(*args, &block).to_dataframe.arrange{ maskout nil, *column_names }
1581
- end
1582
-
1583
- def self.from_csv (*args, &block)
1584
- return CArray.from_csv(*args, &block).to_dataframe.arrange{ maskout nil, *column_names }
1585
- end
1586
-
1587
- def to_csv (io = "", option = {}, rs: $/, sep: ",", fill: "", with_row_index: true, &block)
1588
- if @row_index and with_row_index
1589
- namelist = [""] + @column_names
1590
- tbl = CADFArray.new(namelist, @columns.clone.update("" => @row_index))
1591
- else
1592
- tbl = ca.to_ca
1593
- end
1594
- return tbl.to_csv(io, option, rs: rs, sep: sep, fill: fill, &block)
1595
- end
1596
-
1597
- def to_daru
1598
- require "daru"
1599
- columns = {}
1600
- each_column_name do |name|
1601
- columns[name] = column(name).to_a
1602
- end
1603
- if @row_index
1604
- return Daru::DataFrame.new(columns, index: @row_index.to_a, order: @column_names)
1605
- else
1606
- return Daru::DataFrame.new(columns, order: @column_names)
1607
- end
1608
- end
1609
-
1610
- def to_xlsx (filename, sheet_name: 'Sheet1', with_row_index: false, &block)
1611
- require "axlsx"
1612
- xl = Axlsx::Package.new
1613
- xl.use_shared_strings = true
1614
- sheet = xl.workbook.add_worksheet(name: sheet_name)
1615
- df = self.to_df.objectify.unmask("=NA()")
1616
- if with_row_index
1617
- sheet.add_row([""] + column_names)
1618
- df.each_row_with_row_index(with: Array) do |list, i|
1619
- sheet.add_row([i] + list)
1620
- end
1621
- else
1622
- sheet.add_row(column_names)
1623
- df.each_row(with: Array) do |list|
1624
- sheet.add_row(list)
1625
- end
1626
- end
1627
- if block_given?
1628
- yield sheet
1629
- end
1630
- xl.serialize(filename)
1631
- end
1632
-
1633
- end
1634
-
1635
-
1636
-
1637
-
1638
-
1639
-
1640
-