carray-dataframe 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1279 @@
1
+ require "carray"
2
+ require "carray/table"
3
+
4
+ def CADataFrame (*argv)
5
+ return CADataFrame.new(*argv)
6
+ end
7
+
8
+ class CADataFrame
9
+
10
+ #
11
+ # Constructor
12
+ #
13
+ def initialize (data, index: nil, columns: nil, order: nil, clone: false, &block)
14
+ # @column_names = Array holds column names and its order
15
+ # @column_data = Hash holds data entities
16
+ # @row_number = Integer holds number of rows
17
+ # @row_index = CArray stores row index (any object)
18
+ # @__methods__ = ...
19
+ # Stores data entity
20
+
21
+ case data
22
+ when Hash
23
+ raise "columns option is not needed for hash data" if columns
24
+ @column_data = columns_to_columns(data)
25
+ @column_names = @column_data.keys
26
+ when CArray
27
+ if columns
28
+ @column_names = columns.map(&:to_s)
29
+ else
30
+ if data.respond_to?(:column_names)
31
+ @column_names = data.column_names.map(&:to_s)
32
+ elsif order
33
+ @column_names = order.map(&:to_s)
34
+ else
35
+ raise "can't determin column names use columns or order option"
36
+ end
37
+ end
38
+ if @column_names.size != data.dim1
39
+ raise "mismatch between 'column_names' and table columns"
40
+ end
41
+ @column_data = table_to_columns(data)
42
+ when Array
43
+ case data.first
44
+ when Hash
45
+ @column_data = {}
46
+ dummy = {}
47
+ data.each do |hash|
48
+ dummy.update(hash)
49
+ end
50
+ @column_names = []
51
+ dummy.each_key do |k|
52
+ list = []
53
+ data.each do |hash|
54
+ list << (hash[k] || UNDEF)
55
+ end
56
+ name = k.to_s
57
+ @column_names << name
58
+ @column_data[name] = list.to_ca
59
+ end
60
+ else
61
+ if columns
62
+ @column_names = columns.map(&:to_s)
63
+ elsif order
64
+ @column_names = order.map(&:to_s)
65
+ else
66
+ raise "columns or order option should be given"
67
+ end
68
+ @column_data = array_to_columns(data)
69
+ end
70
+ else
71
+ raise "unknown data"
72
+ end
73
+
74
+ if order
75
+ if @column_names.size != order.size
76
+ raise 'invalid order option'
77
+ end
78
+ new_column_data = {}
79
+ order.each do |key|
80
+ if @column_data.has_key?(key.to_s)
81
+ new_column_data[key.to_s] = @column_data[key.to_s]
82
+ else
83
+ raise 'invalid column name '#{key.to_s}' in order option'
84
+ end
85
+ end
86
+ @column_data = new_column_data
87
+ @column_names = new_column_data.keys
88
+ end
89
+
90
+ # Sets @row_number and check column length
91
+ @row_number = @column_data.first[1].size
92
+ if @column_names.any?{ |key| @column_data[key].size != @row_number }
93
+ raise "column sizes mismatch"
94
+ end
95
+
96
+ # Processing option 'index'
97
+ set_index(index, inplace: true)
98
+ @__methods__ = {}
99
+
100
+ if clone
101
+ raise NotImplementedError, "copy option is not implemented"
102
+ end
103
+
104
+ if block_given?
105
+ arrange(&block)
106
+ end
107
+ end
108
+
109
+ attr_reader :column_data, :column_names, :row_index, :row_number
110
+
111
+ def columns
112
+ @column_data
113
+ end
114
+
115
+ def __methods__
116
+ return @__methods__
117
+ end
118
+
119
+ def table_to_columns (table)
120
+ new_columns = {}
121
+ @column_names.each_with_index do |name, i|
122
+ new_columns[name.to_s] = table[nil,i].to_ca
123
+ end
124
+ return new_columns
125
+ end
126
+
127
+ private :table_to_columns
128
+
129
+ def columns_to_columns (columns)
130
+ new_columns = {}
131
+ row_number = []
132
+ columns.each_with_index do |(key, col), k|
133
+ case col
134
+ when CArray
135
+ column = col
136
+ column = column.flatten unless column.rank == 1
137
+ when Array
138
+ column = col.to_ca
139
+ unless column.rank == 1
140
+ list = col.clone
141
+ column = CArray.object(list.size).convert { list.shift }
142
+ end
143
+ else
144
+ begin
145
+ column = col.to_ca
146
+ column = column.flatten unless column.rank == 1
147
+ rescue
148
+ raise "#{k}-th column can't be converted to CArray"
149
+ end
150
+ end
151
+ if key == ""
152
+ @row_index = column
153
+ else
154
+ new_columns[key.to_s] = column
155
+ end
156
+ end
157
+ return new_columns
158
+ end
159
+
160
+ private :columns_to_columns
161
+
162
+ def array_to_columns (array)
163
+ new_columns = {}
164
+ case array.first
165
+ when CArray
166
+ if @column_names.size != data.size
167
+ raise "mismatch between 'columns' and table columns"
168
+ end
169
+ @column_names.each_with_index do |key, k|
170
+ column = array[k]
171
+ column = column.flatten unless column.rank == 1
172
+ new_columns[key.to_s] = column
173
+ end
174
+ when Array
175
+ table = array.transpose
176
+ @column_names.each_with_index do |key, k|
177
+ new_columns[key] = table[k].to_ca
178
+ end
179
+ else
180
+ raise "invalid array content for CADataFrame"
181
+ end
182
+ return new_columns
183
+ end
184
+
185
+ private :array_to_columns
186
+
187
+ def set_index (index, drop: true, inplace: false)
188
+ if inplace
189
+ case index
190
+ when nil
191
+ when String, Symbol
192
+ index = index.to_s
193
+ raise "can't find column named '#{index}'" unless @column_names.include?(index)
194
+ if drop
195
+ @row_index = @column_data.delete(index)
196
+ @column_names.delete(index)
197
+ else
198
+ @row_index = @column_data[index]
199
+ end
200
+ else
201
+ @row_index = index.to_ca
202
+ end
203
+ self
204
+ else
205
+ return to_df.set_index(index, drop: drop, inplace: true)
206
+ end
207
+ end
208
+
209
+ def replace (other)
210
+ @column_names = other.column_names
211
+ @column_data = other.column_data
212
+ @row_index = other.row_index
213
+ @row_number = other.row_number
214
+ @__methods__ = other.__methods__
215
+ return self
216
+ end
217
+
218
+ def has_column?(name)
219
+ return @column_names.include?(name)
220
+ end
221
+
222
+ def column_types
223
+ return @column_names.map{|name| @column_data[name].data_type_name }
224
+ end
225
+
226
+ #
227
+ # Column, Row Access
228
+ #
229
+
230
+ def column (spec)
231
+ case spec
232
+ when Integer
233
+ return @column_data[@column_names[spec]]
234
+ when String, Symbol
235
+ return @column_data[spec.to_s]
236
+ else
237
+ raise "invalid column specifier"
238
+ end
239
+ end
240
+ alias col column
241
+
242
+ def loc
243
+ @loc ||= CADataFrame::LocAccessor.new(self)
244
+ return @loc
245
+ end
246
+
247
+ def iloc (&block)
248
+ @iloc ||= CADataFrame::ILocAccessor.new(self)
249
+ return @iloc
250
+ end
251
+
252
+ # TO BE FIXED
253
+ def index
254
+ if @row_index
255
+ return @row_index.to_ca
256
+ else
257
+ return CArray.int(@row_number).seq
258
+ end
259
+ end
260
+
261
+ def head (n=10)
262
+ rmax = [@row_number, n].min
263
+ return row[0..rmax-1]
264
+ end
265
+
266
+ def tail (n=10)
267
+ rmin = -([@row_number, n].min)
268
+ return row[rmin..-1]
269
+ end
270
+
271
+ def method (hash)
272
+ new_hash = {}
273
+ hash.each do |key, value|
274
+ new_hash[key.to_s] = value.to_s
275
+ end
276
+ @__methods__.update(new_hash)
277
+ end
278
+
279
+ def method_missing (name, *args)
280
+ if args.size == 0
281
+ name = name.to_s
282
+ if has_column?(name)
283
+ return @column_data[name]
284
+ elsif has_column?(name.gsub(/_/,'.')) ### For R
285
+ return @column_data[name.gsub(/_/,'.')]
286
+ elsif @__methods__.include?(name)
287
+ return @column_data[@__methods__[name]]
288
+ end
289
+ end
290
+ raise "no method '#{name}' for CADataFrame"
291
+ end
292
+
293
+ #
294
+ # Iterators
295
+ #
296
+
297
+ def each_column (&block)
298
+ return @column_data.each(&block)
299
+ end
300
+
301
+ def each_column_name (&block)
302
+ return @column_names.each(&block)
303
+ end
304
+
305
+ def each_row_index (&block)
306
+ if @row_index
307
+ @row_index.each(&block)
308
+ else
309
+ @row_number.times(&block)
310
+ end
311
+ end
312
+
313
+ def each_row (with: Array, columns: nil, &block)
314
+ case columns
315
+ when Array
316
+ column_names = columns
317
+ when Regexp
318
+ column_names = @column_names.grep(columns)
319
+ else
320
+ column_names = @column_names
321
+ end
322
+ if with == Array
323
+ @row_number.times do |i|
324
+ yield column_names.map{|n| @column_data[n][i] }
325
+ end
326
+ elsif with == Hash
327
+ row = {}
328
+ @row_number.times do |i|
329
+ column_names.each do |c|
330
+ row[c] = @column_data[c][i]
331
+ end
332
+ yield row
333
+ end
334
+ elsif with == CArray
335
+ joined = CArray.join(@column_data.values_at(*column_names))
336
+ joined[:i,nil].each do |block|
337
+ yield block.to_ca.compact
338
+ end
339
+ else
340
+ raise "invalid data type for loop variable"
341
+ end
342
+ end
343
+
344
+ def each_row_with_row_index (with: Array, &block)
345
+ if with == Array
346
+ if @row_index
347
+ @row_index.each_with_index do |idx, i|
348
+ yield @column_data.map{|n,c| c[i] }, idx
349
+ end
350
+ else
351
+ @row_number.times do |i|
352
+ yield @column_data.map{|n,c| c[i] }, i
353
+ end
354
+ end
355
+ elsif with == Hash
356
+ row = {}
357
+ if @row_index
358
+ @row_index.each_with_index do |idx, i|
359
+ @column_names.each do |c|
360
+ row[c] = @column_data[c][i]
361
+ end
362
+ yield row, @row_index[i]
363
+ end
364
+ else
365
+ @row_number.times do |idx, i|
366
+ @column_names.each do |c|
367
+ row[c] = @column_data[c][i]
368
+ end
369
+ yield row, @row_index[i]
370
+ end
371
+ end
372
+ else
373
+ raise "invalid data type for loop variable"
374
+ end
375
+ end
376
+
377
+ def where (mask, *args)
378
+ mask.column_names.each do |key|
379
+ if has_column?(key)
380
+ case args.size
381
+ when 1
382
+ column(key)[mask.column(key).boolean.not] = args[0]
383
+ when 2
384
+ column(key)[mask.column(key).boolean.not] = args[0]
385
+ column(key)[mask.column(key).boolean] = args[1]
386
+ end
387
+ end
388
+ end
389
+ end
390
+
391
+ def fill (*names, value)
392
+ names.each do |name|
393
+ if has_column?(name)
394
+ column(name).fill(value)
395
+ end
396
+ end
397
+ return self
398
+ end
399
+
400
+ #
401
+ # Arrange
402
+ #
403
+
404
+ def arrange (&block)
405
+ return Arranger.new(self).arrange(&block)
406
+ end
407
+
408
+ def rename (name1, name2)
409
+ if idx = @column_names.index(name1.to_s)
410
+ @column_names[idx] = name2.to_s
411
+ column = @column_data[name1.to_s]
412
+ @column_data.delete(name1.to_s)
413
+ @column_data[name2.to_s] = column
414
+ else
415
+ raise "unknown column name #{name1}"
416
+ end
417
+ end
418
+
419
+ def downcase
420
+ new_column_names = []
421
+ new_columns = {}
422
+ each_column_name do |name|
423
+ new_column_names << name.downcase
424
+ new_columns[name.downcase] = @column_data[name]
425
+ end
426
+ @column_names = new_column_names
427
+ @column_data = new_columns
428
+ return self
429
+ end
430
+
431
+ def append_column (name, new_column = nil, &block)
432
+ if new_column
433
+ # do nothing
434
+ elsif block
435
+ new_column = instance_exec(&block)
436
+ else
437
+ new_column = @column_data.first[1].template(:object)
438
+ end
439
+ unless new_column.is_a?(CArray)
440
+ new_column = new_column.to_ca
441
+ end
442
+ if new_column.rank != 1 or new_column.size != @row_number
443
+ raise "invalid shape of appended column"
444
+ end
445
+ @column_names.push(name.to_s)
446
+ @column_data[name.to_s] = new_column
447
+ return new_column
448
+ end
449
+
450
+ alias append append_column
451
+
452
+ def prepend_column (name, new_column = nil, &block)
453
+ if new_column
454
+ # do nothing
455
+ elsif block
456
+ new_column = instance_exec(&block)
457
+ else
458
+ new_column = @column_data.first[1].template(:object)
459
+ end
460
+ unless new_column.is_a?(CArray)
461
+ new_column = new_column.to_ca
462
+ end
463
+ if new_column.rank != 1 or new_column.size != @row_number
464
+ raise "invalid shape of appended column"
465
+ end
466
+ @column_names.unshift(name.to_s)
467
+ @column_data[name.to_s] = new_column
468
+ return new_column
469
+ end
470
+
471
+ alias lead prepend_column
472
+
473
+ def drop_column (*columns)
474
+ if columns.empty?
475
+ return self
476
+ else
477
+ names = []
478
+ columns.each do |c|
479
+ case c
480
+ when String
481
+ names << c
482
+ when Symbol
483
+ names << c.to_s
484
+ when Regexp
485
+ names.push *@column_names.grep(c)
486
+ else
487
+ raise "invalid column specification"
488
+ end
489
+ end
490
+ end
491
+ new_columns = {}
492
+ each_column_name do |name|
493
+ unless names.include?(name)
494
+ new_columns[name] = column(name)
495
+ end
496
+ end
497
+ return replace CADataFrame.new(new_columns, index: @row_index)
498
+ end
499
+
500
+ alias eliminate_column drop_column
501
+
502
+ def vacant_copy
503
+ new_columns = {}
504
+ each_column_name do |key|
505
+ new_columns[key] = CArray.object(0)
506
+ end
507
+ return CADataFrame.new(new_columns)
508
+ end
509
+
510
+ def merge (*args)
511
+ return CADataFrame.merge(self, *args)
512
+ end
513
+
514
+ def execute (&block)
515
+ case block.arity
516
+ when 1
517
+ return instance_exec(self, &block)
518
+ else
519
+ return instance_exec(&block)
520
+ end
521
+ end
522
+
523
+ def calculate (label, &block)
524
+ hash = {}
525
+ each_column_name do |name|
526
+ begin
527
+ if block
528
+ hash[name] = [yield(name, column(name))]
529
+ else
530
+ hash[name] = [column(name).send(label.intern)]
531
+ end
532
+ rescue
533
+ hash[name] = [UNDEF]
534
+ end
535
+ end
536
+ return CADataFrame.new(hash, index: [label])
537
+ end
538
+
539
+ def resample (&block)
540
+ new_columns = {}
541
+ each_column_name do |name|
542
+ begin
543
+ new_columns[name] = yield(name, column(name))
544
+ rescue
545
+ end
546
+ end
547
+ return CADataFrame.new(new_columns)
548
+ end
549
+
550
+ def select (*columns, &block)
551
+ if columns.empty?
552
+ names = @column_names
553
+ else
554
+ names = []
555
+ columns.each do |c|
556
+ case c
557
+ when String
558
+ names << c
559
+ when Symbol
560
+ names << c.to_s
561
+ when Regexp
562
+ names.push *@column_names.grep(c)
563
+ else
564
+ raise "invalid column specification"
565
+ end
566
+ end
567
+ end
568
+ if block
569
+ row = instance_exec(&block)
570
+ else
571
+ row = nil
572
+ end
573
+ new_columns = {}
574
+ names.map(&:to_s).each do |name|
575
+ new_columns[name] = column(name)[row]
576
+ end
577
+ return CADataFrame.new(new_columns, index: @row_index ? @row_index[row] : nil)
578
+ end
579
+ #
580
+ # Maintenance
581
+ #
582
+
583
+ def unmask! (value = nil)
584
+ each_column_name do |name|
585
+ column(name).unmask(value)
586
+ end
587
+ return self
588
+ end
589
+
590
+ def unmask (value = nil)
591
+ return to_df.unmask!(value)
592
+ end
593
+
594
+ def detouch
595
+ @column_data = @column_data.clone
596
+ each_column_name do |name|
597
+ @column_data[name] = @column_data[name].to_ca
598
+ end
599
+ if @row_index
600
+ @row_index = @row_index.clone
601
+ end
602
+ return self
603
+ end
604
+
605
+ def delete_masked_rows
606
+ not_masked = @column_data.first[1].template(:boolean) { true }
607
+ @column_names.each do |name|
608
+ not_masked &= @column_data[name].is_not_masked
609
+ end
610
+ columns = {}
611
+ @column_names.each_with_index do |name, i|
612
+ columns[name] = @column_data[name].to_ca[not_masked]
613
+ end
614
+ return CADataFrame.new(columns)
615
+ end
616
+
617
+ def delete_rows (&block)
618
+ masked = instance_eval(&block)
619
+ columns = {}
620
+ @column_names.each_with_index do |name, i|
621
+ columns[name] = @column_data[name].to_ca[masked]
622
+ end
623
+ return CADataFrame.new(columns)
624
+ end
625
+ #
626
+ # Transformation
627
+ #
628
+
629
+ def reorder (&block)
630
+ index = instance_exec(&block)
631
+ new_columns = {}
632
+ each_column_name do |name|
633
+ new_columns[name] = column(name)[index]
634
+ end
635
+ return CADataFrame.new(new_columns, index: @row_index ? @row_index[index] : nil)
636
+ end
637
+
638
+ def order_by (*names, &block)
639
+ if names.empty?
640
+ if block
641
+ ret = instance_exec(&block)
642
+ case ret
643
+ when CArray
644
+ list = [ret]
645
+ when Array
646
+ list = ret
647
+ end
648
+ end
649
+ else
650
+ list = @column_data.values_at(*names.map{|s| s.to_s})
651
+ end
652
+ return reorder { CA.sort_addr(*list) }
653
+ end
654
+
655
+ def reverse
656
+ new_columns = {}
657
+ each_column_name do |name|
658
+ new_columns[name] = column(name).reverse
659
+ end
660
+ return CADataFrame.new(new_columns, index: @row_index ? @row_index.reverse : nil)
661
+ end
662
+
663
+ def transpose (columns: nil)
664
+ if columns
665
+ columns = columns.map(&:to_s)
666
+ else
667
+ if @row_index
668
+ columns = @row_index.convert(:object) {|v| v.to_s }
669
+ else
670
+ columns = CArray.object(@row_number).seq("a",:succ)
671
+ end
672
+ end
673
+ return CADataFrame.new(ca.transpose, index: @column_names.to_ca, columns: columns)
674
+ end
675
+
676
+ def add_suffix (suf)
677
+ new_columns = {}
678
+ each_column_name do |name|
679
+ new_name = (name.to_s + suf).to_s
680
+ new_columns[new_name] = column(name)
681
+ end
682
+ return CADataFrame.new(new_columns, index: @row_index)
683
+ end
684
+ #
685
+ # Conversions
686
+ #
687
+
688
+ def to_df
689
+ new_columns = {}
690
+ each_column_name do |name|
691
+ new_columns[name] = column(name)
692
+ end
693
+ return CADataFrame.new(new_columns, index: @row_index).detouch
694
+ end
695
+
696
+ def objectify
697
+ new_columns = {}
698
+ each_column_name do |name|
699
+ new_columns[name] = column(name).object
700
+ end
701
+ return CADataFrame.new(new_columns, index: @row_index)
702
+ end
703
+
704
+ def ca (*names)
705
+ if names.empty?
706
+ return CADFArray.new(@column_names, @column_data)
707
+ else
708
+ return CADFArray.new(names.map(&:to_s), @column_data)
709
+ end
710
+ end
711
+
712
+ def to_ca (*names)
713
+ return ca(*names).to_ca
714
+ end
715
+
716
+ def to_hash
717
+ hash = {}
718
+ if @row_index
719
+ hash["index"] = @row_index
720
+ end
721
+ @column_data.each do |k,v|
722
+ hash[k] = v.to_a
723
+ end
724
+ return hash
725
+ end
726
+
727
+ alias to_h to_hash
728
+
729
+ def columns_to_hash (key_name, value_names)
730
+ hash = {}
731
+ unless @column_names.include?(key_name)
732
+ raise ArgumentError, "include invalid key column name #{key_name}"
733
+ end
734
+ case value_names
735
+ when String
736
+ unless @column_names.include?(value_names)
737
+ raise ArgumentError, "invalid key column name #{value_names}"
738
+ end
739
+ key_columns = @column_data[key_name]
740
+ value_columns = @column_data[value_names]
741
+ @row_number.times do |i|
742
+ hash[key_columns[i]] = value_columns[i]
743
+ end
744
+ when Array
745
+ unless value_names.all?{|s| @column_names.include?(s) }
746
+ raise ArgumentError, "include invalid column name in #{value_names.join(' ')}"
747
+ end
748
+ key_columns = @column_data[key_name]
749
+ value_columns = @column_data.values_at(*value_names)
750
+ @row_number.times do |i|
751
+ hash[key_columns[i]] = value_columns.map{|c| c[i]}
752
+ end
753
+ else
754
+ raise ArgumentError, "invalud argument"
755
+ end
756
+ return hash
757
+ end
758
+
759
+ private
760
+
761
+ def __obj_to_string__ (obj)
762
+ case obj
763
+ when Float
764
+ "%.6g" % obj
765
+ when nil
766
+ "nil"
767
+ else
768
+ obj.to_s
769
+ end
770
+ end
771
+
772
+ def __strwidth__ (string)
773
+ if string.ascii_only?
774
+ return string.length
775
+ else
776
+ return string.each_char.inject(0){|s,c| s += c.bytesize > 1 ? 2 : 1 }
777
+ end
778
+ end
779
+ public
780
+
781
+ def ascii_table (rowmax = :full, time_format: nil, index: true)
782
+ columns = @column_data.clone
783
+ @column_names.each do |name|
784
+ if columns[name].is_a?(CATimeIndex)
785
+ if time_format
786
+ columns[name] = columns[name].time.time_strftime(time_format)
787
+ else
788
+ columns[name] = columns[name].time.time_strftime("%F %T%:z")
789
+ end
790
+ end
791
+ end
792
+ if index
793
+ if @row_index
794
+ namelist = [" "] + @column_names
795
+ if @row_index.is_a?(CATimeIndex)
796
+ if time_format
797
+ row_index = @row_index.time.time_strftime(time_format)
798
+ else
799
+ row_index = @row_index.time.time_strftime("%F %T%:z")
800
+ end
801
+ else
802
+ row_index = @row_index
803
+ end
804
+ tbl = CADFArray.new(namelist, columns.update(" " => row_index))
805
+ else
806
+ namelist = [" "] + @column_names
807
+ tbl = CADFArray.new(namelist, columns.update(" " => CArray.int(@row_number).seq))
808
+ end
809
+ else
810
+ namelist = @column_names
811
+ tbl = CADFArray.new(namelist, columns)
812
+ end
813
+ if rowmax.is_a?(Integer) and @row_number > rowmax
814
+ list = tbl[0..(rowmax/2),nil].to_a
815
+ list.push namelist.map { "..." }
816
+ list.push *(tbl[-rowmax/2+1..-1,nil].to_a)
817
+ tbl = list.to_ca
818
+ end
819
+ datastr = tbl.convert {|c| __obj_to_string__(c) }.unmask("")
820
+ datamb = datastr.convert(:boolean, &:"ascii_only?").not.sum(0).ne(0)
821
+ namemb = namelist.to_ca.convert(:boolean) {|c| c.to_s.ascii_only? }.eq(0)
822
+ mb = datamb.or(namemb)
823
+ namelen = namelist.map(&:length).to_ca
824
+ datalen = datastr.convert(&:length)
825
+ if mb.max == 0
826
+ if datalen.size == 0
827
+ lengths = namelen.to_a
828
+ else
829
+ lengths = datalen.max(0).pmax(namelen).to_a
830
+ end
831
+ hrule = "-" + lengths.map {|len| "-"*len}.join("--") + "-"
832
+ header = " " +
833
+ [namelist, lengths].transpose.map{|name, len|
834
+ "#{name.to_s.ljust(len)}" }.join(" ") + " "
835
+ ary = [hrule, header, hrule]
836
+ if datalen.size > 0
837
+ datastr[:i,nil].each_with_index do |blk, i|
838
+ list = blk.flatten.to_a
839
+ ary << " " + [list, lengths].transpose.map{|value, len|
840
+ "#{value.ljust(len)}"}.join(" ") + " "
841
+ end
842
+ end
843
+ ary << hrule
844
+ return "DataFrame: rows#=#{@row_number}: \n" + ary.join("\n")
845
+ else
846
+ namewidth = namelist.to_ca.convert{|c| __strwidth__(c.to_s) }
847
+ if datalen.size == 0
848
+ maxwidth = namewidth
849
+ else
850
+ datawidth = datastr.convert{|c| __strwidth__(c.to_s) }
851
+ maxwidth = datawidth.max(0).pmax(namewidth)
852
+ end
853
+ len = maxwidth[:*,nil] - datawidth + datalen
854
+ hrule = "-" + maxwidth.map {|len| "-"*len}.join("--") + "-"
855
+ header = " " +
856
+ [namelist, maxwidth.to_a].transpose.map{|name, len|
857
+ "#{name.to_s.ljust(len-__strwidth__(name.to_s)+name.to_s.length)}" }.join(" ") + " "
858
+ ary = [hrule, header, hrule]
859
+ if datalen.size > 0
860
+ datastr[:i,nil].each_with_addr do |blk, i|
861
+ list = blk.flatten.to_a
862
+ ary << " " + list.map.with_index {|value, j|
863
+ "#{value.ljust(len[i,j])}"}.join(" ") + " "
864
+ end
865
+ end
866
+ ary << hrule
867
+ return "DataFrame: row#=#{@row_number}: \n" + ary.join("\n")
868
+ end
869
+ end
870
+
871
+ def inspect
872
+ return ascii_table(8)
873
+ end
874
+
875
+ def to_s
876
+ return ascii_table
877
+ end
878
+
879
+ def to_ary
880
+ return [to_s]
881
+ end
882
+ end
883
+
884
+ #############################################################
885
+ #
886
+ # Class methods
887
+ #
888
+ #############################################################
889
+ class CADataFrame
890
+
891
+ def self.merge (*args)
892
+ ref = args.first
893
+ new_columns = {}
894
+ args.each do |table|
895
+ table.column_names.each do |name|
896
+ new_columns[name] = table.col(name)
897
+ end
898
+ end
899
+ return CADataFrame.new(new_columns, index: ref.row_index)
900
+ end
901
+
902
+ def self.concat (*args)
903
+ ref = args.first
904
+ column_names = ref.column_names
905
+ new_columns = {}
906
+ column_names.each do |name|
907
+ list = args.map{|t| t.column(name) }
908
+ if list.first.is_a?(CATimeIndex)
909
+ new_columns[name] = CATimeIndex.concat(*list)
910
+ else
911
+ data_type = list.first.data_type
912
+ new_columns[name] = CArray.bind(data_type, list, 0)
913
+ end
914
+ end
915
+ list = args.map(&:row_index)
916
+ if list.all?
917
+ if list.first.is_a?(CATimeIndex)
918
+ new_row_index = CATimeIndex.concat(*list)
919
+ else
920
+ new_row_index = CArray.join(*list).flatten
921
+ end
922
+ else
923
+ new_row_index = nil
924
+ end
925
+ return CADataFrame.new(new_columns, index: new_row_index)
926
+ end
927
+ end
928
+
929
+ #############################################################
930
+ #
931
+ # BASIC Comparison
932
+ #
933
+ #############################################################
934
+ class CADataFrame
935
+
936
+ def -@
937
+ return cmp(:-@)
938
+ end
939
+
940
+ def < (other)
941
+ return cmp(:<, other)
942
+ end
943
+
944
+ def <= (other)
945
+ return cmp(:<=, other)
946
+ end
947
+
948
+ def > (other)
949
+ return cmp(:>, other)
950
+ end
951
+
952
+ def >= (other)
953
+ return cmp(:>=, other)
954
+ end
955
+
956
+ def is_masked
957
+ return cmp(:is_masked)
958
+ end
959
+
960
+ def is_finite
961
+ return cmp(:is_finite)
962
+ end
963
+
964
+ private
965
+
966
+ def cmp (method, *argv)
967
+ return CADataFrame.new(ca.send(method,*argv), columns: @column_names)
968
+ end
969
+
970
+ end
971
+
972
+ #############################################################
973
+ #
974
+ # BASIC Manipulations
975
+ #
976
+ #############################################################
977
+ class CADataFrame
978
+
979
+ def matchup (keyname, reference)
980
+ key = column(keyname.to_s)
981
+ idx = reference.matchup(key)
982
+ new_columns = {}
983
+ each_column_name do |name|
984
+ if name == keyname
985
+ new_columns[name] = reference
986
+ else
987
+ new_columns[name] = column(name).project(idx)
988
+ end
989
+ end
990
+ if @row_index
991
+ new_row_index = @row_index.project(idx).unmask(nil)
992
+ else
993
+ new_row_index = nil
994
+ end
995
+ return CADataFrame.new(new_columns, index: new_row_index) {
996
+ self.send(keyname)[] = reference
997
+ }
998
+ end
999
+
1000
+ def histogram (name, scale = nil, options = nil)
1001
+ if scale.nil?
1002
+ return group_by(name).table{ { :count => col(name).count_valid } }
1003
+ else
1004
+ if options
1005
+ hist = CAHistogram.int(scale, options)
1006
+ else
1007
+ hist = CAHistogram.int(scale)
1008
+ end
1009
+ hist.increment(@column_data[name.to_s])
1010
+ hash = {
1011
+ name.to_s => hist.midpoints[0],
1012
+ "#{name}_L".to_s => scale[0..-2],
1013
+ "#{name}_R".to_s => scale.shift(-1)[0..-2],
1014
+ :count => hist[0..-2].to_ca,
1015
+ }
1016
+ return CADataFrame.new(hash)
1017
+ end
1018
+ end
1019
+
1020
+ def classify (name, scale = nil, opt = {})
1021
+ if not scale
1022
+ column = @column_data[name.to_s]
1023
+ mids = column.uniq
1024
+ mapper = {}
1025
+ mids.each_with_index do |v,i|
1026
+ mapper[v] = i
1027
+ end
1028
+ cls = columns.convert(:int32) {|v| mapper[v] }
1029
+ hash = {
1030
+ "#{name}_M" => mids,
1031
+ "#{name}_L" => mids,
1032
+ "#{name}_R" => mids,
1033
+ "#{name}_CLASS" => cls
1034
+ }
1035
+ else
1036
+ option = {
1037
+ :include_upper => false,
1038
+ :include_lowest => true,
1039
+ :offset => 0,
1040
+ }.update(opt)
1041
+ column = @column_data[name.to_s]
1042
+ cls = scale.bin(column,
1043
+ option[:include_upper],
1044
+ option[:include_lowest],
1045
+ option[:offset])
1046
+ mids = ((scale + scale.shifted(-1))/2)[0..-2].to_ca
1047
+ left = scale[0..-2]
1048
+ right = scale.shift(-1)[0..-2]
1049
+ hash = {
1050
+ "#{name}_M" => mids.project(cls).to_ca,
1051
+ "#{name}_L" => left.project(cls).to_ca,
1052
+ "#{name}_R" => right.project(cls).to_ca,
1053
+ "#{name}_CLASS" => cls
1054
+ }
1055
+ end
1056
+ return CADataFrame.new(hash)
1057
+ end
1058
+
1059
+ def cross (name1, name2)
1060
+ col1 = column(name1)
1061
+ col2 = column(name2)
1062
+ var1 = col1.uniq.sort
1063
+ var2 = col2.uniq.sort
1064
+ hash = {}
1065
+ count = Hash.new {0}
1066
+ var1.each do |v1|
1067
+ var2.each do |v2|
1068
+ hash[[v1,v2]] = 0
1069
+ end
1070
+ end
1071
+ list = CArray.join([col1, col2]).to_a
1072
+ list.each do |item|
1073
+ hash[item] += 1
1074
+ end
1075
+ out = CArray.object(var1.size, var2.size) { 0 }
1076
+ var1.each_with_index do |v1, i|
1077
+ var2.each_with_index do |v2, j|
1078
+ out[i,j] = hash[[v1,v2]]
1079
+ end
1080
+ end
1081
+ return CADataFrame.new(out, index: var1, columns: var2)
1082
+ end
1083
+ end
1084
+
1085
+ #############################################################
1086
+ #
1087
+ # CArray
1088
+ #
1089
+ #############################################################
1090
+ class CADataFrame
1091
+
1092
+ def sum
1093
+ new_columns = {}
1094
+ each_column do |name, col|
1095
+ new_columns[name] = [col.sum]
1096
+ end
1097
+ return CADataFrame.new(new_columns, index: ["sum"])
1098
+ end
1099
+
1100
+ def mean
1101
+ new_columns = {}
1102
+ each_column do |name, col|
1103
+ new_columns[name] = [col.mean]
1104
+ end
1105
+ return CADataFrame.new(new_columns, index: ["mean"])
1106
+ end
1107
+ end
1108
+
1109
+ class CArray
1110
+
1111
+ def describe_type
1112
+ type = nil
1113
+ case true
1114
+ when numeric?
1115
+ type = :numeric
1116
+ when boolean?
1117
+ type = :categorical
1118
+ else
1119
+ begin
1120
+ self / 1
1121
+ type = :numeric
1122
+ rescue
1123
+ type = :categorical
1124
+ end
1125
+ end
1126
+ type
1127
+ end
1128
+
1129
+ private :describe_type
1130
+
1131
+ def describe (as: nil)
1132
+ if as
1133
+ type = as.intern
1134
+ else
1135
+ type = describe_type
1136
+ end
1137
+ case type
1138
+ when :numeric
1139
+ describe_numeric
1140
+ when :categorical
1141
+ describe_categorical
1142
+ else
1143
+ raise "unknown"
1144
+ end
1145
+ end
1146
+
1147
+ def describe_numeric
1148
+ min, q25, median, q75, max = *quantile
1149
+ {
1150
+ count: is_masked.count_false,
1151
+ mean: mean,
1152
+ std: stddev,
1153
+ max: max,
1154
+ q75: q75,
1155
+ median: median,
1156
+ q25: q25,
1157
+ min: min,
1158
+ }
1159
+ end
1160
+
1161
+ def describe_categorical
1162
+ hash = {}
1163
+ each do |v|
1164
+ hash[v] ||= 0
1165
+ hash[v] += 1
1166
+ end
1167
+ top, freq = hash.max_by{|x| x[1]}
1168
+ {
1169
+ count: is_masked.count_false,
1170
+ unique: hash.size,
1171
+ top: top,
1172
+ freq: freq,
1173
+ }
1174
+ end
1175
+
1176
+ def summary
1177
+ summary_categorical
1178
+ end
1179
+
1180
+ def summary_categorical
1181
+ hash = {}
1182
+ each do |v|
1183
+ hash[v] ||= 0
1184
+ hash[v] += 1
1185
+ end
1186
+ hash
1187
+ end
1188
+
1189
+ end
1190
+
1191
+ class CADataFrame
1192
+
1193
+ def describe
1194
+ list = []
1195
+ @column_data.each do |name, column|
1196
+ list << column.describe
1197
+ end
1198
+ CADataFrame.new(list, index: @column_names).transpose
1199
+ end
1200
+
1201
+ def summary (*names)
1202
+ data = []
1203
+ names.each do |name|
1204
+ data << @column_data[name].summary
1205
+ end
1206
+ CADataFrame.new(data, index: names).transpose
1207
+ end
1208
+
1209
+ end
1210
+
1211
+
1212
+ class CArray
1213
+
1214
+ def get_dummies
1215
+ keys = uniq
1216
+ hash = {}
1217
+ keys.each do |k|
1218
+ hash[k] = self.eq(k)
1219
+ end
1220
+ return hash
1221
+ end
1222
+
1223
+ end
1224
+
1225
+ class CADataFrame
1226
+
1227
+ def get_dummies (*names, prefix: nil, prefix_sep: "_")
1228
+ keep_columns = {}
1229
+ new_columns = {}
1230
+ k = 0
1231
+ @column_names.each do |name|
1232
+ unless names.include?(name)
1233
+ keep_columns[name] = @column_data[name]
1234
+ next
1235
+ end
1236
+ hash = @column_data[name].get_dummies
1237
+ case prefix
1238
+ when nil
1239
+ hash.each do |v, dummy|
1240
+ new_columns["#{name}#{prefix_sep}#{v}"] = dummy
1241
+ end
1242
+ when String
1243
+ hash.each do |v, dummy|
1244
+ new_columns["#{prefix}#{prefix_sep}#{v}"] = dummy
1245
+ end
1246
+ when Array
1247
+ hash.each do |v, dummy|
1248
+ new_columns["#{prefix[k]}#{prefix_sep}#{v}"] = dummy
1249
+ end
1250
+ when Hash
1251
+ hash.each do |v, dummy|
1252
+ new_columns["#{prefix[name]}#{prefix_sep}#{v}"] = dummy
1253
+ end
1254
+ end
1255
+ k += 1
1256
+ end
1257
+ CADataFrame.new(keep_columns.update(new_columns), index: @row_index)
1258
+ end
1259
+
1260
+ end
1261
+
1262
+
1263
+ class CADataFrame
1264
+
1265
+ def save (filename)
1266
+ open(filename, "w") {|io|
1267
+ Marshal.dump(self, io)
1268
+ }
1269
+ end
1270
+
1271
+ def self.load (filename)
1272
+ out = open(filename, "r") {|io|
1273
+ Marshal.load(io)
1274
+ }
1275
+ raise "invalid data" unless out.is_a?(CADataFrame)
1276
+ return out
1277
+ end
1278
+
1279
+ end