rust 0.7 → 0.11

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,127 +1,34 @@
1
- require 'code-assertions'
2
- require 'stringio'
3
- require 'rinruby'
4
- require 'csv'
1
+ require_relative 'datatype'
5
2
 
6
3
  module Rust
7
- CLIENT_MUTEX = Mutex.new
8
- R_MUTEX = Mutex.new
9
4
 
10
- R_ENGINE = RinRuby.new(echo: false)
5
+ ##
6
+ # Mirror of the data-frame type in R.
11
7
 
12
- private_constant :R_ENGINE
13
- private_constant :R_MUTEX
14
- private_constant :CLIENT_MUTEX
15
-
16
- @@debugging = false
17
- @@in_client_mutex = false
18
-
19
- def self.debug
20
- @@debugging = true
21
- end
22
-
23
- def self.exclusive
24
- result = nil
25
- CLIENT_MUTEX.synchronize do
26
- @@in_client_mutex = true
27
- result = yield
28
- @@in_client_mutex = false
29
- end
30
- return result
31
- end
32
-
33
- def self.[]=(variable, value)
34
- if value.is_a?(RustDatatype)
35
- value.load_in_r_as(variable.to_s)
36
- elsif value.is_a?(String) || value.is_a?(Numeric) || value.is_a?(Array)
37
- R_ENGINE.assign(variable, value)
38
- else
39
- raise "Given #{value.class}, expected RustDatatype, String, Numeric, or Array"
8
+ class DataFrame < RustDatatype
9
+ def self.can_pull?(type, klass)
10
+ return [klass].flatten.include?("data.frame")
40
11
  end
41
12
 
42
- end
43
-
44
- def self.[](variable, type=RustDatatype)
45
- return type.pull_variable(variable)
46
- end
47
-
48
- def self._eval_big(r_command, return_warnings = false)
49
- r_command = r_command.join("\n") if r_command.is_a?(Array)
50
-
51
- self._rexec(r_command, return_warnings) do |cmd|
52
- result = true
53
- instructions = cmd.lines
54
-
55
- while instructions.size > 0
56
- current_command = ""
57
-
58
- while (instructions.size > 0) && (current_command.length + instructions.first.length < 10000)
59
- current_command << instructions.shift
60
- end
61
-
62
- result &= R_ENGINE.eval(current_command)
63
- end
64
-
65
- result
66
- end
67
- end
68
-
69
- def self._pull(r_command, return_warnings = false)
70
- self._rexec(r_command, return_warnings) { |cmd| R_ENGINE.pull(cmd) }
71
- end
72
-
73
- def self._eval(r_command, return_warnings = false)
74
- self._rexec(r_command, return_warnings) { |cmd| R_ENGINE.eval(cmd) }
75
- end
76
-
77
- def self._rexec(r_command, return_warnings = false)
78
- puts "Calling _rexec with command: #{r_command}" if @@debugging
79
- R_MUTEX.synchronize do
80
- assert("This command must be executed in an exclusive block") { @@in_client_mutex }
81
-
82
- result = nil
83
- begin
84
- $stdout = StringIO.new
85
- if return_warnings
86
- R_ENGINE.echo(true, true)
87
- else
88
- R_ENGINE.echo(false, false)
89
- end
90
- result = yield(r_command)
91
- ensure
92
- R_ENGINE.echo(false, false)
93
- warnings = $stdout.string
94
- $stdout = STDOUT
95
- end
96
-
97
- if return_warnings
98
- return result, warnings.lines.map { |w| w.strip.chomp }
99
- else
100
- return result
101
- end
102
- end
103
- end
104
-
105
- class RustDatatype
106
- def self.pull_variable(variable)
107
- return Rust._pull(variable)
13
+ def self.pull_priority
14
+ 1
108
15
  end
109
16
 
110
- def load_in_r_as(r_instance, variable_name)
111
- raise "Not implemented"
112
- end
113
- end
114
-
115
- class DataFrame < RustDatatype
116
- def self.pull_variable(variable)
17
+ def self.pull_variable(variable, type, klass)
117
18
  hash = {}
118
- colnames = Rust._pull("colnames(#{variable})")
19
+ colnames = Rust["colnames(#{variable})"]
119
20
  colnames.each do |col|
120
- hash[col] = Rust._pull("#{variable}$#{col}")
21
+ hash[col] = Rust["#{variable}$\"#{col}\""]
121
22
  end
122
23
  return DataFrame.new(hash)
123
24
  end
124
25
 
26
+ ##
27
+ # Creates a new data-frame.
28
+ # +labels_or_data+ can be either:
29
+ # - an Array of column names (creates an empty data-frame)
30
+ # - a Hash with column names as keys and values as values
31
+
125
32
  def initialize(labels_or_data)
126
33
  @data = {}
127
34
 
@@ -137,6 +44,9 @@ module Rust
137
44
  end
138
45
  end
139
46
 
47
+ ##
48
+ # Returns the +i+-th row of the data-frame
49
+
140
50
  def row(i)
141
51
  if i < 0 || i >= self.rows
142
52
  return nil
@@ -145,6 +55,9 @@ module Rust
145
55
  end
146
56
  end
147
57
 
58
+ ##
59
+ # Returns the +i+-th row of the data-frame. Faster (but harder to interpret) alternative to #row.
60
+
148
61
  def fast_row(i)
149
62
  if i < 0 || i >= self.rows
150
63
  return nil
@@ -153,6 +66,9 @@ module Rust
153
66
  end
154
67
  end
155
68
 
69
+ ##
70
+ # Shuffles the rows in the data-frame. The arguments are passed to the Array#shuffle method.
71
+
156
72
  def shuffle(*args)
157
73
  result = DataFrame.new(@labels)
158
74
 
@@ -167,6 +83,10 @@ module Rust
167
83
  return result
168
84
  end
169
85
 
86
+ ##
87
+ # Returns a copy of the data-frame containing only the specified +rows+ and/or +cols+. If +rows+ and/or +cols+
88
+ # are nil, all the rows/columns are returned.
89
+
170
90
  def [](rows, cols=nil)
171
91
  raise "You must specify either rows or columns to select" if !rows && !cols
172
92
  result = self
@@ -182,11 +102,17 @@ module Rust
182
102
  return result
183
103
  end
184
104
 
105
+ ##
106
+ # Return the column named +name+.
107
+
185
108
  def column(name)
186
109
  return @data[name]
187
110
  end
188
111
  alias :| :column
189
112
 
113
+ ##
114
+ # Renames the column named +old_name+ in +new_name+.
115
+
190
116
  def rename_column!(old_name, new_name)
191
117
  raise "This DataFrame does not contain a column named #{old_name}" unless @labels.include?(old_name)
192
118
  raise "This DataFrame already contains a column named #{new_name}" if @labels.include?(new_name)
@@ -195,10 +121,24 @@ module Rust
195
121
  @labels[@labels.index(old_name)] = new_name
196
122
  end
197
123
 
124
+ ##
125
+ # Functionally transforms the column named +column+ by applying the function given as a block.
126
+ # Example:
127
+ # df = Rust::DataFrame.new({a: [1,2,3], b: [3,4,5]})
128
+ # df.transform_column!("a") { |v| v + 1 }
129
+ # df|"a" # => [2, 3, 4]
130
+
198
131
  def transform_column!(column)
199
132
  @data[column].map! { |e| yield e }
200
133
  end
201
134
 
135
+ ##
136
+ # Returns a copy data-frame with only the rows for which the function given in the block returns true.
137
+ # Example:
138
+ # df = Rust::DataFrame.new({a: [1,2,3], b: ['a','b','c']})
139
+ # df2 = df.select_rows { |r| r['a'].even? }
140
+ # df2|"b" # => ['b']
141
+
202
142
  def select_rows
203
143
  result = DataFrame.new(self.column_names)
204
144
  self.each_with_index do |row, i|
@@ -207,6 +147,9 @@ module Rust
207
147
  return result
208
148
  end
209
149
 
150
+ ##
151
+ # Returns true if the function given in the block returns true for any of the rows in this data-frame.
152
+
210
153
  def has_row?
211
154
  self.each_with_index do |row, i|
212
155
  return true if yield row, i
@@ -214,6 +157,10 @@ module Rust
214
157
  return false
215
158
  end
216
159
 
160
+ ##
161
+ # Returns a copy of the data-frame with only the columns in +cols+. As an alternative, a block can be used
162
+ # (only the columns for which the function returns true are kept).
163
+
217
164
  def select_columns(cols=nil)
218
165
  raise "You must specify either the columns you want to select or a selection block" if !cols && !block_given?
219
166
 
@@ -229,23 +176,35 @@ module Rust
229
176
  end
230
177
  alias :select_cols :select_columns
231
178
 
179
+ ##
180
+ # Deletes the column named +column+.
181
+
232
182
  def delete_column(column)
233
183
  @labels.delete(column)
234
184
  @data.delete(column)
235
185
  end
236
186
 
187
+ ##
188
+ # Deletes the +i+-th row.
189
+
237
190
  def delete_row(i)
238
191
  @data.each do |label, column|
239
192
  column.delete_at(i)
240
193
  end
241
194
  end
242
195
 
196
+ ##
197
+ # Returns a data-frame in which the rows are unique in terms of all the given columns named +by+.
198
+
243
199
  def uniq_by(by)
244
200
  result = self.clone
245
201
  result.uniq_by!(by)
246
202
  return result
247
203
  end
248
204
 
205
+ ##
206
+ # Makes sure that in this data-frame the rows are unique in terms of all the given columns named +by+.
207
+
249
208
  def uniq_by!(by)
250
209
  my_keys = {}
251
210
  to_delete = []
@@ -268,19 +227,33 @@ module Rust
268
227
  return self
269
228
  end
270
229
 
230
+ ##
231
+ # Return the names of the columns.
232
+
271
233
  def column_names
272
234
  return @labels.map { |k| k.to_s }
273
235
  end
274
236
  alias :colnames :column_names
275
237
 
238
+ ##
239
+ # Returns the number of rows.
240
+
276
241
  def rows
277
242
  @data.values[0].size
278
243
  end
279
244
 
245
+ ##
246
+ # Returns the number of columns
247
+
280
248
  def columns
281
249
  @labels.size
282
250
  end
283
251
 
252
+ ##
253
+ # Adds the given +row+ to the data-frame. +row+ can be either:
254
+ # - An Array of values for all the columns (in the order of #column_names);
255
+ # - A Hash containing associations between column names and value to be set.
256
+
284
257
  def add_row(row)
285
258
  if row.is_a?(Array)
286
259
  raise "Expected an array of size #{@data.size}" unless row.size == @data.size
@@ -304,6 +277,11 @@ module Rust
304
277
  end
305
278
  alias :<< :add_row
306
279
 
280
+ ##
281
+ # Adds a column named +name+ with the given +values+ (array). The size of +values+ must match the number of
282
+ # rows of this data-frame. As an alternative, it can be passed a block which returns, for a given row, the
283
+ # value to assign for the new column.
284
+
307
285
  def add_column(name, values=nil)
308
286
  raise "Column already exists" if @labels.include?(name)
309
287
  raise "Values or block required" if !values && !block_given?
@@ -320,6 +298,9 @@ module Rust
320
298
  end
321
299
  end
322
300
 
301
+ ##
302
+ # Yields each row as a Hash containing column names as keys and values as values.
303
+
323
304
  def each
324
305
  self.each_with_index do |element, i|
325
306
  yield element
@@ -328,6 +309,10 @@ module Rust
328
309
  return self
329
310
  end
330
311
 
312
+ ##
313
+ # Yields each row as a Hash containing column names as keys and values as values. Faster alternative to
314
+ # #each.
315
+
331
316
  def fast_each
332
317
  self.fast_each_with_index do |element, i|
333
318
  yield element
@@ -336,6 +321,9 @@ module Rust
336
321
  return self
337
322
  end
338
323
 
324
+ ##
325
+ # Yields each row as a Hash containing column names as keys and values as values and the row index.
326
+
339
327
  def each_with_index
340
328
  for i in 0...self.rows
341
329
  element = {}
@@ -349,6 +337,10 @@ module Rust
349
337
  return self
350
338
  end
351
339
 
340
+ ##
341
+ # Yields each row as a Hash containing column names as keys and values as values and the row index. Faster
342
+ # alternative to #each_with_index.
343
+
352
344
  def fast_each_with_index
353
345
  for i in 0...self.rows
354
346
  element = []
@@ -373,6 +365,14 @@ module Rust
373
365
  row_index += 1
374
366
  end
375
367
 
368
+ self.column_names.each do |name|
369
+ column = self.column(name)
370
+
371
+ if column.is_a?(Factor)
372
+ command << "#{variable_name}[,#{name.to_R}] <- factor(#{variable_name}[,#{name.to_R}], labels=#{column.levels.to_R})"
373
+ end
374
+ end
375
+
376
376
  Rust._eval_big(command)
377
377
  end
378
378
 
@@ -397,6 +397,9 @@ module Rust
397
397
  return result
398
398
  end
399
399
 
400
+ ##
401
+ # Returns a copy of the data-frame containing only the first +n+ rows.
402
+
400
403
  def head(n=10)
401
404
  result = DataFrame.new(self.column_names)
402
405
  self.each_with_index do |row, i|
@@ -405,6 +408,11 @@ module Rust
405
408
  return result
406
409
  end
407
410
 
411
+ ##
412
+ # Merges this data-frame with +other+ in terms of the +by+ column(s) (Array or String).
413
+ # +first_alias+ and +second_alias+ allow to specify the prefix that should be used for the columns not in +by+
414
+ # for this and the +other+ data-frame, respectively.
415
+
408
416
  def merge(other, by, first_alias = "x", second_alias = "y")
409
417
  raise TypeError, "Expected Rust::DataFrame" unless other.is_a?(DataFrame)
410
418
  raise TypeError, "Expected list of strings" if !by.is_a?(Array) || !by.all? { |e| e.is_a?(String) }
@@ -471,6 +479,14 @@ module Rust
471
479
  return result
472
480
  end
473
481
 
482
+ ##
483
+ # Aggregate the value in groups depending on the +by+ column (String).
484
+ # A block must be passed to specify how to aggregate the columns. Aggregators for specific columns can be
485
+ # specified as optional arguments in which the name of the argument represents the column name and the value
486
+ # contains a block for aggregating the specific column.
487
+ # Both the default and the specialized blocks must take as argument an array of values and must return a
488
+ # scalar value.
489
+
474
490
  def aggregate(by, **aggregators)
475
491
  raise TypeError, "Expected a string" unless by.is_a?(String)
476
492
  raise TypeError, "All the aggregators should be procs" unless aggregators.values.all? { |v| v.is_a?(Proc) }
@@ -511,12 +527,18 @@ module Rust
511
527
  return result
512
528
  end
513
529
 
530
+ ##
531
+ # Returns a copy of this data-frame in which the rows are sorted by the values of the +by+ column.
532
+
514
533
  def sort_by(column)
515
534
  result = self.clone
516
535
  result.sort_by!(column)
517
536
  return result
518
537
  end
519
538
 
539
+ ##
540
+ # Sorts the rows of this data-frame by the values of the +by+ column.
541
+
520
542
  def sort_by!(by)
521
543
  copy = @data[by].clone
522
544
  copy.sort!
@@ -542,6 +564,9 @@ module Rust
542
564
  @data[by].sort!
543
565
  end
544
566
 
567
+ ##
568
+ # Adds all the rows in +dataframe+ to this data-frame. The column names must match.
569
+
545
570
  def bind_rows!(dataframe)
546
571
  raise TypeError, "DataFrame expected" unless dataframe.is_a?(DataFrame)
547
572
  raise "The columns are not compatible: #{self.column_names - dataframe.column_names} - #{dataframe.column_names - self.column_names}" unless (self.column_names & dataframe.column_names).size == self.columns
@@ -554,6 +579,9 @@ module Rust
554
579
  end
555
580
  alias :rbind! :bind_rows!
556
581
 
582
+ ##
583
+ # Adds all the columns in +dataframe+ to this data-frame. The number of rows must match.
584
+
557
585
  def bind_columns!(dataframe)
558
586
  raise TypeError, "DataFrame expected" unless dataframe.is_a?(DataFrame)
559
587
  raise "The number of rows are not compatible" if self.rows != dataframe.rows
@@ -567,6 +595,9 @@ module Rust
567
595
  end
568
596
  alias :cbind! :bind_columns!
569
597
 
598
+ ##
599
+ # Returns a copy of this dataframe and adds all the rows in +dataframe+ to it. The column names must match.
600
+
570
601
  def bind_rows(dataframe)
571
602
  result = self.clone
572
603
  result.bind_rows!(dataframe)
@@ -574,6 +605,9 @@ module Rust
574
605
  end
575
606
  alias :rbind :bind_rows
576
607
 
608
+ ##
609
+ # Returns a copy of this dataframe and adds all the columns in +dataframe+ to it. The number of rows must match.
610
+
577
611
  def bind_columns(dataframe)
578
612
  result = self.clone
579
613
  result.bind_columns!(dataframe)
@@ -581,88 +615,22 @@ module Rust
581
615
  end
582
616
  alias :cbind :bind_columns
583
617
 
618
+ ##
619
+ # Returns a copy of this data-frame.
620
+
584
621
  def clone
585
622
  DataFrame.new(@data)
586
623
  end
587
624
  end
588
625
 
589
- class Matrix < RustDatatype
590
- def self.pull_variable(variable)
591
- return Rust._pull(variable)
592
- end
593
-
594
- def initialize(data)
595
- if data.flatten.size == 0
596
- raise "Empty matrices are not allowed"
597
- else
598
- raise TypeError, "Expected array of array" unless data.is_a?(Array) && data[0].is_a?(Array)
599
- raise TypeError, "Only numeric matrices are supported" unless data.all? { |row| row.all? { |e| e.is_a?(Numeric) } }
600
- raise "All the rows must have the same size" unless data.map { |row| row.size }.uniq.size == 1
601
- @data = data.clone
602
- end
603
- end
604
-
605
- def [](i, j)
606
- return @data[i][j]
607
- end
608
-
609
- def rows
610
- @data.size
611
- end
612
-
613
- def cols
614
- @data[0].size
615
- end
616
-
617
- def []=(i, j, value)
618
- raise "Wrong i" unless i.between?(0, @data.size - 1)
619
- raise "Wrong j" unless j.between?(0, @data[0].size - 1)
620
- @data[i][j] = value
621
- end
622
-
623
- def load_in_r_as(variable_name)
624
- Rust._eval("#{variable_name} <- matrix(c(#{@data.flatten.join(",")}), nrow=#{self.rows}, ncol=#{self.cols}, byrow=T)")
625
- end
626
- end
626
+ ##
627
+ # Represents an array of DataFrame
627
628
 
628
- class Sequence < RustDatatype
629
- attr_reader :min
630
- attr_reader :max
631
-
632
- def initialize(min, max, step=1)
633
- @min = min
634
- @max = max
635
- @step = step
636
- end
637
-
638
- def step(step)
639
- @step = step
640
- end
641
-
642
- def each
643
- (@min..@max).step(@step) do |v|
644
- yield v
645
- end
646
- end
647
-
648
- def to_a
649
- result = []
650
- self.each do |v|
651
- result << v
652
- end
653
- return result
654
- end
629
+ class DataFrameArray < Array
655
630
 
656
- def to_R
657
- "seq(from=#@min, to=#@max, by=#@step)"
658
- end
631
+ ##
632
+ # Returns a data-frame with the rows in all the data-frames together (if compatible).
659
633
 
660
- def load_in_r_as(variable_name)
661
- Rust._eval("#{variable_name} <- #{self.to_R}")
662
- end
663
- end
664
-
665
- class DataFrameArray < Array
666
634
  def bind_all
667
635
  return nil if self.size == 0
668
636
 
@@ -676,7 +644,14 @@ module Rust
676
644
  end
677
645
  end
678
646
 
647
+ ##
648
+ # Represents a hash of DataFrame
649
+
679
650
  class DataFrameHash < Hash
651
+
652
+ ##
653
+ # Returns a data-frame with the rows in all the data-frames together (if compatible).
654
+
680
655
  def bind_all
681
656
  return nil if self.values.size == 0
682
657
 
@@ -689,151 +664,4 @@ module Rust
689
664
  return result
690
665
  end
691
666
  end
692
-
693
- class MathArray < Array
694
- def -(other)
695
- raise ArgumentError, "Expected array or numeric" if !other.is_a?(::Array) && !other.is_a?(Numeric)
696
- raise ArgumentError, "The two arrays must have the same size" if other.is_a?(::Array) && self.size != other.size
697
-
698
- result = self.clone
699
- other = [other] * self.size if other.is_a?(Numeric)
700
- for i in 0...self.size
701
- result[i] -= other[i]
702
- end
703
-
704
- return result
705
- end
706
-
707
- def *(other)
708
- raise ArgumentError, "Expected array or numeric" if !other.is_a?(::Array) && !other.is_a?(Numeric)
709
- raise ArgumentError, "The two arrays must have the same size" if other.is_a?(::Array) && self.size != other.size
710
-
711
- result = self.clone
712
- other = [other] * self.size if other.is_a?(Numeric)
713
- for i in 0...self.size
714
- result[i] *= other[i]
715
- end
716
-
717
- return result
718
- end
719
-
720
- def +(other)
721
- raise ArgumentError, "Expected array or numeric" if !other.is_a?(::Array) && !other.is_a?(Numeric)
722
- raise ArgumentError, "The two arrays must have the same size" if other.is_a?(::Array) && self.size != other.size
723
-
724
- result = self.clone
725
- other = [other] * self.size if other.is_a?(Numeric)
726
- for i in 0...self.size
727
- result[i] += other[i]
728
- end
729
-
730
- return result
731
- end
732
-
733
- def /(other) #To recover the syntax highlighting but in Kate: /
734
- raise ArgumentError, "Expected array or numeric" if !other.is_a?(::Array) && !other.is_a?(Numeric)
735
- raise ArgumentError, "The two arrays must have the same size" if other.is_a?(::Array) && self.size != other.size
736
-
737
- result = self.clone
738
- other = [other] * self.size if other.is_a?(Numeric)
739
- for i in 0...self.size
740
- result[i] /= other[i]
741
- end
742
-
743
- return result
744
- end
745
-
746
- def **(other)
747
- raise ArgumentError, "Expected numeric" if !other.is_a?(Numeric)
748
-
749
- result = self.clone
750
- for i in 0...self.size
751
- result[i] = result[i] ** other
752
- end
753
-
754
- return result
755
- end
756
- end
757
- end
758
-
759
- class TrueClass
760
- def to_R
761
- "TRUE"
762
- end
763
- end
764
-
765
- class FalseClass
766
- def to_R
767
- "FALSE"
768
- end
769
- end
770
-
771
- class Object
772
- def to_R
773
- raise TypeError, "Unsupported type for #{self.class}"
774
- end
775
- end
776
-
777
- class NilClass
778
- def to_R
779
- return "NULL"
780
- end
781
- end
782
-
783
- class Numeric
784
- def to_R
785
- self.inspect
786
- end
787
- end
788
-
789
- class Float
790
- def to_R
791
- return self.nan? ? "NA" : super
792
- end
793
- end
794
-
795
- class Array
796
- def to_R
797
- return "c(#{self.map { |e| e.to_R }.join(",")})"
798
- end
799
-
800
- def distribution
801
- result = {}
802
- self.each do |value|
803
- result[value] = result[value].to_i + 1
804
- end
805
- return result
806
- end
807
- end
808
-
809
- class String
810
- def to_R
811
- return self.inspect
812
- end
813
- end
814
-
815
- class Range
816
- def to_R
817
- [range.min, range.max].to_R
818
- end
819
- end
820
-
821
- module Rust::RBindings
822
- def data_frame(*args)
823
- Rust::DataFrame.new(*args)
824
- end
825
- end
826
-
827
- module Rust::TestCases
828
- def self.sample_dataframe(columns, size=100)
829
- result = Rust::DataFrame.new(columns)
830
- size.times do |i|
831
- result << columns.map { |c| yield i, c }
832
- end
833
- return result
834
- end
835
- end
836
-
837
- def bind_r!
838
- include Rust::RBindings
839
667
  end