rust 0.7 → 0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,127 +1,34 @@
1
- require 'code-assertions'
2
- require 'stringio'
3
- require 'rinruby'
4
- require 'csv'
1
+ require_relative 'datatype'
5
2
 
6
3
  module Rust
7
- CLIENT_MUTEX = Mutex.new
8
- R_MUTEX = Mutex.new
9
4
 
10
- R_ENGINE = RinRuby.new(echo: false)
5
+ ##
6
+ # Mirror of the data-frame type in R.
11
7
 
12
- private_constant :R_ENGINE
13
- private_constant :R_MUTEX
14
- private_constant :CLIENT_MUTEX
15
-
16
- @@debugging = false
17
- @@in_client_mutex = false
18
-
19
- def self.debug
20
- @@debugging = true
21
- end
22
-
23
- def self.exclusive
24
- result = nil
25
- CLIENT_MUTEX.synchronize do
26
- @@in_client_mutex = true
27
- result = yield
28
- @@in_client_mutex = false
29
- end
30
- return result
31
- end
32
-
33
- def self.[]=(variable, value)
34
- if value.is_a?(RustDatatype)
35
- value.load_in_r_as(variable.to_s)
36
- elsif value.is_a?(String) || value.is_a?(Numeric) || value.is_a?(Array)
37
- R_ENGINE.assign(variable, value)
38
- else
39
- raise "Given #{value.class}, expected RustDatatype, String, Numeric, or Array"
8
+ class DataFrame < RustDatatype
9
+ def self.can_pull?(type, klass)
10
+ return [klass].flatten.include?("data.frame")
40
11
  end
41
12
 
42
- end
43
-
44
- def self.[](variable, type=RustDatatype)
45
- return type.pull_variable(variable)
46
- end
47
-
48
- def self._eval_big(r_command, return_warnings = false)
49
- r_command = r_command.join("\n") if r_command.is_a?(Array)
50
-
51
- self._rexec(r_command, return_warnings) do |cmd|
52
- result = true
53
- instructions = cmd.lines
54
-
55
- while instructions.size > 0
56
- current_command = ""
57
-
58
- while (instructions.size > 0) && (current_command.length + instructions.first.length < 10000)
59
- current_command << instructions.shift
60
- end
61
-
62
- result &= R_ENGINE.eval(current_command)
63
- end
64
-
65
- result
66
- end
67
- end
68
-
69
- def self._pull(r_command, return_warnings = false)
70
- self._rexec(r_command, return_warnings) { |cmd| R_ENGINE.pull(cmd) }
71
- end
72
-
73
- def self._eval(r_command, return_warnings = false)
74
- self._rexec(r_command, return_warnings) { |cmd| R_ENGINE.eval(cmd) }
75
- end
76
-
77
- def self._rexec(r_command, return_warnings = false)
78
- puts "Calling _rexec with command: #{r_command}" if @@debugging
79
- R_MUTEX.synchronize do
80
- assert("This command must be executed in an exclusive block") { @@in_client_mutex }
81
-
82
- result = nil
83
- begin
84
- $stdout = StringIO.new
85
- if return_warnings
86
- R_ENGINE.echo(true, true)
87
- else
88
- R_ENGINE.echo(false, false)
89
- end
90
- result = yield(r_command)
91
- ensure
92
- R_ENGINE.echo(false, false)
93
- warnings = $stdout.string
94
- $stdout = STDOUT
95
- end
96
-
97
- if return_warnings
98
- return result, warnings.lines.map { |w| w.strip.chomp }
99
- else
100
- return result
101
- end
102
- end
103
- end
104
-
105
- class RustDatatype
106
- def self.pull_variable(variable)
107
- return Rust._pull(variable)
13
+ def self.pull_priority
14
+ 1
108
15
  end
109
16
 
110
- def load_in_r_as(r_instance, variable_name)
111
- raise "Not implemented"
112
- end
113
- end
114
-
115
- class DataFrame < RustDatatype
116
- def self.pull_variable(variable)
17
+ def self.pull_variable(variable, type, klass)
117
18
  hash = {}
118
- colnames = Rust._pull("colnames(#{variable})")
19
+ colnames = Rust["colnames(#{variable})"]
119
20
  colnames.each do |col|
120
- hash[col] = Rust._pull("#{variable}$#{col}")
21
+ hash[col] = Rust["#{variable}$\"#{col}\""]
121
22
  end
122
23
  return DataFrame.new(hash)
123
24
  end
124
25
 
26
+ ##
27
+ # Creates a new data-frame.
28
+ # +labels_or_data+ can be either:
29
+ # - an Array of column names (creates an empty data-frame)
30
+ # - a Hash with column names as keys and values as values
31
+
125
32
  def initialize(labels_or_data)
126
33
  @data = {}
127
34
 
@@ -137,6 +44,9 @@ module Rust
137
44
  end
138
45
  end
139
46
 
47
+ ##
48
+ # Returns the +i+-th row of the data-frame
49
+
140
50
  def row(i)
141
51
  if i < 0 || i >= self.rows
142
52
  return nil
@@ -145,6 +55,9 @@ module Rust
145
55
  end
146
56
  end
147
57
 
58
+ ##
59
+ # Returns the +i+-th row of the data-frame. Faster (but harder to interpret) alternative to #row.
60
+
148
61
  def fast_row(i)
149
62
  if i < 0 || i >= self.rows
150
63
  return nil
@@ -153,6 +66,9 @@ module Rust
153
66
  end
154
67
  end
155
68
 
69
+ ##
70
+ # Shuffles the rows in the data-frame. The arguments are passed to the Array#shuffle method.
71
+
156
72
  def shuffle(*args)
157
73
  result = DataFrame.new(@labels)
158
74
 
@@ -167,6 +83,10 @@ module Rust
167
83
  return result
168
84
  end
169
85
 
86
+ ##
87
+ # Returns a copy of the data-frame containing only the specified +rows+ and/or +cols+. If +rows+ and/or +cols+
88
+ # are nil, all the rows/columns are returned.
89
+
170
90
  def [](rows, cols=nil)
171
91
  raise "You must specify either rows or columns to select" if !rows && !cols
172
92
  result = self
@@ -182,11 +102,17 @@ module Rust
182
102
  return result
183
103
  end
184
104
 
105
+ ##
106
+ # Return the column named +name+.
107
+
185
108
  def column(name)
186
109
  return @data[name]
187
110
  end
188
111
  alias :| :column
189
112
 
113
+ ##
114
+ # Renames the column named +old_name+ in +new_name+.
115
+
190
116
  def rename_column!(old_name, new_name)
191
117
  raise "This DataFrame does not contain a column named #{old_name}" unless @labels.include?(old_name)
192
118
  raise "This DataFrame already contains a column named #{new_name}" if @labels.include?(new_name)
@@ -195,10 +121,24 @@ module Rust
195
121
  @labels[@labels.index(old_name)] = new_name
196
122
  end
197
123
 
124
+ ##
125
+ # Functionally transforms the column named +column+ by applying the function given as a block.
126
+ # Example:
127
+ # df = Rust::DataFrame.new({a: [1,2,3], b: [3,4,5]})
128
+ # df.transform_column!("a") { |v| v + 1 }
129
+ # df|"a" # => [2, 3, 4]
130
+
198
131
  def transform_column!(column)
199
132
  @data[column].map! { |e| yield e }
200
133
  end
201
134
 
135
+ ##
136
+ # Returns a copy data-frame with only the rows for which the function given in the block returns true.
137
+ # Example:
138
+ # df = Rust::DataFrame.new({a: [1,2,3], b: ['a','b','c']})
139
+ # df2 = df.select_rows { |r| r['a'].even? }
140
+ # df2|"b" # => ['b']
141
+
202
142
  def select_rows
203
143
  result = DataFrame.new(self.column_names)
204
144
  self.each_with_index do |row, i|
@@ -207,6 +147,9 @@ module Rust
207
147
  return result
208
148
  end
209
149
 
150
+ ##
151
+ # Returns true if the function given in the block returns true for any of the rows in this data-frame.
152
+
210
153
  def has_row?
211
154
  self.each_with_index do |row, i|
212
155
  return true if yield row, i
@@ -214,6 +157,10 @@ module Rust
214
157
  return false
215
158
  end
216
159
 
160
+ ##
161
+ # Returns a copy of the data-frame with only the columns in +cols+. As an alternative, a block can be used
162
+ # (only the columns for which the function returns true are kept).
163
+
217
164
  def select_columns(cols=nil)
218
165
  raise "You must specify either the columns you want to select or a selection block" if !cols && !block_given?
219
166
 
@@ -229,23 +176,35 @@ module Rust
229
176
  end
230
177
  alias :select_cols :select_columns
231
178
 
179
+ ##
180
+ # Deletes the column named +column+.
181
+
232
182
  def delete_column(column)
233
183
  @labels.delete(column)
234
184
  @data.delete(column)
235
185
  end
236
186
 
187
+ ##
188
+ # Deletes the +i+-th row.
189
+
237
190
  def delete_row(i)
238
191
  @data.each do |label, column|
239
192
  column.delete_at(i)
240
193
  end
241
194
  end
242
195
 
196
+ ##
197
+ # Returns a data-frame in which the rows are unique in terms of all the given columns named +by+.
198
+
243
199
  def uniq_by(by)
244
200
  result = self.clone
245
201
  result.uniq_by!(by)
246
202
  return result
247
203
  end
248
204
 
205
+ ##
206
+ # Makes sure that in this data-frame the rows are unique in terms of all the given columns named +by+.
207
+
249
208
  def uniq_by!(by)
250
209
  my_keys = {}
251
210
  to_delete = []
@@ -268,19 +227,33 @@ module Rust
268
227
  return self
269
228
  end
270
229
 
230
+ ##
231
+ # Return the names of the columns.
232
+
271
233
  def column_names
272
234
  return @labels.map { |k| k.to_s }
273
235
  end
274
236
  alias :colnames :column_names
275
237
 
238
+ ##
239
+ # Returns the number of rows.
240
+
276
241
  def rows
277
242
  @data.values[0].size
278
243
  end
279
244
 
245
+ ##
246
+ # Returns the number of columns
247
+
280
248
  def columns
281
249
  @labels.size
282
250
  end
283
251
 
252
+ ##
253
+ # Adds the given +row+ to the data-frame. +row+ can be either:
254
+ # - An Array of values for all the columns (in the order of #column_names);
255
+ # - A Hash containing associations between column names and value to be set.
256
+
284
257
  def add_row(row)
285
258
  if row.is_a?(Array)
286
259
  raise "Expected an array of size #{@data.size}" unless row.size == @data.size
@@ -304,6 +277,11 @@ module Rust
304
277
  end
305
278
  alias :<< :add_row
306
279
 
280
+ ##
281
+ # Adds a column named +name+ with the given +values+ (array). The size of +values+ must match the number of
282
+ # rows of this data-frame. As an alternative, it can be passed a block which returns, for a given row, the
283
+ # value to assign for the new column.
284
+
307
285
  def add_column(name, values=nil)
308
286
  raise "Column already exists" if @labels.include?(name)
309
287
  raise "Values or block required" if !values && !block_given?
@@ -320,6 +298,9 @@ module Rust
320
298
  end
321
299
  end
322
300
 
301
+ ##
302
+ # Yields each row as a Hash containing column names as keys and values as values.
303
+
323
304
  def each
324
305
  self.each_with_index do |element, i|
325
306
  yield element
@@ -328,6 +309,10 @@ module Rust
328
309
  return self
329
310
  end
330
311
 
312
+ ##
313
+ # Yields each row as a Hash containing column names as keys and values as values. Faster alternative to
314
+ # #each.
315
+
331
316
  def fast_each
332
317
  self.fast_each_with_index do |element, i|
333
318
  yield element
@@ -336,6 +321,9 @@ module Rust
336
321
  return self
337
322
  end
338
323
 
324
+ ##
325
+ # Yields each row as a Hash containing column names as keys and values as values and the row index.
326
+
339
327
  def each_with_index
340
328
  for i in 0...self.rows
341
329
  element = {}
@@ -349,6 +337,10 @@ module Rust
349
337
  return self
350
338
  end
351
339
 
340
+ ##
341
+ # Yields each row as a Hash containing column names as keys and values as values and the row index. Faster
342
+ # alternative to #each_with_index.
343
+
352
344
  def fast_each_with_index
353
345
  for i in 0...self.rows
354
346
  element = []
@@ -373,6 +365,14 @@ module Rust
373
365
  row_index += 1
374
366
  end
375
367
 
368
+ self.column_names.each do |name|
369
+ column = self.column(name)
370
+
371
+ if column.is_a?(Factor)
372
+ command << "#{variable_name}[,#{name.to_R}] <- factor(#{variable_name}[,#{name.to_R}], labels=#{column.levels.to_R})"
373
+ end
374
+ end
375
+
376
376
  Rust._eval_big(command)
377
377
  end
378
378
 
@@ -397,6 +397,9 @@ module Rust
397
397
  return result
398
398
  end
399
399
 
400
+ ##
401
+ # Returns a copy of the data-frame containing only the first +n+ rows.
402
+
400
403
  def head(n=10)
401
404
  result = DataFrame.new(self.column_names)
402
405
  self.each_with_index do |row, i|
@@ -405,6 +408,11 @@ module Rust
405
408
  return result
406
409
  end
407
410
 
411
+ ##
412
+ # Merges this data-frame with +other+ in terms of the +by+ column(s) (Array or String).
413
+ # +first_alias+ and +second_alias+ allow to specify the prefix that should be used for the columns not in +by+
414
+ # for this and the +other+ data-frame, respectively.
415
+
408
416
  def merge(other, by, first_alias = "x", second_alias = "y")
409
417
  raise TypeError, "Expected Rust::DataFrame" unless other.is_a?(DataFrame)
410
418
  raise TypeError, "Expected list of strings" if !by.is_a?(Array) || !by.all? { |e| e.is_a?(String) }
@@ -471,6 +479,14 @@ module Rust
471
479
  return result
472
480
  end
473
481
 
482
+ ##
483
+ # Aggregate the value in groups depending on the +by+ column (String).
484
+ # A block must be passed to specify how to aggregate the columns. Aggregators for specific columns can be
485
+ # specified as optional arguments in which the name of the argument represents the column name and the value
486
+ # contains a block for aggregating the specific column.
487
+ # Both the default and the specialized blocks must take as argument an array of values and must return a
488
+ # scalar value.
489
+
474
490
  def aggregate(by, **aggregators)
475
491
  raise TypeError, "Expected a string" unless by.is_a?(String)
476
492
  raise TypeError, "All the aggregators should be procs" unless aggregators.values.all? { |v| v.is_a?(Proc) }
@@ -511,12 +527,18 @@ module Rust
511
527
  return result
512
528
  end
513
529
 
530
+ ##
531
+ # Returns a copy of this data-frame in which the rows are sorted by the values of the +by+ column.
532
+
514
533
  def sort_by(column)
515
534
  result = self.clone
516
535
  result.sort_by!(column)
517
536
  return result
518
537
  end
519
538
 
539
+ ##
540
+ # Sorts the rows of this data-frame by the values of the +by+ column.
541
+
520
542
  def sort_by!(by)
521
543
  copy = @data[by].clone
522
544
  copy.sort!
@@ -542,6 +564,9 @@ module Rust
542
564
  @data[by].sort!
543
565
  end
544
566
 
567
+ ##
568
+ # Adds all the rows in +dataframe+ to this data-frame. The column names must match.
569
+
545
570
  def bind_rows!(dataframe)
546
571
  raise TypeError, "DataFrame expected" unless dataframe.is_a?(DataFrame)
547
572
  raise "The columns are not compatible: #{self.column_names - dataframe.column_names} - #{dataframe.column_names - self.column_names}" unless (self.column_names & dataframe.column_names).size == self.columns
@@ -554,6 +579,9 @@ module Rust
554
579
  end
555
580
  alias :rbind! :bind_rows!
556
581
 
582
+ ##
583
+ # Adds all the columns in +dataframe+ to this data-frame. The number of rows must match.
584
+
557
585
  def bind_columns!(dataframe)
558
586
  raise TypeError, "DataFrame expected" unless dataframe.is_a?(DataFrame)
559
587
  raise "The number of rows are not compatible" if self.rows != dataframe.rows
@@ -567,6 +595,9 @@ module Rust
567
595
  end
568
596
  alias :cbind! :bind_columns!
569
597
 
598
+ ##
599
+ # Returns a copy of this dataframe and adds all the rows in +dataframe+ to it. The column names must match.
600
+
570
601
  def bind_rows(dataframe)
571
602
  result = self.clone
572
603
  result.bind_rows!(dataframe)
@@ -574,6 +605,9 @@ module Rust
574
605
  end
575
606
  alias :rbind :bind_rows
576
607
 
608
+ ##
609
+ # Returns a copy of this dataframe and adds all the columns in +dataframe+ to it. The number of rows must match.
610
+
577
611
  def bind_columns(dataframe)
578
612
  result = self.clone
579
613
  result.bind_columns!(dataframe)
@@ -581,88 +615,22 @@ module Rust
581
615
  end
582
616
  alias :cbind :bind_columns
583
617
 
618
+ ##
619
+ # Returns a copy of this data-frame.
620
+
584
621
  def clone
585
622
  DataFrame.new(@data)
586
623
  end
587
624
  end
588
625
 
589
- class Matrix < RustDatatype
590
- def self.pull_variable(variable)
591
- return Rust._pull(variable)
592
- end
593
-
594
- def initialize(data)
595
- if data.flatten.size == 0
596
- raise "Empty matrices are not allowed"
597
- else
598
- raise TypeError, "Expected array of array" unless data.is_a?(Array) && data[0].is_a?(Array)
599
- raise TypeError, "Only numeric matrices are supported" unless data.all? { |row| row.all? { |e| e.is_a?(Numeric) } }
600
- raise "All the rows must have the same size" unless data.map { |row| row.size }.uniq.size == 1
601
- @data = data.clone
602
- end
603
- end
604
-
605
- def [](i, j)
606
- return @data[i][j]
607
- end
608
-
609
- def rows
610
- @data.size
611
- end
612
-
613
- def cols
614
- @data[0].size
615
- end
616
-
617
- def []=(i, j, value)
618
- raise "Wrong i" unless i.between?(0, @data.size - 1)
619
- raise "Wrong j" unless j.between?(0, @data[0].size - 1)
620
- @data[i][j] = value
621
- end
622
-
623
- def load_in_r_as(variable_name)
624
- Rust._eval("#{variable_name} <- matrix(c(#{@data.flatten.join(",")}), nrow=#{self.rows}, ncol=#{self.cols}, byrow=T)")
625
- end
626
- end
626
+ ##
627
+ # Represents an array of DataFrame
627
628
 
628
- class Sequence < RustDatatype
629
- attr_reader :min
630
- attr_reader :max
631
-
632
- def initialize(min, max, step=1)
633
- @min = min
634
- @max = max
635
- @step = step
636
- end
637
-
638
- def step(step)
639
- @step = step
640
- end
641
-
642
- def each
643
- (@min..@max).step(@step) do |v|
644
- yield v
645
- end
646
- end
647
-
648
- def to_a
649
- result = []
650
- self.each do |v|
651
- result << v
652
- end
653
- return result
654
- end
629
+ class DataFrameArray < Array
655
630
 
656
- def to_R
657
- "seq(from=#@min, to=#@max, by=#@step)"
658
- end
631
+ ##
632
+ # Returns a data-frame with the rows in all the data-frames together (if compatible).
659
633
 
660
- def load_in_r_as(variable_name)
661
- Rust._eval("#{variable_name} <- #{self.to_R}")
662
- end
663
- end
664
-
665
- class DataFrameArray < Array
666
634
  def bind_all
667
635
  return nil if self.size == 0
668
636
 
@@ -676,7 +644,14 @@ module Rust
676
644
  end
677
645
  end
678
646
 
647
+ ##
648
+ # Represents a hash of DataFrame
649
+
679
650
  class DataFrameHash < Hash
651
+
652
+ ##
653
+ # Returns a data-frame with the rows in all the data-frames together (if compatible).
654
+
680
655
  def bind_all
681
656
  return nil if self.values.size == 0
682
657
 
@@ -689,151 +664,4 @@ module Rust
689
664
  return result
690
665
  end
691
666
  end
692
-
693
- class MathArray < Array
694
- def -(other)
695
- raise ArgumentError, "Expected array or numeric" if !other.is_a?(::Array) && !other.is_a?(Numeric)
696
- raise ArgumentError, "The two arrays must have the same size" if other.is_a?(::Array) && self.size != other.size
697
-
698
- result = self.clone
699
- other = [other] * self.size if other.is_a?(Numeric)
700
- for i in 0...self.size
701
- result[i] -= other[i]
702
- end
703
-
704
- return result
705
- end
706
-
707
- def *(other)
708
- raise ArgumentError, "Expected array or numeric" if !other.is_a?(::Array) && !other.is_a?(Numeric)
709
- raise ArgumentError, "The two arrays must have the same size" if other.is_a?(::Array) && self.size != other.size
710
-
711
- result = self.clone
712
- other = [other] * self.size if other.is_a?(Numeric)
713
- for i in 0...self.size
714
- result[i] *= other[i]
715
- end
716
-
717
- return result
718
- end
719
-
720
- def +(other)
721
- raise ArgumentError, "Expected array or numeric" if !other.is_a?(::Array) && !other.is_a?(Numeric)
722
- raise ArgumentError, "The two arrays must have the same size" if other.is_a?(::Array) && self.size != other.size
723
-
724
- result = self.clone
725
- other = [other] * self.size if other.is_a?(Numeric)
726
- for i in 0...self.size
727
- result[i] += other[i]
728
- end
729
-
730
- return result
731
- end
732
-
733
- def /(other) #To recover the syntax highlighting but in Kate: /
734
- raise ArgumentError, "Expected array or numeric" if !other.is_a?(::Array) && !other.is_a?(Numeric)
735
- raise ArgumentError, "The two arrays must have the same size" if other.is_a?(::Array) && self.size != other.size
736
-
737
- result = self.clone
738
- other = [other] * self.size if other.is_a?(Numeric)
739
- for i in 0...self.size
740
- result[i] /= other[i]
741
- end
742
-
743
- return result
744
- end
745
-
746
- def **(other)
747
- raise ArgumentError, "Expected numeric" if !other.is_a?(Numeric)
748
-
749
- result = self.clone
750
- for i in 0...self.size
751
- result[i] = result[i] ** other
752
- end
753
-
754
- return result
755
- end
756
- end
757
- end
758
-
759
- class TrueClass
760
- def to_R
761
- "TRUE"
762
- end
763
- end
764
-
765
- class FalseClass
766
- def to_R
767
- "FALSE"
768
- end
769
- end
770
-
771
- class Object
772
- def to_R
773
- raise TypeError, "Unsupported type for #{self.class}"
774
- end
775
- end
776
-
777
- class NilClass
778
- def to_R
779
- return "NULL"
780
- end
781
- end
782
-
783
- class Numeric
784
- def to_R
785
- self.inspect
786
- end
787
- end
788
-
789
- class Float
790
- def to_R
791
- return self.nan? ? "NA" : super
792
- end
793
- end
794
-
795
- class Array
796
- def to_R
797
- return "c(#{self.map { |e| e.to_R }.join(",")})"
798
- end
799
-
800
- def distribution
801
- result = {}
802
- self.each do |value|
803
- result[value] = result[value].to_i + 1
804
- end
805
- return result
806
- end
807
- end
808
-
809
- class String
810
- def to_R
811
- return self.inspect
812
- end
813
- end
814
-
815
- class Range
816
- def to_R
817
- [range.min, range.max].to_R
818
- end
819
- end
820
-
821
- module Rust::RBindings
822
- def data_frame(*args)
823
- Rust::DataFrame.new(*args)
824
- end
825
- end
826
-
827
- module Rust::TestCases
828
- def self.sample_dataframe(columns, size=100)
829
- result = Rust::DataFrame.new(columns)
830
- size.times do |i|
831
- result << columns.map { |c| yield i, c }
832
- end
833
- return result
834
- end
835
- end
836
-
837
- def bind_r!
838
- include Rust::RBindings
839
667
  end