rust 0.3 → 0.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,123 +1,20 @@
1
- require 'code-assertions'
2
- require 'stringio'
3
- require 'rinruby'
4
- require 'csv'
1
+ require_relative 'datatype'
5
2
 
6
3
  module Rust
7
- CLIENT_MUTEX = Mutex.new
8
- R_MUTEX = Mutex.new
9
-
10
- R_ENGINE = RinRuby.new(echo: false)
11
-
12
- private_constant :R_ENGINE
13
- private_constant :R_MUTEX
14
- private_constant :CLIENT_MUTEX
15
-
16
- @@debugging = false
17
- @@in_client_mutex = false
18
-
19
- def self.debug
20
- @@debugging = true
21
- end
22
-
23
- def self.exclusive
24
- result = nil
25
- CLIENT_MUTEX.synchronize do
26
- @@in_client_mutex = true
27
- result = yield
28
- @@in_client_mutex = false
29
- end
30
- return result
31
- end
32
-
33
- def self.[]=(variable, value)
34
- if value.is_a?(RustDatatype)
35
- value.load_in_r_as(variable.to_s)
36
- elsif value.is_a?(String) || value.is_a?(Numeric) || value.is_a?(Array)
37
- R_ENGINE.assign(variable, value)
38
- else
39
- raise "Given #{value.class}, expected RustDatatype, String, Numeric, or Array"
4
+ class DataFrame < RustDatatype
5
+ def self.can_pull?(type, klass)
6
+ return [klass].flatten.include?("data.frame")
40
7
  end
41
8
 
42
- end
43
-
44
- def self.[](variable, type=RustDatatype)
45
- return type.pull_variable(variable)
46
- end
47
-
48
- def self._eval_big(r_command, return_warnings = false)
49
- r_command = r_command.join("\n") if r_command.is_a?(Array)
50
-
51
- self._rexec(r_command, return_warnings) do |cmd|
52
- result = true
53
- instructions = cmd.lines
54
-
55
- while instructions.size > 0
56
- current_command = ""
57
-
58
- while (instructions.size > 0) && (current_command.length + instructions.first.length < 10000)
59
- current_command << instructions.shift
60
- end
61
-
62
- result &= R_ENGINE.eval(current_command)
63
- end
64
-
65
- result
66
- end
67
- end
68
-
69
- def self._pull(r_command, return_warnings = false)
70
- self._rexec(r_command, return_warnings) { |cmd| R_ENGINE.pull(cmd) }
71
- end
72
-
73
- def self._eval(r_command, return_warnings = false)
74
- self._rexec(r_command, return_warnings) { |cmd| R_ENGINE.eval(cmd) }
75
- end
76
-
77
- def self._rexec(r_command, return_warnings = false)
78
- puts "Calling _rexec with command: #{r_command}" if @@debugging
79
- R_MUTEX.synchronize do
80
- assert("This command must be executed in an exclusive block") { @@in_client_mutex }
81
-
82
- result = nil
83
- begin
84
- $stdout = StringIO.new
85
- if return_warnings
86
- R_ENGINE.echo(true, true)
87
- else
88
- R_ENGINE.echo(false, false)
89
- end
90
- result = yield(r_command)
91
- ensure
92
- R_ENGINE.echo(false, false)
93
- warnings = $stdout.string
94
- $stdout = STDOUT
95
- end
96
-
97
- if return_warnings
98
- return result, warnings.lines.map { |w| w.strip.chomp }
99
- else
100
- return result
101
- end
102
- end
103
- end
104
-
105
- class RustDatatype
106
- def self.pull_variable(variable)
107
- return Rust._pull(variable)
9
+ def self.pull_priority
10
+ 1
108
11
  end
109
12
 
110
- def load_in_r_as(r_instance, variable_name)
111
- raise "Not implemented"
112
- end
113
- end
114
-
115
- class DataFrame < RustDatatype
116
- def self.pull_variable(variable)
13
+ def self.pull_variable(variable, type, klass)
117
14
  hash = {}
118
- colnames = Rust._pull("colnames(#{variable})")
15
+ colnames = Rust["colnames(#{variable})"]
119
16
  colnames.each do |col|
120
- hash[col] = Rust._pull("#{variable}$#{col}")
17
+ hash[col] = Rust["#{variable}$\"#{col}\""]
121
18
  end
122
19
  return DataFrame.new(hash)
123
20
  end
@@ -130,7 +27,10 @@ module Rust
130
27
  @labels.each { |label| @data[label] = [] }
131
28
  elsif labels_or_data.is_a? Hash
132
29
  @labels = labels_or_data.keys.map { |l| l.to_s }
133
- @data = labels_or_data.clone
30
+
31
+ labels_or_data.each do |key, value|
32
+ @data[key.to_s] = value.clone
33
+ end
134
34
  end
135
35
  end
136
36
 
@@ -142,6 +42,14 @@ module Rust
142
42
  end
143
43
  end
144
44
 
45
+ def fast_row(i)
46
+ if i < 0 || i >= self.rows
47
+ return nil
48
+ else
49
+ return @labels.map { |label| @data[label][i] }
50
+ end
51
+ end
52
+
145
53
  def shuffle(*args)
146
54
  result = DataFrame.new(@labels)
147
55
 
@@ -174,6 +82,7 @@ module Rust
174
82
  def column(name)
175
83
  return @data[name]
176
84
  end
85
+ alias :| :column
177
86
 
178
87
  def rename_column!(old_name, new_name)
179
88
  raise "This DataFrame does not contain a column named #{old_name}" unless @labels.include?(old_name)
@@ -195,6 +104,13 @@ module Rust
195
104
  return result
196
105
  end
197
106
 
107
+ def has_row?
108
+ self.each_with_index do |row, i|
109
+ return true if yield row, i
110
+ end
111
+ return false
112
+ end
113
+
198
114
  def select_columns(cols=nil)
199
115
  raise "You must specify either the columns you want to select or a selection block" if !cols && !block_given?
200
116
 
@@ -215,6 +131,40 @@ module Rust
215
131
  @data.delete(column)
216
132
  end
217
133
 
134
+ def delete_row(i)
135
+ @data.each do |label, column|
136
+ column.delete_at(i)
137
+ end
138
+ end
139
+
140
+ def uniq_by(by)
141
+ result = self.clone
142
+ result.uniq_by!(by)
143
+ return result
144
+ end
145
+
146
+ def uniq_by!(by)
147
+ my_keys = {}
148
+ to_delete = []
149
+ self.each_with_index do |row, i|
150
+ key = []
151
+ by.each do |colname|
152
+ key << row[colname]
153
+ end
154
+ unless my_keys[key]
155
+ my_keys[key] = i
156
+ else
157
+ to_delete << (i-to_delete.size)
158
+ end
159
+ end
160
+
161
+ to_delete.each do |i|
162
+ self.delete_row(i)
163
+ end
164
+
165
+ return self
166
+ end
167
+
218
168
  def column_names
219
169
  return @labels.map { |k| k.to_s }
220
170
  end
@@ -243,7 +193,7 @@ module Rust
243
193
  row.each do |key, value|
244
194
  @data[key.to_s] << value
245
195
  end
246
- #
196
+
247
197
  return true
248
198
  else
249
199
  raise TypeError, "Expected an Array or a Hash"
@@ -275,6 +225,14 @@ module Rust
275
225
  return self
276
226
  end
277
227
 
228
+ def fast_each
229
+ self.fast_each_with_index do |element, i|
230
+ yield element
231
+ end
232
+
233
+ return self
234
+ end
235
+
278
236
  def each_with_index
279
237
  for i in 0...self.rows
280
238
  element = {}
@@ -288,6 +246,19 @@ module Rust
288
246
  return self
289
247
  end
290
248
 
249
+ def fast_each_with_index
250
+ for i in 0...self.rows
251
+ element = []
252
+ @labels.each do |label|
253
+ element << @data[label][i]
254
+ end
255
+
256
+ yield element, i
257
+ end
258
+
259
+ return self
260
+ end
261
+
291
262
  def load_in_r_as(variable_name)
292
263
  command = []
293
264
 
@@ -299,6 +270,14 @@ module Rust
299
270
  row_index += 1
300
271
  end
301
272
 
273
+ self.column_names.each do |name|
274
+ column = self.column(name)
275
+
276
+ if column.is_a?(Factor)
277
+ command << "#{variable_name}[,#{name.to_R}] <- factor(#{variable_name}[,#{name.to_R}], labels=#{column.levels.to_R})"
278
+ end
279
+ end
280
+
302
281
  Rust._eval_big(command)
303
282
  end
304
283
 
@@ -397,6 +376,77 @@ module Rust
397
376
  return result
398
377
  end
399
378
 
379
+ def aggregate(by, **aggregators)
380
+ raise TypeError, "Expected a string" unless by.is_a?(String)
381
+ raise TypeError, "All the aggregators should be procs" unless aggregators.values.all? { |v| v.is_a?(Proc) }
382
+ raise "Expected a block for default aggregator" unless block_given?
383
+
384
+ aggregators = aggregators.map { |label, callable| [label.to_s, callable] }.to_h
385
+
386
+ sorted = self.sort_by(by)
387
+
388
+ current_value = nil
389
+ partials = []
390
+ partial = nil
391
+ sorted.column(by).each_with_index do |value, index|
392
+ if current_value != value
393
+ current_value = value
394
+ partials << partial if partial
395
+ partial = Rust::DataFrame.new(self.column_names)
396
+ end
397
+ partial << sorted.fast_row(index)
398
+ end
399
+ partials << partial
400
+
401
+ result = Rust::DataFrame.new(self.column_names)
402
+ partials.each do |partial|
403
+ aggregated_row = {}
404
+ aggregated_row[by] = partial.column(by)[0]
405
+ (self.column_names - [by]).each do |column|
406
+ if aggregators[column]
407
+ aggregated_row[column] = aggregators[column].call(partial.column(column))
408
+ else
409
+ aggregated_row[column] = yield partial.column(column)
410
+ end
411
+ end
412
+
413
+ result << aggregated_row
414
+ end
415
+
416
+ return result
417
+ end
418
+
419
+ def sort_by(column)
420
+ result = self.clone
421
+ result.sort_by!(column)
422
+ return result
423
+ end
424
+
425
+ def sort_by!(by)
426
+ copy = @data[by].clone
427
+ copy.sort!
428
+
429
+ indices = []
430
+ @data[by].each_with_index do |value, i|
431
+ index = copy.index(value)
432
+ indices << index
433
+
434
+ copy[index] = NilClass
435
+ end
436
+
437
+ (self.column_names - [by]).each do |column_name|
438
+ sorted = []
439
+ column = self.column(column_name)
440
+ column_i = 0
441
+ indices.each do |i|
442
+ sorted[i] = column[column_i]
443
+ column_i += 1
444
+ end
445
+ @data[column_name] = sorted
446
+ end
447
+ @data[by].sort!
448
+ end
449
+
400
450
  def bind_rows!(dataframe)
401
451
  raise TypeError, "DataFrame expected" unless dataframe.is_a?(DataFrame)
402
452
  raise "The columns are not compatible: #{self.column_names - dataframe.column_names} - #{dataframe.column_names - self.column_names}" unless (self.column_names & dataframe.column_names).size == self.columns
@@ -441,143 +491,31 @@ module Rust
441
491
  end
442
492
  end
443
493
 
444
- class Matrix < RustDatatype
445
- def self.pull_variable(variable)
446
- return Rust._pull(variable)
447
- end
448
-
449
- def initialize(data)
450
- if data.flatten.size == 0
451
- raise "Empty matrices are not allowed"
452
- else
453
- raise TypeError, "Expected array of array" unless data.is_a?(Array) && data[0].is_a?(Array)
454
- raise TypeError, "Only numeric matrices are supported" unless data.all? { |row| row.all? { |e| e.is_a?(Numeric) } }
455
- raise "All the rows must have the same size" unless data.map { |row| row.size }.uniq.size == 1
456
- @data = data.clone
494
+ class DataFrameArray < Array
495
+ def bind_all
496
+ return nil if self.size == 0
497
+
498
+ result = self.first.clone
499
+
500
+ for i in 1...self.size
501
+ result .bind_rows!(self[i])
457
502
  end
458
- end
459
-
460
- def [](i, j)
461
- return @data[i][j]
462
- end
463
-
464
- def rows
465
- @data.size
466
- end
467
-
468
- def cols
469
- @data[0].size
470
- end
471
-
472
- def []=(i, j, value)
473
- raise "Wrong i" unless i.between?(0, @data.size - 1)
474
- raise "Wrong j" unless j.between?(0, @data[0].size - 1)
475
- @data[i][j] = value
476
- end
477
-
478
- def load_in_r_as(variable_name)
479
- Rust._eval("#{variable_name} <- matrix(c(#{@data.flatten.join(",")}), nrow=#{self.rows}, ncol=#{self.cols}, byrow=T)")
503
+
504
+ return result
480
505
  end
481
506
  end
482
507
 
483
- class Sequence
484
- attr_reader :min
485
- attr_reader :max
486
-
487
- def initialize(min, max, step=1)
488
- @min = min
489
- @max = max
490
- @step = step
491
- end
492
-
493
- def step(step)
494
- @step = step
495
- end
496
-
497
- def each
498
- (@min..@max).step(@step) do |v|
499
- yield v
500
- end
501
- end
502
-
503
- def to_a
504
- result = []
505
- self.each do |v|
506
- result << v
508
+ class DataFrameHash < Hash
509
+ def bind_all
510
+ return nil if self.values.size == 0
511
+
512
+ result = self.values.first.clone
513
+
514
+ for i in 1...self.values.size
515
+ result .bind_rows!(self.values[i])
507
516
  end
517
+
508
518
  return result
509
519
  end
510
-
511
- def to_R
512
- "seq(from=#@min, to=#@max, by=#@step)"
513
- end
514
- end
515
- end
516
-
517
- class TrueClass
518
- def to_R
519
- "TRUE"
520
- end
521
- end
522
-
523
- class FalseClass
524
- def to_R
525
- "FALSE"
526
- end
527
- end
528
-
529
- class Object
530
- def to_R
531
- raise TypeError, "Unsupported type for #{self.class}"
532
- end
533
- end
534
-
535
- class NilClass
536
- def to_R
537
- return "NULL"
538
- end
539
- end
540
-
541
- class Numeric
542
- def to_R
543
- self.inspect
544
- end
545
- end
546
-
547
- class Float
548
- def to_R
549
- return self.nan? ? "NA" : super
550
- end
551
- end
552
-
553
- class Array
554
- def to_R
555
- return "c(#{self.map { |e| e.to_R }.join(",")})"
556
- end
557
- end
558
-
559
- class String
560
- def to_R
561
- return self.inspect
562
- end
563
- end
564
-
565
- class Range
566
- def to_R
567
- [range.min, range.max].to_R
568
- end
569
- end
570
-
571
- module Rust::RBindings
572
- def read_csv(filename, **options)
573
- Rust::CSV.read(filename, **options)
574
- end
575
-
576
- def write_csv(filename, dataframe, **options)
577
- Rust::CSV.write(filename, dataframe, **options)
578
- end
579
-
580
- def data_frame(*args)
581
- Rust::DataFrame.new(*args)
582
520
  end
583
521
  end
@@ -0,0 +1,161 @@
1
+ require_relative '../rust'
2
+
3
+ module Rust
4
+ class RustDatatype
5
+ def self.pull_variable(variable, forced_interpreter = nil)
6
+ r_type = Rust._pull("as.character(typeof(#{variable}))")
7
+ r_class = Rust._pull("as.character(class(#{variable}))")
8
+
9
+ if forced_interpreter
10
+ raise ArgumentError, "Expected null or class as forced_interpreter" if forced_interpreter && !forced_interpreter.is_a?(Class)
11
+ raise ArgumentError, "Class #{forced_interpreter} can not handle type #{r_type}, class #{r_class}" unless forced_interpreter.can_pull?(r_type, r_class)
12
+
13
+ return forced_interpreter.pull_variable(variable, r_type, r_class)
14
+ end
15
+
16
+ candidates = []
17
+ ObjectSpace.each_object(Class) do |type|
18
+ if type < RustDatatype
19
+ if type.can_pull?(r_type, r_class)
20
+ candidates << type
21
+ end
22
+ end
23
+ end
24
+
25
+ if candidates.size > 0
26
+ type = candidates.max_by { |c| c.pull_priority }
27
+
28
+ puts "Using #{type} to pull #{variable}" if Rust.debug?
29
+ return type.pull_variable(variable, r_type, r_class)
30
+ else
31
+ if Rust._pull("length(#{variable})") == 0
32
+ return []
33
+ else
34
+ return Rust._pull(variable)
35
+ end
36
+ end
37
+ end
38
+
39
+ def self.pull_priority
40
+ 0
41
+ end
42
+
43
+ def load_in_r_as(variable_name)
44
+ raise "Loading #{self.class} in R was not implemented"
45
+ end
46
+
47
+ def r_mirror_to(other_variable)
48
+ varname = self.mirrored_R_variable_name
49
+
50
+ Rust._eval("#{varname} = #{other_variable}")
51
+ Rust["#{varname}.hash"] = self.r_hash
52
+
53
+ return varname
54
+ end
55
+
56
+ def r_mirror
57
+ varname = self.mirrored_R_variable_name
58
+
59
+ if !Rust._pull("exists(\"#{varname}\")") || Rust._pull("#{varname}.hash") != self.r_hash
60
+ puts "Loading #{varname}" if Rust.debug?
61
+ Rust[varname] = self
62
+ Rust["#{varname}.hash"] = self.r_hash
63
+ else
64
+ puts "Using cached value for #{varname}" if Rust.debug?
65
+ end
66
+
67
+ return varname
68
+ end
69
+
70
+ def r_hash
71
+ self.hash.to_s
72
+ end
73
+
74
+ private
75
+ def mirrored_R_variable_name
76
+ return "rust.mirrored.#{self.object_id}"
77
+ end
78
+ end
79
+
80
+ class Null < RustDatatype
81
+ def self.can_pull?(type, klass)
82
+ return type == "NULL" && klass == "NULL"
83
+ end
84
+
85
+ def self.pull_variable(variable, type, klass)
86
+ return nil
87
+ end
88
+ end
89
+ end
90
+
91
+ class TrueClass
92
+ def to_R
93
+ "TRUE"
94
+ end
95
+ end
96
+
97
+ class FalseClass
98
+ def to_R
99
+ "FALSE"
100
+ end
101
+ end
102
+
103
+ class Object
104
+ def to_R
105
+ raise TypeError, "Unsupported type for #{self.class}"
106
+ end
107
+ end
108
+
109
+ class NilClass
110
+ def to_R
111
+ return "NULL"
112
+ end
113
+
114
+ def load_in_r_as(variable)
115
+ Rust._eval("#{variable} <- NULL")
116
+ end
117
+ end
118
+
119
+ class Numeric
120
+ def to_R
121
+ self.inspect
122
+ end
123
+ end
124
+
125
+ class Float
126
+ def to_R
127
+ return self.nan? ? "NA" : super
128
+ end
129
+ end
130
+
131
+ class Symbol
132
+ def to_R
133
+ return self.to_s.inspect
134
+ end
135
+ end
136
+
137
+ class Array
138
+ def to_R
139
+ return "c(#{self.map { |e| e.to_R }.join(",")})"
140
+ end
141
+
142
+ def distribution
143
+ result = {}
144
+ self.each do |value|
145
+ result[value] = result[value].to_i + 1
146
+ end
147
+ return result
148
+ end
149
+ end
150
+
151
+ class String
152
+ def to_R
153
+ return self.inspect
154
+ end
155
+ end
156
+
157
+ class Range
158
+ def to_R
159
+ [range.min, range.max].to_R
160
+ end
161
+ end