rust 0.3 → 0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,123 +1,20 @@
1
- require 'code-assertions'
2
- require 'stringio'
3
- require 'rinruby'
4
- require 'csv'
1
+ require_relative 'datatype'
5
2
 
6
3
  module Rust
7
- CLIENT_MUTEX = Mutex.new
8
- R_MUTEX = Mutex.new
9
-
10
- R_ENGINE = RinRuby.new(echo: false)
11
-
12
- private_constant :R_ENGINE
13
- private_constant :R_MUTEX
14
- private_constant :CLIENT_MUTEX
15
-
16
- @@debugging = false
17
- @@in_client_mutex = false
18
-
19
- def self.debug
20
- @@debugging = true
21
- end
22
-
23
- def self.exclusive
24
- result = nil
25
- CLIENT_MUTEX.synchronize do
26
- @@in_client_mutex = true
27
- result = yield
28
- @@in_client_mutex = false
29
- end
30
- return result
31
- end
32
-
33
- def self.[]=(variable, value)
34
- if value.is_a?(RustDatatype)
35
- value.load_in_r_as(variable.to_s)
36
- elsif value.is_a?(String) || value.is_a?(Numeric) || value.is_a?(Array)
37
- R_ENGINE.assign(variable, value)
38
- else
39
- raise "Given #{value.class}, expected RustDatatype, String, Numeric, or Array"
4
+ class DataFrame < RustDatatype
5
+ def self.can_pull?(type, klass)
6
+ return [klass].flatten.include?("data.frame")
40
7
  end
41
8
 
42
- end
43
-
44
- def self.[](variable, type=RustDatatype)
45
- return type.pull_variable(variable)
46
- end
47
-
48
- def self._eval_big(r_command, return_warnings = false)
49
- r_command = r_command.join("\n") if r_command.is_a?(Array)
50
-
51
- self._rexec(r_command, return_warnings) do |cmd|
52
- result = true
53
- instructions = cmd.lines
54
-
55
- while instructions.size > 0
56
- current_command = ""
57
-
58
- while (instructions.size > 0) && (current_command.length + instructions.first.length < 10000)
59
- current_command << instructions.shift
60
- end
61
-
62
- result &= R_ENGINE.eval(current_command)
63
- end
64
-
65
- result
66
- end
67
- end
68
-
69
- def self._pull(r_command, return_warnings = false)
70
- self._rexec(r_command, return_warnings) { |cmd| R_ENGINE.pull(cmd) }
71
- end
72
-
73
- def self._eval(r_command, return_warnings = false)
74
- self._rexec(r_command, return_warnings) { |cmd| R_ENGINE.eval(cmd) }
75
- end
76
-
77
- def self._rexec(r_command, return_warnings = false)
78
- puts "Calling _rexec with command: #{r_command}" if @@debugging
79
- R_MUTEX.synchronize do
80
- assert("This command must be executed in an exclusive block") { @@in_client_mutex }
81
-
82
- result = nil
83
- begin
84
- $stdout = StringIO.new
85
- if return_warnings
86
- R_ENGINE.echo(true, true)
87
- else
88
- R_ENGINE.echo(false, false)
89
- end
90
- result = yield(r_command)
91
- ensure
92
- R_ENGINE.echo(false, false)
93
- warnings = $stdout.string
94
- $stdout = STDOUT
95
- end
96
-
97
- if return_warnings
98
- return result, warnings.lines.map { |w| w.strip.chomp }
99
- else
100
- return result
101
- end
102
- end
103
- end
104
-
105
- class RustDatatype
106
- def self.pull_variable(variable)
107
- return Rust._pull(variable)
9
+ def self.pull_priority
10
+ 1
108
11
  end
109
12
 
110
- def load_in_r_as(r_instance, variable_name)
111
- raise "Not implemented"
112
- end
113
- end
114
-
115
- class DataFrame < RustDatatype
116
- def self.pull_variable(variable)
13
+ def self.pull_variable(variable, type, klass)
117
14
  hash = {}
118
- colnames = Rust._pull("colnames(#{variable})")
15
+ colnames = Rust["colnames(#{variable})"]
119
16
  colnames.each do |col|
120
- hash[col] = Rust._pull("#{variable}$#{col}")
17
+ hash[col] = Rust["#{variable}$\"#{col}\""]
121
18
  end
122
19
  return DataFrame.new(hash)
123
20
  end
@@ -130,7 +27,10 @@ module Rust
130
27
  @labels.each { |label| @data[label] = [] }
131
28
  elsif labels_or_data.is_a? Hash
132
29
  @labels = labels_or_data.keys.map { |l| l.to_s }
133
- @data = labels_or_data.clone
30
+
31
+ labels_or_data.each do |key, value|
32
+ @data[key.to_s] = value.clone
33
+ end
134
34
  end
135
35
  end
136
36
 
@@ -142,6 +42,14 @@ module Rust
142
42
  end
143
43
  end
144
44
 
45
+ def fast_row(i)
46
+ if i < 0 || i >= self.rows
47
+ return nil
48
+ else
49
+ return @labels.map { |label| @data[label][i] }
50
+ end
51
+ end
52
+
145
53
  def shuffle(*args)
146
54
  result = DataFrame.new(@labels)
147
55
 
@@ -174,6 +82,7 @@ module Rust
174
82
  def column(name)
175
83
  return @data[name]
176
84
  end
85
+ alias :| :column
177
86
 
178
87
  def rename_column!(old_name, new_name)
179
88
  raise "This DataFrame does not contain a column named #{old_name}" unless @labels.include?(old_name)
@@ -195,6 +104,13 @@ module Rust
195
104
  return result
196
105
  end
197
106
 
107
+ def has_row?
108
+ self.each_with_index do |row, i|
109
+ return true if yield row, i
110
+ end
111
+ return false
112
+ end
113
+
198
114
  def select_columns(cols=nil)
199
115
  raise "You must specify either the columns you want to select or a selection block" if !cols && !block_given?
200
116
 
@@ -215,6 +131,40 @@ module Rust
215
131
  @data.delete(column)
216
132
  end
217
133
 
134
+ def delete_row(i)
135
+ @data.each do |label, column|
136
+ column.delete_at(i)
137
+ end
138
+ end
139
+
140
+ def uniq_by(by)
141
+ result = self.clone
142
+ result.uniq_by!(by)
143
+ return result
144
+ end
145
+
146
+ def uniq_by!(by)
147
+ my_keys = {}
148
+ to_delete = []
149
+ self.each_with_index do |row, i|
150
+ key = []
151
+ by.each do |colname|
152
+ key << row[colname]
153
+ end
154
+ unless my_keys[key]
155
+ my_keys[key] = i
156
+ else
157
+ to_delete << (i-to_delete.size)
158
+ end
159
+ end
160
+
161
+ to_delete.each do |i|
162
+ self.delete_row(i)
163
+ end
164
+
165
+ return self
166
+ end
167
+
218
168
  def column_names
219
169
  return @labels.map { |k| k.to_s }
220
170
  end
@@ -243,7 +193,7 @@ module Rust
243
193
  row.each do |key, value|
244
194
  @data[key.to_s] << value
245
195
  end
246
- #
196
+
247
197
  return true
248
198
  else
249
199
  raise TypeError, "Expected an Array or a Hash"
@@ -275,6 +225,14 @@ module Rust
275
225
  return self
276
226
  end
277
227
 
228
+ def fast_each
229
+ self.fast_each_with_index do |element, i|
230
+ yield element
231
+ end
232
+
233
+ return self
234
+ end
235
+
278
236
  def each_with_index
279
237
  for i in 0...self.rows
280
238
  element = {}
@@ -288,6 +246,19 @@ module Rust
288
246
  return self
289
247
  end
290
248
 
249
+ def fast_each_with_index
250
+ for i in 0...self.rows
251
+ element = []
252
+ @labels.each do |label|
253
+ element << @data[label][i]
254
+ end
255
+
256
+ yield element, i
257
+ end
258
+
259
+ return self
260
+ end
261
+
291
262
  def load_in_r_as(variable_name)
292
263
  command = []
293
264
 
@@ -299,6 +270,14 @@ module Rust
299
270
  row_index += 1
300
271
  end
301
272
 
273
+ self.column_names.each do |name|
274
+ column = self.column(name)
275
+
276
+ if column.is_a?(Factor)
277
+ command << "#{variable_name}[,#{name.to_R}] <- factor(#{variable_name}[,#{name.to_R}], labels=#{column.levels.to_R})"
278
+ end
279
+ end
280
+
302
281
  Rust._eval_big(command)
303
282
  end
304
283
 
@@ -397,6 +376,77 @@ module Rust
397
376
  return result
398
377
  end
399
378
 
379
+ def aggregate(by, **aggregators)
380
+ raise TypeError, "Expected a string" unless by.is_a?(String)
381
+ raise TypeError, "All the aggregators should be procs" unless aggregators.values.all? { |v| v.is_a?(Proc) }
382
+ raise "Expected a block for default aggregator" unless block_given?
383
+
384
+ aggregators = aggregators.map { |label, callable| [label.to_s, callable] }.to_h
385
+
386
+ sorted = self.sort_by(by)
387
+
388
+ current_value = nil
389
+ partials = []
390
+ partial = nil
391
+ sorted.column(by).each_with_index do |value, index|
392
+ if current_value != value
393
+ current_value = value
394
+ partials << partial if partial
395
+ partial = Rust::DataFrame.new(self.column_names)
396
+ end
397
+ partial << sorted.fast_row(index)
398
+ end
399
+ partials << partial
400
+
401
+ result = Rust::DataFrame.new(self.column_names)
402
+ partials.each do |partial|
403
+ aggregated_row = {}
404
+ aggregated_row[by] = partial.column(by)[0]
405
+ (self.column_names - [by]).each do |column|
406
+ if aggregators[column]
407
+ aggregated_row[column] = aggregators[column].call(partial.column(column))
408
+ else
409
+ aggregated_row[column] = yield partial.column(column)
410
+ end
411
+ end
412
+
413
+ result << aggregated_row
414
+ end
415
+
416
+ return result
417
+ end
418
+
419
+ def sort_by(column)
420
+ result = self.clone
421
+ result.sort_by!(column)
422
+ return result
423
+ end
424
+
425
+ def sort_by!(by)
426
+ copy = @data[by].clone
427
+ copy.sort!
428
+
429
+ indices = []
430
+ @data[by].each_with_index do |value, i|
431
+ index = copy.index(value)
432
+ indices << index
433
+
434
+ copy[index] = NilClass
435
+ end
436
+
437
+ (self.column_names - [by]).each do |column_name|
438
+ sorted = []
439
+ column = self.column(column_name)
440
+ column_i = 0
441
+ indices.each do |i|
442
+ sorted[i] = column[column_i]
443
+ column_i += 1
444
+ end
445
+ @data[column_name] = sorted
446
+ end
447
+ @data[by].sort!
448
+ end
449
+
400
450
  def bind_rows!(dataframe)
401
451
  raise TypeError, "DataFrame expected" unless dataframe.is_a?(DataFrame)
402
452
  raise "The columns are not compatible: #{self.column_names - dataframe.column_names} - #{dataframe.column_names - self.column_names}" unless (self.column_names & dataframe.column_names).size == self.columns
@@ -441,143 +491,31 @@ module Rust
441
491
  end
442
492
  end
443
493
 
444
- class Matrix < RustDatatype
445
- def self.pull_variable(variable)
446
- return Rust._pull(variable)
447
- end
448
-
449
- def initialize(data)
450
- if data.flatten.size == 0
451
- raise "Empty matrices are not allowed"
452
- else
453
- raise TypeError, "Expected array of array" unless data.is_a?(Array) && data[0].is_a?(Array)
454
- raise TypeError, "Only numeric matrices are supported" unless data.all? { |row| row.all? { |e| e.is_a?(Numeric) } }
455
- raise "All the rows must have the same size" unless data.map { |row| row.size }.uniq.size == 1
456
- @data = data.clone
494
+ class DataFrameArray < Array
495
+ def bind_all
496
+ return nil if self.size == 0
497
+
498
+ result = self.first.clone
499
+
500
+ for i in 1...self.size
501
+ result .bind_rows!(self[i])
457
502
  end
458
- end
459
-
460
- def [](i, j)
461
- return @data[i][j]
462
- end
463
-
464
- def rows
465
- @data.size
466
- end
467
-
468
- def cols
469
- @data[0].size
470
- end
471
-
472
- def []=(i, j, value)
473
- raise "Wrong i" unless i.between?(0, @data.size - 1)
474
- raise "Wrong j" unless j.between?(0, @data[0].size - 1)
475
- @data[i][j] = value
476
- end
477
-
478
- def load_in_r_as(variable_name)
479
- Rust._eval("#{variable_name} <- matrix(c(#{@data.flatten.join(",")}), nrow=#{self.rows}, ncol=#{self.cols}, byrow=T)")
503
+
504
+ return result
480
505
  end
481
506
  end
482
507
 
483
- class Sequence
484
- attr_reader :min
485
- attr_reader :max
486
-
487
- def initialize(min, max, step=1)
488
- @min = min
489
- @max = max
490
- @step = step
491
- end
492
-
493
- def step(step)
494
- @step = step
495
- end
496
-
497
- def each
498
- (@min..@max).step(@step) do |v|
499
- yield v
500
- end
501
- end
502
-
503
- def to_a
504
- result = []
505
- self.each do |v|
506
- result << v
508
+ class DataFrameHash < Hash
509
+ def bind_all
510
+ return nil if self.values.size == 0
511
+
512
+ result = self.values.first.clone
513
+
514
+ for i in 1...self.values.size
515
+ result .bind_rows!(self.values[i])
507
516
  end
517
+
508
518
  return result
509
519
  end
510
-
511
- def to_R
512
- "seq(from=#@min, to=#@max, by=#@step)"
513
- end
514
- end
515
- end
516
-
517
- class TrueClass
518
- def to_R
519
- "TRUE"
520
- end
521
- end
522
-
523
- class FalseClass
524
- def to_R
525
- "FALSE"
526
- end
527
- end
528
-
529
- class Object
530
- def to_R
531
- raise TypeError, "Unsupported type for #{self.class}"
532
- end
533
- end
534
-
535
- class NilClass
536
- def to_R
537
- return "NULL"
538
- end
539
- end
540
-
541
- class Numeric
542
- def to_R
543
- self.inspect
544
- end
545
- end
546
-
547
- class Float
548
- def to_R
549
- return self.nan? ? "NA" : super
550
- end
551
- end
552
-
553
- class Array
554
- def to_R
555
- return "c(#{self.map { |e| e.to_R }.join(",")})"
556
- end
557
- end
558
-
559
- class String
560
- def to_R
561
- return self.inspect
562
- end
563
- end
564
-
565
- class Range
566
- def to_R
567
- [range.min, range.max].to_R
568
- end
569
- end
570
-
571
- module Rust::RBindings
572
- def read_csv(filename, **options)
573
- Rust::CSV.read(filename, **options)
574
- end
575
-
576
- def write_csv(filename, dataframe, **options)
577
- Rust::CSV.write(filename, dataframe, **options)
578
- end
579
-
580
- def data_frame(*args)
581
- Rust::DataFrame.new(*args)
582
520
  end
583
521
  end
@@ -0,0 +1,161 @@
1
+ require_relative '../rust'
2
+
3
+ module Rust
4
+ class RustDatatype
5
+ def self.pull_variable(variable, forced_interpreter = nil)
6
+ r_type = Rust._pull("as.character(typeof(#{variable}))")
7
+ r_class = Rust._pull("as.character(class(#{variable}))")
8
+
9
+ if forced_interpreter
10
+ raise ArgumentError, "Expected null or class as forced_interpreter" if forced_interpreter && !forced_interpreter.is_a?(Class)
11
+ raise ArgumentError, "Class #{forced_interpreter} can not handle type #{r_type}, class #{r_class}" unless forced_interpreter.can_pull?(r_type, r_class)
12
+
13
+ return forced_interpreter.pull_variable(variable, r_type, r_class)
14
+ end
15
+
16
+ candidates = []
17
+ ObjectSpace.each_object(Class) do |type|
18
+ if type < RustDatatype
19
+ if type.can_pull?(r_type, r_class)
20
+ candidates << type
21
+ end
22
+ end
23
+ end
24
+
25
+ if candidates.size > 0
26
+ type = candidates.max_by { |c| c.pull_priority }
27
+
28
+ puts "Using #{type} to pull #{variable}" if Rust.debug?
29
+ return type.pull_variable(variable, r_type, r_class)
30
+ else
31
+ if Rust._pull("length(#{variable})") == 0
32
+ return []
33
+ else
34
+ return Rust._pull(variable)
35
+ end
36
+ end
37
+ end
38
+
39
+ def self.pull_priority
40
+ 0
41
+ end
42
+
43
+ def load_in_r_as(variable_name)
44
+ raise "Loading #{self.class} in R was not implemented"
45
+ end
46
+
47
+ def r_mirror_to(other_variable)
48
+ varname = self.mirrored_R_variable_name
49
+
50
+ Rust._eval("#{varname} = #{other_variable}")
51
+ Rust["#{varname}.hash"] = self.r_hash
52
+
53
+ return varname
54
+ end
55
+
56
+ def r_mirror
57
+ varname = self.mirrored_R_variable_name
58
+
59
+ if !Rust._pull("exists(\"#{varname}\")") || Rust._pull("#{varname}.hash") != self.r_hash
60
+ puts "Loading #{varname}" if Rust.debug?
61
+ Rust[varname] = self
62
+ Rust["#{varname}.hash"] = self.r_hash
63
+ else
64
+ puts "Using cached value for #{varname}" if Rust.debug?
65
+ end
66
+
67
+ return varname
68
+ end
69
+
70
+ def r_hash
71
+ self.hash.to_s
72
+ end
73
+
74
+ private
75
+ def mirrored_R_variable_name
76
+ return "rust.mirrored.#{self.object_id}"
77
+ end
78
+ end
79
+
80
+ class Null < RustDatatype
81
+ def self.can_pull?(type, klass)
82
+ return type == "NULL" && klass == "NULL"
83
+ end
84
+
85
+ def self.pull_variable(variable, type, klass)
86
+ return nil
87
+ end
88
+ end
89
+ end
90
+
91
+ class TrueClass
92
+ def to_R
93
+ "TRUE"
94
+ end
95
+ end
96
+
97
+ class FalseClass
98
+ def to_R
99
+ "FALSE"
100
+ end
101
+ end
102
+
103
+ class Object
104
+ def to_R
105
+ raise TypeError, "Unsupported type for #{self.class}"
106
+ end
107
+ end
108
+
109
+ class NilClass
110
+ def to_R
111
+ return "NULL"
112
+ end
113
+
114
+ def load_in_r_as(variable)
115
+ Rust._eval("#{variable} <- NULL")
116
+ end
117
+ end
118
+
119
+ class Numeric
120
+ def to_R
121
+ self.inspect
122
+ end
123
+ end
124
+
125
+ class Float
126
+ def to_R
127
+ return self.nan? ? "NA" : super
128
+ end
129
+ end
130
+
131
+ class Symbol
132
+ def to_R
133
+ return self.to_s.inspect
134
+ end
135
+ end
136
+
137
+ class Array
138
+ def to_R
139
+ return "c(#{self.map { |e| e.to_R }.join(",")})"
140
+ end
141
+
142
+ def distribution
143
+ result = {}
144
+ self.each do |value|
145
+ result[value] = result[value].to_i + 1
146
+ end
147
+ return result
148
+ end
149
+ end
150
+
151
+ class String
152
+ def to_R
153
+ return self.inspect
154
+ end
155
+ end
156
+
157
+ class Range
158
+ def to_R
159
+ [range.min, range.max].to_R
160
+ end
161
+ end