rust 0.4 → 0.10

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,127 +1,34 @@
1
- require 'code-assertions'
2
- require 'stringio'
3
- require 'rinruby'
4
- require 'csv'
1
+ require_relative 'datatype'
5
2
 
6
3
  module Rust
7
- CLIENT_MUTEX = Mutex.new
8
- R_MUTEX = Mutex.new
9
4
 
10
- R_ENGINE = RinRuby.new(echo: false)
5
+ ##
6
+ # Mirror of the data-frame type in R.
11
7
 
12
- private_constant :R_ENGINE
13
- private_constant :R_MUTEX
14
- private_constant :CLIENT_MUTEX
15
-
16
- @@debugging = false
17
- @@in_client_mutex = false
18
-
19
- def self.debug
20
- @@debugging = true
21
- end
22
-
23
- def self.exclusive
24
- result = nil
25
- CLIENT_MUTEX.synchronize do
26
- @@in_client_mutex = true
27
- result = yield
28
- @@in_client_mutex = false
29
- end
30
- return result
31
- end
32
-
33
- def self.[]=(variable, value)
34
- if value.is_a?(RustDatatype)
35
- value.load_in_r_as(variable.to_s)
36
- elsif value.is_a?(String) || value.is_a?(Numeric) || value.is_a?(Array)
37
- R_ENGINE.assign(variable, value)
38
- else
39
- raise "Given #{value.class}, expected RustDatatype, String, Numeric, or Array"
8
+ class DataFrame < RustDatatype
9
+ def self.can_pull?(type, klass)
10
+ return [klass].flatten.include?("data.frame")
40
11
  end
41
12
 
42
- end
43
-
44
- def self.[](variable, type=RustDatatype)
45
- return type.pull_variable(variable)
46
- end
47
-
48
- def self._eval_big(r_command, return_warnings = false)
49
- r_command = r_command.join("\n") if r_command.is_a?(Array)
50
-
51
- self._rexec(r_command, return_warnings) do |cmd|
52
- result = true
53
- instructions = cmd.lines
54
-
55
- while instructions.size > 0
56
- current_command = ""
57
-
58
- while (instructions.size > 0) && (current_command.length + instructions.first.length < 10000)
59
- current_command << instructions.shift
60
- end
61
-
62
- result &= R_ENGINE.eval(current_command)
63
- end
64
-
65
- result
66
- end
67
- end
68
-
69
- def self._pull(r_command, return_warnings = false)
70
- self._rexec(r_command, return_warnings) { |cmd| R_ENGINE.pull(cmd) }
71
- end
72
-
73
- def self._eval(r_command, return_warnings = false)
74
- self._rexec(r_command, return_warnings) { |cmd| R_ENGINE.eval(cmd) }
75
- end
76
-
77
- def self._rexec(r_command, return_warnings = false)
78
- puts "Calling _rexec with command: #{r_command}" if @@debugging
79
- R_MUTEX.synchronize do
80
- assert("This command must be executed in an exclusive block") { @@in_client_mutex }
81
-
82
- result = nil
83
- begin
84
- $stdout = StringIO.new
85
- if return_warnings
86
- R_ENGINE.echo(true, true)
87
- else
88
- R_ENGINE.echo(false, false)
89
- end
90
- result = yield(r_command)
91
- ensure
92
- R_ENGINE.echo(false, false)
93
- warnings = $stdout.string
94
- $stdout = STDOUT
95
- end
96
-
97
- if return_warnings
98
- return result, warnings.lines.map { |w| w.strip.chomp }
99
- else
100
- return result
101
- end
102
- end
103
- end
104
-
105
- class RustDatatype
106
- def self.pull_variable(variable)
107
- return Rust._pull(variable)
13
+ def self.pull_priority
14
+ 1
108
15
  end
109
16
 
110
- def load_in_r_as(r_instance, variable_name)
111
- raise "Not implemented"
112
- end
113
- end
114
-
115
- class DataFrame < RustDatatype
116
- def self.pull_variable(variable)
17
+ def self.pull_variable(variable, type, klass)
117
18
  hash = {}
118
- colnames = Rust._pull("colnames(#{variable})")
19
+ colnames = Rust["colnames(#{variable})"]
119
20
  colnames.each do |col|
120
- hash[col] = Rust._pull("#{variable}$#{col}")
21
+ hash[col] = Rust["#{variable}$\"#{col}\""]
121
22
  end
122
23
  return DataFrame.new(hash)
123
24
  end
124
25
 
26
+ ##
27
+ # Creates a new data-frame.
28
+ # +labels_or_data+ can be either:
29
+ # - an Array of column names (creates an empty data-frame)
30
+ # - a Hash with column names as keys and values as values
31
+
125
32
  def initialize(labels_or_data)
126
33
  @data = {}
127
34
 
@@ -130,10 +37,16 @@ module Rust
130
37
  @labels.each { |label| @data[label] = [] }
131
38
  elsif labels_or_data.is_a? Hash
132
39
  @labels = labels_or_data.keys.map { |l| l.to_s }
133
- @data = labels_or_data.clone
40
+
41
+ labels_or_data.each do |key, value|
42
+ @data[key.to_s] = value.clone
43
+ end
134
44
  end
135
45
  end
136
46
 
47
+ ##
48
+ # Returns the +i+-th row of the data-frame
49
+
137
50
  def row(i)
138
51
  if i < 0 || i >= self.rows
139
52
  return nil
@@ -142,6 +55,20 @@ module Rust
142
55
  end
143
56
  end
144
57
 
58
+ ##
59
+ # Returns the +i+-th row of the data-frame. Faster (but harder to interpret) alternative to #row.
60
+
61
+ def fast_row(i)
62
+ if i < 0 || i >= self.rows
63
+ return nil
64
+ else
65
+ return @labels.map { |label| @data[label][i] }
66
+ end
67
+ end
68
+
69
+ ##
70
+ # Shuffles the rows in the data-frame. The arguments are passed to the Array#shuffle method.
71
+
145
72
  def shuffle(*args)
146
73
  result = DataFrame.new(@labels)
147
74
 
@@ -156,6 +83,10 @@ module Rust
156
83
  return result
157
84
  end
158
85
 
86
+ ##
87
+ # Returns a copy of the data-frame containing only the specified +rows+ and/or +cols+. If +rows+ and/or +cols+
88
+ # are nil, all the rows/columns are returned.
89
+
159
90
  def [](rows, cols=nil)
160
91
  raise "You must specify either rows or columns to select" if !rows && !cols
161
92
  result = self
@@ -171,9 +102,16 @@ module Rust
171
102
  return result
172
103
  end
173
104
 
105
+ ##
106
+ # Return the column named +name+.
107
+
174
108
  def column(name)
175
109
  return @data[name]
176
110
  end
111
+ alias :| :column
112
+
113
+ ##
114
+ # Renames the column named +old_name+ in +new_name+.
177
115
 
178
116
  def rename_column!(old_name, new_name)
179
117
  raise "This DataFrame does not contain a column named #{old_name}" unless @labels.include?(old_name)
@@ -183,10 +121,24 @@ module Rust
183
121
  @labels[@labels.index(old_name)] = new_name
184
122
  end
185
123
 
124
+ ##
125
+ # Functionally transforms the column named +column+ by applying the function given as a block.
126
+ # Example:
127
+ # df = Rust::DataFrame.new({a: [1,2,3], b: [3,4,5]})
128
+ # df.transform_column!("a") { |v| v + 1 }
129
+ # df|"a" # => [2, 3, 4]
130
+
186
131
  def transform_column!(column)
187
132
  @data[column].map! { |e| yield e }
188
133
  end
189
134
 
135
+ ##
136
+ # Returns a copy data-frame with only the rows for which the function given in the block returns true.
137
+ # Example:
138
+ # df = Rust::DataFrame.new({a: [1,2,3], b: ['a','b','c']})
139
+ # df2 = df.select_rows { |r| r['a'].even? }
140
+ # df2|"b" # => ['b']
141
+
190
142
  def select_rows
191
143
  result = DataFrame.new(self.column_names)
192
144
  self.each_with_index do |row, i|
@@ -195,6 +147,20 @@ module Rust
195
147
  return result
196
148
  end
197
149
 
150
+ ##
151
+ # Returns true if the function given in the block returns true for any of the rows in this data-frame.
152
+
153
+ def has_row?
154
+ self.each_with_index do |row, i|
155
+ return true if yield row, i
156
+ end
157
+ return false
158
+ end
159
+
160
+ ##
161
+ # Returns a copy of the data-frame with only the columns in +cols+. As an alternative, a block can be used
162
+ # (only the columns for which the function returns true are kept).
163
+
198
164
  def select_columns(cols=nil)
199
165
  raise "You must specify either the columns you want to select or a selection block" if !cols && !block_given?
200
166
 
@@ -210,24 +176,84 @@ module Rust
210
176
  end
211
177
  alias :select_cols :select_columns
212
178
 
179
+ ##
180
+ # Deletes the column named +column+.
181
+
213
182
  def delete_column(column)
214
183
  @labels.delete(column)
215
184
  @data.delete(column)
216
185
  end
217
186
 
187
+ ##
188
+ # Deletes the +i+-th row.
189
+
190
+ def delete_row(i)
191
+ @data.each do |label, column|
192
+ column.delete_at(i)
193
+ end
194
+ end
195
+
196
+ ##
197
+ # Returns a data-frame in which the rows are unique in terms of all the given columns named +by+.
198
+
199
+ def uniq_by(by)
200
+ result = self.clone
201
+ result.uniq_by!(by)
202
+ return result
203
+ end
204
+
205
+ ##
206
+ # Makes sure that in this data-frame the rows are unique in terms of all the given columns named +by+.
207
+
208
+ def uniq_by!(by)
209
+ my_keys = {}
210
+ to_delete = []
211
+ self.each_with_index do |row, i|
212
+ key = []
213
+ by.each do |colname|
214
+ key << row[colname]
215
+ end
216
+ unless my_keys[key]
217
+ my_keys[key] = i
218
+ else
219
+ to_delete << (i-to_delete.size)
220
+ end
221
+ end
222
+
223
+ to_delete.each do |i|
224
+ self.delete_row(i)
225
+ end
226
+
227
+ return self
228
+ end
229
+
230
+ ##
231
+ # Return the names of the columns.
232
+
218
233
  def column_names
219
234
  return @labels.map { |k| k.to_s }
220
235
  end
221
236
  alias :colnames :column_names
222
237
 
238
+ ##
239
+ # Returns the number of rows.
240
+
223
241
  def rows
224
242
  @data.values[0].size
225
243
  end
226
244
 
245
+ ##
246
+ # Returns the number of columns
247
+
227
248
  def columns
228
249
  @labels.size
229
250
  end
230
251
 
252
+ ##
253
+ # Adds the given +row+ to the data-frame. +row+ can be either:
254
+ # - An Array of values for all the columns (in the order of #column_names);
255
+ # - A Hash containing associations between column names and value to be set.
256
+
231
257
  def add_row(row)
232
258
  if row.is_a?(Array)
233
259
  raise "Expected an array of size #{@data.size}" unless row.size == @data.size
@@ -243,7 +269,7 @@ module Rust
243
269
  row.each do |key, value|
244
270
  @data[key.to_s] << value
245
271
  end
246
- #
272
+
247
273
  return true
248
274
  else
249
275
  raise TypeError, "Expected an Array or a Hash"
@@ -251,6 +277,11 @@ module Rust
251
277
  end
252
278
  alias :<< :add_row
253
279
 
280
+ ##
281
+ # Adds a column named +name+ with the given +values+ (array). The size of +values+ must match the number of
282
+ # rows of this data-frame. As an alternative, it can be passed a block which returns, for a given row, the
283
+ # value to assign for the new column.
284
+
254
285
  def add_column(name, values=nil)
255
286
  raise "Column already exists" if @labels.include?(name)
256
287
  raise "Values or block required" if !values && !block_given?
@@ -267,6 +298,9 @@ module Rust
267
298
  end
268
299
  end
269
300
 
301
+ ##
302
+ # Yields each row as a Hash containing column names as keys and values as values.
303
+
270
304
  def each
271
305
  self.each_with_index do |element, i|
272
306
  yield element
@@ -275,6 +309,21 @@ module Rust
275
309
  return self
276
310
  end
277
311
 
312
+ ##
313
+ # Yields each row as a Hash containing column names as keys and values as values. Faster alternative to
314
+ # #each.
315
+
316
+ def fast_each
317
+ self.fast_each_with_index do |element, i|
318
+ yield element
319
+ end
320
+
321
+ return self
322
+ end
323
+
324
+ ##
325
+ # Yields each row as a Hash containing column names as keys and values as values and the row index.
326
+
278
327
  def each_with_index
279
328
  for i in 0...self.rows
280
329
  element = {}
@@ -288,6 +337,23 @@ module Rust
288
337
  return self
289
338
  end
290
339
 
340
+ ##
341
+ # Yields each row as a Hash containing column names as keys and values as values and the row index. Faster
342
+ # alternative to #each_with_index.
343
+
344
+ def fast_each_with_index
345
+ for i in 0...self.rows
346
+ element = []
347
+ @labels.each do |label|
348
+ element << @data[label][i]
349
+ end
350
+
351
+ yield element, i
352
+ end
353
+
354
+ return self
355
+ end
356
+
291
357
  def load_in_r_as(variable_name)
292
358
  command = []
293
359
 
@@ -299,6 +365,14 @@ module Rust
299
365
  row_index += 1
300
366
  end
301
367
 
368
+ self.column_names.each do |name|
369
+ column = self.column(name)
370
+
371
+ if column.is_a?(Factor)
372
+ command << "#{variable_name}[,#{name.to_R}] <- factor(#{variable_name}[,#{name.to_R}], labels=#{column.levels.to_R})"
373
+ end
374
+ end
375
+
302
376
  Rust._eval_big(command)
303
377
  end
304
378
 
@@ -323,6 +397,9 @@ module Rust
323
397
  return result
324
398
  end
325
399
 
400
+ ##
401
+ # Returns a copy of the data-frame containing only the first +n+ rows.
402
+
326
403
  def head(n=10)
327
404
  result = DataFrame.new(self.column_names)
328
405
  self.each_with_index do |row, i|
@@ -331,6 +408,11 @@ module Rust
331
408
  return result
332
409
  end
333
410
 
411
+ ##
412
+ # Merges this data-frame with +other+ in terms of the +by+ column(s) (Array or String).
413
+ # +first_alias+ and +second_alias+ allow to specify the prefix that should be used for the columns not in +by+
414
+ # for this and the +other+ data-frame, respectively.
415
+
334
416
  def merge(other, by, first_alias = "x", second_alias = "y")
335
417
  raise TypeError, "Expected Rust::DataFrame" unless other.is_a?(DataFrame)
336
418
  raise TypeError, "Expected list of strings" if !by.is_a?(Array) || !by.all? { |e| e.is_a?(String) }
@@ -397,6 +479,94 @@ module Rust
397
479
  return result
398
480
  end
399
481
 
482
+ ##
483
+ # Aggregate the value in groups depending on the +by+ column (String).
484
+ # A block must be passed to specify how to aggregate the columns. Aggregators for specific columns can be
485
+ # specified as optional arguments in which the name of the argument represents the column name and the value
486
+ # contains a block for aggregating the specific column.
487
+ # Both the default and the specialized blocks must take as argument an array of values and must return a
488
+ # scalar value.
489
+
490
+ def aggregate(by, **aggregators)
491
+ raise TypeError, "Expected a string" unless by.is_a?(String)
492
+ raise TypeError, "All the aggregators should be procs" unless aggregators.values.all? { |v| v.is_a?(Proc) }
493
+ raise "Expected a block for default aggregator" unless block_given?
494
+
495
+ aggregators = aggregators.map { |label, callable| [label.to_s, callable] }.to_h
496
+
497
+ sorted = self.sort_by(by)
498
+
499
+ current_value = nil
500
+ partials = []
501
+ partial = nil
502
+ sorted.column(by).each_with_index do |value, index|
503
+ if current_value != value
504
+ current_value = value
505
+ partials << partial if partial
506
+ partial = Rust::DataFrame.new(self.column_names)
507
+ end
508
+ partial << sorted.fast_row(index)
509
+ end
510
+ partials << partial
511
+
512
+ result = Rust::DataFrame.new(self.column_names)
513
+ partials.each do |partial|
514
+ aggregated_row = {}
515
+ aggregated_row[by] = partial.column(by)[0]
516
+ (self.column_names - [by]).each do |column|
517
+ if aggregators[column]
518
+ aggregated_row[column] = aggregators[column].call(partial.column(column))
519
+ else
520
+ aggregated_row[column] = yield partial.column(column)
521
+ end
522
+ end
523
+
524
+ result << aggregated_row
525
+ end
526
+
527
+ return result
528
+ end
529
+
530
+ ##
531
+ # Returns a copy of this data-frame in which the rows are sorted by the values of the +by+ column.
532
+
533
+ def sort_by(column)
534
+ result = self.clone
535
+ result.sort_by!(column)
536
+ return result
537
+ end
538
+
539
+ ##
540
+ # Sorts the rows of this data-frame by the values of the +by+ column.
541
+
542
+ def sort_by!(by)
543
+ copy = @data[by].clone
544
+ copy.sort!
545
+
546
+ indices = []
547
+ @data[by].each_with_index do |value, i|
548
+ index = copy.index(value)
549
+ indices << index
550
+
551
+ copy[index] = NilClass
552
+ end
553
+
554
+ (self.column_names - [by]).each do |column_name|
555
+ sorted = []
556
+ column = self.column(column_name)
557
+ column_i = 0
558
+ indices.each do |i|
559
+ sorted[i] = column[column_i]
560
+ column_i += 1
561
+ end
562
+ @data[column_name] = sorted
563
+ end
564
+ @data[by].sort!
565
+ end
566
+
567
+ ##
568
+ # Adds all the rows in +dataframe+ to this data-frame. The column names must match.
569
+
400
570
  def bind_rows!(dataframe)
401
571
  raise TypeError, "DataFrame expected" unless dataframe.is_a?(DataFrame)
402
572
  raise "The columns are not compatible: #{self.column_names - dataframe.column_names} - #{dataframe.column_names - self.column_names}" unless (self.column_names & dataframe.column_names).size == self.columns
@@ -409,6 +579,9 @@ module Rust
409
579
  end
410
580
  alias :rbind! :bind_rows!
411
581
 
582
+ ##
583
+ # Adds all the columns in +dataframe+ to this data-frame. The number of rows must match.
584
+
412
585
  def bind_columns!(dataframe)
413
586
  raise TypeError, "DataFrame expected" unless dataframe.is_a?(DataFrame)
414
587
  raise "The number of rows are not compatible" if self.rows != dataframe.rows
@@ -422,6 +595,9 @@ module Rust
422
595
  end
423
596
  alias :cbind! :bind_columns!
424
597
 
598
+ ##
599
+ # Returns a copy of this dataframe and adds all the rows in +dataframe+ to it. The column names must match.
600
+
425
601
  def bind_rows(dataframe)
426
602
  result = self.clone
427
603
  result.bind_rows!(dataframe)
@@ -429,6 +605,9 @@ module Rust
429
605
  end
430
606
  alias :rbind :bind_rows
431
607
 
608
+ ##
609
+ # Returns a copy of this dataframe and adds all the columns in +dataframe+ to it. The number of rows must match.
610
+
432
611
  def bind_columns(dataframe)
433
612
  result = self.clone
434
613
  result.bind_columns!(dataframe)
@@ -436,152 +615,53 @@ module Rust
436
615
  end
437
616
  alias :cbind :bind_columns
438
617
 
618
+ ##
619
+ # Returns a copy of this data-frame.
620
+
439
621
  def clone
440
622
  DataFrame.new(@data)
441
623
  end
442
624
  end
443
625
 
444
- class Matrix < RustDatatype
445
- def self.pull_variable(variable)
446
- return Rust._pull(variable)
447
- end
448
-
449
- def initialize(data)
450
- if data.flatten.size == 0
451
- raise "Empty matrices are not allowed"
452
- else
453
- raise TypeError, "Expected array of array" unless data.is_a?(Array) && data[0].is_a?(Array)
454
- raise TypeError, "Only numeric matrices are supported" unless data.all? { |row| row.all? { |e| e.is_a?(Numeric) } }
455
- raise "All the rows must have the same size" unless data.map { |row| row.size }.uniq.size == 1
456
- @data = data.clone
457
- end
458
- end
459
-
460
- def [](i, j)
461
- return @data[i][j]
462
- end
463
-
464
- def rows
465
- @data.size
466
- end
467
-
468
- def cols
469
- @data[0].size
470
- end
626
+ ##
627
+ # Represents an array of DataFrame
628
+
629
+ class DataFrameArray < Array
471
630
 
472
- def []=(i, j, value)
473
- raise "Wrong i" unless i.between?(0, @data.size - 1)
474
- raise "Wrong j" unless j.between?(0, @data[0].size - 1)
475
- @data[i][j] = value
476
- end
631
+ ##
632
+ # Returns a data-frame with the rows in all the data-frames together (if compatible).
477
633
 
478
- def load_in_r_as(variable_name)
479
- Rust._eval("#{variable_name} <- matrix(c(#{@data.flatten.join(",")}), nrow=#{self.rows}, ncol=#{self.cols}, byrow=T)")
634
+ def bind_all
635
+ return nil if self.size == 0
636
+
637
+ result = self.first.clone
638
+
639
+ for i in 1...self.size
640
+ result .bind_rows!(self[i])
641
+ end
642
+
643
+ return result
480
644
  end
481
645
  end
482
646
 
483
- class Sequence
484
- attr_reader :min
485
- attr_reader :max
486
-
487
- def initialize(min, max, step=1)
488
- @min = min
489
- @max = max
490
- @step = step
491
- end
492
-
493
- def step(step)
494
- @step = step
495
- end
647
+ ##
648
+ # Represents a hash of DataFrame
649
+
650
+ class DataFrameHash < Hash
496
651
 
497
- def each
498
- (@min..@max).step(@step) do |v|
499
- yield v
500
- end
501
- end
652
+ ##
653
+ # Returns a data-frame with the rows in all the data-frames together (if compatible).
502
654
 
503
- def to_a
504
- result = []
505
- self.each do |v|
506
- result << v
655
+ def bind_all
656
+ return nil if self.values.size == 0
657
+
658
+ result = self.values.first.clone
659
+
660
+ for i in 1...self.values.size
661
+ result .bind_rows!(self.values[i])
507
662
  end
663
+
508
664
  return result
509
665
  end
510
-
511
- def to_R
512
- "seq(from=#@min, to=#@max, by=#@step)"
513
- end
514
- end
515
- end
516
-
517
- class TrueClass
518
- def to_R
519
- "TRUE"
520
- end
521
- end
522
-
523
- class FalseClass
524
- def to_R
525
- "FALSE"
526
- end
527
- end
528
-
529
- class Object
530
- def to_R
531
- raise TypeError, "Unsupported type for #{self.class}"
532
666
  end
533
667
  end
534
-
535
- class NilClass
536
- def to_R
537
- return "NULL"
538
- end
539
- end
540
-
541
- class Numeric
542
- def to_R
543
- self.inspect
544
- end
545
- end
546
-
547
- class Float
548
- def to_R
549
- return self.nan? ? "NA" : super
550
- end
551
- end
552
-
553
- class Array
554
- def to_R
555
- return "c(#{self.map { |e| e.to_R }.join(",")})"
556
- end
557
- end
558
-
559
- class String
560
- def to_R
561
- return self.inspect
562
- end
563
- end
564
-
565
- class Range
566
- def to_R
567
- [range.min, range.max].to_R
568
- end
569
- end
570
-
571
- module Rust::RBindings
572
- def read_csv(filename, **options)
573
- Rust::CSV.read(filename, **options)
574
- end
575
-
576
- def write_csv(filename, dataframe, **options)
577
- Rust::CSV.write(filename, dataframe, **options)
578
- end
579
-
580
- def data_frame(*args)
581
- Rust::DataFrame.new(*args)
582
- end
583
- end
584
-
585
- def bind_r!
586
- include Rust::RBindings
587
- end