rust 0.4 → 0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,127 +1,34 @@
1
- require 'code-assertions'
2
- require 'stringio'
3
- require 'rinruby'
4
- require 'csv'
1
+ require_relative 'datatype'
5
2
 
6
3
  module Rust
7
- CLIENT_MUTEX = Mutex.new
8
- R_MUTEX = Mutex.new
9
4
 
10
- R_ENGINE = RinRuby.new(echo: false)
5
+ ##
6
+ # Mirror of the data-frame type in R.
11
7
 
12
- private_constant :R_ENGINE
13
- private_constant :R_MUTEX
14
- private_constant :CLIENT_MUTEX
15
-
16
- @@debugging = false
17
- @@in_client_mutex = false
18
-
19
- def self.debug
20
- @@debugging = true
21
- end
22
-
23
- def self.exclusive
24
- result = nil
25
- CLIENT_MUTEX.synchronize do
26
- @@in_client_mutex = true
27
- result = yield
28
- @@in_client_mutex = false
29
- end
30
- return result
31
- end
32
-
33
- def self.[]=(variable, value)
34
- if value.is_a?(RustDatatype)
35
- value.load_in_r_as(variable.to_s)
36
- elsif value.is_a?(String) || value.is_a?(Numeric) || value.is_a?(Array)
37
- R_ENGINE.assign(variable, value)
38
- else
39
- raise "Given #{value.class}, expected RustDatatype, String, Numeric, or Array"
8
+ class DataFrame < RustDatatype
9
+ def self.can_pull?(type, klass)
10
+ return [klass].flatten.include?("data.frame")
40
11
  end
41
12
 
42
- end
43
-
44
- def self.[](variable, type=RustDatatype)
45
- return type.pull_variable(variable)
46
- end
47
-
48
- def self._eval_big(r_command, return_warnings = false)
49
- r_command = r_command.join("\n") if r_command.is_a?(Array)
50
-
51
- self._rexec(r_command, return_warnings) do |cmd|
52
- result = true
53
- instructions = cmd.lines
54
-
55
- while instructions.size > 0
56
- current_command = ""
57
-
58
- while (instructions.size > 0) && (current_command.length + instructions.first.length < 10000)
59
- current_command << instructions.shift
60
- end
61
-
62
- result &= R_ENGINE.eval(current_command)
63
- end
64
-
65
- result
66
- end
67
- end
68
-
69
- def self._pull(r_command, return_warnings = false)
70
- self._rexec(r_command, return_warnings) { |cmd| R_ENGINE.pull(cmd) }
71
- end
72
-
73
- def self._eval(r_command, return_warnings = false)
74
- self._rexec(r_command, return_warnings) { |cmd| R_ENGINE.eval(cmd) }
75
- end
76
-
77
- def self._rexec(r_command, return_warnings = false)
78
- puts "Calling _rexec with command: #{r_command}" if @@debugging
79
- R_MUTEX.synchronize do
80
- assert("This command must be executed in an exclusive block") { @@in_client_mutex }
81
-
82
- result = nil
83
- begin
84
- $stdout = StringIO.new
85
- if return_warnings
86
- R_ENGINE.echo(true, true)
87
- else
88
- R_ENGINE.echo(false, false)
89
- end
90
- result = yield(r_command)
91
- ensure
92
- R_ENGINE.echo(false, false)
93
- warnings = $stdout.string
94
- $stdout = STDOUT
95
- end
96
-
97
- if return_warnings
98
- return result, warnings.lines.map { |w| w.strip.chomp }
99
- else
100
- return result
101
- end
102
- end
103
- end
104
-
105
- class RustDatatype
106
- def self.pull_variable(variable)
107
- return Rust._pull(variable)
13
+ def self.pull_priority
14
+ 1
108
15
  end
109
16
 
110
- def load_in_r_as(r_instance, variable_name)
111
- raise "Not implemented"
112
- end
113
- end
114
-
115
- class DataFrame < RustDatatype
116
- def self.pull_variable(variable)
17
+ def self.pull_variable(variable, type, klass)
117
18
  hash = {}
118
- colnames = Rust._pull("colnames(#{variable})")
19
+ colnames = Rust["colnames(#{variable})"]
119
20
  colnames.each do |col|
120
- hash[col] = Rust._pull("#{variable}$#{col}")
21
+ hash[col] = Rust["#{variable}$\"#{col}\""]
121
22
  end
122
23
  return DataFrame.new(hash)
123
24
  end
124
25
 
26
+ ##
27
+ # Creates a new data-frame.
28
+ # +labels_or_data+ can be either:
29
+ # - an Array of column names (creates an empty data-frame)
30
+ # - a Hash with column names as keys and values as values
31
+
125
32
  def initialize(labels_or_data)
126
33
  @data = {}
127
34
 
@@ -130,10 +37,16 @@ module Rust
130
37
  @labels.each { |label| @data[label] = [] }
131
38
  elsif labels_or_data.is_a? Hash
132
39
  @labels = labels_or_data.keys.map { |l| l.to_s }
133
- @data = labels_or_data.clone
40
+
41
+ labels_or_data.each do |key, value|
42
+ @data[key.to_s] = value.clone
43
+ end
134
44
  end
135
45
  end
136
46
 
47
+ ##
48
+ # Returns the +i+-th row of the data-frame
49
+
137
50
  def row(i)
138
51
  if i < 0 || i >= self.rows
139
52
  return nil
@@ -142,6 +55,20 @@ module Rust
142
55
  end
143
56
  end
144
57
 
58
+ ##
59
+ # Returns the +i+-th row of the data-frame. Faster (but harder to interpret) alternative to #row.
60
+
61
+ def fast_row(i)
62
+ if i < 0 || i >= self.rows
63
+ return nil
64
+ else
65
+ return @labels.map { |label| @data[label][i] }
66
+ end
67
+ end
68
+
69
+ ##
70
+ # Shuffles the rows in the data-frame. The arguments are passed to the Array#shuffle method.
71
+
145
72
  def shuffle(*args)
146
73
  result = DataFrame.new(@labels)
147
74
 
@@ -156,6 +83,10 @@ module Rust
156
83
  return result
157
84
  end
158
85
 
86
+ ##
87
+ # Returns a copy of the data-frame containing only the specified +rows+ and/or +cols+. If +rows+ and/or +cols+
88
+ # are nil, all the rows/columns are returned.
89
+
159
90
  def [](rows, cols=nil)
160
91
  raise "You must specify either rows or columns to select" if !rows && !cols
161
92
  result = self
@@ -171,9 +102,16 @@ module Rust
171
102
  return result
172
103
  end
173
104
 
105
+ ##
106
+ # Return the column named +name+.
107
+
174
108
  def column(name)
175
109
  return @data[name]
176
110
  end
111
+ alias :| :column
112
+
113
+ ##
114
+ # Renames the column named +old_name+ in +new_name+.
177
115
 
178
116
  def rename_column!(old_name, new_name)
179
117
  raise "This DataFrame does not contain a column named #{old_name}" unless @labels.include?(old_name)
@@ -183,10 +121,24 @@ module Rust
183
121
  @labels[@labels.index(old_name)] = new_name
184
122
  end
185
123
 
124
+ ##
125
+ # Functionally transforms the column named +column+ by applying the function given as a block.
126
+ # Example:
127
+ # df = Rust::DataFrame.new({a: [1,2,3], b: [3,4,5]})
128
+ # df.transform_column!("a") { |v| v + 1 }
129
+ # df|"a" # => [2, 3, 4]
130
+
186
131
  def transform_column!(column)
187
132
  @data[column].map! { |e| yield e }
188
133
  end
189
134
 
135
+ ##
136
+ # Returns a copy data-frame with only the rows for which the function given in the block returns true.
137
+ # Example:
138
+ # df = Rust::DataFrame.new({a: [1,2,3], b: ['a','b','c']})
139
+ # df2 = df.select_rows { |r| r['a'].even? }
140
+ # df2|"b" # => ['b']
141
+
190
142
  def select_rows
191
143
  result = DataFrame.new(self.column_names)
192
144
  self.each_with_index do |row, i|
@@ -195,6 +147,20 @@ module Rust
195
147
  return result
196
148
  end
197
149
 
150
+ ##
151
+ # Returns true if the function given in the block returns true for any of the rows in this data-frame.
152
+
153
+ def has_row?
154
+ self.each_with_index do |row, i|
155
+ return true if yield row, i
156
+ end
157
+ return false
158
+ end
159
+
160
+ ##
161
+ # Returns a copy of the data-frame with only the columns in +cols+. As an alternative, a block can be used
162
+ # (only the columns for which the function returns true are kept).
163
+
198
164
  def select_columns(cols=nil)
199
165
  raise "You must specify either the columns you want to select or a selection block" if !cols && !block_given?
200
166
 
@@ -210,24 +176,84 @@ module Rust
210
176
  end
211
177
  alias :select_cols :select_columns
212
178
 
179
+ ##
180
+ # Deletes the column named +column+.
181
+
213
182
  def delete_column(column)
214
183
  @labels.delete(column)
215
184
  @data.delete(column)
216
185
  end
217
186
 
187
+ ##
188
+ # Deletes the +i+-th row.
189
+
190
+ def delete_row(i)
191
+ @data.each do |label, column|
192
+ column.delete_at(i)
193
+ end
194
+ end
195
+
196
+ ##
197
+ # Returns a data-frame in which the rows are unique in terms of all the given columns named +by+.
198
+
199
+ def uniq_by(by)
200
+ result = self.clone
201
+ result.uniq_by!(by)
202
+ return result
203
+ end
204
+
205
+ ##
206
+ # Makes sure that in this data-frame the rows are unique in terms of all the given columns named +by+.
207
+
208
+ def uniq_by!(by)
209
+ my_keys = {}
210
+ to_delete = []
211
+ self.each_with_index do |row, i|
212
+ key = []
213
+ by.each do |colname|
214
+ key << row[colname]
215
+ end
216
+ unless my_keys[key]
217
+ my_keys[key] = i
218
+ else
219
+ to_delete << (i-to_delete.size)
220
+ end
221
+ end
222
+
223
+ to_delete.each do |i|
224
+ self.delete_row(i)
225
+ end
226
+
227
+ return self
228
+ end
229
+
230
+ ##
231
+ # Return the names of the columns.
232
+
218
233
  def column_names
219
234
  return @labels.map { |k| k.to_s }
220
235
  end
221
236
  alias :colnames :column_names
222
237
 
238
+ ##
239
+ # Returns the number of rows.
240
+
223
241
  def rows
224
242
  @data.values[0].size
225
243
  end
226
244
 
245
+ ##
246
+ # Returns the number of columns
247
+
227
248
  def columns
228
249
  @labels.size
229
250
  end
230
251
 
252
+ ##
253
+ # Adds the given +row+ to the data-frame. +row+ can be either:
254
+ # - An Array of values for all the columns (in the order of #column_names);
255
+ # - A Hash containing associations between column names and value to be set.
256
+
231
257
  def add_row(row)
232
258
  if row.is_a?(Array)
233
259
  raise "Expected an array of size #{@data.size}" unless row.size == @data.size
@@ -243,7 +269,7 @@ module Rust
243
269
  row.each do |key, value|
244
270
  @data[key.to_s] << value
245
271
  end
246
- #
272
+
247
273
  return true
248
274
  else
249
275
  raise TypeError, "Expected an Array or a Hash"
@@ -251,6 +277,11 @@ module Rust
251
277
  end
252
278
  alias :<< :add_row
253
279
 
280
+ ##
281
+ # Adds a column named +name+ with the given +values+ (array). The size of +values+ must match the number of
282
+ # rows of this data-frame. As an alternative, it can be passed a block which returns, for a given row, the
283
+ # value to assign for the new column.
284
+
254
285
  def add_column(name, values=nil)
255
286
  raise "Column already exists" if @labels.include?(name)
256
287
  raise "Values or block required" if !values && !block_given?
@@ -267,6 +298,9 @@ module Rust
267
298
  end
268
299
  end
269
300
 
301
+ ##
302
+ # Yields each row as a Hash containing column names as keys and values as values.
303
+
270
304
  def each
271
305
  self.each_with_index do |element, i|
272
306
  yield element
@@ -275,6 +309,21 @@ module Rust
275
309
  return self
276
310
  end
277
311
 
312
+ ##
313
+ # Yields each row as a Hash containing column names as keys and values as values. Faster alternative to
314
+ # #each.
315
+
316
+ def fast_each
317
+ self.fast_each_with_index do |element, i|
318
+ yield element
319
+ end
320
+
321
+ return self
322
+ end
323
+
324
+ ##
325
+ # Yields each row as a Hash containing column names as keys and values as values and the row index.
326
+
278
327
  def each_with_index
279
328
  for i in 0...self.rows
280
329
  element = {}
@@ -288,6 +337,23 @@ module Rust
288
337
  return self
289
338
  end
290
339
 
340
+ ##
341
+ # Yields each row as a Hash containing column names as keys and values as values and the row index. Faster
342
+ # alternative to #each_with_index.
343
+
344
+ def fast_each_with_index
345
+ for i in 0...self.rows
346
+ element = []
347
+ @labels.each do |label|
348
+ element << @data[label][i]
349
+ end
350
+
351
+ yield element, i
352
+ end
353
+
354
+ return self
355
+ end
356
+
291
357
  def load_in_r_as(variable_name)
292
358
  command = []
293
359
 
@@ -299,6 +365,14 @@ module Rust
299
365
  row_index += 1
300
366
  end
301
367
 
368
+ self.column_names.each do |name|
369
+ column = self.column(name)
370
+
371
+ if column.is_a?(Factor)
372
+ command << "#{variable_name}[,#{name.to_R}] <- factor(#{variable_name}[,#{name.to_R}], labels=#{column.levels.to_R})"
373
+ end
374
+ end
375
+
302
376
  Rust._eval_big(command)
303
377
  end
304
378
 
@@ -323,6 +397,9 @@ module Rust
323
397
  return result
324
398
  end
325
399
 
400
+ ##
401
+ # Returns a copy of the data-frame containing only the first +n+ rows.
402
+
326
403
  def head(n=10)
327
404
  result = DataFrame.new(self.column_names)
328
405
  self.each_with_index do |row, i|
@@ -331,6 +408,11 @@ module Rust
331
408
  return result
332
409
  end
333
410
 
411
+ ##
412
+ # Merges this data-frame with +other+ in terms of the +by+ column(s) (Array or String).
413
+ # +first_alias+ and +second_alias+ allow to specify the prefix that should be used for the columns not in +by+
414
+ # for this and the +other+ data-frame, respectively.
415
+
334
416
  def merge(other, by, first_alias = "x", second_alias = "y")
335
417
  raise TypeError, "Expected Rust::DataFrame" unless other.is_a?(DataFrame)
336
418
  raise TypeError, "Expected list of strings" if !by.is_a?(Array) || !by.all? { |e| e.is_a?(String) }
@@ -397,6 +479,94 @@ module Rust
397
479
  return result
398
480
  end
399
481
 
482
+ ##
483
+ # Aggregate the value in groups depending on the +by+ column (String).
484
+ # A block must be passed to specify how to aggregate the columns. Aggregators for specific columns can be
485
+ # specified as optional arguments in which the name of the argument represents the column name and the value
486
+ # contains a block for aggregating the specific column.
487
+ # Both the default and the specialized blocks must take as argument an array of values and must return a
488
+ # scalar value.
489
+
490
+ def aggregate(by, **aggregators)
491
+ raise TypeError, "Expected a string" unless by.is_a?(String)
492
+ raise TypeError, "All the aggregators should be procs" unless aggregators.values.all? { |v| v.is_a?(Proc) }
493
+ raise "Expected a block for default aggregator" unless block_given?
494
+
495
+ aggregators = aggregators.map { |label, callable| [label.to_s, callable] }.to_h
496
+
497
+ sorted = self.sort_by(by)
498
+
499
+ current_value = nil
500
+ partials = []
501
+ partial = nil
502
+ sorted.column(by).each_with_index do |value, index|
503
+ if current_value != value
504
+ current_value = value
505
+ partials << partial if partial
506
+ partial = Rust::DataFrame.new(self.column_names)
507
+ end
508
+ partial << sorted.fast_row(index)
509
+ end
510
+ partials << partial
511
+
512
+ result = Rust::DataFrame.new(self.column_names)
513
+ partials.each do |partial|
514
+ aggregated_row = {}
515
+ aggregated_row[by] = partial.column(by)[0]
516
+ (self.column_names - [by]).each do |column|
517
+ if aggregators[column]
518
+ aggregated_row[column] = aggregators[column].call(partial.column(column))
519
+ else
520
+ aggregated_row[column] = yield partial.column(column)
521
+ end
522
+ end
523
+
524
+ result << aggregated_row
525
+ end
526
+
527
+ return result
528
+ end
529
+
530
+ ##
531
+ # Returns a copy of this data-frame in which the rows are sorted by the values of the +by+ column.
532
+
533
+ def sort_by(column)
534
+ result = self.clone
535
+ result.sort_by!(column)
536
+ return result
537
+ end
538
+
539
+ ##
540
+ # Sorts the rows of this data-frame by the values of the +by+ column.
541
+
542
+ def sort_by!(by)
543
+ copy = @data[by].clone
544
+ copy.sort!
545
+
546
+ indices = []
547
+ @data[by].each_with_index do |value, i|
548
+ index = copy.index(value)
549
+ indices << index
550
+
551
+ copy[index] = NilClass
552
+ end
553
+
554
+ (self.column_names - [by]).each do |column_name|
555
+ sorted = []
556
+ column = self.column(column_name)
557
+ column_i = 0
558
+ indices.each do |i|
559
+ sorted[i] = column[column_i]
560
+ column_i += 1
561
+ end
562
+ @data[column_name] = sorted
563
+ end
564
+ @data[by].sort!
565
+ end
566
+
567
+ ##
568
+ # Adds all the rows in +dataframe+ to this data-frame. The column names must match.
569
+
400
570
  def bind_rows!(dataframe)
401
571
  raise TypeError, "DataFrame expected" unless dataframe.is_a?(DataFrame)
402
572
  raise "The columns are not compatible: #{self.column_names - dataframe.column_names} - #{dataframe.column_names - self.column_names}" unless (self.column_names & dataframe.column_names).size == self.columns
@@ -409,6 +579,9 @@ module Rust
409
579
  end
410
580
  alias :rbind! :bind_rows!
411
581
 
582
+ ##
583
+ # Adds all the columns in +dataframe+ to this data-frame. The number of rows must match.
584
+
412
585
  def bind_columns!(dataframe)
413
586
  raise TypeError, "DataFrame expected" unless dataframe.is_a?(DataFrame)
414
587
  raise "The number of rows are not compatible" if self.rows != dataframe.rows
@@ -422,6 +595,9 @@ module Rust
422
595
  end
423
596
  alias :cbind! :bind_columns!
424
597
 
598
+ ##
599
+ # Returns a copy of this dataframe and adds all the rows in +dataframe+ to it. The column names must match.
600
+
425
601
  def bind_rows(dataframe)
426
602
  result = self.clone
427
603
  result.bind_rows!(dataframe)
@@ -429,6 +605,9 @@ module Rust
429
605
  end
430
606
  alias :rbind :bind_rows
431
607
 
608
+ ##
609
+ # Returns a copy of this dataframe and adds all the columns in +dataframe+ to it. The number of rows must match.
610
+
432
611
  def bind_columns(dataframe)
433
612
  result = self.clone
434
613
  result.bind_columns!(dataframe)
@@ -436,152 +615,53 @@ module Rust
436
615
  end
437
616
  alias :cbind :bind_columns
438
617
 
618
+ ##
619
+ # Returns a copy of this data-frame.
620
+
439
621
  def clone
440
622
  DataFrame.new(@data)
441
623
  end
442
624
  end
443
625
 
444
- class Matrix < RustDatatype
445
- def self.pull_variable(variable)
446
- return Rust._pull(variable)
447
- end
448
-
449
- def initialize(data)
450
- if data.flatten.size == 0
451
- raise "Empty matrices are not allowed"
452
- else
453
- raise TypeError, "Expected array of array" unless data.is_a?(Array) && data[0].is_a?(Array)
454
- raise TypeError, "Only numeric matrices are supported" unless data.all? { |row| row.all? { |e| e.is_a?(Numeric) } }
455
- raise "All the rows must have the same size" unless data.map { |row| row.size }.uniq.size == 1
456
- @data = data.clone
457
- end
458
- end
459
-
460
- def [](i, j)
461
- return @data[i][j]
462
- end
463
-
464
- def rows
465
- @data.size
466
- end
467
-
468
- def cols
469
- @data[0].size
470
- end
626
+ ##
627
+ # Represents an array of DataFrame
628
+
629
+ class DataFrameArray < Array
471
630
 
472
- def []=(i, j, value)
473
- raise "Wrong i" unless i.between?(0, @data.size - 1)
474
- raise "Wrong j" unless j.between?(0, @data[0].size - 1)
475
- @data[i][j] = value
476
- end
631
+ ##
632
+ # Returns a data-frame with the rows in all the data-frames together (if compatible).
477
633
 
478
- def load_in_r_as(variable_name)
479
- Rust._eval("#{variable_name} <- matrix(c(#{@data.flatten.join(",")}), nrow=#{self.rows}, ncol=#{self.cols}, byrow=T)")
634
+ def bind_all
635
+ return nil if self.size == 0
636
+
637
+ result = self.first.clone
638
+
639
+ for i in 1...self.size
640
+ result .bind_rows!(self[i])
641
+ end
642
+
643
+ return result
480
644
  end
481
645
  end
482
646
 
483
- class Sequence
484
- attr_reader :min
485
- attr_reader :max
486
-
487
- def initialize(min, max, step=1)
488
- @min = min
489
- @max = max
490
- @step = step
491
- end
492
-
493
- def step(step)
494
- @step = step
495
- end
647
+ ##
648
+ # Represents a hash of DataFrame
649
+
650
+ class DataFrameHash < Hash
496
651
 
497
- def each
498
- (@min..@max).step(@step) do |v|
499
- yield v
500
- end
501
- end
652
+ ##
653
+ # Returns a data-frame with the rows in all the data-frames together (if compatible).
502
654
 
503
- def to_a
504
- result = []
505
- self.each do |v|
506
- result << v
655
+ def bind_all
656
+ return nil if self.values.size == 0
657
+
658
+ result = self.values.first.clone
659
+
660
+ for i in 1...self.values.size
661
+ result .bind_rows!(self.values[i])
507
662
  end
663
+
508
664
  return result
509
665
  end
510
-
511
- def to_R
512
- "seq(from=#@min, to=#@max, by=#@step)"
513
- end
514
- end
515
- end
516
-
517
- class TrueClass
518
- def to_R
519
- "TRUE"
520
- end
521
- end
522
-
523
- class FalseClass
524
- def to_R
525
- "FALSE"
526
- end
527
- end
528
-
529
- class Object
530
- def to_R
531
- raise TypeError, "Unsupported type for #{self.class}"
532
666
  end
533
667
  end
534
-
535
- class NilClass
536
- def to_R
537
- return "NULL"
538
- end
539
- end
540
-
541
- class Numeric
542
- def to_R
543
- self.inspect
544
- end
545
- end
546
-
547
- class Float
548
- def to_R
549
- return self.nan? ? "NA" : super
550
- end
551
- end
552
-
553
- class Array
554
- def to_R
555
- return "c(#{self.map { |e| e.to_R }.join(",")})"
556
- end
557
- end
558
-
559
- class String
560
- def to_R
561
- return self.inspect
562
- end
563
- end
564
-
565
- class Range
566
- def to_R
567
- [range.min, range.max].to_R
568
- end
569
- end
570
-
571
- module Rust::RBindings
572
- def read_csv(filename, **options)
573
- Rust::CSV.read(filename, **options)
574
- end
575
-
576
- def write_csv(filename, dataframe, **options)
577
- Rust::CSV.write(filename, dataframe, **options)
578
- end
579
-
580
- def data_frame(*args)
581
- Rust::DataFrame.new(*args)
582
- end
583
- end
584
-
585
- def bind_r!
586
- include Rust::RBindings
587
- end