galaaz 0.4.7 → 0.4.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1179 -39
- data/Rakefile +4 -2
- data/bin/grun +1 -1
- data/bin/gstudio +1 -1
- data/blogs/dev/dev.Rmd +2 -56
- data/blogs/dev/dev.md +32 -61
- data/blogs/dev/dev2.Rmd +65 -0
- data/blogs/dplyr/dplyr.Rmd +29 -0
- data/blogs/{dev/dev.html → dplyr/dplyr.html} +88 -57
- data/blogs/dplyr/dplyr.md +58 -0
- data/blogs/gknit/gknit.html +1262 -25
- data/blogs/gknit/gknit.md +471 -27
- data/blogs/gknit/gknit_files/figure-html/bubble-1.png +0 -0
- data/blogs/manual/graph.rb +29 -0
- data/blogs/manual/manual.Rmd +567 -29
- data/blogs/manual/manual.html +743 -46
- data/blogs/manual/manual.md +1179 -39
- data/blogs/nse_dplyr/nse_dplyr.Rmd +466 -11
- data/blogs/nse_dplyr/nse_dplyr.html +472 -37
- data/blogs/nse_dplyr/nse_dplyr.md +645 -32
- data/blogs/ruby_plot/ruby_plot.Rmd +4 -4
- data/blogs/ruby_plot/ruby_plot.html +217 -2
- data/blogs/ruby_plot/ruby_plot.md +226 -1
- data/blogs/ruby_plot/ruby_plot_files/figure-html/dose_len.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/dose_len.svg +2 -2
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_delivery.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_delivery.svg +70 -70
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_dose.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_dose.svg +72 -72
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color.svg +116 -116
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color2.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color2.svg +176 -176
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_decorations.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_jitter.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_jitter.svg +236 -236
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_points.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_points.svg +176 -176
- data/blogs/ruby_plot/ruby_plot_files/figure-html/final_box_plot.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/final_box_plot.svg +160 -160
- data/blogs/ruby_plot/ruby_plot_files/figure-html/final_violin_plot.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/final_violin_plot.svg +105 -105
- data/blogs/ruby_plot/ruby_plot_files/figure-html/violin_with_jitter.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/violin_with_jitter.svg +121 -121
- data/examples/islr/ch2.spec.rb +1 -1
- data/examples/islr/ch3_boston.rb +4 -4
- data/examples/islr/x_y_rnorm.jpg +0 -0
- data/lib/R_interface/r.rb +1 -1
- data/lib/R_interface/r_methods.rb +2 -2
- data/lib/R_interface/rdata_frame.rb +8 -5
- data/lib/R_interface/rindexed_object.rb +1 -2
- data/lib/R_interface/rlist.rb +1 -0
- data/lib/R_interface/robject.rb +0 -1
- data/lib/R_interface/rpkg.rb +14 -6
- data/lib/R_interface/rsupport.rb +7 -9
- data/lib/R_interface/ruby_extensions.rb +17 -5
- data/lib/gknit/knitr_engine.rb +9 -2
- data/lib/util/exec_ruby.rb +2 -2
- data/specs/r_dataframe.spec.rb +173 -0
- data/specs/r_list.spec.rb +4 -4
- data/specs/ruby_expression.spec.rb +2 -11
- data/specs/tmp.rb +76 -34
- data/version.rb +1 -1
- metadata +17 -6
- data/blogs/dev/dev_files/figure-html/bubble-1.png +0 -0
- data/blogs/dev/dev_files/figure-html/diverging_bar. +0 -0
- data/blogs/dev/dev_files/figure-html/diverging_bar.png +0 -0
@@ -19,6 +19,79 @@ output:
|
|
19
19
|
|
20
20
|
# Introduction
|
21
21
|
|
22
|
+
In this post we will see how to program with dplyr in Galaaz.
|
23
|
+
|
24
|
+
### But first, what is Galaaz??
|
25
|
+
|
26
|
+
Galaaz is a system for tightly coupling Ruby and R. Ruby is a powerful language, with
|
27
|
+
a large community, a very large set of libraries and great for web development. However,
|
28
|
+
it lacks libraries for data science, statistics, scientific plotting and machine learning.
|
29
|
+
On the other hand, R is considered one of the most powerful languages for solving all of the
|
30
|
+
above problems. Maybe the strongest competitor to R is Python with libraries such as NumPy,
|
31
|
+
Panda, SciPy, SciKit-Learn and a couple more.
|
32
|
+
|
33
|
+
With Galaaz we do not intend to re-implement any of the scientific libraries in R. However, we
|
34
|
+
allow for very tight coupling between the two languages to the point that the Ruby
|
35
|
+
developer does not need to know that there is an R engine running. For this to happen we
|
36
|
+
use new technologies provided by Oracle: GraalVM, TruffleRuby and FastR:
|
37
|
+
|
38
|
+
GraalVM is a universal virtual machine for running applications
|
39
|
+
written in JavaScript, Python 3, Ruby, R, JVM-based languages like Java,
|
40
|
+
Scala, Kotlin, and LLVM-based languages such as C and C++.
|
41
|
+
|
42
|
+
GraalVM removes the isolation between programming languages and enables
|
43
|
+
interoperability in a shared runtime. It can run either standalone or in
|
44
|
+
the context of OpenJDK, Node.js, Oracle Database, or MySQL.
|
45
|
+
|
46
|
+
GraalVM allows you to write polyglot applications with a seamless way to
|
47
|
+
pass values from one language to another. With GraalVM there is no copying
|
48
|
+
or marshaling necessary as it is with other polyglot systems. This lets
|
49
|
+
you achieve high performance when language boundaries are crossed. Most
|
50
|
+
of the time there is no additional cost for crossing a language boundary
|
51
|
+
at all.
|
52
|
+
|
53
|
+
Often developers have to make uncomfortable compromises that require them
|
54
|
+
to rewrite their software in other languages. For example:
|
55
|
+
|
56
|
+
* “That library is not available in my language. I need to rewrite it.”
|
57
|
+
* “That language would be the perfect fit for my problem, but we cannot
|
58
|
+
run it in our environment.”
|
59
|
+
* “That problem is already solved in my language, but the language is
|
60
|
+
too slow.”
|
61
|
+
|
62
|
+
With GraalVM we aim to allow developers to freely choose the right language
|
63
|
+
for the task at hand without making compromises.
|
64
|
+
|
65
|
+
Interested readers should also check out the following sites:
|
66
|
+
|
67
|
+
* [GraalVM Home](https://www.graalvm.org/)
|
68
|
+
* [TruffleRuby](https://github.com/oracle/truffleruby)
|
69
|
+
* [FastR](https://github.com/oracle/fastr)
|
70
|
+
* [Faster R with FastR](https://medium.com/graalvm/faster-r-with-fastr-4b8db0e0dceb)
|
71
|
+
|
72
|
+
### Now to programming with dplyr
|
73
|
+
|
74
|
+
According to Hardley (https://dplyr.tidyverse.org/articles/programming.html)
|
75
|
+
|
76
|
+
> Most dplyr functions use non-standard evaluation (NSE). This is a catch-all term that
|
77
|
+
> means they don’t follow the usual R rules of evaluation. Instead, they capture the
|
78
|
+
> expression that you typed and evaluate it in a custom way. This has two main
|
79
|
+
> benefits for dplyr code:
|
80
|
+
|
81
|
+
> Operations on data frames can be expressed succinctly because you don’t need to repeat
|
82
|
+
> the name of the data frame. For example, you can write filter(df, x == 1, y == 2, z == 3)
|
83
|
+
> instead of df[df$x == 1 & df$y ==2 & df$z == 3, ].
|
84
|
+
|
85
|
+
> dplyr can choose to compute results in a different way to base R. This is important for
|
86
|
+
> database backends because dplyr itself doesn’t do any work, but instead generates the SQL
|
87
|
+
> that tells the database what to do.
|
88
|
+
|
89
|
+
> Unfortunately these benefits do not come for free. There are two main drawbacks:
|
90
|
+
|
91
|
+
> Most dplyr arguments are not referentially transparent. That means you can’t replace a value
|
92
|
+
> with a seemingly equivalent object that you’ve defined elsewhere. In other words, this code:
|
93
|
+
|
94
|
+
|
22
95
|
|
23
96
|
```r
|
24
97
|
df <- data.frame(x = 1:3, y = 3:1)
|
@@ -47,6 +120,7 @@ print(filter(df, x == 1))
|
|
47
120
|
#> <int> <int>
|
48
121
|
#> 1 1 3
|
49
122
|
```
|
123
|
+
> Is not equivalent to this code:
|
50
124
|
|
51
125
|
|
52
126
|
```r
|
@@ -55,13 +129,149 @@ my_var <- x
|
|
55
129
|
filter(df, my_var == 1)
|
56
130
|
#> Error: object 'my_var' not found
|
57
131
|
```
|
132
|
+
> This makes it hard to create functions with arguments that change how dplyr verbs are computed.
|
133
|
+
|
134
|
+
# Writing Expressions in Galaaz
|
135
|
+
|
136
|
+
Galaaz extends Ruby to work with complex expressions, similar to R's expressions build with 'quote'
|
137
|
+
(base R) or 'quo' (tidyverse). Let's take a look at some of those expressions.
|
138
|
+
|
139
|
+
## Expressions from operators
|
140
|
+
|
141
|
+
The code bellow
|
142
|
+
creates an expression summing two symbols
|
143
|
+
|
144
|
+
|
145
|
+
```ruby
|
146
|
+
exp1 = :a + :b
|
147
|
+
puts exp1
|
148
|
+
```
|
149
|
+
|
150
|
+
```
|
151
|
+
## a + b
|
152
|
+
```
|
153
|
+
We can build any complex mathematical expression
|
154
|
+
|
155
|
+
|
156
|
+
```ruby
|
157
|
+
exp2 = (:a + :b) * 2.0 + :c ** 2 / :z
|
158
|
+
puts exp2
|
159
|
+
```
|
160
|
+
|
161
|
+
```
|
162
|
+
## (a + b) * 2 + c^2L/z
|
163
|
+
```
|
164
|
+
|
165
|
+
It is also possible to use inequality operators in building expressions
|
166
|
+
|
167
|
+
|
168
|
+
```ruby
|
169
|
+
exp3 = (:a + :b) >= :z
|
170
|
+
puts exp3
|
171
|
+
```
|
172
|
+
|
173
|
+
```
|
174
|
+
## a + b >= z
|
175
|
+
```
|
176
|
+
|
177
|
+
Galaaz provides both symbolic representations for operators, such as (>, <, !=) as functional
|
178
|
+
notation for those operators such as (.gt, .ge, etc.). So the same expression written
|
179
|
+
above can also be written as
|
180
|
+
|
181
|
+
|
182
|
+
```ruby
|
183
|
+
exp4 = (:a + :b).ge :z
|
184
|
+
puts exp4
|
185
|
+
```
|
186
|
+
|
187
|
+
```
|
188
|
+
## a + b >= z
|
189
|
+
```
|
190
|
+
|
191
|
+
Two type of expression can only be created with the functional representation of the operators,
|
192
|
+
those are expressions involving '==', and '='. In order to write an expression involving '==' we
|
193
|
+
need to use the method '.eq' and for '=' we need the function '.assign'
|
194
|
+
|
195
|
+
|
196
|
+
```ruby
|
197
|
+
exp5 = (:a + :b).eq :z
|
198
|
+
puts exp5
|
199
|
+
```
|
200
|
+
|
201
|
+
```
|
202
|
+
## a + b == z
|
203
|
+
```
|
204
|
+
|
205
|
+
|
206
|
+
```ruby
|
207
|
+
exp6 = :y.assign :a + :b
|
208
|
+
puts exp6
|
209
|
+
```
|
210
|
+
|
211
|
+
```
|
212
|
+
## y <- a + b
|
213
|
+
```
|
214
|
+
In general we think that using the functional notation is preferable to using the
|
215
|
+
symbolic notation as otherwise, we end up writing invalid expressions such as
|
216
|
+
|
217
|
+
|
218
|
+
```ruby
|
219
|
+
exp_wrong = (:a + :b) == :z
|
220
|
+
puts exp_wrong
|
221
|
+
```
|
222
|
+
|
223
|
+
```
|
224
|
+
## Message:
|
225
|
+
## Error in function (x, y, num.eq = TRUE, single.NA = TRUE, attrib.as.set = TRUE, :
|
226
|
+
## object 'a' not found (RError)
|
227
|
+
## Translated to internal error
|
228
|
+
```
|
229
|
+
and it might be difficult to understand what is going on here. The problem lies with the fact that
|
230
|
+
when using '==' we are comparing expression (:a + :b) to expression :z with '=='. When the
|
231
|
+
comparison is executed, the system tries to evaluate :a, :b and :z, and those symbols, at
|
232
|
+
this time are not bound to anything and we get a "object 'a' not found" message.
|
233
|
+
If we only use functional notation, this type of error will never occur.
|
234
|
+
|
235
|
+
## Expressions with R methods
|
236
|
+
|
237
|
+
It is often necessary to create an expression that uses a method or function. For instance, in
|
238
|
+
mathematics, it's quite natural to write an expressin such as $y = sin(x)$. In this case, the
|
239
|
+
'sin' function is part of the expression and should not immediately executed. Now, let's say
|
240
|
+
that 'x' is an angle of 45$^\circ$ and we acttually want our expression to be $y = 0.850...$.
|
241
|
+
When we want the function to be part of the expression, we call the function preceeding it
|
242
|
+
by the letter E, such as 'E.sin(x)'
|
243
|
+
|
244
|
+
|
245
|
+
```ruby
|
246
|
+
exp7 = :y.assign E.sin(:x)
|
247
|
+
puts exp7
|
248
|
+
```
|
249
|
+
|
250
|
+
```
|
251
|
+
## y <- sin(x)
|
252
|
+
```
|
253
|
+
However, if we want the function to be evaluated, then
|
254
|
+
we use the normal call to function with R as 'R.sin(x)'.
|
255
|
+
|
256
|
+
|
257
|
+
```ruby
|
258
|
+
x = 45
|
259
|
+
exp8 = :y.assign R.sin(x)
|
260
|
+
puts exp8
|
261
|
+
```
|
262
|
+
|
263
|
+
```
|
264
|
+
## y <- 0.850903524534118
|
265
|
+
```
|
266
|
+
# Filtering using expressions
|
267
|
+
|
268
|
+
Now that we now how to write expression, we can use then to filter a data frame by expressions.
|
269
|
+
Let's first start by creating a simple data frame with two columns named 'x' and 'y'
|
58
270
|
|
59
271
|
|
60
272
|
```ruby
|
61
273
|
@df = R.data__frame(x: (1..3), y: (3..1))
|
62
274
|
puts @df
|
63
|
-
|
64
|
-
puts @df.filter(:x.eq 1)
|
65
275
|
```
|
66
276
|
|
67
277
|
```
|
@@ -69,10 +279,37 @@ puts @df.filter(:x.eq 1)
|
|
69
279
|
## 1 1 3
|
70
280
|
## 2 2 2
|
71
281
|
## 3 3 1
|
282
|
+
```
|
283
|
+
In the code bellow we want to filter the data frame by rows in which the value of 'x' is
|
284
|
+
equal to 1.
|
285
|
+
|
286
|
+
|
287
|
+
```ruby
|
288
|
+
puts @df.filter(:x.eq 1)
|
289
|
+
```
|
290
|
+
|
291
|
+
```
|
72
292
|
## x y
|
73
293
|
## 1 1 3
|
74
294
|
```
|
75
295
|
|
296
|
+
In R, and when coding with 'tidyverse', arguments to a function are usually not
|
297
|
+
*referencially transparent*. That is, ou can’t replace a value with a seemingly equivalent
|
298
|
+
object that you’ve defined elsewhere. In other words, this code
|
299
|
+
|
300
|
+
|
301
|
+
```r
|
302
|
+
my_var <- x
|
303
|
+
filter(df, my_var == 1)
|
304
|
+
```
|
305
|
+
Generates the following error: "object 'x' not found.
|
306
|
+
|
307
|
+
However, in Ruby and Galaaz, arguments are referencially transparent as can be seen by the
|
308
|
+
code bellow. Note, initally that 'my_var = :x' will not give the error "object 'x' not found"
|
309
|
+
since ':x' is treated as an expression and assigned to my\_var. Then when doing (my\_var.eq 1),
|
310
|
+
my\_var is a variable that resolves to ':x' and it becomes equivalent to (:x.eq 1) which is
|
311
|
+
what we want.
|
312
|
+
|
76
313
|
|
77
314
|
```ruby
|
78
315
|
my_var = :x
|
@@ -83,9 +320,10 @@ puts @df.filter(my_var.eq 1)
|
|
83
320
|
## x y
|
84
321
|
## 1 1 3
|
85
322
|
```
|
323
|
+
As stated by Hardley
|
86
324
|
|
87
|
-
|
88
|
-
|
325
|
+
> dplyr code is ambiguous. Depending on what variables are defined where,
|
326
|
+
> filter(df, x == y) could be equivalent to any of:
|
89
327
|
|
90
328
|
```
|
91
329
|
df[df$x == df$y, ]
|
@@ -93,44 +331,419 @@ df[df$x == y, ]
|
|
93
331
|
df[x == df$y, ]
|
94
332
|
df[x == y, ]
|
95
333
|
```
|
334
|
+
In galaaz this ambiguity does not exist, filter(df, x.eq y) is not a valid expression as
|
335
|
+
expressions are build with symbols. In doing filter(df, :x.eq y) we are looking for elements
|
336
|
+
of the 'x' column that are equal to a previously defined y variable. Finally,
|
337
|
+
filter(df, :x.eq :y) we are looking for elements in which the 'x' column value is equal to
|
338
|
+
the 'y' column value. This can be seen in the following two chunks of code:
|
339
|
+
|
340
|
+
|
341
|
+
```ruby
|
342
|
+
@y = 1
|
343
|
+
@x = 2
|
344
|
+
|
345
|
+
# looking for values where the 'x' column is equal to the 'y' column
|
346
|
+
puts @df.filter(:x.eq :y)
|
347
|
+
```
|
348
|
+
|
349
|
+
```
|
350
|
+
## x y
|
351
|
+
## 1 2 2
|
352
|
+
```
|
353
|
+
|
354
|
+
|
355
|
+
```ruby
|
356
|
+
# looking for values where the 'x' column is equal to the 'y' variable
|
357
|
+
# in this case, the number 1
|
358
|
+
puts @df.filter(:x.eq @y)
|
359
|
+
```
|
360
|
+
|
361
|
+
```
|
362
|
+
## x y
|
363
|
+
## 1 1 3
|
364
|
+
```
|
365
|
+
# Writing a function that applies to different data sets
|
366
|
+
|
367
|
+
|
368
|
+
|
369
|
+
```
|
370
|
+
mutate(df1, y = a + x)
|
371
|
+
mutate(df2, y = a + x)
|
372
|
+
mutate(df3, y = a + x)
|
373
|
+
mutate(df4, y = a + x)
|
374
|
+
```
|
96
375
|
|
97
|
-
|
376
|
+
Here we create a mutate_y Ruby method.
|
98
377
|
|
99
378
|
|
100
379
|
```ruby
|
101
|
-
|
102
|
-
|
380
|
+
def mutate_y(df)
|
381
|
+
df.mutate(:y.assign :a + :x)
|
382
|
+
end
|
383
|
+
```
|
384
|
+
|
385
|
+
Note that contrary to what happens in R, method mutate_y will fail independetly from the fact
|
386
|
+
that variable 'a' is defined or not.
|
103
387
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
388
|
+
|
389
|
+
```ruby
|
390
|
+
df1 = R.data__frame(x: (1..3))
|
391
|
+
puts df1
|
392
|
+
a = 10
|
393
|
+
mutate_y(df1)
|
108
394
|
```
|
109
395
|
|
110
396
|
```
|
111
397
|
## Message:
|
112
|
-
##
|
398
|
+
## Error in mutate_impl(.data, dots) :
|
399
|
+
## Evaluation error: object 'a' not found.
|
400
|
+
## In addition: Warning message:
|
401
|
+
## In mutate_impl(.data, dots) :
|
402
|
+
## mismatched protect/unprotect (unprotect with empty protect stack) (RError)
|
403
|
+
## Translated to internal error
|
113
404
|
```
|
114
405
|
|
406
|
+
# Different expressions
|
407
|
+
|
408
|
+
|
409
|
+
```r
|
410
|
+
df <- data.frame(
|
411
|
+
g1 = c(1, 1, 2, 2, 2),
|
412
|
+
g2 = c(1, 2, 1, 2, 1),
|
413
|
+
a = sample(5),
|
414
|
+
b = sample(5)
|
415
|
+
)
|
416
|
+
|
417
|
+
d2 <- df %>%
|
418
|
+
group_by(g1) %>%
|
419
|
+
summarise(a = mean(a))
|
420
|
+
|
421
|
+
as.data.frame(d2)
|
115
422
|
```
|
116
|
-
|
117
|
-
|
118
|
-
##
|
119
|
-
##
|
120
|
-
##
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
##
|
133
|
-
##
|
134
|
-
##
|
135
|
-
|
423
|
+
|
424
|
+
```
|
425
|
+
## g1 a
|
426
|
+
## 1 1 3
|
427
|
+
## 2 2 3
|
428
|
+
```
|
429
|
+
|
430
|
+
```r
|
431
|
+
d2 <- df %>%
|
432
|
+
group_by(g2) %>%
|
433
|
+
summarise(a = mean(a))
|
434
|
+
|
435
|
+
as.data.frame(d2)
|
436
|
+
```
|
437
|
+
|
438
|
+
```
|
439
|
+
## g2 a
|
440
|
+
## 1 1 3.666667
|
441
|
+
## 2 2 2.000000
|
442
|
+
```
|
443
|
+
|
444
|
+
Trying to write a function in R that will receive two argumens, the first a variable and
|
445
|
+
the second an expression is not trivia. As shown by Hardley, one might expect this function
|
446
|
+
to do the trick:
|
447
|
+
|
448
|
+
|
449
|
+
```r
|
450
|
+
my_summarise <- function(df, group_var) {
|
451
|
+
df %>%
|
452
|
+
group_by(group_var) %>%
|
453
|
+
summarise(a = mean(a))
|
454
|
+
}
|
455
|
+
|
456
|
+
# my_summarise(df, g1)
|
457
|
+
#> Error: Column `group_var` is unknown
|
458
|
+
```
|
459
|
+
|
460
|
+
In order to solve this problem, coding with dplyr requires the introduction of many new concepts
|
461
|
+
and functions such as 'quo', 'quos', 'enquo', 'enquos', '!!' (bang bang), '!!!' (triple bang).
|
462
|
+
|
463
|
+
Now, let's try to implement the same function in galaaz. The next code block first prints the
|
464
|
+
'df' data frame define previously in R, then creates the my_summarize function and calls it
|
465
|
+
passing the R data frame and the group by variable ':g1'
|
466
|
+
|
467
|
+
|
468
|
+
```ruby
|
469
|
+
puts ~:df
|
470
|
+
print "\n"
|
471
|
+
|
472
|
+
def my_summarize(df, group_var)
|
473
|
+
df.group_by(group_var).
|
474
|
+
summarize(a: E.mean(:a))
|
475
|
+
end
|
476
|
+
|
477
|
+
puts my_summarize((~:df), :g1).as__data__frame
|
478
|
+
```
|
479
|
+
|
480
|
+
```
|
481
|
+
## g1 g2 a b
|
482
|
+
## 1 1 1 5 2
|
483
|
+
## 2 1 2 1 5
|
484
|
+
## 3 2 1 2 4
|
485
|
+
## 4 2 2 3 1
|
486
|
+
## 5 2 1 4 3
|
487
|
+
##
|
488
|
+
## g1 a
|
489
|
+
## 1 1 3
|
490
|
+
## 2 2 3
|
491
|
+
```
|
492
|
+
It works!!! Well let's make sure this was not just some coincidence
|
493
|
+
|
494
|
+
|
495
|
+
```ruby
|
496
|
+
puts my_summarize((~:df), :g2).as__data__frame
|
497
|
+
```
|
498
|
+
|
499
|
+
```
|
500
|
+
## g2 a
|
501
|
+
## 1 1 3.666667
|
502
|
+
## 2 2 2.000000
|
503
|
+
```
|
504
|
+
|
505
|
+
Great, everything is fine! No magic, no new functions, no complexities, just normal, standard Ruby
|
506
|
+
code. If you've ever done NSE in R, this certainly feels much safer and easy to implement.
|
507
|
+
|
508
|
+
# Different input variables
|
509
|
+
|
510
|
+
In the previous section we've managed to get rid of all NSE formulation for a simple example, but
|
511
|
+
does this remain true for more complex examples, or will the Ruby way prove inpractical for
|
512
|
+
more complex code?
|
513
|
+
|
514
|
+
In the next example Hardley proposes us to write a function that given an expression such as 'a'
|
515
|
+
or 'a * b', calculates three summaries. What we want a function that does the same as these R
|
516
|
+
statements:
|
517
|
+
|
518
|
+
```
|
519
|
+
summarise(df, mean = mean(a), sum = sum(a), n = n())
|
520
|
+
#> # A tibble: 1 x 3
|
521
|
+
#> mean sum n
|
522
|
+
#> <dbl> <int> <int>
|
523
|
+
#> 1 3 15 5
|
524
|
+
|
525
|
+
summarise(df, mean = mean(a * b), sum = sum(a * b), n = n())
|
526
|
+
#> # A tibble: 1 x 3
|
527
|
+
#> mean sum n
|
528
|
+
#> <dbl> <int> <int>
|
529
|
+
#> 1 9.6 48 5
|
530
|
+
```
|
531
|
+
|
532
|
+
Let's try it in galaaz:
|
533
|
+
|
534
|
+
|
535
|
+
```ruby
|
536
|
+
def my_summarise2(df, expr)
|
537
|
+
df.summarize(
|
538
|
+
mean: E.mean(expr),
|
539
|
+
sum: E.sum(expr),
|
540
|
+
n: E.n
|
541
|
+
)
|
542
|
+
end
|
543
|
+
|
544
|
+
puts my_summarise2((~:df), :a)
|
545
|
+
puts my_summarise2((~:df), :a * :b)
|
546
|
+
```
|
547
|
+
|
548
|
+
```
|
549
|
+
## mean sum n
|
550
|
+
## 1 3 15 5
|
551
|
+
## mean sum n
|
552
|
+
## 1 7.6 38 5
|
553
|
+
```
|
554
|
+
|
555
|
+
Once again, there is no need to use any special theory or functions. The only point to be
|
556
|
+
careful about is the use of 'E' to build an expression that uses the mean, sum and n.
|
557
|
+
|
558
|
+
# Different input and output variable
|
559
|
+
|
560
|
+
Now the next challenge presented by Hardley is to vary the name of the output variables based on
|
561
|
+
the received expression. So, if the input expression is 'a', we want our data frame columns to
|
562
|
+
be named 'mean\_a' and 'sum\_a'. Now, if the input expression is 'b', columns
|
563
|
+
should be named 'mean\_b' and 'sum\_b'.
|
564
|
+
|
565
|
+
```
|
566
|
+
mutate(df, mean_a = mean(a), sum_a = sum(a))
|
567
|
+
#> # A tibble: 5 x 6
|
568
|
+
#> g1 g2 a b mean_a sum_a
|
569
|
+
#> <dbl> <dbl> <int> <int> <dbl> <int>
|
570
|
+
#> 1 1 1 1 3 3 15
|
571
|
+
#> 2 1 2 4 2 3 15
|
572
|
+
#> 3 2 1 2 1 3 15
|
573
|
+
#> 4 2 2 5 4 3 15
|
574
|
+
#> # … with 1 more row
|
575
|
+
|
576
|
+
mutate(df, mean_b = mean(b), sum_b = sum(b))
|
577
|
+
#> # A tibble: 5 x 6
|
578
|
+
#> g1 g2 a b mean_b sum_b
|
579
|
+
#> <dbl> <dbl> <int> <int> <dbl> <int>
|
580
|
+
#> 1 1 1 1 3 3 15
|
581
|
+
#> 2 1 2 4 2 3 15
|
582
|
+
#> 3 2 1 2 1 3 15
|
583
|
+
#> 4 2 2 5 4 3 15
|
584
|
+
#> # … with 1 more row
|
585
|
+
```
|
586
|
+
|
587
|
+
Here is our Ruby code
|
588
|
+
|
589
|
+
|
590
|
+
```ruby
|
591
|
+
def my_mutate(df, expr)
|
592
|
+
mean_name = "mean_#{expr.to_s}"
|
593
|
+
sum_name = "sum_#{expr.to_s}"
|
594
|
+
|
595
|
+
df.mutate(mean_name => E.mean(expr),
|
596
|
+
sum_name => E.sum(expr))
|
597
|
+
end
|
598
|
+
|
599
|
+
puts my_mutate((~:df), :a)
|
600
|
+
puts my_mutate((~:df), :b)
|
601
|
+
```
|
602
|
+
|
603
|
+
```
|
604
|
+
## g1 g2 a b mean_a sum_a
|
605
|
+
## 1 1 1 5 2 3 15
|
606
|
+
## 2 1 2 1 5 3 15
|
607
|
+
## 3 2 1 2 4 3 15
|
608
|
+
## 4 2 2 3 1 3 15
|
609
|
+
## 5 2 1 4 3 3 15
|
610
|
+
## g1 g2 a b mean_b sum_b
|
611
|
+
## 1 1 1 5 2 3 15
|
612
|
+
## 2 1 2 1 5 3 15
|
613
|
+
## 3 2 1 2 4 3 15
|
614
|
+
## 4 2 2 3 1 3 15
|
615
|
+
## 5 2 1 4 3 3 15
|
616
|
+
```
|
617
|
+
It really seems that "Non Standard Evaluation" is actually quite standard in Galaaz! But, you
|
618
|
+
might have noticed a small change in the way the arguments to the mutate method were called.
|
619
|
+
In a previous example we used df.summarise(mean: E.mean(:a), ...) where the column name was
|
620
|
+
followed by a ':' colom. In this example, we have df.mutate(mean_name => E.mean(expr), ...)
|
621
|
+
and variable mean\_name is not followed by ':' but by '=>'. This is standard Ruby notation.
|
622
|
+
|
623
|
+
[explain....]
|
624
|
+
|
625
|
+
# Capturing multiple variables
|
626
|
+
|
627
|
+
|
628
|
+
```ruby
|
629
|
+
def my_summarise3(df, *group_vars)
|
630
|
+
df.group_by(*group_vars).
|
631
|
+
summarise(a: E.mean(:a))
|
632
|
+
end
|
633
|
+
|
634
|
+
puts my_summarise3((~:df), :g1, :g2).as__data__frame
|
635
|
+
```
|
636
|
+
|
637
|
+
```
|
638
|
+
## g1 g2 a
|
639
|
+
## 1 1 1 5
|
640
|
+
## 2 1 2 1
|
641
|
+
## 3 2 1 3
|
642
|
+
## 4 2 2 3
|
643
|
+
```
|
644
|
+
|
645
|
+
# Advanced dplyr features
|
646
|
+
https://www.r-bloggers.com/programming-with-dplyr-by-using-dplyr/
|
647
|
+
|
648
|
+
|
649
|
+
```ruby
|
650
|
+
puts (~:starwars).head.as__data__frame
|
651
|
+
```
|
652
|
+
|
653
|
+
```
|
654
|
+
## name height mass hair_color skin_color eye_color birth_year
|
655
|
+
## 1 Luke Skywalker 172 77 blond fair blue 19.0
|
656
|
+
## 2 C-3PO 167 75 <NA> gold yellow 112.0
|
657
|
+
## 3 R2-D2 96 32 <NA> white, blue red 33.0
|
658
|
+
## 4 Darth Vader 202 136 none white yellow 41.9
|
659
|
+
## 5 Leia Organa 150 49 brown light brown 19.0
|
660
|
+
## 6 Owen Lars 178 120 brown, grey light blue 52.0
|
661
|
+
## gender homeworld species
|
662
|
+
## 1 male Tatooine Human
|
663
|
+
## 2 <NA> Tatooine Droid
|
664
|
+
## 3 <NA> Naboo Droid
|
665
|
+
## 4 male Tatooine Human
|
666
|
+
## 5 female Alderaan Human
|
667
|
+
## 6 male Tatooine Human
|
668
|
+
## films
|
669
|
+
## 1 Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope, The Force Awakens
|
670
|
+
## 2 Attack of the Clones, The Phantom Menace, Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope
|
671
|
+
## 3 Attack of the Clones, The Phantom Menace, Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope, The Force Awakens
|
672
|
+
## 4 Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope
|
673
|
+
## 5 Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope, The Force Awakens
|
674
|
+
## 6 Attack of the Clones, Revenge of the Sith, A New Hope
|
675
|
+
## vehicles starships
|
676
|
+
## 1 Snowspeeder, Imperial Speeder Bike X-wing, Imperial shuttle
|
677
|
+
## 2
|
678
|
+
## 3
|
679
|
+
## 4 TIE Advanced x1
|
680
|
+
## 5 Imperial Speeder Bike
|
681
|
+
## 6
|
682
|
+
```
|
683
|
+
|
684
|
+
|
685
|
+
```r
|
686
|
+
grouped_mean <- function(data, grouping_variables, value_variables) {
|
687
|
+
data %>%
|
688
|
+
group_by_at(grouping_variables) %>%
|
689
|
+
mutate(count = n()) %>%
|
690
|
+
summarise_at(c(value_variables, "count"), mean, na.rm = TRUE) %>%
|
691
|
+
rename_at(value_variables, funs(paste0("mean_", .)))
|
692
|
+
}
|
693
|
+
|
694
|
+
gm = starwars %>%
|
695
|
+
grouped_mean("eye_color", c("mass", "birth_year"))
|
696
|
+
|
697
|
+
as.data.frame(gm)
|
698
|
+
```
|
699
|
+
|
700
|
+
```
|
701
|
+
## eye_color mean_mass mean_birth_year count
|
702
|
+
## 1 black 76.28571 33.00000 10
|
703
|
+
## 2 blue 86.51667 67.06923 19
|
704
|
+
## 3 blue-gray 77.00000 57.00000 1
|
705
|
+
## 4 brown 66.09231 108.96429 21
|
706
|
+
## 5 dark NaN NaN 1
|
707
|
+
## 6 gold NaN NaN 1
|
708
|
+
## 7 green, yellow 159.00000 NaN 1
|
709
|
+
## 8 hazel 66.00000 34.50000 3
|
710
|
+
## 9 orange 282.33333 231.00000 8
|
711
|
+
## 10 pink NaN NaN 1
|
712
|
+
## 11 red 81.40000 33.66667 5
|
713
|
+
## 12 red, blue NaN NaN 1
|
714
|
+
## 13 unknown 31.50000 NaN 3
|
715
|
+
## 14 white 48.00000 NaN 1
|
716
|
+
## 15 yellow 81.11111 76.38000 11
|
717
|
+
```
|
718
|
+
|
719
|
+
|
720
|
+
```ruby
|
721
|
+
def grouped_mean(data, grouping_variables, value_variables)
|
722
|
+
data.
|
723
|
+
group_by_at(grouping_variables).
|
724
|
+
mutate(count: E.n).
|
725
|
+
summarise_at(E.c(value_variables, "count"), ~:mean, na__rm: true).
|
726
|
+
rename_at(value_variables, R.funs(E.paste0("mean_", value_variables)))
|
727
|
+
end
|
728
|
+
|
729
|
+
puts grouped_mean((~:starwars), "eye_color", R.c("mass", "birth_year")).as__data__frame
|
730
|
+
```
|
731
|
+
|
732
|
+
```
|
733
|
+
## eye_color mean_mass mean_birth_year count
|
734
|
+
## 1 black 76.28571 33.00000 10
|
735
|
+
## 2 blue 86.51667 67.06923 19
|
736
|
+
## 3 blue-gray 77.00000 57.00000 1
|
737
|
+
## 4 brown 66.09231 108.96429 21
|
738
|
+
## 5 dark NaN NaN 1
|
739
|
+
## 6 gold NaN NaN 1
|
740
|
+
## 7 green, yellow 159.00000 NaN 1
|
741
|
+
## 8 hazel 66.00000 34.50000 3
|
742
|
+
## 9 orange 282.33333 231.00000 8
|
743
|
+
## 10 pink NaN NaN 1
|
744
|
+
## 11 red 81.40000 33.66667 5
|
745
|
+
## 12 red, blue NaN NaN 1
|
746
|
+
## 13 unknown 31.50000 NaN 3
|
747
|
+
## 14 white 48.00000 NaN 1
|
748
|
+
## 15 yellow 81.11111 76.38000 11
|
136
749
|
```
|