galaaz 0.4.9 → 0.4.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +798 -285
- data/blogs/galaaz_ggplot/galaaz_ggplot.Rmd +3 -12
- data/blogs/galaaz_ggplot/galaaz_ggplot.aux +5 -7
- data/blogs/galaaz_ggplot/galaaz_ggplot.html +69 -29
- data/blogs/galaaz_ggplot/galaaz_ggplot.pdf +0 -0
- data/blogs/galaaz_ggplot/galaaz_ggplot_files/figure-html/midwest_rb.png +0 -0
- data/blogs/galaaz_ggplot/galaaz_ggplot_files/figure-html/scatter_plot_rb.png +0 -0
- data/blogs/galaaz_ggplot/galaaz_ggplot_files/figure-latex/midwest_rb.pdf +0 -0
- data/blogs/galaaz_ggplot/galaaz_ggplot_files/figure-latex/scatter_plot_rb.pdf +0 -0
- data/blogs/galaaz_ggplot/midwest.Rmd +1 -9
- data/blogs/gknit/gknit.Rmd +37 -40
- data/blogs/gknit/gknit.html +32 -30
- data/blogs/gknit/gknit.md +36 -37
- data/blogs/gknit/gknit.pdf +0 -0
- data/blogs/gknit/gknit.tex +35 -37
- data/blogs/manual/manual.Rmd +548 -125
- data/blogs/manual/manual.html +509 -286
- data/blogs/manual/manual.md +798 -285
- data/blogs/manual/manual.pdf +0 -0
- data/blogs/manual/manual.tex +2816 -0
- data/blogs/manual/manual_files/figure-latex/diverging_bar.pdf +0 -0
- data/blogs/nse_dplyr/nse_dplyr.Rmd +240 -74
- data/blogs/nse_dplyr/nse_dplyr.html +191 -87
- data/blogs/nse_dplyr/nse_dplyr.md +361 -107
- data/blogs/nse_dplyr/nse_dplyr.pdf +0 -0
- data/blogs/nse_dplyr/nse_dplyr.tex +1373 -0
- data/blogs/ruby_plot/ruby_plot.Rmd +61 -81
- data/blogs/ruby_plot/ruby_plot.html +54 -57
- data/blogs/ruby_plot/ruby_plot.md +48 -67
- data/blogs/ruby_plot/ruby_plot.pdf +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/dose_len.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_delivery.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_dose.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color2.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_jitter.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_points.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/final_box_plot.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/final_violin_plot.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/violin_with_jitter.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/dose_len.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/facet_by_delivery.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/facet_by_dose.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/facets_by_delivery_color.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/facets_by_delivery_color2.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/facets_with_decorations.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/facets_with_jitter.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/facets_with_points.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/final_box_plot.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/final_violin_plot.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/violin_with_jitter.png +0 -0
- data/lib/R_interface/rdata_frame.rb +0 -12
- data/lib/R_interface/robject.rb +14 -14
- data/lib/R_interface/ruby_extensions.rb +3 -31
- data/lib/R_interface/rvector.rb +0 -12
- data/lib/gknit/knitr_engine.rb +5 -3
- data/lib/util/exec_ruby.rb +22 -61
- data/specs/tmp.rb +26 -12
- data/version.rb +1 -1
- metadata +22 -17
- data/bin/gknit_old_r +0 -236
- data/blogs/dev/dev.Rmd +0 -23
- data/blogs/dev/dev.md +0 -58
- data/blogs/dev/dev2.Rmd +0 -65
- data/blogs/dev/model.rb +0 -41
- data/blogs/dplyr/dplyr.Rmd +0 -29
- data/blogs/dplyr/dplyr.html +0 -433
- data/blogs/dplyr/dplyr.md +0 -58
- data/blogs/dplyr/dplyr.rb +0 -63
- data/blogs/galaaz_ggplot/galaaz_ggplot.log +0 -640
- data/blogs/galaaz_ggplot/galaaz_ggplot.md +0 -431
- data/blogs/galaaz_ggplot/galaaz_ggplot.tex +0 -481
- data/blogs/galaaz_ggplot/midwest.png +0 -0
- data/blogs/galaaz_ggplot/scatter_plot.png +0 -0
- data/blogs/ruby_plot/ruby_plot.tex +0 -1077
data/blogs/manual/manual.Rmd
CHANGED
@@ -5,11 +5,6 @@ author: "Rodrigo Botafogo"
|
|
5
5
|
tags: [Galaaz, Ruby, R, TruffleRuby, FastR, GraalVM, ggplot2]
|
6
6
|
date: "2019"
|
7
7
|
output:
|
8
|
-
html_document:
|
9
|
-
self_contained: true
|
10
|
-
keep_md: true
|
11
|
-
md_document:
|
12
|
-
variant: markdown_github
|
13
8
|
pdf_document:
|
14
9
|
includes:
|
15
10
|
in_header: "../../sty/galaaz.sty"
|
@@ -17,6 +12,11 @@ output:
|
|
17
12
|
number_sections: yes
|
18
13
|
toc: true
|
19
14
|
toc_depth: 2
|
15
|
+
html_document:
|
16
|
+
self_contained: true
|
17
|
+
keep_md: true
|
18
|
+
md_document:
|
19
|
+
variant: markdown_github
|
20
20
|
fontsize: 11pt
|
21
21
|
---
|
22
22
|
|
@@ -95,15 +95,13 @@ Panda, SciPy, SciKit-Learn and a couple more.
|
|
95
95
|
# gKnitting a Document
|
96
96
|
|
97
97
|
This manual has been formatted usign gKnit. gKnit uses Knitr and R markdown to knit
|
98
|
-
a document in Ruby or R and output it in any of the available formats for R markdown.
|
98
|
+
a document in Ruby or R and output it in any of the available formats for R markdown.
|
99
99
|
gKnit runs atop of GraalVM, and Galaaz. In gKnit, Ruby variables are persisted between
|
100
|
-
chunks, making it an ideal solution for literate programming.
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
gknit was describe in more depth in:
|
100
|
+
chunks, making it an ideal solution for literate programming. Also, since it is based
|
101
|
+
on Galaaz, Ruby chunks can have access to R variables and Polyglot Programming with
|
102
|
+
Ruby and R is quite natural.
|
105
103
|
|
106
|
-
|
104
|
+
[gknit is described in more details here](https://towardsdatascience.com/how-to-do-reproducible-research-in-ruby-with-gknit-c26d2684d64e)
|
107
105
|
|
108
106
|
# Vector
|
109
107
|
|
@@ -130,11 +128,11 @@ vector is often referred to as a character string.
|
|
130
128
|
To create a vector the 'c' (concatenate) method from the 'R' module should be used:
|
131
129
|
|
132
130
|
```{ruby integer}
|
133
|
-
|
134
|
-
puts
|
131
|
+
vec = R.c(1, 2, 3)
|
132
|
+
puts vec
|
135
133
|
```
|
136
134
|
|
137
|
-
Lets take a look at the type, mode and storage.mode of our vector
|
135
|
+
Lets take a look at the type, mode and storage.mode of our vector vec. In order to print
|
138
136
|
this out, we are creating a data frame 'df' and printing it out. A data frame, for those
|
139
137
|
not familiar with it, is basically a table. Here we create the data frame and add the
|
140
138
|
column name by passing named parameters for each column, such as 'typeof:', 'mode:' and
|
@@ -145,7 +143,7 @@ Data frames will later be more carefully described. In R, the method used to cr
|
|
145
143
|
data frame is 'data.frame', in Galaaz we use 'data\_\_frame'.
|
146
144
|
|
147
145
|
```{ruby typeof_integer}
|
148
|
-
df = R.data__frame(typeof:
|
146
|
+
df = R.data__frame(typeof: vec.typeof, mode: vec.mode, storage__mode: vec.storage__mode)
|
149
147
|
puts df
|
150
148
|
```
|
151
149
|
|
@@ -155,12 +153,12 @@ like '1' is converted to float and to have an integer the R developer will use '
|
|
155
153
|
follows normal Ruby rules and the number 1 is an integer and 1.0 is a float.
|
156
154
|
|
157
155
|
```{ruby float}
|
158
|
-
|
159
|
-
puts
|
156
|
+
vec = R.c(1.0, 2, 3)
|
157
|
+
puts vec
|
160
158
|
```
|
161
159
|
|
162
160
|
```{ruby typeof_float}
|
163
|
-
df = R.data__frame(typeof:
|
161
|
+
df = R.data__frame(typeof: vec.typeof, mode: vec.mode, storage__mode: vec.storage__mode)
|
164
162
|
outputs df.kable.kable_styling
|
165
163
|
```
|
166
164
|
|
@@ -176,8 +174,8 @@ vec = R.c(1, hello, 5)
|
|
176
174
|
Here is a vector with logical values
|
177
175
|
|
178
176
|
```{ruby logical_vector}
|
179
|
-
|
180
|
-
puts
|
177
|
+
vec = R.c(true, true, false, false, true)
|
178
|
+
puts vec
|
181
179
|
```
|
182
180
|
|
183
181
|
## Combining Vectors
|
@@ -185,21 +183,21 @@ puts @vec
|
|
185
183
|
The 'c' functions used to create vectors can also be used to combine two vectors:
|
186
184
|
|
187
185
|
```{ruby combining_vectors}
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
puts
|
186
|
+
vec1 = R.c(10.0, 20.0, 30.0)
|
187
|
+
vec2 = R.c(4.0, 5.0, 6.0)
|
188
|
+
vec = R.c(vec1, vec2)
|
189
|
+
puts vec
|
192
190
|
```
|
193
191
|
In galaaz, methods can be chainned (somewhat like the pipe operator in R %>%, but more generic).
|
194
|
-
In this next example, method 'c' is chainned after '
|
192
|
+
In this next example, method 'c' is chainned after 'vec1'. This also looks like 'c' is a
|
195
193
|
method of the vector, but in reallity, this is actually closer to the pipe operator. When
|
196
194
|
Galaaz identifies that 'c' is not a method of 'vec' it actually tries to call 'R.c' with
|
197
|
-
'
|
195
|
+
'vec1' as the first argument concatenated with all the other available arguments. The code
|
198
196
|
bellow is automatically converted to the code above.
|
199
197
|
|
200
198
|
```{ruby chainning_methods}
|
201
|
-
|
202
|
-
puts
|
199
|
+
vec = vec1.c(vec2)
|
200
|
+
puts vec
|
203
201
|
```
|
204
202
|
|
205
203
|
## Vector Arithmetic
|
@@ -207,18 +205,18 @@ puts @vec
|
|
207
205
|
Arithmetic operations on vectors are performed element by element:
|
208
206
|
|
209
207
|
```{ruby vec_arith1}
|
210
|
-
puts
|
208
|
+
puts vec1 + vec2
|
211
209
|
```
|
212
210
|
|
213
211
|
```{ruby mult}
|
214
|
-
puts
|
212
|
+
puts vec1 * 5
|
215
213
|
```
|
216
214
|
|
217
215
|
When vectors have different length, a recycling rule is applied to the shorter vector:
|
218
216
|
|
219
217
|
```{ruby recycle}
|
220
|
-
|
221
|
-
puts
|
218
|
+
vec3 = R.c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)
|
219
|
+
puts vec4 = vec1 + vec3
|
222
220
|
```
|
223
221
|
|
224
222
|
## Vector Indexing
|
@@ -226,40 +224,40 @@ puts @vec4 = @vec1 + @vec3
|
|
226
224
|
Vectors can be indexed by using the '[]' operator:
|
227
225
|
|
228
226
|
```{ruby index}
|
229
|
-
puts
|
227
|
+
puts vec4[3]
|
230
228
|
```
|
231
229
|
|
232
230
|
We can also index a vector with another vector. For example, in the code bellow, we take elements
|
233
|
-
1, 3, 5, and 7 from
|
231
|
+
1, 3, 5, and 7 from vec3:
|
234
232
|
|
235
233
|
```{ruby index_by_vector}
|
236
|
-
puts
|
234
|
+
puts vec4[R.c(1, 3, 5, 7)]
|
237
235
|
```
|
238
236
|
|
239
237
|
Repeating an index and having indices out of order is valid code:
|
240
238
|
|
241
239
|
```{ruby repeated_index}
|
242
|
-
puts
|
240
|
+
puts vec4[R.c(1, 3, 3, 1)]
|
243
241
|
```
|
244
242
|
|
245
243
|
It is also possible to index a vector with a negative number or negative vector. In these cases
|
246
244
|
the indexed values are not returned:
|
247
245
|
|
248
246
|
```{ruby neg_index}
|
249
|
-
puts
|
250
|
-
puts
|
247
|
+
puts vec4[-3]
|
248
|
+
puts vec4[-R.c(1, 3, 5, 7)]
|
251
249
|
```
|
252
250
|
|
253
251
|
If an index is out of range, a missing value (NA) will be reported.
|
254
252
|
|
255
253
|
```{ruby out_of_range}
|
256
|
-
puts
|
254
|
+
puts vec4[30]
|
257
255
|
```
|
258
256
|
|
259
257
|
It is also possible to index a vector by range:
|
260
258
|
|
261
259
|
```{ruby range}
|
262
|
-
puts
|
260
|
+
puts vec4[(2..5)]
|
263
261
|
```
|
264
262
|
|
265
263
|
Elements in a vector can be named using the 'names' attribute of a vector:
|
@@ -285,9 +283,9 @@ R::Vector with other ruby classes it might be necessary to extract the actual Ru
|
|
285
283
|
from the vector. In order to do this extraction the '>>' operator is used.
|
286
284
|
|
287
285
|
```{ruby ruby_native}
|
288
|
-
puts
|
289
|
-
puts
|
290
|
-
puts
|
286
|
+
puts vec4
|
287
|
+
puts vec4 >> 0
|
288
|
+
puts vec4 >> 4
|
291
289
|
```
|
292
290
|
|
293
291
|
Note that indexing with '>>' starts at 0 and not at 1, also, we cannot do negative indexing.
|
@@ -310,22 +308,22 @@ A matrix is a collection of elements organized as a two dimensional table. A ma
|
|
310
308
|
created by the 'matrix' function:
|
311
309
|
|
312
310
|
```{ruby matrix}
|
313
|
-
|
314
|
-
|
315
|
-
|
311
|
+
mat = R.matrix(R.c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0),
|
312
|
+
nrow: 3,
|
313
|
+
ncol: 3)
|
316
314
|
|
317
|
-
puts
|
315
|
+
puts mat
|
318
316
|
```
|
319
317
|
Note that matrices data is organized by column first. It is possible to organize the matrix
|
320
318
|
memory by row first passing an extra argument to the 'matrix' function:
|
321
319
|
|
322
320
|
```{ruby matrix_rowfirst}
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
321
|
+
mat_row = R.matrix(R.c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0),
|
322
|
+
nrow: 3,
|
323
|
+
ncol: 3,
|
324
|
+
byrow: true)
|
327
325
|
|
328
|
-
puts
|
326
|
+
puts mat_row
|
329
327
|
```
|
330
328
|
|
331
329
|
## Indexing a Matrix
|
@@ -333,28 +331,33 @@ puts @mat_row
|
|
333
331
|
A matrix can be indexed by [row, column]:
|
334
332
|
|
335
333
|
```{ruby matrix_index}
|
336
|
-
puts
|
337
|
-
puts
|
334
|
+
puts mat_row[1, 1]
|
335
|
+
puts mat_row[2, 3]
|
338
336
|
```
|
339
337
|
It is possible to index an entire row or column with the ':all' keyword
|
340
338
|
|
341
339
|
```{ruby matrix_index_all}
|
342
|
-
puts
|
343
|
-
puts
|
340
|
+
puts mat_row[1, :all]
|
341
|
+
puts mat_row[:all, 2]
|
344
342
|
```
|
345
343
|
|
346
344
|
Indexing with a vector is also possible for matrices. In the following example we want
|
347
345
|
rows 1 and 3 and columns 2 and 3 building a 2 x 2 matrix.
|
348
346
|
|
349
347
|
```{ruby matrix_index_vector}
|
350
|
-
puts
|
348
|
+
puts mat_row[R.c(1, 3), R.c(2, 3)]
|
349
|
+
```
|
350
|
+
|
351
|
+
Matrices can be combined with functions 'rbind':
|
352
|
+
|
353
|
+
```{ruby matrix_combine_rbind}
|
354
|
+
puts mat_row.rbind(mat)
|
351
355
|
```
|
352
356
|
|
353
|
-
|
357
|
+
and 'cbind':
|
354
358
|
|
355
|
-
```{ruby
|
356
|
-
puts
|
357
|
-
puts @mat_row.cbind(@mat)
|
359
|
+
```{ruby matrix_combine_cbind}
|
360
|
+
puts mat_row.cbind(mat)
|
358
361
|
```
|
359
362
|
|
360
363
|
# List
|
@@ -366,11 +369,11 @@ can only hold one type of element.
|
|
366
369
|
nums = R.c(1.0, 2.0, 3.0)
|
367
370
|
strs = R.c("a", "b", "c", "d")
|
368
371
|
bool = R.c(true, true, false)
|
369
|
-
|
370
|
-
puts
|
372
|
+
lst = R.list(nums: nums, strs: strs, bool: bool)
|
373
|
+
puts lst
|
371
374
|
```
|
372
375
|
|
373
|
-
Note that '
|
376
|
+
Note that 'lst' elements are named elements.
|
374
377
|
|
375
378
|
|
376
379
|
## List Indexing
|
@@ -380,7 +383,7 @@ first start with the '[]' operator. The list above has three sublist indexing wi
|
|
380
383
|
return one of the sublists.
|
381
384
|
|
382
385
|
```{ruby list_indexing}
|
383
|
-
puts
|
386
|
+
puts lst[1]
|
384
387
|
```
|
385
388
|
|
386
389
|
Note that when using '[]' a new list is returned. When using the double square bracket operator
|
@@ -389,13 +392,13 @@ the original list
|
|
389
392
|
|
390
393
|
|
391
394
|
```{ruby list_indexing_single}
|
392
|
-
puts
|
395
|
+
puts lst[[1]]
|
393
396
|
```
|
394
397
|
|
395
|
-
When elements are named, as dones with
|
398
|
+
When elements are named, as dones with lst, indexing can be done by name:
|
396
399
|
|
397
400
|
```{ruby list_indexing_by_name}
|
398
|
-
puts
|
401
|
+
puts lst[['bool']][[1]] >> 0
|
399
402
|
```
|
400
403
|
|
401
404
|
In this example, first the 'bool' element of the list was extracted, not as a list, but as a vector,
|
@@ -458,14 +461,18 @@ puts (~:mtcars)[R.c('Datsun 710', 'Camaro Z28'), :all]
|
|
458
461
|
Finally, a data frame can also be indexed with a logical vector. In this next example, the
|
459
462
|
'am' column of :mtcars is compared with 0 (with method 'eq'). When 'am' is equal to 0 the
|
460
463
|
car is automatic. So, by doing '(~:mtcars).am.eq 0' a logical vector is created with
|
461
|
-
'true' whenever 'am' is 0 and 'false' otherwise.
|
462
|
-
is indexed, returning a new data frame in which all cars have automatic transmission.
|
464
|
+
'true' whenever 'am' is 0 and 'false' otherwise.
|
463
465
|
|
464
|
-
```{ruby
|
466
|
+
```{ruby logical_vector_filter}
|
465
467
|
# obtain a vector with 'true' for cars with automatic transmission
|
466
468
|
automatic = (~:mtcars).am.eq 0
|
467
469
|
puts automatic
|
470
|
+
```
|
468
471
|
|
472
|
+
Using this logical vector, the data frame is indexed, returning a new data frame in
|
473
|
+
which all cars have automatic transmission.
|
474
|
+
|
475
|
+
```{ruby dataframe_logical}
|
469
476
|
# slice the data frame by using this vector
|
470
477
|
puts (~:mtcars)[automatic, :all]
|
471
478
|
```
|
@@ -547,6 +554,42 @@ exp7 = :y.assign E.sin(:x)
|
|
547
554
|
puts exp7
|
548
555
|
```
|
549
556
|
|
557
|
+
Expressions can also be written using '.' notation:
|
558
|
+
|
559
|
+
```{ruby expression_with_dot}
|
560
|
+
exp8 = :y.assign :x.sin
|
561
|
+
puts exp8
|
562
|
+
```
|
563
|
+
|
564
|
+
When a function has multiple arguments, the first one can be used before the '.':
|
565
|
+
|
566
|
+
```{ruby expression_multiple_args}
|
567
|
+
exp9 = :x.c(:y)
|
568
|
+
puts exp9
|
569
|
+
```
|
570
|
+
|
571
|
+
## Evaluating an Expression
|
572
|
+
|
573
|
+
Expressions can be evaluated by calling function 'eval' with a binding. A binding can be provided
|
574
|
+
with a list:
|
575
|
+
|
576
|
+
```{ruby eval_expression_list}
|
577
|
+
exp = (:a + :b) * 2.0 + :c ** 2 / :z
|
578
|
+
puts exp.eval(R.list(a: 10, b: 20, c: 30, z: 40))
|
579
|
+
```
|
580
|
+
|
581
|
+
... with a data frame:
|
582
|
+
|
583
|
+
```{ruby eval_expression_df}
|
584
|
+
df = R.data__frame(
|
585
|
+
a: R.c(1, 2, 3),
|
586
|
+
b: R.c(10, 20, 30),
|
587
|
+
c: R.c(100, 200, 300),
|
588
|
+
z: R.c(1000, 2000, 3000))
|
589
|
+
|
590
|
+
puts exp.eval(df)
|
591
|
+
```
|
592
|
+
|
550
593
|
# Manipulating Data
|
551
594
|
|
552
595
|
One of the major benefits of Galaaz is to bring strong data manipulation to Ruby. The following
|
@@ -568,8 +611,8 @@ R.library('dplyr')
|
|
568
611
|
```
|
569
612
|
|
570
613
|
```{ruby flights}
|
571
|
-
|
572
|
-
puts
|
614
|
+
flights = ~:flights
|
615
|
+
puts flights.head.as__data__frame
|
573
616
|
```
|
574
617
|
|
575
618
|
## Filtering rows with Filter
|
@@ -578,7 +621,7 @@ In this example we filter the flights data set by giving to the filter function
|
|
578
621
|
the first :month.eq 1
|
579
622
|
|
580
623
|
```{ruby filter_rows}
|
581
|
-
puts
|
624
|
+
puts flights.filter((:month.eq 1), (:day.eq 1)).head.as__data__frame
|
582
625
|
```
|
583
626
|
|
584
627
|
## Logical Operators
|
@@ -586,7 +629,7 @@ puts @flights.filter((:month.eq 1), (:day.eq 1)).head.as__data__frame
|
|
586
629
|
All flights that departed in November of December
|
587
630
|
|
588
631
|
```{ruby nov_dec}
|
589
|
-
puts
|
632
|
+
puts flights.filter((:month.eq 11) | (:month.eq 12)).head.as__data__frame
|
590
633
|
```
|
591
634
|
|
592
635
|
The same as above, but using the 'in' operator. In R, it is possible to define many operators
|
@@ -595,7 +638,7 @@ operators from Galaaz the '._' method is used, where the first argument is the o
|
|
595
638
|
symbol, in this case ':in' and the second argument is the vector:
|
596
639
|
|
597
640
|
```{ruby in_op}
|
598
|
-
puts
|
641
|
+
puts flights.filter(:month._ :in, R.c(11, 12)).head.as__data__frame
|
599
642
|
```
|
600
643
|
|
601
644
|
## Filtering with NA (Not Available)
|
@@ -606,21 +649,21 @@ the values and the result of some subsetting operations that are more consistent
|
|
606
649
|
what is obtained from data frame.
|
607
650
|
|
608
651
|
```{ruby na_tibble}
|
609
|
-
|
610
|
-
puts
|
652
|
+
df = R.tibble(x: R.c(1, R::NA, 3))
|
653
|
+
puts df.as__data__frame
|
611
654
|
```
|
612
655
|
|
613
656
|
Now filtering by :x > 1 shows all lines that satisfy this condition, where the row with R:NA does
|
614
657
|
not.
|
615
658
|
|
616
659
|
```{ruby filter_na}
|
617
|
-
puts
|
660
|
+
puts df.filter(:x > 1).as__data__frame
|
618
661
|
```
|
619
662
|
|
620
663
|
To match an NA use method 'is__na'
|
621
664
|
|
622
665
|
```{ruby with_na}
|
623
|
-
puts
|
666
|
+
puts df.filter((:x.is__na) | (:x > 1)).as__data__frame
|
624
667
|
```
|
625
668
|
|
626
669
|
## Arrange Rows with arrange
|
@@ -628,13 +671,13 @@ puts @df.filter((:x.is__na) | (:x > 1)).as__data__frame
|
|
628
671
|
Arrange reorders the rows of a data frame by the given arguments.
|
629
672
|
|
630
673
|
```{ruby arrange}
|
631
|
-
puts
|
674
|
+
puts flights.arrange(:year, :month, :day).head.as__data__frame
|
632
675
|
```
|
633
676
|
|
634
677
|
To arrange in descending order, use function 'desc'
|
635
678
|
|
636
679
|
```{ruby desc_arrange}
|
637
|
-
puts
|
680
|
+
puts flights.arrange(:dep_delay.desc).head.as__data__frame
|
638
681
|
```
|
639
682
|
|
640
683
|
## Selecting columns
|
@@ -642,19 +685,19 @@ puts @flights.arrange(:dep_delay.desc).head.as__data__frame
|
|
642
685
|
To select specific columns from a dataset we use function 'select':
|
643
686
|
|
644
687
|
```{ruby select}
|
645
|
-
puts
|
688
|
+
puts flights.select(:year, :month, :day).head.as__data__frame
|
646
689
|
```
|
647
690
|
|
648
691
|
It is also possible to select column in a given range
|
649
692
|
|
650
693
|
```{ruby select_range}
|
651
|
-
puts
|
694
|
+
puts flights.select(:year.up_to :day).head.as__data__frame
|
652
695
|
```
|
653
696
|
|
654
697
|
Select all columns that start with a given name sequence
|
655
698
|
|
656
699
|
```{ruby select_starts_with}
|
657
|
-
puts
|
700
|
+
puts flights.select(E.starts_with('arr')).head.as__data__frame
|
658
701
|
```
|
659
702
|
|
660
703
|
Other functions that can be used:
|
@@ -671,26 +714,26 @@ Other functions that can be used:
|
|
671
714
|
A helper function that comes in handy when we just want to rearrange column order is 'Everything':
|
672
715
|
|
673
716
|
```{ruby everything}
|
674
|
-
puts
|
717
|
+
puts flights.select(:year, :month, :day, E.everything).head.as__data__frame
|
675
718
|
```
|
676
719
|
|
677
720
|
## Add variables to a dataframe with 'mutate'
|
678
721
|
|
679
722
|
```{ruby small_flights}
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
723
|
+
flights_sm = flights.
|
724
|
+
select((:year.up_to :day),
|
725
|
+
E.ends_with('delay'),
|
726
|
+
:distance,
|
727
|
+
:air_time)
|
685
728
|
|
686
|
-
puts
|
729
|
+
puts flights_sm.head.as__data__frame
|
687
730
|
```
|
688
731
|
|
689
732
|
```{ruby mutate}
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
puts
|
733
|
+
flights_sm = flights_sm.
|
734
|
+
mutate(gain: :dep_delay - :arr_delay,
|
735
|
+
speed: :distance / :air_time * 60)
|
736
|
+
puts flights_sm.head.as__data__frame
|
694
737
|
```
|
695
738
|
|
696
739
|
## Summarising data
|
@@ -699,20 +742,20 @@ Function 'summarise' calculates summaries for the data frame. When no 'group_by'
|
|
699
742
|
a single value is obtained from the data frame:
|
700
743
|
|
701
744
|
```{ruby summarise}
|
702
|
-
puts
|
745
|
+
puts flights.summarise(delay: E.mean(:dep_delay, na__rm: true)).as__data__frame
|
703
746
|
```
|
704
747
|
|
705
748
|
When a data frame is groupe with 'group_by' summaries apply to the given group:
|
706
749
|
|
707
750
|
```{ruby summarise_group_by}
|
708
|
-
by_day =
|
751
|
+
by_day = flights.group_by(:year, :month, :day)
|
709
752
|
puts by_day.summarise(delay: :dep_delay.mean(na__rm: true)).head.as__data__frame
|
710
753
|
```
|
711
754
|
|
712
755
|
Next we put many operations together by pipping them one after the other:
|
713
756
|
|
714
757
|
```{ruby pipping}
|
715
|
-
delays =
|
758
|
+
delays = flights.
|
716
759
|
group_by(:dest).
|
717
760
|
summarise(
|
718
761
|
count: E.n,
|
@@ -720,7 +763,7 @@ delays = @flights.
|
|
720
763
|
delay: :arr_delay.mean(na__rm: true)).
|
721
764
|
filter(:count > 20, :dest != "NHL")
|
722
765
|
|
723
|
-
puts delays.as__data__frame
|
766
|
+
puts delays.as__data__frame.head
|
724
767
|
```
|
725
768
|
|
726
769
|
# Using Data Table
|
@@ -730,9 +773,9 @@ R.library('data.table')
|
|
730
773
|
R.install_and_loads('curl')
|
731
774
|
|
732
775
|
input = "https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv"
|
733
|
-
|
734
|
-
puts
|
735
|
-
puts
|
776
|
+
flights = R.fread(input)
|
777
|
+
puts flights
|
778
|
+
puts flights.dim
|
736
779
|
```
|
737
780
|
|
738
781
|
```{ruby data_table}
|
@@ -750,17 +793,17 @@ puts data_table.ID
|
|
750
793
|
|
751
794
|
```{ruby subset_i}
|
752
795
|
# subset rows in i
|
753
|
-
ans =
|
796
|
+
ans = flights[(:origin.eq "JFK") & (:month.eq 6)]
|
754
797
|
puts ans.head
|
755
798
|
|
756
799
|
# Get the first two rows from flights.
|
757
800
|
|
758
|
-
ans =
|
801
|
+
ans = flights[(1..2)]
|
759
802
|
puts ans
|
760
803
|
|
761
804
|
# Sort flights first by column origin in ascending order, and then by dest in descending order:
|
762
805
|
|
763
|
-
# ans =
|
806
|
+
# ans = flights[E.order(:origin, -(:dest))]
|
764
807
|
# puts ans.head
|
765
808
|
|
766
809
|
```
|
@@ -769,15 +812,15 @@ puts ans
|
|
769
812
|
# Select column(s) in j
|
770
813
|
# select arr_delay column, but return it as a vector.
|
771
814
|
|
772
|
-
ans =
|
815
|
+
ans = flights[:all, :arr_delay]
|
773
816
|
puts ans.head
|
774
817
|
|
775
818
|
# Select arr_delay column, but return as a data.table instead.
|
776
819
|
|
777
|
-
ans =
|
820
|
+
ans = flights[:all, :arr_delay.list]
|
778
821
|
puts ans.head
|
779
822
|
|
780
|
-
ans =
|
823
|
+
ans = flights[:all, E.list(:arr_delay, :dep_delay)]
|
781
824
|
```
|
782
825
|
|
783
826
|
# Graphics in Galaaz
|
@@ -790,32 +833,32 @@ the data frame with the necessary data:
|
|
790
833
|
|
791
834
|
```{ruby diverging_plot_pre}
|
792
835
|
# copy the R variable :mtcars to the Ruby mtcars variable
|
793
|
-
|
836
|
+
mtcars = ~:mtcars
|
794
837
|
|
795
838
|
# create a new column 'car_name' to store the car names so that it can be
|
796
839
|
# used for plotting. The 'rownames' of the data frame cannot be used as
|
797
840
|
# data for plotting
|
798
|
-
|
841
|
+
mtcars.car_name = R.rownames(:mtcars)
|
799
842
|
|
800
843
|
# compute normalized mpg and add it to a new column called mpg_z
|
801
844
|
# Note that the mean value for mpg can be obtained by calling the 'mean'
|
802
845
|
# function on the vector 'mtcars.mpg'. The same with the standard
|
803
846
|
# deviation 'sd'. The vector is then rounded to two digits with 'round 2'
|
804
|
-
|
847
|
+
mtcars.mpg_z = ((mtcars.mpg - mtcars.mpg.mean)/mtcars.mpg.sd).round 2
|
805
848
|
|
806
849
|
# create a new column 'mpg_type'. Function 'ifelse' is a vectorized function
|
807
850
|
# that looks at every element of the mpg_z vector and if the value is below
|
808
851
|
# 0, returns 'below', otherwise returns 'above'
|
809
|
-
|
852
|
+
mtcars.mpg_type = (mtcars.mpg_z < 0).ifelse("below", "above")
|
810
853
|
|
811
854
|
# order the mtcar data set by the mpg_z vector from smaler to larger values
|
812
|
-
|
855
|
+
mtcars = mtcars[mtcars.mpg_z.order, :all]
|
813
856
|
|
814
857
|
# convert the car_name column to a factor to retain sorted order in plot
|
815
|
-
|
858
|
+
mtcars.car_name = mtcars.car_name.factor levels: mtcars.car_name
|
816
859
|
|
817
860
|
# let's look at the final data frame
|
818
|
-
puts
|
861
|
+
puts mtcars.head
|
819
862
|
```
|
820
863
|
Now, lets plot the diverging bar plot. When using gKnit, there is no need to call
|
821
864
|
'R.awt' to create a plotting device, since gKnit does take care of it. Galaaz
|
@@ -836,14 +879,394 @@ but in this graph we want the bars to be horizontally layed so we add 'coord\_fl
|
|
836
879
|
```{ruby diverging_bar, fig.width = 9.1, fig.height = 6.5}
|
837
880
|
require 'ggplot'
|
838
881
|
|
839
|
-
puts
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
882
|
+
puts mtcars.ggplot(E.aes(x: :car_name, y: :mpg_z, label: :mpg_z)) +
|
883
|
+
R.geom_bar(E.aes(fill: :mpg_type), stat: 'identity', width: 0.5) +
|
884
|
+
R.scale_fill_manual(name: 'Mileage',
|
885
|
+
labels: R.c('Above Average', 'Below Average'),
|
886
|
+
values: R.c('above': '#00ba38', 'below': '#f8766d')) +
|
887
|
+
R.labs(subtitle: "Normalised mileage from 'mtcars'",
|
888
|
+
title: "Diverging Bars") +
|
889
|
+
R.coord_flip
|
890
|
+
```
|
891
|
+
|
892
|
+
# Coding with Tidyverse
|
893
|
+
|
894
|
+
In R, and when coding with 'tidyverse', arguments to a function are usually not
|
895
|
+
*referencially transparent*. That is, you can’t replace a value with a seemingly equivalent
|
896
|
+
object that you’ve defined elsewhere. To see the problem, let's first define a data frame:
|
897
|
+
|
898
|
+
```{ruby df}
|
899
|
+
df = R.data__frame(x: (1..3), y: (3..1))
|
900
|
+
puts df
|
901
|
+
```
|
902
|
+
|
903
|
+
and now, let's look at this code:
|
904
|
+
|
905
|
+
```{r not_transp, eval=FALSE}
|
906
|
+
my_var <- x
|
907
|
+
filter(df, my_var == 1)
|
908
|
+
```
|
909
|
+
It generates the following error: "object 'x' not found.
|
910
|
+
|
911
|
+
However, in Galaaz, arguments are referencially transparent as can be seen by the
|
912
|
+
code bellow. Note initally that 'my_var = :x' will not give the error "object 'x' not found"
|
913
|
+
since ':x' is treated as an expression and assigned to my\_var. Then when doing (my\_var.eq 1),
|
914
|
+
my\_var is a variable that resolves to ':x' and it becomes equivalent to (:x.eq 1) which is
|
915
|
+
what we want.
|
916
|
+
|
917
|
+
```{ruby my_var}
|
918
|
+
my_var = :x
|
919
|
+
puts df.filter(my_var.eq 1)
|
920
|
+
```
|
921
|
+
As stated by Hardley
|
922
|
+
|
923
|
+
> dplyr code is ambiguous. Depending on what variables are defined where,
|
924
|
+
> filter(df, x == y) could be equivalent to any of:
|
925
|
+
|
926
|
+
```
|
927
|
+
df[df$x == df$y, ]
|
928
|
+
df[df$x == y, ]
|
929
|
+
df[x == df$y, ]
|
930
|
+
df[x == y, ]
|
931
|
+
```
|
932
|
+
In galaaz this ambiguity does not exist, filter(df, x.eq y) is not a valid expression as
|
933
|
+
expressions are build with symbols. In doing filter(df, :x.eq y) we are looking for elements
|
934
|
+
of the 'x' column that are equal to a previously defined y variable. Finally in
|
935
|
+
filter(df, :x.eq :y) we are looking for elements in which the 'x' column value is equal to
|
936
|
+
the 'y' column value. This can be seen in the following two chunks of code:
|
937
|
+
|
938
|
+
```{ruby disamb1}
|
939
|
+
y = 1
|
940
|
+
x = 2
|
941
|
+
|
942
|
+
# looking for values where the 'x' column is equal to the 'y' column
|
943
|
+
puts df.filter(:x.eq :y)
|
944
|
+
```
|
945
|
+
|
946
|
+
```{ruby disamb2}
|
947
|
+
# looking for values where the 'x' column is equal to the 'y' variable
|
948
|
+
# in this case, the number 1
|
949
|
+
puts df.filter(:x.eq y)
|
950
|
+
```
|
951
|
+
## Writing a function that applies to different data sets
|
952
|
+
|
953
|
+
Let's suppose that we want to write a function that receives as the first argument a data frame
|
954
|
+
and as second argument an expression that adds a column to the data frame that is equal to the
|
955
|
+
sum of elements in column 'a' plus 'x'.
|
956
|
+
|
957
|
+
Here is the intended behaviour using the 'mutate' function of 'dplyr':
|
958
|
+
|
959
|
+
```
|
960
|
+
mutate(df1, y = a + x)
|
961
|
+
mutate(df2, y = a + x)
|
962
|
+
mutate(df3, y = a + x)
|
963
|
+
mutate(df4, y = a + x)
|
964
|
+
```
|
965
|
+
The naive approach to writing an R function to solve this problem is:
|
966
|
+
|
967
|
+
```
|
968
|
+
mutate_y <- function(df) {
|
969
|
+
mutate(df, y = a + x)
|
970
|
+
}
|
971
|
+
```
|
972
|
+
Unfortunately, in R, this function can fail silently if one of the variables isn’t present
|
973
|
+
in the data frame, but is present in the global environment. We will not go through here how
|
974
|
+
to solve this problem in R.
|
975
|
+
|
976
|
+
In Galaaz the method mutate_y bellow will work fine and will never fail silently.
|
977
|
+
|
978
|
+
```{ruby mutate_y, warning=FALSE}
|
979
|
+
def mutate_y(df)
|
980
|
+
df.mutate(:y.assign :a + :x)
|
981
|
+
end
|
982
|
+
```
|
983
|
+
Here we create a data frame that has only one column named 'x':
|
984
|
+
|
985
|
+
```{ruby data_frame_no_a_column, warning=FALSE}
|
986
|
+
df1 = R.data__frame(x: (1..3))
|
987
|
+
puts df1
|
988
|
+
```
|
989
|
+
|
990
|
+
Note that method mutate_y will fail independetly from the fact that variable 'a' is defined and
|
991
|
+
in the scope of the method. Variable 'a' has no relationship with the symbol ':a' used in the
|
992
|
+
definition of 'mutate\_y' above:
|
993
|
+
|
994
|
+
```{ruby call_mutate_y, warning = FALSE}
|
995
|
+
a = 10
|
996
|
+
mutate_y(df1)
|
997
|
+
```
|
998
|
+
## Different expressions
|
999
|
+
|
1000
|
+
Let's move to the next problem as presented by Hardley where trying to write a function in R
|
1001
|
+
that will receive two argumens, the first a variable and the second an expression is not trivial.
|
1002
|
+
Bellow we create a data frame and we want to write a function that groups data by a variable and
|
1003
|
+
summarises it by an expression:
|
1004
|
+
|
1005
|
+
```{r diff_expr}
|
1006
|
+
set.seed(123)
|
1007
|
+
|
1008
|
+
df <- data.frame(
|
1009
|
+
g1 = c(1, 1, 2, 2, 2),
|
1010
|
+
g2 = c(1, 2, 1, 2, 1),
|
1011
|
+
a = sample(5),
|
1012
|
+
b = sample(5)
|
1013
|
+
)
|
1014
|
+
|
1015
|
+
as.data.frame(df)
|
1016
|
+
|
1017
|
+
d2 <- df %>%
|
1018
|
+
group_by(g1) %>%
|
1019
|
+
summarise(a = mean(a))
|
1020
|
+
|
1021
|
+
as.data.frame(d2)
|
1022
|
+
|
1023
|
+
d2 <- df %>%
|
1024
|
+
group_by(g2) %>%
|
1025
|
+
summarise(a = mean(a))
|
1026
|
+
|
1027
|
+
as.data.frame(d2)
|
1028
|
+
```
|
1029
|
+
|
1030
|
+
As shown by Hardley, one might expect this function to do the trick:
|
1031
|
+
|
1032
|
+
```{r diff_exp_fnc}
|
1033
|
+
my_summarise <- function(df, group_var) {
|
1034
|
+
df %>%
|
1035
|
+
group_by(group_var) %>%
|
1036
|
+
summarise(a = mean(a))
|
1037
|
+
}
|
1038
|
+
|
1039
|
+
# my_summarise(df, g1)
|
1040
|
+
#> Error: Column `group_var` is unknown
|
1041
|
+
```
|
1042
|
+
|
1043
|
+
In order to solve this problem, coding with dplyr requires the introduction of many new concepts
|
1044
|
+
and functions such as 'quo', 'quos', 'enquo', 'enquos', '!!' (bang bang), '!!!' (triple bang).
|
1045
|
+
Again, we'll leave to Hardley the explanation on how to use all those functions.
|
1046
|
+
|
1047
|
+
Now, let's try to implement the same function in galaaz. The next code block first prints the
|
1048
|
+
'df' data frame defined previously in R (to access an R variable from Galaaz, we use the tilda
|
1049
|
+
operator '~' applied to the R variable name as symbol, i.e., ':df'.
|
1050
|
+
|
1051
|
+
```{ruby r_dataframe}
|
1052
|
+
puts ~:df
|
1053
|
+
```
|
1054
|
+
|
1055
|
+
We then create the 'my_summarize' method and call it passing the R data frame and
|
1056
|
+
the group by variable ':g1':
|
1057
|
+
|
1058
|
+
```{ruby diff_exp_ruby_func}
|
1059
|
+
def my_summarize(df, group_var)
|
1060
|
+
df.group_by(group_var).
|
1061
|
+
summarize(a: :a.mean)
|
1062
|
+
end
|
1063
|
+
|
1064
|
+
puts my_summarize(:df, :g1).as__data__frame
|
1065
|
+
```
|
1066
|
+
|
1067
|
+
It works!!! Well, let's make sure this was not just some coincidence
|
1068
|
+
|
1069
|
+
```{ruby group_g2}
|
1070
|
+
puts my_summarize(:df, :g2).as__data__frame
|
1071
|
+
```
|
1072
|
+
|
1073
|
+
Great, everything is fine! No magic, no new functions, no complexities, just normal, standard Ruby
|
1074
|
+
code. If you've ever done NSE in R, this certainly feels much safer and easy to implement.
|
1075
|
+
|
1076
|
+
## Different input variables
|
1077
|
+
|
1078
|
+
In the previous section we've managed to get rid of all NSE formulation for a simple example, but
|
1079
|
+
does this remain true for more complex examples, or will the Galaaz way prove inpractical for
|
1080
|
+
more complex code?
|
1081
|
+
|
1082
|
+
In the next example Hardley proposes us to write a function that given an expression such as 'a'
|
1083
|
+
or 'a * b', calculates three summaries. What we want a function that does the same as these R
|
1084
|
+
statements:
|
1085
|
+
|
1086
|
+
```
|
1087
|
+
summarise(df, mean = mean(a), sum = sum(a), n = n())
|
1088
|
+
#> # A tibble: 1 x 3
|
1089
|
+
#> mean sum n
|
1090
|
+
#> <dbl> <int> <int>
|
1091
|
+
#> 1 3 15 5
|
1092
|
+
|
1093
|
+
summarise(df, mean = mean(a * b), sum = sum(a * b), n = n())
|
1094
|
+
#> # A tibble: 1 x 3
|
1095
|
+
#> mean sum n
|
1096
|
+
#> <dbl> <int> <int>
|
1097
|
+
#> 1 9 45 5
|
1098
|
+
```
|
1099
|
+
|
1100
|
+
Let's try it in galaaz:
|
1101
|
+
|
1102
|
+
```{ruby summarize_method}
|
1103
|
+
def my_summarise2(df, expr)
|
1104
|
+
df.summarize(
|
1105
|
+
mean: E.mean(expr),
|
1106
|
+
sum: E.sum(expr),
|
1107
|
+
n: E.n
|
1108
|
+
)
|
1109
|
+
end
|
1110
|
+
|
1111
|
+
puts my_summarise2((~:df), :a)
|
1112
|
+
puts "\n"
|
1113
|
+
puts my_summarise2((~:df), :a * :b)
|
1114
|
+
```
|
1115
|
+
|
1116
|
+
Once again, there is no need to use any special theory or functions. The only point to be
|
1117
|
+
careful about is the use of 'E' to build expressions from functions 'mean', 'sum' and 'n'.
|
1118
|
+
|
1119
|
+
## Different input and output variable
|
1120
|
+
|
1121
|
+
Now the next challenge presented by Hardley is to vary the name of the output variables based on
|
1122
|
+
the received expression. So, if the input expression is 'a', we want our data frame columns to
|
1123
|
+
be named 'mean\_a' and 'sum\_a'. Now, if the input expression is 'b', columns
|
1124
|
+
should be named 'mean\_b' and 'sum\_b'.
|
1125
|
+
|
1126
|
+
```
|
1127
|
+
mutate(df, mean_a = mean(a), sum_a = sum(a))
|
1128
|
+
#> # A tibble: 5 x 6
|
1129
|
+
#> g1 g2 a b mean_a sum_a
|
1130
|
+
#> <dbl> <dbl> <int> <int> <dbl> <int>
|
1131
|
+
#> 1 1 1 1 3 3 15
|
1132
|
+
#> 2 1 2 4 2 3 15
|
1133
|
+
#> 3 2 1 2 1 3 15
|
1134
|
+
#> 4 2 2 5 4 3 15
|
1135
|
+
#> # … with 1 more row
|
1136
|
+
|
1137
|
+
mutate(df, mean_b = mean(b), sum_b = sum(b))
|
1138
|
+
#> # A tibble: 5 x 6
|
1139
|
+
#> g1 g2 a b mean_b sum_b
|
1140
|
+
#> <dbl> <dbl> <int> <int> <dbl> <int>
|
1141
|
+
#> 1 1 1 1 3 3 15
|
1142
|
+
#> 2 1 2 4 2 3 15
|
1143
|
+
#> 3 2 1 2 1 3 15
|
1144
|
+
#> 4 2 2 5 4 3 15
|
1145
|
+
#> # … with 1 more row
|
1146
|
+
```
|
1147
|
+
In order to solve this problem in R, Hardley needs to introduce some more new functions and notations:
|
1148
|
+
'quo_name' and the ':=' operator from package 'rlang'
|
1149
|
+
|
1150
|
+
Here is our Ruby code:
|
1151
|
+
|
1152
|
+
```{ruby name_change}
|
1153
|
+
def my_mutate(df, expr)
|
1154
|
+
mean_name = "mean_#{expr.to_s}"
|
1155
|
+
sum_name = "sum_#{expr.to_s}"
|
1156
|
+
|
1157
|
+
df.mutate(mean_name => E.mean(expr),
|
1158
|
+
sum_name => E.sum(expr))
|
1159
|
+
end
|
1160
|
+
|
1161
|
+
puts my_mutate((~:df), :a)
|
1162
|
+
puts "\n"
|
1163
|
+
puts my_mutate((~:df), :b)
|
1164
|
+
```
|
1165
|
+
It really seems that "Non Standard Evaluation" is actually quite standard in Galaaz! But, you
|
1166
|
+
might have noticed a small change in the way the arguments to the mutate method were called.
|
1167
|
+
In a previous example we used df.summarise(mean: E.mean(:a), ...) where the column name was
|
1168
|
+
followed by a ':' colom. In this example, we have df.mutate(mean_name => E.mean(expr), ...)
|
1169
|
+
and variable mean\_name is not followed by ':' but by '=>'. This is standard Ruby notation.
|
1170
|
+
|
1171
|
+
[explain....]
|
1172
|
+
|
1173
|
+
## Capturing multiple variables
|
1174
|
+
|
1175
|
+
Moving on with new complexities, Hardley proposes us to solve the problem in which the
|
1176
|
+
summarise function will receive any number of grouping variables.
|
1177
|
+
|
1178
|
+
This again is quite standard Ruby. In order to receive an undefined number of paramenters
|
1179
|
+
the paramenter is preceded by '*':
|
1180
|
+
|
1181
|
+
```{ruby multiple_vars}
|
1182
|
+
def my_summarise3(df, *group_vars)
|
1183
|
+
df.group_by(*group_vars).
|
1184
|
+
summarise(a: E.mean(:a))
|
1185
|
+
end
|
1186
|
+
|
1187
|
+
puts my_summarise3((~:df), :g1, :g2).as__data__frame
|
1188
|
+
```
|
1189
|
+
|
1190
|
+
## Why does R require NSE and Galaaz does not?
|
1191
|
+
|
1192
|
+
NSE introduces a number of new concepts, such as 'quoting', 'quasiquotation', 'unquoting' and
|
1193
|
+
'unquote-splicing', while in Galaaz none of those concepts are needed. What gives?
|
1194
|
+
|
1195
|
+
R is an extremely flexible language and it has lazy evaluation of parameters. When in R a
|
1196
|
+
function is called as 'summarise(df, a = b)', the summarise function receives the litteral
|
1197
|
+
'a = b' parameter and can work with this as if it were a string. In R, it is not clear what
|
1198
|
+
a and b are, they can be expressions or they can be variables, it is up to the function to
|
1199
|
+
decide what 'a = b' means.
|
1200
|
+
|
1201
|
+
In Ruby, there is no lazy evaluation of parameters and 'a' is always a variable and so is 'b'.
|
1202
|
+
Variables assume their value as soon as they are used, so 'x = a' is immediately evaluate and
|
1203
|
+
variable 'x' will receive the value of variable 'a' as soon as the Ruby statement is executed.
|
1204
|
+
Ruby also provides the notion of a symbol; ':a' is a symbol and does not evaluate to anything.
|
1205
|
+
Galaaz uses Ruby symbols to build expressions that are not bound to anything: ':a.eq :b' is
|
1206
|
+
clearly an expression and has no relationship whatsoever with the statment 'a = b'. By using
|
1207
|
+
symbols, variables and expressions all the possible ambiguities that are found in R are
|
1208
|
+
eliminated in Galaaz.
|
1209
|
+
|
1210
|
+
The main problem that remains, is that in R, functions are not clearly documented as what type
|
1211
|
+
of input they are expecting, they might be expecting regular variables or they might be
|
1212
|
+
expecting expressions and the R function will know how to deal with an input of the form
|
1213
|
+
'a = b', now for the Ruby developer it might not be immediately clear if it should call the
|
1214
|
+
function passing the value 'true' if variable 'a' is equal to variable 'b' or if it should
|
1215
|
+
call the function passing the expression ':a.eq :b'.
|
1216
|
+
|
1217
|
+
|
1218
|
+
## Advanced dplyr features
|
1219
|
+
|
1220
|
+
In the blog: Programming with dplyr by using dplyr (https://www.r-bloggers.com/programming-with-dplyr-by-using-dplyr/) Iñaki Úcar shows surprise that some R users are trying to code in dplyr avoiding
|
1221
|
+
the use of NSE. For instance he says:
|
1222
|
+
|
1223
|
+
> Take the example of seplyr. It stands for standard evaluation dplyr, and enables us to
|
1224
|
+
> program over dplyr without having “to bring in (or study) any deep-theory or
|
1225
|
+
> heavy-weight tools such as rlang/tidyeval”.
|
1226
|
+
|
1227
|
+
For me, there isn't really any surprise that users are trying to avoid dplyr deep-theory. R
|
1228
|
+
users frequently are not programmers and learning to code is already hard business, on top
|
1229
|
+
of that, having to learn how to 'quote' or 'enquo' or 'quos' or 'enquos' is not necessarily
|
1230
|
+
a 'piece of cake'. So much so, that 'tidyeval' has some more advanced functions that instead
|
1231
|
+
of using quoted expressions, uses strings as arguments.
|
1232
|
+
|
1233
|
+
In the following examples, we show the use of functions 'group\_by\_at', 'summarise\_at' and
|
1234
|
+
'rename\_at' that receive strings as argument. The data frame used in 'starwars' that describes
|
1235
|
+
features of characters in the Starwars movies:
|
1236
|
+
|
1237
|
+
```{ruby starwars}
|
1238
|
+
puts (~:starwars).head.as__data__frame
|
1239
|
+
```
|
1240
|
+
The grouped_mean function bellow will receive a grouping variable and calculate summaries for
|
1241
|
+
the value\_variables given:
|
1242
|
+
|
1243
|
+
```{r grouped_mean}
|
1244
|
+
grouped_mean <- function(data, grouping_variables, value_variables) {
|
1245
|
+
data %>%
|
1246
|
+
group_by_at(grouping_variables) %>%
|
1247
|
+
mutate(count = n()) %>%
|
1248
|
+
summarise_at(c(value_variables, "count"), mean, na.rm = TRUE) %>%
|
1249
|
+
rename_at(value_variables, funs(paste0("mean_", .)))
|
1250
|
+
}
|
1251
|
+
|
1252
|
+
gm = starwars %>%
|
1253
|
+
grouped_mean("eye_color", c("mass", "birth_year"))
|
1254
|
+
|
1255
|
+
as.data.frame(gm)
|
1256
|
+
```
|
1257
|
+
|
1258
|
+
The same code with Galaaz, becomes:
|
1259
|
+
|
1260
|
+
```{ruby advanced_starwars}
|
1261
|
+
def grouped_mean(data, grouping_variables, value_variables)
|
1262
|
+
data.
|
1263
|
+
group_by_at(grouping_variables).
|
1264
|
+
mutate(count: E.n).
|
1265
|
+
summarise_at(E.c(value_variables, "count"), ~:mean, na__rm: true).
|
1266
|
+
rename_at(value_variables, E.funs(E.paste0("mean_", value_variables)))
|
1267
|
+
end
|
1268
|
+
|
1269
|
+
puts grouped_mean((~:starwars), "eye_color", E.c("mass", "birth_year")).as__data__frame
|
847
1270
|
```
|
848
1271
|
|
849
1272
|
|