galaaz 0.4.9 → 0.4.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +798 -285
- data/blogs/galaaz_ggplot/galaaz_ggplot.Rmd +3 -12
- data/blogs/galaaz_ggplot/galaaz_ggplot.aux +5 -7
- data/blogs/galaaz_ggplot/galaaz_ggplot.html +69 -29
- data/blogs/galaaz_ggplot/galaaz_ggplot.pdf +0 -0
- data/blogs/galaaz_ggplot/galaaz_ggplot_files/figure-html/midwest_rb.png +0 -0
- data/blogs/galaaz_ggplot/galaaz_ggplot_files/figure-html/scatter_plot_rb.png +0 -0
- data/blogs/galaaz_ggplot/galaaz_ggplot_files/figure-latex/midwest_rb.pdf +0 -0
- data/blogs/galaaz_ggplot/galaaz_ggplot_files/figure-latex/scatter_plot_rb.pdf +0 -0
- data/blogs/galaaz_ggplot/midwest.Rmd +1 -9
- data/blogs/gknit/gknit.Rmd +37 -40
- data/blogs/gknit/gknit.html +32 -30
- data/blogs/gknit/gknit.md +36 -37
- data/blogs/gknit/gknit.pdf +0 -0
- data/blogs/gknit/gknit.tex +35 -37
- data/blogs/manual/manual.Rmd +548 -125
- data/blogs/manual/manual.html +509 -286
- data/blogs/manual/manual.md +798 -285
- data/blogs/manual/manual.pdf +0 -0
- data/blogs/manual/manual.tex +2816 -0
- data/blogs/manual/manual_files/figure-latex/diverging_bar.pdf +0 -0
- data/blogs/nse_dplyr/nse_dplyr.Rmd +240 -74
- data/blogs/nse_dplyr/nse_dplyr.html +191 -87
- data/blogs/nse_dplyr/nse_dplyr.md +361 -107
- data/blogs/nse_dplyr/nse_dplyr.pdf +0 -0
- data/blogs/nse_dplyr/nse_dplyr.tex +1373 -0
- data/blogs/ruby_plot/ruby_plot.Rmd +61 -81
- data/blogs/ruby_plot/ruby_plot.html +54 -57
- data/blogs/ruby_plot/ruby_plot.md +48 -67
- data/blogs/ruby_plot/ruby_plot.pdf +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/dose_len.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_delivery.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_dose.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color2.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_jitter.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_points.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/final_box_plot.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/final_violin_plot.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-html/violin_with_jitter.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/dose_len.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/facet_by_delivery.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/facet_by_dose.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/facets_by_delivery_color.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/facets_by_delivery_color2.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/facets_with_decorations.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/facets_with_jitter.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/facets_with_points.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/final_box_plot.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/final_violin_plot.png +0 -0
- data/blogs/ruby_plot/ruby_plot_files/figure-latex/violin_with_jitter.png +0 -0
- data/lib/R_interface/rdata_frame.rb +0 -12
- data/lib/R_interface/robject.rb +14 -14
- data/lib/R_interface/ruby_extensions.rb +3 -31
- data/lib/R_interface/rvector.rb +0 -12
- data/lib/gknit/knitr_engine.rb +5 -3
- data/lib/util/exec_ruby.rb +22 -61
- data/specs/tmp.rb +26 -12
- data/version.rb +1 -1
- metadata +22 -17
- data/bin/gknit_old_r +0 -236
- data/blogs/dev/dev.Rmd +0 -23
- data/blogs/dev/dev.md +0 -58
- data/blogs/dev/dev2.Rmd +0 -65
- data/blogs/dev/model.rb +0 -41
- data/blogs/dplyr/dplyr.Rmd +0 -29
- data/blogs/dplyr/dplyr.html +0 -433
- data/blogs/dplyr/dplyr.md +0 -58
- data/blogs/dplyr/dplyr.rb +0 -63
- data/blogs/galaaz_ggplot/galaaz_ggplot.log +0 -640
- data/blogs/galaaz_ggplot/galaaz_ggplot.md +0 -431
- data/blogs/galaaz_ggplot/galaaz_ggplot.tex +0 -481
- data/blogs/galaaz_ggplot/midwest.png +0 -0
- data/blogs/galaaz_ggplot/scatter_plot.png +0 -0
- data/blogs/ruby_plot/ruby_plot.tex +0 -1077
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5028519688d5197e29ea9198499c8093f96aa27e498a0eb974367187d7d151da
|
|
4
|
+
data.tar.gz: f5bad7debd953898f0335e04e83089137025c759a3910cf6d74061b53f4eb37e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5b14427f32a5db4f2c9754c1ee7fea356c939727152a626c616c3dff1372cddb4fd4d982dc761c2a2e2ca1c211b8a0215d26c2b11eb162cd2f7ab5f0c1c9344e
|
|
7
|
+
data.tar.gz: 94c7da10fd04a9136b9a36582574ae04c9f3a4767f1a3dd04137a64f4e104cb8c3c0906752c627cef27ff81b7bbca0bde83aa58e9e5b742005079b30c46616a2
|
data/README.md
CHANGED
|
@@ -74,15 +74,13 @@ Panda, SciPy, SciKit-Learn and a couple more.
|
|
|
74
74
|
# gKnitting a Document
|
|
75
75
|
|
|
76
76
|
This manual has been formatted usign gKnit. gKnit uses Knitr and R markdown to knit
|
|
77
|
-
a document in Ruby or R and output it in any of the available formats for R markdown.
|
|
77
|
+
a document in Ruby or R and output it in any of the available formats for R markdown.
|
|
78
78
|
gKnit runs atop of GraalVM, and Galaaz. In gKnit, Ruby variables are persisted between
|
|
79
|
-
chunks, making it an ideal solution for literate programming.
|
|
80
|
-
|
|
81
|
-
|
|
79
|
+
chunks, making it an ideal solution for literate programming. Also, since it is based
|
|
80
|
+
on Galaaz, Ruby chunks can have access to R variables and Polyglot Programming with
|
|
81
|
+
Ruby and R is quite natural.
|
|
82
82
|
|
|
83
|
-
gknit
|
|
84
|
-
|
|
85
|
-
* xxx.xxxx.xxx
|
|
83
|
+
[gknit is described in more details here](https://towardsdatascience.com/how-to-do-reproducible-research-in-ruby-with-gknit-c26d2684d64e)
|
|
86
84
|
|
|
87
85
|
# Vector
|
|
88
86
|
|
|
@@ -110,15 +108,15 @@ To create a vector the 'c' (concatenate) method from the 'R' module should be us
|
|
|
110
108
|
|
|
111
109
|
|
|
112
110
|
```ruby
|
|
113
|
-
|
|
114
|
-
puts
|
|
111
|
+
vec = R.c(1, 2, 3)
|
|
112
|
+
puts vec
|
|
115
113
|
```
|
|
116
114
|
|
|
117
115
|
```
|
|
118
116
|
## [1] 1 2 3
|
|
119
117
|
```
|
|
120
118
|
|
|
121
|
-
Lets take a look at the type, mode and storage.mode of our vector
|
|
119
|
+
Lets take a look at the type, mode and storage.mode of our vector vec. In order to print
|
|
122
120
|
this out, we are creating a data frame 'df' and printing it out. A data frame, for those
|
|
123
121
|
not familiar with it, is basically a table. Here we create the data frame and add the
|
|
124
122
|
column name by passing named parameters for each column, such as 'typeof:', 'mode:' and
|
|
@@ -130,7 +128,7 @@ data frame is 'data.frame', in Galaaz we use 'data\_\_frame'.
|
|
|
130
128
|
|
|
131
129
|
|
|
132
130
|
```ruby
|
|
133
|
-
df = R.data__frame(typeof:
|
|
131
|
+
df = R.data__frame(typeof: vec.typeof, mode: vec.mode, storage__mode: vec.storage__mode)
|
|
134
132
|
puts df
|
|
135
133
|
```
|
|
136
134
|
|
|
@@ -146,8 +144,8 @@ follows normal Ruby rules and the number 1 is an integer and 1.0 is a float.
|
|
|
146
144
|
|
|
147
145
|
|
|
148
146
|
```ruby
|
|
149
|
-
|
|
150
|
-
puts
|
|
147
|
+
vec = R.c(1.0, 2, 3)
|
|
148
|
+
puts vec
|
|
151
149
|
```
|
|
152
150
|
|
|
153
151
|
```
|
|
@@ -156,7 +154,7 @@ puts @vec
|
|
|
156
154
|
|
|
157
155
|
|
|
158
156
|
```ruby
|
|
159
|
-
df = R.data__frame(typeof:
|
|
157
|
+
df = R.data__frame(typeof: vec.typeof, mode: vec.mode, storage__mode: vec.storage__mode)
|
|
160
158
|
outputs df.kable.kable_styling
|
|
161
159
|
```
|
|
162
160
|
|
|
@@ -189,14 +187,14 @@ vec = R.c(1, hello, 5)
|
|
|
189
187
|
|
|
190
188
|
```
|
|
191
189
|
## Message:
|
|
192
|
-
## undefined local variable or method `hello' for
|
|
190
|
+
## undefined local variable or method `hello' for #<RC:0x2e0 @out_list=nil>:RC
|
|
193
191
|
```
|
|
194
192
|
|
|
195
193
|
```
|
|
196
194
|
## Message:
|
|
197
|
-
##
|
|
198
|
-
## /home/rbotafogo/desenv/galaaz/lib/util/exec_ruby.rb:
|
|
199
|
-
## /home/rbotafogo/desenv/galaaz/lib/util/exec_ruby.rb:
|
|
195
|
+
## /home/rbotafogo/desenv/galaaz/lib/util/exec_ruby.rb:103:in `get_binding'
|
|
196
|
+
## /home/rbotafogo/desenv/galaaz/lib/util/exec_ruby.rb:102:in `eval'
|
|
197
|
+
## /home/rbotafogo/desenv/galaaz/lib/util/exec_ruby.rb:102:in `exec_ruby'
|
|
200
198
|
## /home/rbotafogo/desenv/galaaz/lib/gknit/knitr_engine.rb:650:in `block in initialize'
|
|
201
199
|
## /home/rbotafogo/desenv/galaaz/lib/R_interface/ruby_callback.rb:77:in `call'
|
|
202
200
|
## /home/rbotafogo/desenv/galaaz/lib/R_interface/ruby_callback.rb:77:in `callback'
|
|
@@ -221,8 +219,8 @@ Here is a vector with logical values
|
|
|
221
219
|
|
|
222
220
|
|
|
223
221
|
```ruby
|
|
224
|
-
|
|
225
|
-
puts
|
|
222
|
+
vec = R.c(true, true, false, false, true)
|
|
223
|
+
puts vec
|
|
226
224
|
```
|
|
227
225
|
|
|
228
226
|
```
|
|
@@ -235,26 +233,26 @@ The 'c' functions used to create vectors can also be used to combine two vectors
|
|
|
235
233
|
|
|
236
234
|
|
|
237
235
|
```ruby
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
puts
|
|
236
|
+
vec1 = R.c(10.0, 20.0, 30.0)
|
|
237
|
+
vec2 = R.c(4.0, 5.0, 6.0)
|
|
238
|
+
vec = R.c(vec1, vec2)
|
|
239
|
+
puts vec
|
|
242
240
|
```
|
|
243
241
|
|
|
244
242
|
```
|
|
245
243
|
## [1] 10 20 30 4 5 6
|
|
246
244
|
```
|
|
247
245
|
In galaaz, methods can be chainned (somewhat like the pipe operator in R %>%, but more generic).
|
|
248
|
-
In this next example, method 'c' is chainned after '
|
|
246
|
+
In this next example, method 'c' is chainned after 'vec1'. This also looks like 'c' is a
|
|
249
247
|
method of the vector, but in reallity, this is actually closer to the pipe operator. When
|
|
250
248
|
Galaaz identifies that 'c' is not a method of 'vec' it actually tries to call 'R.c' with
|
|
251
|
-
'
|
|
249
|
+
'vec1' as the first argument concatenated with all the other available arguments. The code
|
|
252
250
|
bellow is automatically converted to the code above.
|
|
253
251
|
|
|
254
252
|
|
|
255
253
|
```ruby
|
|
256
|
-
|
|
257
|
-
puts
|
|
254
|
+
vec = vec1.c(vec2)
|
|
255
|
+
puts vec
|
|
258
256
|
```
|
|
259
257
|
|
|
260
258
|
```
|
|
@@ -267,7 +265,7 @@ Arithmetic operations on vectors are performed element by element:
|
|
|
267
265
|
|
|
268
266
|
|
|
269
267
|
```ruby
|
|
270
|
-
puts
|
|
268
|
+
puts vec1 + vec2
|
|
271
269
|
```
|
|
272
270
|
|
|
273
271
|
```
|
|
@@ -276,7 +274,7 @@ puts @vec1 + @vec2
|
|
|
276
274
|
|
|
277
275
|
|
|
278
276
|
```ruby
|
|
279
|
-
puts
|
|
277
|
+
puts vec1 * 5
|
|
280
278
|
```
|
|
281
279
|
|
|
282
280
|
```
|
|
@@ -287,8 +285,8 @@ When vectors have different length, a recycling rule is applied to the shorter v
|
|
|
287
285
|
|
|
288
286
|
|
|
289
287
|
```ruby
|
|
290
|
-
|
|
291
|
-
puts
|
|
288
|
+
vec3 = R.c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)
|
|
289
|
+
puts vec4 = vec1 + vec3
|
|
292
290
|
```
|
|
293
291
|
|
|
294
292
|
```
|
|
@@ -301,7 +299,7 @@ Vectors can be indexed by using the '[]' operator:
|
|
|
301
299
|
|
|
302
300
|
|
|
303
301
|
```ruby
|
|
304
|
-
puts
|
|
302
|
+
puts vec4[3]
|
|
305
303
|
```
|
|
306
304
|
|
|
307
305
|
```
|
|
@@ -309,11 +307,11 @@ puts @vec4[3]
|
|
|
309
307
|
```
|
|
310
308
|
|
|
311
309
|
We can also index a vector with another vector. For example, in the code bellow, we take elements
|
|
312
|
-
1, 3, 5, and 7 from
|
|
310
|
+
1, 3, 5, and 7 from vec3:
|
|
313
311
|
|
|
314
312
|
|
|
315
313
|
```ruby
|
|
316
|
-
puts
|
|
314
|
+
puts vec4[R.c(1, 3, 5, 7)]
|
|
317
315
|
```
|
|
318
316
|
|
|
319
317
|
```
|
|
@@ -324,7 +322,7 @@ Repeating an index and having indices out of order is valid code:
|
|
|
324
322
|
|
|
325
323
|
|
|
326
324
|
```ruby
|
|
327
|
-
puts
|
|
325
|
+
puts vec4[R.c(1, 3, 3, 1)]
|
|
328
326
|
```
|
|
329
327
|
|
|
330
328
|
```
|
|
@@ -336,8 +334,8 @@ the indexed values are not returned:
|
|
|
336
334
|
|
|
337
335
|
|
|
338
336
|
```ruby
|
|
339
|
-
puts
|
|
340
|
-
puts
|
|
337
|
+
puts vec4[-3]
|
|
338
|
+
puts vec4[-R.c(1, 3, 5, 7)]
|
|
341
339
|
```
|
|
342
340
|
|
|
343
341
|
```
|
|
@@ -349,7 +347,7 @@ If an index is out of range, a missing value (NA) will be reported.
|
|
|
349
347
|
|
|
350
348
|
|
|
351
349
|
```ruby
|
|
352
|
-
puts
|
|
350
|
+
puts vec4[30]
|
|
353
351
|
```
|
|
354
352
|
|
|
355
353
|
```
|
|
@@ -360,7 +358,7 @@ It is also possible to index a vector by range:
|
|
|
360
358
|
|
|
361
359
|
|
|
362
360
|
```ruby
|
|
363
|
-
puts
|
|
361
|
+
puts vec4[(2..5)]
|
|
364
362
|
```
|
|
365
363
|
|
|
366
364
|
```
|
|
@@ -403,9 +401,9 @@ from the vector. In order to do this extraction the '>>' operator is used.
|
|
|
403
401
|
|
|
404
402
|
|
|
405
403
|
```ruby
|
|
406
|
-
puts
|
|
407
|
-
puts
|
|
408
|
-
puts
|
|
404
|
+
puts vec4
|
|
405
|
+
puts vec4 >> 0
|
|
406
|
+
puts vec4 >> 4
|
|
409
407
|
```
|
|
410
408
|
|
|
411
409
|
```
|
|
@@ -905,11 +903,11 @@ created by the 'matrix' function:
|
|
|
905
903
|
|
|
906
904
|
|
|
907
905
|
```ruby
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
906
|
+
mat = R.matrix(R.c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0),
|
|
907
|
+
nrow: 3,
|
|
908
|
+
ncol: 3)
|
|
911
909
|
|
|
912
|
-
puts
|
|
910
|
+
puts mat
|
|
913
911
|
```
|
|
914
912
|
|
|
915
913
|
```
|
|
@@ -923,12 +921,12 @@ memory by row first passing an extra argument to the 'matrix' function:
|
|
|
923
921
|
|
|
924
922
|
|
|
925
923
|
```ruby
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
924
|
+
mat_row = R.matrix(R.c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0),
|
|
925
|
+
nrow: 3,
|
|
926
|
+
ncol: 3,
|
|
927
|
+
byrow: true)
|
|
930
928
|
|
|
931
|
-
puts
|
|
929
|
+
puts mat_row
|
|
932
930
|
```
|
|
933
931
|
|
|
934
932
|
```
|
|
@@ -944,8 +942,8 @@ A matrix can be indexed by [row, column]:
|
|
|
944
942
|
|
|
945
943
|
|
|
946
944
|
```ruby
|
|
947
|
-
puts
|
|
948
|
-
puts
|
|
945
|
+
puts mat_row[1, 1]
|
|
946
|
+
puts mat_row[2, 3]
|
|
949
947
|
```
|
|
950
948
|
|
|
951
949
|
```
|
|
@@ -956,8 +954,8 @@ It is possible to index an entire row or column with the ':all' keyword
|
|
|
956
954
|
|
|
957
955
|
|
|
958
956
|
```ruby
|
|
959
|
-
puts
|
|
960
|
-
puts
|
|
957
|
+
puts mat_row[1, :all]
|
|
958
|
+
puts mat_row[:all, 2]
|
|
961
959
|
```
|
|
962
960
|
|
|
963
961
|
```
|
|
@@ -970,7 +968,7 @@ rows 1 and 3 and columns 2 and 3 building a 2 x 2 matrix.
|
|
|
970
968
|
|
|
971
969
|
|
|
972
970
|
```ruby
|
|
973
|
-
puts
|
|
971
|
+
puts mat_row[R.c(1, 3), R.c(2, 3)]
|
|
974
972
|
```
|
|
975
973
|
|
|
976
974
|
```
|
|
@@ -979,12 +977,11 @@ puts @mat_row[R.c(1, 3), R.c(2, 3)]
|
|
|
979
977
|
## [2,] 8 9
|
|
980
978
|
```
|
|
981
979
|
|
|
982
|
-
Matrices can be combined with functions 'rbind'
|
|
980
|
+
Matrices can be combined with functions 'rbind':
|
|
983
981
|
|
|
984
982
|
|
|
985
983
|
```ruby
|
|
986
|
-
puts
|
|
987
|
-
puts @mat_row.cbind(@mat)
|
|
984
|
+
puts mat_row.rbind(mat)
|
|
988
985
|
```
|
|
989
986
|
|
|
990
987
|
```
|
|
@@ -995,6 +992,16 @@ puts @mat_row.cbind(@mat)
|
|
|
995
992
|
## [4,] 1 4 7
|
|
996
993
|
## [5,] 2 5 8
|
|
997
994
|
## [6,] 3 6 9
|
|
995
|
+
```
|
|
996
|
+
|
|
997
|
+
and 'cbind':
|
|
998
|
+
|
|
999
|
+
|
|
1000
|
+
```ruby
|
|
1001
|
+
puts mat_row.cbind(mat)
|
|
1002
|
+
```
|
|
1003
|
+
|
|
1004
|
+
```
|
|
998
1005
|
## [,1] [,2] [,3] [,4] [,5] [,6]
|
|
999
1006
|
## [1,] 1 2 3 1 4 7
|
|
1000
1007
|
## [2,] 4 5 6 2 5 8
|
|
@@ -1011,8 +1018,8 @@ can only hold one type of element.
|
|
|
1011
1018
|
nums = R.c(1.0, 2.0, 3.0)
|
|
1012
1019
|
strs = R.c("a", "b", "c", "d")
|
|
1013
1020
|
bool = R.c(true, true, false)
|
|
1014
|
-
|
|
1015
|
-
puts
|
|
1021
|
+
lst = R.list(nums: nums, strs: strs, bool: bool)
|
|
1022
|
+
puts lst
|
|
1016
1023
|
```
|
|
1017
1024
|
|
|
1018
1025
|
```
|
|
@@ -1026,7 +1033,7 @@ puts @lst
|
|
|
1026
1033
|
## [1] TRUE TRUE FALSE
|
|
1027
1034
|
```
|
|
1028
1035
|
|
|
1029
|
-
Note that '
|
|
1036
|
+
Note that 'lst' elements are named elements.
|
|
1030
1037
|
|
|
1031
1038
|
|
|
1032
1039
|
## List Indexing
|
|
@@ -1037,7 +1044,7 @@ return one of the sublists.
|
|
|
1037
1044
|
|
|
1038
1045
|
|
|
1039
1046
|
```ruby
|
|
1040
|
-
puts
|
|
1047
|
+
puts lst[1]
|
|
1041
1048
|
```
|
|
1042
1049
|
|
|
1043
1050
|
```
|
|
@@ -1052,18 +1059,18 @@ the original list
|
|
|
1052
1059
|
|
|
1053
1060
|
|
|
1054
1061
|
```ruby
|
|
1055
|
-
puts
|
|
1062
|
+
puts lst[[1]]
|
|
1056
1063
|
```
|
|
1057
1064
|
|
|
1058
1065
|
```
|
|
1059
1066
|
## [1] 1 2 3
|
|
1060
1067
|
```
|
|
1061
1068
|
|
|
1062
|
-
When elements are named, as dones with
|
|
1069
|
+
When elements are named, as dones with lst, indexing can be done by name:
|
|
1063
1070
|
|
|
1064
1071
|
|
|
1065
1072
|
```ruby
|
|
1066
|
-
puts
|
|
1073
|
+
puts lst[['bool']][[1]] >> 0
|
|
1067
1074
|
```
|
|
1068
1075
|
|
|
1069
1076
|
```
|
|
@@ -1183,23 +1190,31 @@ puts (~:mtcars)[R.c('Datsun 710', 'Camaro Z28'), :all]
|
|
|
1183
1190
|
Finally, a data frame can also be indexed with a logical vector. In this next example, the
|
|
1184
1191
|
'am' column of :mtcars is compared with 0 (with method 'eq'). When 'am' is equal to 0 the
|
|
1185
1192
|
car is automatic. So, by doing '(~:mtcars).am.eq 0' a logical vector is created with
|
|
1186
|
-
'true' whenever 'am' is 0 and 'false' otherwise.
|
|
1187
|
-
is indexed, returning a new data frame in which all cars have automatic transmission.
|
|
1193
|
+
'true' whenever 'am' is 0 and 'false' otherwise.
|
|
1188
1194
|
|
|
1189
1195
|
|
|
1190
1196
|
```ruby
|
|
1191
1197
|
# obtain a vector with 'true' for cars with automatic transmission
|
|
1192
1198
|
automatic = (~:mtcars).am.eq 0
|
|
1193
1199
|
puts automatic
|
|
1194
|
-
|
|
1195
|
-
# slice the data frame by using this vector
|
|
1196
|
-
puts (~:mtcars)[automatic, :all]
|
|
1197
1200
|
```
|
|
1198
1201
|
|
|
1199
1202
|
```
|
|
1200
1203
|
## [1] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
|
|
1201
1204
|
## [12] TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE
|
|
1202
1205
|
## [23] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
|
|
1206
|
+
```
|
|
1207
|
+
|
|
1208
|
+
Using this logical vector, the data frame is indexed, returning a new data frame in
|
|
1209
|
+
which all cars have automatic transmission.
|
|
1210
|
+
|
|
1211
|
+
|
|
1212
|
+
```ruby
|
|
1213
|
+
# slice the data frame by using this vector
|
|
1214
|
+
puts (~:mtcars)[automatic, :all]
|
|
1215
|
+
```
|
|
1216
|
+
|
|
1217
|
+
```
|
|
1203
1218
|
## mpg cyl disp hp drat wt qsec vs am gear carb
|
|
1204
1219
|
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
|
|
1205
1220
|
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
|
|
@@ -1342,6 +1357,62 @@ puts exp7
|
|
|
1342
1357
|
## y <- sin(x)
|
|
1343
1358
|
```
|
|
1344
1359
|
|
|
1360
|
+
Expressions can also be written using '.' notation:
|
|
1361
|
+
|
|
1362
|
+
|
|
1363
|
+
```ruby
|
|
1364
|
+
exp8 = :y.assign :x.sin
|
|
1365
|
+
puts exp8
|
|
1366
|
+
```
|
|
1367
|
+
|
|
1368
|
+
```
|
|
1369
|
+
## y <- sin(x)
|
|
1370
|
+
```
|
|
1371
|
+
|
|
1372
|
+
When a function has multiple arguments, the first one can be used before the '.':
|
|
1373
|
+
|
|
1374
|
+
|
|
1375
|
+
```ruby
|
|
1376
|
+
exp9 = :x.c(:y)
|
|
1377
|
+
puts exp9
|
|
1378
|
+
```
|
|
1379
|
+
|
|
1380
|
+
```
|
|
1381
|
+
## c(x, y)
|
|
1382
|
+
```
|
|
1383
|
+
|
|
1384
|
+
## Evaluating an Expression
|
|
1385
|
+
|
|
1386
|
+
Expressions can be evaluated by calling function 'eval' with a binding. A binding can be provided
|
|
1387
|
+
with a list:
|
|
1388
|
+
|
|
1389
|
+
|
|
1390
|
+
```ruby
|
|
1391
|
+
exp = (:a + :b) * 2.0 + :c ** 2 / :z
|
|
1392
|
+
puts exp.eval(R.list(a: 10, b: 20, c: 30, z: 40))
|
|
1393
|
+
```
|
|
1394
|
+
|
|
1395
|
+
```
|
|
1396
|
+
## [1] 82.5
|
|
1397
|
+
```
|
|
1398
|
+
|
|
1399
|
+
... with a data frame:
|
|
1400
|
+
|
|
1401
|
+
|
|
1402
|
+
```ruby
|
|
1403
|
+
df = R.data__frame(
|
|
1404
|
+
a: R.c(1, 2, 3),
|
|
1405
|
+
b: R.c(10, 20, 30),
|
|
1406
|
+
c: R.c(100, 200, 300),
|
|
1407
|
+
z: R.c(1000, 2000, 3000))
|
|
1408
|
+
|
|
1409
|
+
puts exp.eval(df)
|
|
1410
|
+
```
|
|
1411
|
+
|
|
1412
|
+
```
|
|
1413
|
+
## [1] 32 64 96
|
|
1414
|
+
```
|
|
1415
|
+
|
|
1345
1416
|
# Manipulating Data
|
|
1346
1417
|
|
|
1347
1418
|
One of the major benefits of Galaaz is to bring strong data manipulation to Ruby. The following
|
|
@@ -1365,8 +1436,8 @@ R.library('dplyr')
|
|
|
1365
1436
|
|
|
1366
1437
|
|
|
1367
1438
|
```ruby
|
|
1368
|
-
|
|
1369
|
-
puts
|
|
1439
|
+
flights = ~:flights
|
|
1440
|
+
puts flights.head.as__data__frame
|
|
1370
1441
|
```
|
|
1371
1442
|
|
|
1372
1443
|
```
|
|
@@ -1400,7 +1471,7 @@ the first :month.eq 1
|
|
|
1400
1471
|
|
|
1401
1472
|
|
|
1402
1473
|
```ruby
|
|
1403
|
-
puts
|
|
1474
|
+
puts flights.filter((:month.eq 1), (:day.eq 1)).head.as__data__frame
|
|
1404
1475
|
```
|
|
1405
1476
|
|
|
1406
1477
|
```
|
|
@@ -1433,7 +1504,7 @@ All flights that departed in November of December
|
|
|
1433
1504
|
|
|
1434
1505
|
|
|
1435
1506
|
```ruby
|
|
1436
|
-
puts
|
|
1507
|
+
puts flights.filter((:month.eq 11) | (:month.eq 12)).head.as__data__frame
|
|
1437
1508
|
```
|
|
1438
1509
|
|
|
1439
1510
|
```
|
|
@@ -1467,7 +1538,7 @@ symbol, in this case ':in' and the second argument is the vector:
|
|
|
1467
1538
|
|
|
1468
1539
|
|
|
1469
1540
|
```ruby
|
|
1470
|
-
puts
|
|
1541
|
+
puts flights.filter(:month._ :in, R.c(11, 12)).head.as__data__frame
|
|
1471
1542
|
```
|
|
1472
1543
|
|
|
1473
1544
|
```
|
|
@@ -1503,8 +1574,8 @@ what is obtained from data frame.
|
|
|
1503
1574
|
|
|
1504
1575
|
|
|
1505
1576
|
```ruby
|
|
1506
|
-
|
|
1507
|
-
puts
|
|
1577
|
+
df = R.tibble(x: R.c(1, R::NA, 3))
|
|
1578
|
+
puts df.as__data__frame
|
|
1508
1579
|
```
|
|
1509
1580
|
|
|
1510
1581
|
```
|
|
@@ -1519,7 +1590,7 @@ not.
|
|
|
1519
1590
|
|
|
1520
1591
|
|
|
1521
1592
|
```ruby
|
|
1522
|
-
puts
|
|
1593
|
+
puts df.filter(:x > 1).as__data__frame
|
|
1523
1594
|
```
|
|
1524
1595
|
|
|
1525
1596
|
```
|
|
@@ -1531,7 +1602,7 @@ To match an NA use method 'is__na'
|
|
|
1531
1602
|
|
|
1532
1603
|
|
|
1533
1604
|
```ruby
|
|
1534
|
-
puts
|
|
1605
|
+
puts df.filter((:x.is__na) | (:x > 1)).as__data__frame
|
|
1535
1606
|
```
|
|
1536
1607
|
|
|
1537
1608
|
```
|
|
@@ -1546,7 +1617,7 @@ Arrange reorders the rows of a data frame by the given arguments.
|
|
|
1546
1617
|
|
|
1547
1618
|
|
|
1548
1619
|
```ruby
|
|
1549
|
-
puts
|
|
1620
|
+
puts flights.arrange(:year, :month, :day).head.as__data__frame
|
|
1550
1621
|
```
|
|
1551
1622
|
|
|
1552
1623
|
```
|
|
@@ -1577,7 +1648,7 @@ To arrange in descending order, use function 'desc'
|
|
|
1577
1648
|
|
|
1578
1649
|
|
|
1579
1650
|
```ruby
|
|
1580
|
-
puts
|
|
1651
|
+
puts flights.arrange(:dep_delay.desc).head.as__data__frame
|
|
1581
1652
|
```
|
|
1582
1653
|
|
|
1583
1654
|
```
|
|
@@ -1610,7 +1681,7 @@ To select specific columns from a dataset we use function 'select':
|
|
|
1610
1681
|
|
|
1611
1682
|
|
|
1612
1683
|
```ruby
|
|
1613
|
-
puts
|
|
1684
|
+
puts flights.select(:year, :month, :day).head.as__data__frame
|
|
1614
1685
|
```
|
|
1615
1686
|
|
|
1616
1687
|
```
|
|
@@ -1627,7 +1698,7 @@ It is also possible to select column in a given range
|
|
|
1627
1698
|
|
|
1628
1699
|
|
|
1629
1700
|
```ruby
|
|
1630
|
-
puts
|
|
1701
|
+
puts flights.select(:year.up_to :day).head.as__data__frame
|
|
1631
1702
|
```
|
|
1632
1703
|
|
|
1633
1704
|
```
|
|
@@ -1644,7 +1715,7 @@ Select all columns that start with a given name sequence
|
|
|
1644
1715
|
|
|
1645
1716
|
|
|
1646
1717
|
```ruby
|
|
1647
|
-
puts
|
|
1718
|
+
puts flights.select(E.starts_with('arr')).head.as__data__frame
|
|
1648
1719
|
```
|
|
1649
1720
|
|
|
1650
1721
|
```
|
|
@@ -1672,7 +1743,7 @@ A helper function that comes in handy when we just want to rearrange column orde
|
|
|
1672
1743
|
|
|
1673
1744
|
|
|
1674
1745
|
```ruby
|
|
1675
|
-
puts
|
|
1746
|
+
puts flights.select(:year, :month, :day, E.everything).head.as__data__frame
|
|
1676
1747
|
```
|
|
1677
1748
|
|
|
1678
1749
|
```
|
|
@@ -1703,13 +1774,13 @@ puts @flights.select(:year, :month, :day, E.everything).head.as__data__frame
|
|
|
1703
1774
|
|
|
1704
1775
|
|
|
1705
1776
|
```ruby
|
|
1706
|
-
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1777
|
+
flights_sm = flights.
|
|
1778
|
+
select((:year.up_to :day),
|
|
1779
|
+
E.ends_with('delay'),
|
|
1780
|
+
:distance,
|
|
1781
|
+
:air_time)
|
|
1711
1782
|
|
|
1712
|
-
puts
|
|
1783
|
+
puts flights_sm.head.as__data__frame
|
|
1713
1784
|
```
|
|
1714
1785
|
|
|
1715
1786
|
```
|
|
@@ -1724,10 +1795,10 @@ puts @flights_sm.head.as__data__frame
|
|
|
1724
1795
|
|
|
1725
1796
|
|
|
1726
1797
|
```ruby
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
puts
|
|
1798
|
+
flights_sm = flights_sm.
|
|
1799
|
+
mutate(gain: :dep_delay - :arr_delay,
|
|
1800
|
+
speed: :distance / :air_time * 60)
|
|
1801
|
+
puts flights_sm.head.as__data__frame
|
|
1731
1802
|
```
|
|
1732
1803
|
|
|
1733
1804
|
```
|
|
@@ -1747,7 +1818,7 @@ a single value is obtained from the data frame:
|
|
|
1747
1818
|
|
|
1748
1819
|
|
|
1749
1820
|
```ruby
|
|
1750
|
-
puts
|
|
1821
|
+
puts flights.summarise(delay: E.mean(:dep_delay, na__rm: true)).as__data__frame
|
|
1751
1822
|
```
|
|
1752
1823
|
|
|
1753
1824
|
```
|
|
@@ -1759,7 +1830,7 @@ When a data frame is groupe with 'group_by' summaries apply to the given group:
|
|
|
1759
1830
|
|
|
1760
1831
|
|
|
1761
1832
|
```ruby
|
|
1762
|
-
by_day =
|
|
1833
|
+
by_day = flights.group_by(:year, :month, :day)
|
|
1763
1834
|
puts by_day.summarise(delay: :dep_delay.mean(na__rm: true)).head.as__data__frame
|
|
1764
1835
|
```
|
|
1765
1836
|
|
|
@@ -1777,7 +1848,7 @@ Next we put many operations together by pipping them one after the other:
|
|
|
1777
1848
|
|
|
1778
1849
|
|
|
1779
1850
|
```ruby
|
|
1780
|
-
delays =
|
|
1851
|
+
delays = flights.
|
|
1781
1852
|
group_by(:dest).
|
|
1782
1853
|
summarise(
|
|
1783
1854
|
count: E.n,
|
|
@@ -1785,108 +1856,17 @@ delays = @flights.
|
|
|
1785
1856
|
delay: :arr_delay.mean(na__rm: true)).
|
|
1786
1857
|
filter(:count > 20, :dest != "NHL")
|
|
1787
1858
|
|
|
1788
|
-
puts delays.as__data__frame
|
|
1789
|
-
```
|
|
1790
|
-
|
|
1791
|
-
```
|
|
1792
|
-
##
|
|
1793
|
-
## 1
|
|
1794
|
-
## 2
|
|
1795
|
-
## 3
|
|
1796
|
-
## 4
|
|
1797
|
-
## 5
|
|
1798
|
-
## 6
|
|
1799
|
-
## 7 BDL 443 116.00000 7.04854369
|
|
1800
|
-
## 8 BGR 375 378.00000 8.02793296
|
|
1801
|
-
## 9 BHM 297 865.99663 16.87732342
|
|
1802
|
-
## 10 BNA 6333 758.21348 11.81245891
|
|
1803
|
-
## 11 BOS 15508 190.63696 2.91439222
|
|
1804
|
-
## 12 BQN 896 1578.98326 8.24549550
|
|
1805
|
-
## 13 BTV 2589 265.09154 8.95099602
|
|
1806
|
-
## 14 BUF 4681 296.80837 8.94595186
|
|
1807
|
-
## 15 BUR 371 2465.00000 8.17567568
|
|
1808
|
-
## 16 BWI 1781 179.41830 10.72673385
|
|
1809
|
-
## 17 BZN 36 1882.00000 7.60000000
|
|
1810
|
-
## 18 CAE 116 603.55172 41.76415094
|
|
1811
|
-
## 19 CAK 864 397.00000 19.69833729
|
|
1812
|
-
## 20 CHO 52 305.00000 9.50000000
|
|
1813
|
-
## 21 CHS 2884 632.91678 10.59296847
|
|
1814
|
-
## 22 CLE 4573 414.17428 9.18161129
|
|
1815
|
-
## 23 CLT 14064 538.02730 7.36031885
|
|
1816
|
-
## 24 CMH 3524 476.55505 10.60132291
|
|
1817
|
-
## 25 CRW 138 444.00000 14.67164179
|
|
1818
|
-
## 26 CVG 3941 575.15986 15.36456376
|
|
1819
|
-
## 27 DAY 1525 537.10230 12.68048606
|
|
1820
|
-
## 28 DCA 9705 211.00618 9.06695204
|
|
1821
|
-
## 29 DEN 7266 1614.67836 8.60650021
|
|
1822
|
-
## 30 DFW 8738 1383.04303 0.32212685
|
|
1823
|
-
## 31 DSM 569 1020.88752 19.00573614
|
|
1824
|
-
## 32 DTW 9384 498.12852 5.42996346
|
|
1825
|
-
## 33 EGE 213 1735.70892 6.30434783
|
|
1826
|
-
## 34 FLL 12055 1070.06877 8.08212154
|
|
1827
|
-
## 35 GRR 765 605.78170 18.18956044
|
|
1828
|
-
## 36 GSO 1606 449.84184 14.11260054
|
|
1829
|
-
## 37 GSP 849 595.95995 15.93544304
|
|
1830
|
-
## 38 HNL 707 4972.67468 -1.36519258
|
|
1831
|
-
## 39 HOU 2115 1420.15508 7.17618819
|
|
1832
|
-
## 40 IAD 5700 224.84684 13.86420212
|
|
1833
|
-
## 41 IAH 7198 1407.20672 4.24079040
|
|
1834
|
-
## 42 ILM 110 500.00000 4.63551402
|
|
1835
|
-
## 43 IND 2077 652.26288 9.94043412
|
|
1836
|
-
## 44 JAC 25 1875.60000 28.09523810
|
|
1837
|
-
## 45 JAX 2720 824.67610 11.84483416
|
|
1838
|
-
## 46 LAS 5997 2240.96148 0.25772849
|
|
1839
|
-
## 47 LAX 16174 2468.62236 0.54711094
|
|
1840
|
-
## 48 LGB 668 2465.00000 -0.06202723
|
|
1841
|
-
## 49 MCI 2008 1097.69522 14.51405836
|
|
1842
|
-
## 50 MCO 14082 943.11057 5.45464309
|
|
1843
|
-
## 51 MDW 4113 718.04595 12.36422360
|
|
1844
|
-
## 52 MEM 1789 954.20123 10.64531435
|
|
1845
|
-
## 53 MHT 1009 207.02973 14.78755365
|
|
1846
|
-
## 54 MIA 11728 1091.55244 0.29905978
|
|
1847
|
-
## 55 MKE 2802 733.38151 14.16722038
|
|
1848
|
-
## 56 MSN 572 803.95455 20.19604317
|
|
1849
|
-
## 57 MSP 7185 1017.40167 7.27016886
|
|
1850
|
-
## 58 MSY 3799 1177.70571 6.49017497
|
|
1851
|
-
## 59 MVY 221 173.00000 -0.28571429
|
|
1852
|
-
## 60 MYR 59 550.66102 4.60344828
|
|
1853
|
-
## 61 OAK 312 2576.00000 3.07766990
|
|
1854
|
-
## 62 OKC 346 1325.00000 30.61904762
|
|
1855
|
-
## 63 OMA 849 1135.56655 14.69889841
|
|
1856
|
-
## 64 ORD 17283 729.00081 5.87661475
|
|
1857
|
-
## 65 ORF 1536 288.52344 10.94909344
|
|
1858
|
-
## 66 PBI 6554 1028.83811 8.56297210
|
|
1859
|
-
## 67 PDX 1354 2445.56573 5.14157973
|
|
1860
|
-
## 68 PHL 1632 94.32353 10.12719014
|
|
1861
|
-
## 69 PHX 4656 2141.30326 2.09704733
|
|
1862
|
-
## 70 PIT 2875 334.06122 7.68099053
|
|
1863
|
-
## 71 PSE 365 1617.00000 7.87150838
|
|
1864
|
-
## 72 PVD 376 160.00000 16.23463687
|
|
1865
|
-
## 73 PWM 2352 276.12840 11.66040210
|
|
1866
|
-
## 74 RDU 8163 426.75769 10.05238095
|
|
1867
|
-
## 75 RIC 2454 281.40465 20.11125320
|
|
1868
|
-
## 76 ROC 2416 259.25083 11.56064461
|
|
1869
|
-
## 77 RSW 3537 1072.85327 3.23814963
|
|
1870
|
-
## 78 SAN 2737 2437.29923 3.13916574
|
|
1871
|
-
## 79 SAT 686 1578.34111 6.94537178
|
|
1872
|
-
## 80 SAV 804 709.18408 15.12950601
|
|
1873
|
-
## 81 SDF 1157 645.98358 12.66938406
|
|
1874
|
-
## 82 SEA 3923 2412.66531 -1.09909910
|
|
1875
|
-
## 83 SFO 13331 2577.92356 2.67289152
|
|
1876
|
-
## 84 SJC 329 2569.00000 3.44817073
|
|
1877
|
-
## 85 SJU 5819 1599.83365 2.52052659
|
|
1878
|
-
## 86 SLC 2467 1986.98662 0.17625459
|
|
1879
|
-
## 87 SMF 284 2521.00000 12.10992908
|
|
1880
|
-
## 88 SNA 825 2434.00000 -7.86822660
|
|
1881
|
-
## 89 SRQ 1211 1044.65153 3.08243131
|
|
1882
|
-
## 90 STL 4339 878.72321 11.07846451
|
|
1883
|
-
## 91 STT 522 1626.98276 -3.83590734
|
|
1884
|
-
## 92 SYR 1761 205.92164 8.90392501
|
|
1885
|
-
## 93 TPA 7466 1003.93557 7.40852503
|
|
1886
|
-
## 94 TUL 315 1215.00000 33.65986395
|
|
1887
|
-
## 95 TVC 101 652.38614 12.96842105
|
|
1888
|
-
## 96 TYS 631 638.80983 24.06920415
|
|
1889
|
-
## 97 XNA 1036 1142.50579 7.46572581
|
|
1859
|
+
puts delays.as__data__frame.head
|
|
1860
|
+
```
|
|
1861
|
+
|
|
1862
|
+
```
|
|
1863
|
+
## dest count dist delay
|
|
1864
|
+
## 1 ABQ 254 1826.0000 4.381890
|
|
1865
|
+
## 2 ACK 265 199.0000 4.852273
|
|
1866
|
+
## 3 ALB 439 143.0000 14.397129
|
|
1867
|
+
## 4 ATL 17215 757.1082 11.300113
|
|
1868
|
+
## 5 AUS 2439 1514.2530 6.019909
|
|
1869
|
+
## 6 AVL 275 583.5818 8.003831
|
|
1890
1870
|
```
|
|
1891
1871
|
|
|
1892
1872
|
# Using Data Table
|
|
@@ -1897,9 +1877,9 @@ R.library('data.table')
|
|
|
1897
1877
|
R.install_and_loads('curl')
|
|
1898
1878
|
|
|
1899
1879
|
input = "https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv"
|
|
1900
|
-
|
|
1901
|
-
puts
|
|
1902
|
-
puts
|
|
1880
|
+
flights = R.fread(input)
|
|
1881
|
+
puts flights
|
|
1882
|
+
puts flights.dim
|
|
1903
1883
|
```
|
|
1904
1884
|
|
|
1905
1885
|
```
|
|
@@ -1958,17 +1938,17 @@ puts data_table.ID
|
|
|
1958
1938
|
|
|
1959
1939
|
```ruby
|
|
1960
1940
|
# subset rows in i
|
|
1961
|
-
ans =
|
|
1941
|
+
ans = flights[(:origin.eq "JFK") & (:month.eq 6)]
|
|
1962
1942
|
puts ans.head
|
|
1963
1943
|
|
|
1964
1944
|
# Get the first two rows from flights.
|
|
1965
1945
|
|
|
1966
|
-
ans =
|
|
1946
|
+
ans = flights[(1..2)]
|
|
1967
1947
|
puts ans
|
|
1968
1948
|
|
|
1969
1949
|
# Sort flights first by column origin in ascending order, and then by dest in descending order:
|
|
1970
1950
|
|
|
1971
|
-
# ans =
|
|
1951
|
+
# ans = flights[E.order(:origin, -(:dest))]
|
|
1972
1952
|
# puts ans.head
|
|
1973
1953
|
```
|
|
1974
1954
|
|
|
@@ -2000,15 +1980,15 @@ puts ans
|
|
|
2000
1980
|
# Select column(s) in j
|
|
2001
1981
|
# select arr_delay column, but return it as a vector.
|
|
2002
1982
|
|
|
2003
|
-
ans =
|
|
1983
|
+
ans = flights[:all, :arr_delay]
|
|
2004
1984
|
puts ans.head
|
|
2005
1985
|
|
|
2006
1986
|
# Select arr_delay column, but return as a data.table instead.
|
|
2007
1987
|
|
|
2008
|
-
ans =
|
|
1988
|
+
ans = flights[:all, :arr_delay.list]
|
|
2009
1989
|
puts ans.head
|
|
2010
1990
|
|
|
2011
|
-
ans =
|
|
1991
|
+
ans = flights[:all, E.list(:arr_delay, :dep_delay)]
|
|
2012
1992
|
```
|
|
2013
1993
|
|
|
2014
1994
|
```
|
|
@@ -2033,68 +2013,42 @@ the data frame with the necessary data:
|
|
|
2033
2013
|
|
|
2034
2014
|
```ruby
|
|
2035
2015
|
# copy the R variable :mtcars to the Ruby mtcars variable
|
|
2036
|
-
|
|
2016
|
+
mtcars = ~:mtcars
|
|
2037
2017
|
|
|
2038
2018
|
# create a new column 'car_name' to store the car names so that it can be
|
|
2039
2019
|
# used for plotting. The 'rownames' of the data frame cannot be used as
|
|
2040
2020
|
# data for plotting
|
|
2041
|
-
|
|
2021
|
+
mtcars.car_name = R.rownames(:mtcars)
|
|
2042
2022
|
|
|
2043
2023
|
# compute normalized mpg and add it to a new column called mpg_z
|
|
2044
2024
|
# Note that the mean value for mpg can be obtained by calling the 'mean'
|
|
2045
2025
|
# function on the vector 'mtcars.mpg'. The same with the standard
|
|
2046
2026
|
# deviation 'sd'. The vector is then rounded to two digits with 'round 2'
|
|
2047
|
-
|
|
2027
|
+
mtcars.mpg_z = ((mtcars.mpg - mtcars.mpg.mean)/mtcars.mpg.sd).round 2
|
|
2048
2028
|
|
|
2049
2029
|
# create a new column 'mpg_type'. Function 'ifelse' is a vectorized function
|
|
2050
2030
|
# that looks at every element of the mpg_z vector and if the value is below
|
|
2051
2031
|
# 0, returns 'below', otherwise returns 'above'
|
|
2052
|
-
|
|
2032
|
+
mtcars.mpg_type = (mtcars.mpg_z < 0).ifelse("below", "above")
|
|
2053
2033
|
|
|
2054
2034
|
# order the mtcar data set by the mpg_z vector from smaler to larger values
|
|
2055
|
-
|
|
2035
|
+
mtcars = mtcars[mtcars.mpg_z.order, :all]
|
|
2056
2036
|
|
|
2057
2037
|
# convert the car_name column to a factor to retain sorted order in plot
|
|
2058
|
-
|
|
2038
|
+
mtcars.car_name = mtcars.car_name.factor levels: mtcars.car_name
|
|
2059
2039
|
|
|
2060
2040
|
# let's look at the final data frame
|
|
2061
|
-
puts
|
|
2041
|
+
puts mtcars.head
|
|
2062
2042
|
```
|
|
2063
2043
|
|
|
2064
2044
|
```
|
|
2065
|
-
## mpg cyl
|
|
2066
|
-
## Cadillac Fleetwood 10.4 8
|
|
2067
|
-
## Lincoln Continental 10.4 8
|
|
2068
|
-
## Camaro Z28 13.3 8
|
|
2069
|
-
## Duster 360 14.3 8
|
|
2070
|
-
## Chrysler Imperial 14.7 8
|
|
2071
|
-
## Maserati Bora 15.0 8
|
|
2072
|
-
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
|
|
2073
|
-
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
|
|
2074
|
-
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
|
|
2075
|
-
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
|
|
2076
|
-
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
|
|
2077
|
-
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
|
|
2078
|
-
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
|
|
2079
|
-
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
|
|
2080
|
-
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
|
|
2081
|
-
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
|
|
2082
|
-
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
|
|
2083
|
-
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
|
|
2084
|
-
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
|
|
2085
|
-
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
|
|
2086
|
-
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
|
|
2087
|
-
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
|
|
2088
|
-
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
|
|
2089
|
-
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
|
|
2090
|
-
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
|
|
2091
|
-
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
|
|
2092
|
-
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
|
|
2093
|
-
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
|
|
2094
|
-
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
|
|
2095
|
-
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
|
|
2096
|
-
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
|
|
2097
|
-
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
|
|
2045
|
+
## mpg cyl disp hp drat wt qsec vs am gear carb
|
|
2046
|
+
## Cadillac Fleetwood 10.4 8 472 205 2.93 5.250 17.98 0 0 3 4
|
|
2047
|
+
## Lincoln Continental 10.4 8 460 215 3.00 5.424 17.82 0 0 3 4
|
|
2048
|
+
## Camaro Z28 13.3 8 350 245 3.73 3.840 15.41 0 0 3 4
|
|
2049
|
+
## Duster 360 14.3 8 360 245 3.21 3.570 15.84 0 0 3 4
|
|
2050
|
+
## Chrysler Imperial 14.7 8 440 230 3.23 5.345 17.42 0 0 3 4
|
|
2051
|
+
## Maserati Bora 15.0 8 301 335 3.54 3.570 14.60 0 1 5 8
|
|
2098
2052
|
## car_name mpg_z mpg_type
|
|
2099
2053
|
## Cadillac Fleetwood Cadillac Fleetwood -1.61 below
|
|
2100
2054
|
## Lincoln Continental Lincoln Continental -1.61 below
|
|
@@ -2102,32 +2056,6 @@ puts @mtcars
|
|
|
2102
2056
|
## Duster 360 Duster 360 -0.96 below
|
|
2103
2057
|
## Chrysler Imperial Chrysler Imperial -0.89 below
|
|
2104
2058
|
## Maserati Bora Maserati Bora -0.84 below
|
|
2105
|
-
## Merc 450SLC Merc 450SLC -0.81 below
|
|
2106
|
-
## AMC Javelin AMC Javelin -0.81 below
|
|
2107
|
-
## Dodge Challenger Dodge Challenger -0.76 below
|
|
2108
|
-
## Ford Pantera L Ford Pantera L -0.71 below
|
|
2109
|
-
## Merc 450SE Merc 450SE -0.61 below
|
|
2110
|
-
## Merc 450SL Merc 450SL -0.46 below
|
|
2111
|
-
## Merc 280C Merc 280C -0.38 below
|
|
2112
|
-
## Valiant Valiant -0.33 below
|
|
2113
|
-
## Hornet Sportabout Hornet Sportabout -0.23 below
|
|
2114
|
-
## Merc 280 Merc 280 -0.15 below
|
|
2115
|
-
## Pontiac Firebird Pontiac Firebird -0.15 below
|
|
2116
|
-
## Ferrari Dino Ferrari Dino -0.06 below
|
|
2117
|
-
## Mazda RX4 Mazda RX4 0.15 above
|
|
2118
|
-
## Mazda RX4 Wag Mazda RX4 Wag 0.15 above
|
|
2119
|
-
## Hornet 4 Drive Hornet 4 Drive 0.22 above
|
|
2120
|
-
## Volvo 142E Volvo 142E 0.22 above
|
|
2121
|
-
## Toyota Corona Toyota Corona 0.23 above
|
|
2122
|
-
## Datsun 710 Datsun 710 0.45 above
|
|
2123
|
-
## Merc 230 Merc 230 0.45 above
|
|
2124
|
-
## Merc 240D Merc 240D 0.72 above
|
|
2125
|
-
## Porsche 914-2 Porsche 914-2 0.98 above
|
|
2126
|
-
## Fiat X1-9 Fiat X1-9 1.20 above
|
|
2127
|
-
## Honda Civic Honda Civic 1.71 above
|
|
2128
|
-
## Lotus Europa Lotus Europa 1.71 above
|
|
2129
|
-
## Fiat 128 Fiat 128 2.04 above
|
|
2130
|
-
## Toyota Corolla Toyota Corolla 2.29 above
|
|
2131
2059
|
```
|
|
2132
2060
|
Now, lets plot the diverging bar plot. When using gKnit, there is no need to call
|
|
2133
2061
|
'R.awt' to create a plotting device, since gKnit does take care of it. Galaaz
|
|
@@ -2149,19 +2077,604 @@ but in this graph we want the bars to be horizontally layed so we add 'coord\_fl
|
|
|
2149
2077
|
```ruby
|
|
2150
2078
|
require 'ggplot'
|
|
2151
2079
|
|
|
2152
|
-
puts
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
|
|
2158
|
-
|
|
2159
|
-
|
|
2080
|
+
puts mtcars.ggplot(E.aes(x: :car_name, y: :mpg_z, label: :mpg_z)) +
|
|
2081
|
+
R.geom_bar(E.aes(fill: :mpg_type), stat: 'identity', width: 0.5) +
|
|
2082
|
+
R.scale_fill_manual(name: 'Mileage',
|
|
2083
|
+
labels: R.c('Above Average', 'Below Average'),
|
|
2084
|
+
values: R.c('above': '#00ba38', 'below': '#f8766d')) +
|
|
2085
|
+
R.labs(subtitle: "Normalised mileage from 'mtcars'",
|
|
2086
|
+
title: "Diverging Bars") +
|
|
2087
|
+
R.coord_flip
|
|
2160
2088
|
```
|
|
2161
2089
|
|
|
2162
2090
|
|
|
2163
2091
|
<!-- -->
|
|
2164
2092
|
|
|
2093
|
+
# Coding with Tidyverse
|
|
2094
|
+
|
|
2095
|
+
In R, and when coding with 'tidyverse', arguments to a function are usually not
|
|
2096
|
+
*referencially transparent*. That is, you can’t replace a value with a seemingly equivalent
|
|
2097
|
+
object that you’ve defined elsewhere. To see the problem, let's first define a data frame:
|
|
2098
|
+
|
|
2099
|
+
|
|
2100
|
+
```ruby
|
|
2101
|
+
df = R.data__frame(x: (1..3), y: (3..1))
|
|
2102
|
+
puts df
|
|
2103
|
+
```
|
|
2104
|
+
|
|
2105
|
+
```
|
|
2106
|
+
## x y
|
|
2107
|
+
## 1 1 3
|
|
2108
|
+
## 2 2 2
|
|
2109
|
+
## 3 3 1
|
|
2110
|
+
```
|
|
2111
|
+
|
|
2112
|
+
and now, let's look at this code:
|
|
2113
|
+
|
|
2114
|
+
|
|
2115
|
+
```r
|
|
2116
|
+
my_var <- x
|
|
2117
|
+
filter(df, my_var == 1)
|
|
2118
|
+
```
|
|
2119
|
+
It generates the following error: "object 'x' not found.
|
|
2120
|
+
|
|
2121
|
+
However, in Galaaz, arguments are referencially transparent as can be seen by the
|
|
2122
|
+
code bellow. Note initally that 'my_var = :x' will not give the error "object 'x' not found"
|
|
2123
|
+
since ':x' is treated as an expression and assigned to my\_var. Then when doing (my\_var.eq 1),
|
|
2124
|
+
my\_var is a variable that resolves to ':x' and it becomes equivalent to (:x.eq 1) which is
|
|
2125
|
+
what we want.
|
|
2126
|
+
|
|
2127
|
+
|
|
2128
|
+
```ruby
|
|
2129
|
+
my_var = :x
|
|
2130
|
+
puts df.filter(my_var.eq 1)
|
|
2131
|
+
```
|
|
2132
|
+
|
|
2133
|
+
```
|
|
2134
|
+
## x y
|
|
2135
|
+
## 1 1 3
|
|
2136
|
+
```
|
|
2137
|
+
As stated by Hardley
|
|
2138
|
+
|
|
2139
|
+
> dplyr code is ambiguous. Depending on what variables are defined where,
|
|
2140
|
+
> filter(df, x == y) could be equivalent to any of:
|
|
2141
|
+
|
|
2142
|
+
```
|
|
2143
|
+
df[df$x == df$y, ]
|
|
2144
|
+
df[df$x == y, ]
|
|
2145
|
+
df[x == df$y, ]
|
|
2146
|
+
df[x == y, ]
|
|
2147
|
+
```
|
|
2148
|
+
In galaaz this ambiguity does not exist, filter(df, x.eq y) is not a valid expression as
|
|
2149
|
+
expressions are build with symbols. In doing filter(df, :x.eq y) we are looking for elements
|
|
2150
|
+
of the 'x' column that are equal to a previously defined y variable. Finally in
|
|
2151
|
+
filter(df, :x.eq :y) we are looking for elements in which the 'x' column value is equal to
|
|
2152
|
+
the 'y' column value. This can be seen in the following two chunks of code:
|
|
2153
|
+
|
|
2154
|
+
|
|
2155
|
+
```ruby
|
|
2156
|
+
y = 1
|
|
2157
|
+
x = 2
|
|
2158
|
+
|
|
2159
|
+
# looking for values where the 'x' column is equal to the 'y' column
|
|
2160
|
+
puts df.filter(:x.eq :y)
|
|
2161
|
+
```
|
|
2162
|
+
|
|
2163
|
+
```
|
|
2164
|
+
## x y
|
|
2165
|
+
## 1 2 2
|
|
2166
|
+
```
|
|
2167
|
+
|
|
2168
|
+
|
|
2169
|
+
```ruby
|
|
2170
|
+
# looking for values where the 'x' column is equal to the 'y' variable
|
|
2171
|
+
# in this case, the number 1
|
|
2172
|
+
puts df.filter(:x.eq y)
|
|
2173
|
+
```
|
|
2174
|
+
|
|
2175
|
+
```
|
|
2176
|
+
## x y
|
|
2177
|
+
## 1 1 3
|
|
2178
|
+
```
|
|
2179
|
+
## Writing a function that applies to different data sets
|
|
2180
|
+
|
|
2181
|
+
Let's suppose that we want to write a function that receives as the first argument a data frame
|
|
2182
|
+
and as second argument an expression that adds a column to the data frame that is equal to the
|
|
2183
|
+
sum of elements in column 'a' plus 'x'.
|
|
2184
|
+
|
|
2185
|
+
Here is the intended behaviour using the 'mutate' function of 'dplyr':
|
|
2186
|
+
|
|
2187
|
+
```
|
|
2188
|
+
mutate(df1, y = a + x)
|
|
2189
|
+
mutate(df2, y = a + x)
|
|
2190
|
+
mutate(df3, y = a + x)
|
|
2191
|
+
mutate(df4, y = a + x)
|
|
2192
|
+
```
|
|
2193
|
+
The naive approach to writing an R function to solve this problem is:
|
|
2194
|
+
|
|
2195
|
+
```
|
|
2196
|
+
mutate_y <- function(df) {
|
|
2197
|
+
mutate(df, y = a + x)
|
|
2198
|
+
}
|
|
2199
|
+
```
|
|
2200
|
+
Unfortunately, in R, this function can fail silently if one of the variables isn’t present
|
|
2201
|
+
in the data frame, but is present in the global environment. We will not go through here how
|
|
2202
|
+
to solve this problem in R.
|
|
2203
|
+
|
|
2204
|
+
In Galaaz the method mutate_y bellow will work fine and will never fail silently.
|
|
2205
|
+
|
|
2206
|
+
|
|
2207
|
+
```ruby
|
|
2208
|
+
def mutate_y(df)
|
|
2209
|
+
df.mutate(:y.assign :a + :x)
|
|
2210
|
+
end
|
|
2211
|
+
```
|
|
2212
|
+
Here we create a data frame that has only one column named 'x':
|
|
2213
|
+
|
|
2214
|
+
|
|
2215
|
+
```ruby
|
|
2216
|
+
df1 = R.data__frame(x: (1..3))
|
|
2217
|
+
puts df1
|
|
2218
|
+
```
|
|
2219
|
+
|
|
2220
|
+
```
|
|
2221
|
+
## x
|
|
2222
|
+
## 1 1
|
|
2223
|
+
## 2 2
|
|
2224
|
+
## 3 3
|
|
2225
|
+
```
|
|
2226
|
+
|
|
2227
|
+
Note that method mutate_y will fail independetly from the fact that variable 'a' is defined and
|
|
2228
|
+
in the scope of the method. Variable 'a' has no relationship with the symbol ':a' used in the
|
|
2229
|
+
definition of 'mutate\_y' above:
|
|
2230
|
+
|
|
2231
|
+
|
|
2232
|
+
```ruby
|
|
2233
|
+
a = 10
|
|
2234
|
+
mutate_y(df1)
|
|
2235
|
+
```
|
|
2236
|
+
|
|
2237
|
+
```
|
|
2238
|
+
## Message:
|
|
2239
|
+
## Error in mutate_impl(.data, dots) :
|
|
2240
|
+
## Evaluation error: object 'a' not found.
|
|
2241
|
+
## In addition: Warning message:
|
|
2242
|
+
## In mutate_impl(.data, dots) :
|
|
2243
|
+
## mismatched protect/unprotect (unprotect with empty protect stack) (RError)
|
|
2244
|
+
## Translated to internal error
|
|
2245
|
+
```
|
|
2246
|
+
## Different expressions
|
|
2247
|
+
|
|
2248
|
+
Let's move to the next problem as presented by Hardley where trying to write a function in R
|
|
2249
|
+
that will receive two argumens, the first a variable and the second an expression is not trivial.
|
|
2250
|
+
Bellow we create a data frame and we want to write a function that groups data by a variable and
|
|
2251
|
+
summarises it by an expression:
|
|
2252
|
+
|
|
2253
|
+
|
|
2254
|
+
```r
|
|
2255
|
+
set.seed(123)
|
|
2256
|
+
|
|
2257
|
+
df <- data.frame(
|
|
2258
|
+
g1 = c(1, 1, 2, 2, 2),
|
|
2259
|
+
g2 = c(1, 2, 1, 2, 1),
|
|
2260
|
+
a = sample(5),
|
|
2261
|
+
b = sample(5)
|
|
2262
|
+
)
|
|
2263
|
+
|
|
2264
|
+
as.data.frame(df)
|
|
2265
|
+
```
|
|
2266
|
+
|
|
2267
|
+
```
|
|
2268
|
+
## g1 g2 a b
|
|
2269
|
+
## 1 1 1 2 1
|
|
2270
|
+
## 2 1 2 4 3
|
|
2271
|
+
## 3 2 1 5 4
|
|
2272
|
+
## 4 2 2 3 2
|
|
2273
|
+
## 5 2 1 1 5
|
|
2274
|
+
```
|
|
2275
|
+
|
|
2276
|
+
```r
|
|
2277
|
+
d2 <- df %>%
|
|
2278
|
+
group_by(g1) %>%
|
|
2279
|
+
summarise(a = mean(a))
|
|
2280
|
+
|
|
2281
|
+
as.data.frame(d2)
|
|
2282
|
+
```
|
|
2283
|
+
|
|
2284
|
+
```
|
|
2285
|
+
## g1 a
|
|
2286
|
+
## 1 1 3
|
|
2287
|
+
## 2 2 3
|
|
2288
|
+
```
|
|
2289
|
+
|
|
2290
|
+
```r
|
|
2291
|
+
d2 <- df %>%
|
|
2292
|
+
group_by(g2) %>%
|
|
2293
|
+
summarise(a = mean(a))
|
|
2294
|
+
|
|
2295
|
+
as.data.frame(d2)
|
|
2296
|
+
```
|
|
2297
|
+
|
|
2298
|
+
```
|
|
2299
|
+
## g2 a
|
|
2300
|
+
## 1 1 2.666667
|
|
2301
|
+
## 2 2 3.500000
|
|
2302
|
+
```
|
|
2303
|
+
|
|
2304
|
+
As shown by Hardley, one might expect this function to do the trick:
|
|
2305
|
+
|
|
2306
|
+
|
|
2307
|
+
```r
|
|
2308
|
+
my_summarise <- function(df, group_var) {
|
|
2309
|
+
df %>%
|
|
2310
|
+
group_by(group_var) %>%
|
|
2311
|
+
summarise(a = mean(a))
|
|
2312
|
+
}
|
|
2313
|
+
|
|
2314
|
+
# my_summarise(df, g1)
|
|
2315
|
+
#> Error: Column `group_var` is unknown
|
|
2316
|
+
```
|
|
2317
|
+
|
|
2318
|
+
In order to solve this problem, coding with dplyr requires the introduction of many new concepts
|
|
2319
|
+
and functions such as 'quo', 'quos', 'enquo', 'enquos', '!!' (bang bang), '!!!' (triple bang).
|
|
2320
|
+
Again, we'll leave to Hardley the explanation on how to use all those functions.
|
|
2321
|
+
|
|
2322
|
+
Now, let's try to implement the same function in galaaz. The next code block first prints the
|
|
2323
|
+
'df' data frame defined previously in R (to access an R variable from Galaaz, we use the tilda
|
|
2324
|
+
operator '~' applied to the R variable name as symbol, i.e., ':df'.
|
|
2325
|
+
|
|
2326
|
+
|
|
2327
|
+
```ruby
|
|
2328
|
+
puts ~:df
|
|
2329
|
+
```
|
|
2330
|
+
|
|
2331
|
+
```
|
|
2332
|
+
## g1 g2 a b
|
|
2333
|
+
## 1 1 1 2 1
|
|
2334
|
+
## 2 1 2 4 3
|
|
2335
|
+
## 3 2 1 5 4
|
|
2336
|
+
## 4 2 2 3 2
|
|
2337
|
+
## 5 2 1 1 5
|
|
2338
|
+
```
|
|
2339
|
+
|
|
2340
|
+
We then create the 'my_summarize' method and call it passing the R data frame and
|
|
2341
|
+
the group by variable ':g1':
|
|
2342
|
+
|
|
2343
|
+
|
|
2344
|
+
```ruby
|
|
2345
|
+
def my_summarize(df, group_var)
|
|
2346
|
+
df.group_by(group_var).
|
|
2347
|
+
summarize(a: :a.mean)
|
|
2348
|
+
end
|
|
2349
|
+
|
|
2350
|
+
puts my_summarize(:df, :g1).as__data__frame
|
|
2351
|
+
```
|
|
2352
|
+
|
|
2353
|
+
```
|
|
2354
|
+
## g1 a
|
|
2355
|
+
## 1 1 3
|
|
2356
|
+
## 2 2 3
|
|
2357
|
+
```
|
|
2358
|
+
|
|
2359
|
+
It works!!! Well, let's make sure this was not just some coincidence
|
|
2360
|
+
|
|
2361
|
+
|
|
2362
|
+
```ruby
|
|
2363
|
+
puts my_summarize(:df, :g2).as__data__frame
|
|
2364
|
+
```
|
|
2365
|
+
|
|
2366
|
+
```
|
|
2367
|
+
## g2 a
|
|
2368
|
+
## 1 1 2.666667
|
|
2369
|
+
## 2 2 3.500000
|
|
2370
|
+
```
|
|
2371
|
+
|
|
2372
|
+
Great, everything is fine! No magic, no new functions, no complexities, just normal, standard Ruby
|
|
2373
|
+
code. If you've ever done NSE in R, this certainly feels much safer and easy to implement.
|
|
2374
|
+
|
|
2375
|
+
## Different input variables
|
|
2376
|
+
|
|
2377
|
+
In the previous section we've managed to get rid of all NSE formulation for a simple example, but
|
|
2378
|
+
does this remain true for more complex examples, or will the Galaaz way prove inpractical for
|
|
2379
|
+
more complex code?
|
|
2380
|
+
|
|
2381
|
+
In the next example Hardley proposes us to write a function that given an expression such as 'a'
|
|
2382
|
+
or 'a * b', calculates three summaries. What we want a function that does the same as these R
|
|
2383
|
+
statements:
|
|
2384
|
+
|
|
2385
|
+
```
|
|
2386
|
+
summarise(df, mean = mean(a), sum = sum(a), n = n())
|
|
2387
|
+
#> # A tibble: 1 x 3
|
|
2388
|
+
#> mean sum n
|
|
2389
|
+
#> <dbl> <int> <int>
|
|
2390
|
+
#> 1 3 15 5
|
|
2391
|
+
|
|
2392
|
+
summarise(df, mean = mean(a * b), sum = sum(a * b), n = n())
|
|
2393
|
+
#> # A tibble: 1 x 3
|
|
2394
|
+
#> mean sum n
|
|
2395
|
+
#> <dbl> <int> <int>
|
|
2396
|
+
#> 1 9 45 5
|
|
2397
|
+
```
|
|
2398
|
+
|
|
2399
|
+
Let's try it in galaaz:
|
|
2400
|
+
|
|
2401
|
+
|
|
2402
|
+
```ruby
|
|
2403
|
+
def my_summarise2(df, expr)
|
|
2404
|
+
df.summarize(
|
|
2405
|
+
mean: E.mean(expr),
|
|
2406
|
+
sum: E.sum(expr),
|
|
2407
|
+
n: E.n
|
|
2408
|
+
)
|
|
2409
|
+
end
|
|
2410
|
+
|
|
2411
|
+
puts my_summarise2((~:df), :a)
|
|
2412
|
+
puts "\n"
|
|
2413
|
+
puts my_summarise2((~:df), :a * :b)
|
|
2414
|
+
```
|
|
2415
|
+
|
|
2416
|
+
```
|
|
2417
|
+
## mean sum n
|
|
2418
|
+
## 1 3 15 5
|
|
2419
|
+
##
|
|
2420
|
+
## mean sum n
|
|
2421
|
+
## 1 9 45 5
|
|
2422
|
+
```
|
|
2423
|
+
|
|
2424
|
+
Once again, there is no need to use any special theory or functions. The only point to be
|
|
2425
|
+
careful about is the use of 'E' to build expressions from functions 'mean', 'sum' and 'n'.
|
|
2426
|
+
|
|
2427
|
+
## Different input and output variable
|
|
2428
|
+
|
|
2429
|
+
Now the next challenge presented by Hardley is to vary the name of the output variables based on
|
|
2430
|
+
the received expression. So, if the input expression is 'a', we want our data frame columns to
|
|
2431
|
+
be named 'mean\_a' and 'sum\_a'. Now, if the input expression is 'b', columns
|
|
2432
|
+
should be named 'mean\_b' and 'sum\_b'.
|
|
2433
|
+
|
|
2434
|
+
```
|
|
2435
|
+
mutate(df, mean_a = mean(a), sum_a = sum(a))
|
|
2436
|
+
#> # A tibble: 5 x 6
|
|
2437
|
+
#> g1 g2 a b mean_a sum_a
|
|
2438
|
+
#> <dbl> <dbl> <int> <int> <dbl> <int>
|
|
2439
|
+
#> 1 1 1 1 3 3 15
|
|
2440
|
+
#> 2 1 2 4 2 3 15
|
|
2441
|
+
#> 3 2 1 2 1 3 15
|
|
2442
|
+
#> 4 2 2 5 4 3 15
|
|
2443
|
+
#> # … with 1 more row
|
|
2444
|
+
|
|
2445
|
+
mutate(df, mean_b = mean(b), sum_b = sum(b))
|
|
2446
|
+
#> # A tibble: 5 x 6
|
|
2447
|
+
#> g1 g2 a b mean_b sum_b
|
|
2448
|
+
#> <dbl> <dbl> <int> <int> <dbl> <int>
|
|
2449
|
+
#> 1 1 1 1 3 3 15
|
|
2450
|
+
#> 2 1 2 4 2 3 15
|
|
2451
|
+
#> 3 2 1 2 1 3 15
|
|
2452
|
+
#> 4 2 2 5 4 3 15
|
|
2453
|
+
#> # … with 1 more row
|
|
2454
|
+
```
|
|
2455
|
+
In order to solve this problem in R, Hardley needs to introduce some more new functions and notations:
|
|
2456
|
+
'quo_name' and the ':=' operator from package 'rlang'
|
|
2457
|
+
|
|
2458
|
+
Here is our Ruby code:
|
|
2459
|
+
|
|
2460
|
+
|
|
2461
|
+
```ruby
|
|
2462
|
+
def my_mutate(df, expr)
|
|
2463
|
+
mean_name = "mean_#{expr.to_s}"
|
|
2464
|
+
sum_name = "sum_#{expr.to_s}"
|
|
2465
|
+
|
|
2466
|
+
df.mutate(mean_name => E.mean(expr),
|
|
2467
|
+
sum_name => E.sum(expr))
|
|
2468
|
+
end
|
|
2469
|
+
|
|
2470
|
+
puts my_mutate((~:df), :a)
|
|
2471
|
+
puts "\n"
|
|
2472
|
+
puts my_mutate((~:df), :b)
|
|
2473
|
+
```
|
|
2474
|
+
|
|
2475
|
+
```
|
|
2476
|
+
## g1 g2 a b mean_a sum_a
|
|
2477
|
+
## 1 1 1 2 1 3 15
|
|
2478
|
+
## 2 1 2 4 3 3 15
|
|
2479
|
+
## 3 2 1 5 4 3 15
|
|
2480
|
+
## 4 2 2 3 2 3 15
|
|
2481
|
+
## 5 2 1 1 5 3 15
|
|
2482
|
+
##
|
|
2483
|
+
## g1 g2 a b mean_b sum_b
|
|
2484
|
+
## 1 1 1 2 1 3 15
|
|
2485
|
+
## 2 1 2 4 3 3 15
|
|
2486
|
+
## 3 2 1 5 4 3 15
|
|
2487
|
+
## 4 2 2 3 2 3 15
|
|
2488
|
+
## 5 2 1 1 5 3 15
|
|
2489
|
+
```
|
|
2490
|
+
It really seems that "Non Standard Evaluation" is actually quite standard in Galaaz! But, you
|
|
2491
|
+
might have noticed a small change in the way the arguments to the mutate method were called.
|
|
2492
|
+
In a previous example we used df.summarise(mean: E.mean(:a), ...) where the column name was
|
|
2493
|
+
followed by a ':' colom. In this example, we have df.mutate(mean_name => E.mean(expr), ...)
|
|
2494
|
+
and variable mean\_name is not followed by ':' but by '=>'. This is standard Ruby notation.
|
|
2495
|
+
|
|
2496
|
+
[explain....]
|
|
2497
|
+
|
|
2498
|
+
## Capturing multiple variables
|
|
2499
|
+
|
|
2500
|
+
Moving on with new complexities, Hardley proposes us to solve the problem in which the
|
|
2501
|
+
summarise function will receive any number of grouping variables.
|
|
2502
|
+
|
|
2503
|
+
This again is quite standard Ruby. In order to receive an undefined number of paramenters
|
|
2504
|
+
the paramenter is preceded by '*':
|
|
2505
|
+
|
|
2506
|
+
|
|
2507
|
+
```ruby
|
|
2508
|
+
def my_summarise3(df, *group_vars)
|
|
2509
|
+
df.group_by(*group_vars).
|
|
2510
|
+
summarise(a: E.mean(:a))
|
|
2511
|
+
end
|
|
2512
|
+
|
|
2513
|
+
puts my_summarise3((~:df), :g1, :g2).as__data__frame
|
|
2514
|
+
```
|
|
2515
|
+
|
|
2516
|
+
```
|
|
2517
|
+
## g1 g2 a
|
|
2518
|
+
## 1 1 1 2
|
|
2519
|
+
## 2 1 2 4
|
|
2520
|
+
## 3 2 1 3
|
|
2521
|
+
## 4 2 2 3
|
|
2522
|
+
```
|
|
2523
|
+
|
|
2524
|
+
## Why does R require NSE and Galaaz does not?
|
|
2525
|
+
|
|
2526
|
+
NSE introduces a number of new concepts, such as 'quoting', 'quasiquotation', 'unquoting' and
|
|
2527
|
+
'unquote-splicing', while in Galaaz none of those concepts are needed. What gives?
|
|
2528
|
+
|
|
2529
|
+
R is an extremely flexible language and it has lazy evaluation of parameters. When in R a
|
|
2530
|
+
function is called as 'summarise(df, a = b)', the summarise function receives the litteral
|
|
2531
|
+
'a = b' parameter and can work with this as if it were a string. In R, it is not clear what
|
|
2532
|
+
a and b are, they can be expressions or they can be variables, it is up to the function to
|
|
2533
|
+
decide what 'a = b' means.
|
|
2534
|
+
|
|
2535
|
+
In Ruby, there is no lazy evaluation of parameters and 'a' is always a variable and so is 'b'.
|
|
2536
|
+
Variables assume their value as soon as they are used, so 'x = a' is immediately evaluate and
|
|
2537
|
+
variable 'x' will receive the value of variable 'a' as soon as the Ruby statement is executed.
|
|
2538
|
+
Ruby also provides the notion of a symbol; ':a' is a symbol and does not evaluate to anything.
|
|
2539
|
+
Galaaz uses Ruby symbols to build expressions that are not bound to anything: ':a.eq :b' is
|
|
2540
|
+
clearly an expression and has no relationship whatsoever with the statment 'a = b'. By using
|
|
2541
|
+
symbols, variables and expressions all the possible ambiguities that are found in R are
|
|
2542
|
+
eliminated in Galaaz.
|
|
2543
|
+
|
|
2544
|
+
The main problem that remains, is that in R, functions are not clearly documented as what type
|
|
2545
|
+
of input they are expecting, they might be expecting regular variables or they might be
|
|
2546
|
+
expecting expressions and the R function will know how to deal with an input of the form
|
|
2547
|
+
'a = b', now for the Ruby developer it might not be immediately clear if it should call the
|
|
2548
|
+
function passing the value 'true' if variable 'a' is equal to variable 'b' or if it should
|
|
2549
|
+
call the function passing the expression ':a.eq :b'.
|
|
2550
|
+
|
|
2551
|
+
|
|
2552
|
+
## Advanced dplyr features
|
|
2553
|
+
|
|
2554
|
+
In the blog: Programming with dplyr by using dplyr (https://www.r-bloggers.com/programming-with-dplyr-by-using-dplyr/) Iñaki Úcar shows surprise that some R users are trying to code in dplyr avoiding
|
|
2555
|
+
the use of NSE. For instance he says:
|
|
2556
|
+
|
|
2557
|
+
> Take the example of seplyr. It stands for standard evaluation dplyr, and enables us to
|
|
2558
|
+
> program over dplyr without having “to bring in (or study) any deep-theory or
|
|
2559
|
+
> heavy-weight tools such as rlang/tidyeval”.
|
|
2560
|
+
|
|
2561
|
+
For me, there isn't really any surprise that users are trying to avoid dplyr deep-theory. R
|
|
2562
|
+
users frequently are not programmers and learning to code is already hard business, on top
|
|
2563
|
+
of that, having to learn how to 'quote' or 'enquo' or 'quos' or 'enquos' is not necessarily
|
|
2564
|
+
a 'piece of cake'. So much so, that 'tidyeval' has some more advanced functions that instead
|
|
2565
|
+
of using quoted expressions, uses strings as arguments.
|
|
2566
|
+
|
|
2567
|
+
In the following examples, we show the use of functions 'group\_by\_at', 'summarise\_at' and
|
|
2568
|
+
'rename\_at' that receive strings as argument. The data frame used in 'starwars' that describes
|
|
2569
|
+
features of characters in the Starwars movies:
|
|
2570
|
+
|
|
2571
|
+
|
|
2572
|
+
```ruby
|
|
2573
|
+
puts (~:starwars).head.as__data__frame
|
|
2574
|
+
```
|
|
2575
|
+
|
|
2576
|
+
```
|
|
2577
|
+
## name height mass hair_color skin_color eye_color birth_year
|
|
2578
|
+
## 1 Luke Skywalker 172 77 blond fair blue 19.0
|
|
2579
|
+
## 2 C-3PO 167 75 <NA> gold yellow 112.0
|
|
2580
|
+
## 3 R2-D2 96 32 <NA> white, blue red 33.0
|
|
2581
|
+
## 4 Darth Vader 202 136 none white yellow 41.9
|
|
2582
|
+
## 5 Leia Organa 150 49 brown light brown 19.0
|
|
2583
|
+
## 6 Owen Lars 178 120 brown, grey light blue 52.0
|
|
2584
|
+
## gender homeworld species
|
|
2585
|
+
## 1 male Tatooine Human
|
|
2586
|
+
## 2 <NA> Tatooine Droid
|
|
2587
|
+
## 3 <NA> Naboo Droid
|
|
2588
|
+
## 4 male Tatooine Human
|
|
2589
|
+
## 5 female Alderaan Human
|
|
2590
|
+
## 6 male Tatooine Human
|
|
2591
|
+
## films
|
|
2592
|
+
## 1 Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope, The Force Awakens
|
|
2593
|
+
## 2 Attack of the Clones, The Phantom Menace, Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope
|
|
2594
|
+
## 3 Attack of the Clones, The Phantom Menace, Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope, The Force Awakens
|
|
2595
|
+
## 4 Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope
|
|
2596
|
+
## 5 Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope, The Force Awakens
|
|
2597
|
+
## 6 Attack of the Clones, Revenge of the Sith, A New Hope
|
|
2598
|
+
## vehicles starships
|
|
2599
|
+
## 1 Snowspeeder, Imperial Speeder Bike X-wing, Imperial shuttle
|
|
2600
|
+
## 2
|
|
2601
|
+
## 3
|
|
2602
|
+
## 4 TIE Advanced x1
|
|
2603
|
+
## 5 Imperial Speeder Bike
|
|
2604
|
+
## 6
|
|
2605
|
+
```
|
|
2606
|
+
The grouped_mean function bellow will receive a grouping variable and calculate summaries for
|
|
2607
|
+
the value\_variables given:
|
|
2608
|
+
|
|
2609
|
+
|
|
2610
|
+
```r
|
|
2611
|
+
grouped_mean <- function(data, grouping_variables, value_variables) {
|
|
2612
|
+
data %>%
|
|
2613
|
+
group_by_at(grouping_variables) %>%
|
|
2614
|
+
mutate(count = n()) %>%
|
|
2615
|
+
summarise_at(c(value_variables, "count"), mean, na.rm = TRUE) %>%
|
|
2616
|
+
rename_at(value_variables, funs(paste0("mean_", .)))
|
|
2617
|
+
}
|
|
2618
|
+
|
|
2619
|
+
gm = starwars %>%
|
|
2620
|
+
grouped_mean("eye_color", c("mass", "birth_year"))
|
|
2621
|
+
|
|
2622
|
+
as.data.frame(gm)
|
|
2623
|
+
```
|
|
2624
|
+
|
|
2625
|
+
```
|
|
2626
|
+
## eye_color mean_mass mean_birth_year count
|
|
2627
|
+
## 1 black 76.28571 33.00000 10
|
|
2628
|
+
## 2 blue 86.51667 67.06923 19
|
|
2629
|
+
## 3 blue-gray 77.00000 57.00000 1
|
|
2630
|
+
## 4 brown 66.09231 108.96429 21
|
|
2631
|
+
## 5 dark NaN NaN 1
|
|
2632
|
+
## 6 gold NaN NaN 1
|
|
2633
|
+
## 7 green, yellow 159.00000 NaN 1
|
|
2634
|
+
## 8 hazel 66.00000 34.50000 3
|
|
2635
|
+
## 9 orange 282.33333 231.00000 8
|
|
2636
|
+
## 10 pink NaN NaN 1
|
|
2637
|
+
## 11 red 81.40000 33.66667 5
|
|
2638
|
+
## 12 red, blue NaN NaN 1
|
|
2639
|
+
## 13 unknown 31.50000 NaN 3
|
|
2640
|
+
## 14 white 48.00000 NaN 1
|
|
2641
|
+
## 15 yellow 81.11111 76.38000 11
|
|
2642
|
+
```
|
|
2643
|
+
|
|
2644
|
+
The same code with Galaaz, becomes:
|
|
2645
|
+
|
|
2646
|
+
|
|
2647
|
+
```ruby
|
|
2648
|
+
def grouped_mean(data, grouping_variables, value_variables)
|
|
2649
|
+
data.
|
|
2650
|
+
group_by_at(grouping_variables).
|
|
2651
|
+
mutate(count: E.n).
|
|
2652
|
+
summarise_at(E.c(value_variables, "count"), ~:mean, na__rm: true).
|
|
2653
|
+
rename_at(value_variables, E.funs(E.paste0("mean_", value_variables)))
|
|
2654
|
+
end
|
|
2655
|
+
|
|
2656
|
+
puts grouped_mean((~:starwars), "eye_color", E.c("mass", "birth_year")).as__data__frame
|
|
2657
|
+
```
|
|
2658
|
+
|
|
2659
|
+
```
|
|
2660
|
+
## eye_color mean_mass mean_birth_year count
|
|
2661
|
+
## 1 black 76.28571 33.00000 10
|
|
2662
|
+
## 2 blue 86.51667 67.06923 19
|
|
2663
|
+
## 3 blue-gray 77.00000 57.00000 1
|
|
2664
|
+
## 4 brown 66.09231 108.96429 21
|
|
2665
|
+
## 5 dark NaN NaN 1
|
|
2666
|
+
## 6 gold NaN NaN 1
|
|
2667
|
+
## 7 green, yellow 159.00000 NaN 1
|
|
2668
|
+
## 8 hazel 66.00000 34.50000 3
|
|
2669
|
+
## 9 orange 282.33333 231.00000 8
|
|
2670
|
+
## 10 pink NaN NaN 1
|
|
2671
|
+
## 11 red 81.40000 33.66667 5
|
|
2672
|
+
## 12 red, blue NaN NaN 1
|
|
2673
|
+
## 13 unknown 31.50000 NaN 3
|
|
2674
|
+
## 14 white 48.00000 NaN 1
|
|
2675
|
+
## 15 yellow 81.11111 76.38000 11
|
|
2676
|
+
```
|
|
2677
|
+
|
|
2165
2678
|
|
|
2166
2679
|
[TO BE CONTINUED...]
|
|
2167
2680
|
|