galaaz 0.4.9 → 0.4.10

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +798 -285
  3. data/blogs/galaaz_ggplot/galaaz_ggplot.Rmd +3 -12
  4. data/blogs/galaaz_ggplot/galaaz_ggplot.aux +5 -7
  5. data/blogs/galaaz_ggplot/galaaz_ggplot.html +69 -29
  6. data/blogs/galaaz_ggplot/galaaz_ggplot.pdf +0 -0
  7. data/blogs/galaaz_ggplot/galaaz_ggplot_files/figure-html/midwest_rb.png +0 -0
  8. data/blogs/galaaz_ggplot/galaaz_ggplot_files/figure-html/scatter_plot_rb.png +0 -0
  9. data/blogs/galaaz_ggplot/galaaz_ggplot_files/figure-latex/midwest_rb.pdf +0 -0
  10. data/blogs/galaaz_ggplot/galaaz_ggplot_files/figure-latex/scatter_plot_rb.pdf +0 -0
  11. data/blogs/galaaz_ggplot/midwest.Rmd +1 -9
  12. data/blogs/gknit/gknit.Rmd +37 -40
  13. data/blogs/gknit/gknit.html +32 -30
  14. data/blogs/gknit/gknit.md +36 -37
  15. data/blogs/gknit/gknit.pdf +0 -0
  16. data/blogs/gknit/gknit.tex +35 -37
  17. data/blogs/manual/manual.Rmd +548 -125
  18. data/blogs/manual/manual.html +509 -286
  19. data/blogs/manual/manual.md +798 -285
  20. data/blogs/manual/manual.pdf +0 -0
  21. data/blogs/manual/manual.tex +2816 -0
  22. data/blogs/manual/manual_files/figure-latex/diverging_bar.pdf +0 -0
  23. data/blogs/nse_dplyr/nse_dplyr.Rmd +240 -74
  24. data/blogs/nse_dplyr/nse_dplyr.html +191 -87
  25. data/blogs/nse_dplyr/nse_dplyr.md +361 -107
  26. data/blogs/nse_dplyr/nse_dplyr.pdf +0 -0
  27. data/blogs/nse_dplyr/nse_dplyr.tex +1373 -0
  28. data/blogs/ruby_plot/ruby_plot.Rmd +61 -81
  29. data/blogs/ruby_plot/ruby_plot.html +54 -57
  30. data/blogs/ruby_plot/ruby_plot.md +48 -67
  31. data/blogs/ruby_plot/ruby_plot.pdf +0 -0
  32. data/blogs/ruby_plot/ruby_plot_files/figure-html/dose_len.png +0 -0
  33. data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_delivery.png +0 -0
  34. data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_dose.png +0 -0
  35. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color.png +0 -0
  36. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color2.png +0 -0
  37. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_jitter.png +0 -0
  38. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_points.png +0 -0
  39. data/blogs/ruby_plot/ruby_plot_files/figure-html/final_box_plot.png +0 -0
  40. data/blogs/ruby_plot/ruby_plot_files/figure-html/final_violin_plot.png +0 -0
  41. data/blogs/ruby_plot/ruby_plot_files/figure-html/violin_with_jitter.png +0 -0
  42. data/blogs/ruby_plot/ruby_plot_files/figure-latex/dose_len.png +0 -0
  43. data/blogs/ruby_plot/ruby_plot_files/figure-latex/facet_by_delivery.png +0 -0
  44. data/blogs/ruby_plot/ruby_plot_files/figure-latex/facet_by_dose.png +0 -0
  45. data/blogs/ruby_plot/ruby_plot_files/figure-latex/facets_by_delivery_color.png +0 -0
  46. data/blogs/ruby_plot/ruby_plot_files/figure-latex/facets_by_delivery_color2.png +0 -0
  47. data/blogs/ruby_plot/ruby_plot_files/figure-latex/facets_with_decorations.png +0 -0
  48. data/blogs/ruby_plot/ruby_plot_files/figure-latex/facets_with_jitter.png +0 -0
  49. data/blogs/ruby_plot/ruby_plot_files/figure-latex/facets_with_points.png +0 -0
  50. data/blogs/ruby_plot/ruby_plot_files/figure-latex/final_box_plot.png +0 -0
  51. data/blogs/ruby_plot/ruby_plot_files/figure-latex/final_violin_plot.png +0 -0
  52. data/blogs/ruby_plot/ruby_plot_files/figure-latex/violin_with_jitter.png +0 -0
  53. data/lib/R_interface/rdata_frame.rb +0 -12
  54. data/lib/R_interface/robject.rb +14 -14
  55. data/lib/R_interface/ruby_extensions.rb +3 -31
  56. data/lib/R_interface/rvector.rb +0 -12
  57. data/lib/gknit/knitr_engine.rb +5 -3
  58. data/lib/util/exec_ruby.rb +22 -61
  59. data/specs/tmp.rb +26 -12
  60. data/version.rb +1 -1
  61. metadata +22 -17
  62. data/bin/gknit_old_r +0 -236
  63. data/blogs/dev/dev.Rmd +0 -23
  64. data/blogs/dev/dev.md +0 -58
  65. data/blogs/dev/dev2.Rmd +0 -65
  66. data/blogs/dev/model.rb +0 -41
  67. data/blogs/dplyr/dplyr.Rmd +0 -29
  68. data/blogs/dplyr/dplyr.html +0 -433
  69. data/blogs/dplyr/dplyr.md +0 -58
  70. data/blogs/dplyr/dplyr.rb +0 -63
  71. data/blogs/galaaz_ggplot/galaaz_ggplot.log +0 -640
  72. data/blogs/galaaz_ggplot/galaaz_ggplot.md +0 -431
  73. data/blogs/galaaz_ggplot/galaaz_ggplot.tex +0 -481
  74. data/blogs/galaaz_ggplot/midwest.png +0 -0
  75. data/blogs/galaaz_ggplot/scatter_plot.png +0 -0
  76. data/blogs/ruby_plot/ruby_plot.tex +0 -1077
@@ -98,15 +98,13 @@ Panda, SciPy, SciKit-Learn and a couple more.
98
98
  # gKnitting a Document
99
99
 
100
100
  This manual has been formatted usign gKnit. gKnit uses Knitr and R markdown to knit
101
- a document in Ruby or R and output it in any of the available formats for R markdown.
101
+ a document in Ruby or R and output it in any of the available formats for R markdown.
102
102
  gKnit runs atop of GraalVM, and Galaaz. In gKnit, Ruby variables are persisted between
103
- chunks, making it an ideal solution for literate programming.
104
- Also, since it is based on Galaaz, Ruby chunks can have access to R variables and Polyglot
105
- Programming with Ruby and R is quite natural.
103
+ chunks, making it an ideal solution for literate programming. Also, since it is based
104
+ on Galaaz, Ruby chunks can have access to R variables and Polyglot Programming with
105
+ Ruby and R is quite natural.
106
106
 
107
- gknit was describe in more depth in:
108
-
109
- * xxx.xxxx.xxx
107
+ [gknit is described in more details here](https://towardsdatascience.com/how-to-do-reproducible-research-in-ruby-with-gknit-c26d2684d64e)
110
108
 
111
109
  # Vector
112
110
 
@@ -134,15 +132,15 @@ To create a vector the 'c' (concatenate) method from the 'R' module should be us
134
132
 
135
133
 
136
134
  ```ruby
137
- @vec = R.c(1, 2, 3)
138
- puts @vec
135
+ vec = R.c(1, 2, 3)
136
+ puts vec
139
137
  ```
140
138
 
141
139
  ```
142
140
  ## [1] 1 2 3
143
141
  ```
144
142
 
145
- Lets take a look at the type, mode and storage.mode of our vector @vec. In order to print
143
+ Lets take a look at the type, mode and storage.mode of our vector vec. In order to print
146
144
  this out, we are creating a data frame 'df' and printing it out. A data frame, for those
147
145
  not familiar with it, is basically a table. Here we create the data frame and add the
148
146
  column name by passing named parameters for each column, such as 'typeof:', 'mode:' and
@@ -154,7 +152,7 @@ data frame is 'data.frame', in Galaaz we use 'data\_\_frame'.
154
152
 
155
153
 
156
154
  ```ruby
157
- df = R.data__frame(typeof: @vec.typeof, mode: @vec.mode, storage__mode: @vec.storage__mode)
155
+ df = R.data__frame(typeof: vec.typeof, mode: vec.mode, storage__mode: vec.storage__mode)
158
156
  puts df
159
157
  ```
160
158
 
@@ -170,8 +168,8 @@ follows normal Ruby rules and the number 1 is an integer and 1.0 is a float.
170
168
 
171
169
 
172
170
  ```ruby
173
- @vec = R.c(1.0, 2, 3)
174
- puts @vec
171
+ vec = R.c(1.0, 2, 3)
172
+ puts vec
175
173
  ```
176
174
 
177
175
  ```
@@ -180,7 +178,7 @@ puts @vec
180
178
 
181
179
 
182
180
  ```ruby
183
- df = R.data__frame(typeof: @vec.typeof, mode: @vec.mode, storage__mode: @vec.storage__mode)
181
+ df = R.data__frame(typeof: vec.typeof, mode: vec.mode, storage__mode: vec.storage__mode)
184
182
  outputs df.kable.kable_styling
185
183
  ```
186
184
 
@@ -213,14 +211,14 @@ vec = R.c(1, hello, 5)
213
211
 
214
212
  ```
215
213
  ## Message:
216
- ## undefined local variable or method `hello' for RubyChunk:Class
214
+ ## undefined local variable or method `hello' for #<RC:0x2e0 @out_list=nil>:RC
217
215
  ```
218
216
 
219
217
  ```
220
218
  ## Message:
221
- ## (eval):1:in `exec_ruby'
222
- ## /home/rbotafogo/desenv/galaaz/lib/util/exec_ruby.rb:141:in `instance_eval'
223
- ## /home/rbotafogo/desenv/galaaz/lib/util/exec_ruby.rb:141:in `exec_ruby'
219
+ ## /home/rbotafogo/desenv/galaaz/lib/util/exec_ruby.rb:103:in `get_binding'
220
+ ## /home/rbotafogo/desenv/galaaz/lib/util/exec_ruby.rb:102:in `eval'
221
+ ## /home/rbotafogo/desenv/galaaz/lib/util/exec_ruby.rb:102:in `exec_ruby'
224
222
  ## /home/rbotafogo/desenv/galaaz/lib/gknit/knitr_engine.rb:650:in `block in initialize'
225
223
  ## /home/rbotafogo/desenv/galaaz/lib/R_interface/ruby_callback.rb:77:in `call'
226
224
  ## /home/rbotafogo/desenv/galaaz/lib/R_interface/ruby_callback.rb:77:in `callback'
@@ -245,8 +243,8 @@ Here is a vector with logical values
245
243
 
246
244
 
247
245
  ```ruby
248
- @vec = R.c(true, true, false, false, true)
249
- puts @vec
246
+ vec = R.c(true, true, false, false, true)
247
+ puts vec
250
248
  ```
251
249
 
252
250
  ```
@@ -259,26 +257,26 @@ The 'c' functions used to create vectors can also be used to combine two vectors
259
257
 
260
258
 
261
259
  ```ruby
262
- @vec1 = R.c(10.0, 20.0, 30.0)
263
- @vec2 = R.c(4.0, 5.0, 6.0)
264
- @vec = R.c(@vec1, @vec2)
265
- puts @vec
260
+ vec1 = R.c(10.0, 20.0, 30.0)
261
+ vec2 = R.c(4.0, 5.0, 6.0)
262
+ vec = R.c(vec1, vec2)
263
+ puts vec
266
264
  ```
267
265
 
268
266
  ```
269
267
  ## [1] 10 20 30 4 5 6
270
268
  ```
271
269
  In galaaz, methods can be chainned (somewhat like the pipe operator in R %>%, but more generic).
272
- In this next example, method 'c' is chainned after '@vec1'. This also looks like 'c' is a
270
+ In this next example, method 'c' is chainned after 'vec1'. This also looks like 'c' is a
273
271
  method of the vector, but in reallity, this is actually closer to the pipe operator. When
274
272
  Galaaz identifies that 'c' is not a method of 'vec' it actually tries to call 'R.c' with
275
- '@vec1' as the first argument concatenated with all the other available arguments. The code
273
+ 'vec1' as the first argument concatenated with all the other available arguments. The code
276
274
  bellow is automatically converted to the code above.
277
275
 
278
276
 
279
277
  ```ruby
280
- @vec = @vec1.c(@vec2)
281
- puts @vec
278
+ vec = vec1.c(vec2)
279
+ puts vec
282
280
  ```
283
281
 
284
282
  ```
@@ -291,7 +289,7 @@ Arithmetic operations on vectors are performed element by element:
291
289
 
292
290
 
293
291
  ```ruby
294
- puts @vec1 + @vec2
292
+ puts vec1 + vec2
295
293
  ```
296
294
 
297
295
  ```
@@ -300,7 +298,7 @@ puts @vec1 + @vec2
300
298
 
301
299
 
302
300
  ```ruby
303
- puts @vec1 * 5
301
+ puts vec1 * 5
304
302
  ```
305
303
 
306
304
  ```
@@ -311,8 +309,8 @@ When vectors have different length, a recycling rule is applied to the shorter v
311
309
 
312
310
 
313
311
  ```ruby
314
- @vec3 = R.c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)
315
- puts @vec4 = @vec1 + @vec3
312
+ vec3 = R.c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)
313
+ puts vec4 = vec1 + vec3
316
314
  ```
317
315
 
318
316
  ```
@@ -325,7 +323,7 @@ Vectors can be indexed by using the '[]' operator:
325
323
 
326
324
 
327
325
  ```ruby
328
- puts @vec4[3]
326
+ puts vec4[3]
329
327
  ```
330
328
 
331
329
  ```
@@ -333,11 +331,11 @@ puts @vec4[3]
333
331
  ```
334
332
 
335
333
  We can also index a vector with another vector. For example, in the code bellow, we take elements
336
- 1, 3, 5, and 7 from @vec3:
334
+ 1, 3, 5, and 7 from vec3:
337
335
 
338
336
 
339
337
  ```ruby
340
- puts @vec4[R.c(1, 3, 5, 7)]
338
+ puts vec4[R.c(1, 3, 5, 7)]
341
339
  ```
342
340
 
343
341
  ```
@@ -348,7 +346,7 @@ Repeating an index and having indices out of order is valid code:
348
346
 
349
347
 
350
348
  ```ruby
351
- puts @vec4[R.c(1, 3, 3, 1)]
349
+ puts vec4[R.c(1, 3, 3, 1)]
352
350
  ```
353
351
 
354
352
  ```
@@ -360,8 +358,8 @@ the indexed values are not returned:
360
358
 
361
359
 
362
360
  ```ruby
363
- puts @vec4[-3]
364
- puts @vec4[-R.c(1, 3, 5, 7)]
361
+ puts vec4[-3]
362
+ puts vec4[-R.c(1, 3, 5, 7)]
365
363
  ```
366
364
 
367
365
  ```
@@ -373,7 +371,7 @@ If an index is out of range, a missing value (NA) will be reported.
373
371
 
374
372
 
375
373
  ```ruby
376
- puts @vec4[30]
374
+ puts vec4[30]
377
375
  ```
378
376
 
379
377
  ```
@@ -384,7 +382,7 @@ It is also possible to index a vector by range:
384
382
 
385
383
 
386
384
  ```ruby
387
- puts @vec4[(2..5)]
385
+ puts vec4[(2..5)]
388
386
  ```
389
387
 
390
388
  ```
@@ -427,9 +425,9 @@ from the vector. In order to do this extraction the '>>' operator is used.
427
425
 
428
426
 
429
427
  ```ruby
430
- puts @vec4
431
- puts @vec4 >> 0
432
- puts @vec4 >> 4
428
+ puts vec4
429
+ puts vec4 >> 0
430
+ puts vec4 >> 4
433
431
  ```
434
432
 
435
433
  ```
@@ -929,11 +927,11 @@ created by the 'matrix' function:
929
927
 
930
928
 
931
929
  ```ruby
932
- @mat = R.matrix(R.c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0),
933
- nrow: 3,
934
- ncol: 3)
930
+ mat = R.matrix(R.c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0),
931
+ nrow: 3,
932
+ ncol: 3)
935
933
 
936
- puts @mat
934
+ puts mat
937
935
  ```
938
936
 
939
937
  ```
@@ -947,12 +945,12 @@ memory by row first passing an extra argument to the 'matrix' function:
947
945
 
948
946
 
949
947
  ```ruby
950
- @mat_row = R.matrix(R.c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0),
951
- nrow: 3,
952
- ncol: 3,
953
- byrow: true)
948
+ mat_row = R.matrix(R.c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0),
949
+ nrow: 3,
950
+ ncol: 3,
951
+ byrow: true)
954
952
 
955
- puts @mat_row
953
+ puts mat_row
956
954
  ```
957
955
 
958
956
  ```
@@ -968,8 +966,8 @@ A matrix can be indexed by [row, column]:
968
966
 
969
967
 
970
968
  ```ruby
971
- puts @mat_row[1, 1]
972
- puts @mat_row[2, 3]
969
+ puts mat_row[1, 1]
970
+ puts mat_row[2, 3]
973
971
  ```
974
972
 
975
973
  ```
@@ -980,8 +978,8 @@ It is possible to index an entire row or column with the ':all' keyword
980
978
 
981
979
 
982
980
  ```ruby
983
- puts @mat_row[1, :all]
984
- puts @mat_row[:all, 2]
981
+ puts mat_row[1, :all]
982
+ puts mat_row[:all, 2]
985
983
  ```
986
984
 
987
985
  ```
@@ -994,7 +992,7 @@ rows 1 and 3 and columns 2 and 3 building a 2 x 2 matrix.
994
992
 
995
993
 
996
994
  ```ruby
997
- puts @mat_row[R.c(1, 3), R.c(2, 3)]
995
+ puts mat_row[R.c(1, 3), R.c(2, 3)]
998
996
  ```
999
997
 
1000
998
  ```
@@ -1003,12 +1001,11 @@ puts @mat_row[R.c(1, 3), R.c(2, 3)]
1003
1001
  ## [2,] 8 9
1004
1002
  ```
1005
1003
 
1006
- Matrices can be combined with functions 'rbind' and 'cbind'
1004
+ Matrices can be combined with functions 'rbind':
1007
1005
 
1008
1006
 
1009
1007
  ```ruby
1010
- puts @mat_row.rbind(@mat)
1011
- puts @mat_row.cbind(@mat)
1008
+ puts mat_row.rbind(mat)
1012
1009
  ```
1013
1010
 
1014
1011
  ```
@@ -1019,6 +1016,16 @@ puts @mat_row.cbind(@mat)
1019
1016
  ## [4,] 1 4 7
1020
1017
  ## [5,] 2 5 8
1021
1018
  ## [6,] 3 6 9
1019
+ ```
1020
+
1021
+ and 'cbind':
1022
+
1023
+
1024
+ ```ruby
1025
+ puts mat_row.cbind(mat)
1026
+ ```
1027
+
1028
+ ```
1022
1029
  ## [,1] [,2] [,3] [,4] [,5] [,6]
1023
1030
  ## [1,] 1 2 3 1 4 7
1024
1031
  ## [2,] 4 5 6 2 5 8
@@ -1035,8 +1042,8 @@ can only hold one type of element.
1035
1042
  nums = R.c(1.0, 2.0, 3.0)
1036
1043
  strs = R.c("a", "b", "c", "d")
1037
1044
  bool = R.c(true, true, false)
1038
- @lst = R.list(nums: nums, strs: strs, bool: bool)
1039
- puts @lst
1045
+ lst = R.list(nums: nums, strs: strs, bool: bool)
1046
+ puts lst
1040
1047
  ```
1041
1048
 
1042
1049
  ```
@@ -1050,7 +1057,7 @@ puts @lst
1050
1057
  ## [1] TRUE TRUE FALSE
1051
1058
  ```
1052
1059
 
1053
- Note that '@lst' elements are named elements.
1060
+ Note that 'lst' elements are named elements.
1054
1061
 
1055
1062
 
1056
1063
  ## List Indexing
@@ -1061,7 +1068,7 @@ return one of the sublists.
1061
1068
 
1062
1069
 
1063
1070
  ```ruby
1064
- puts @lst[1]
1071
+ puts lst[1]
1065
1072
  ```
1066
1073
 
1067
1074
  ```
@@ -1076,18 +1083,18 @@ the original list
1076
1083
 
1077
1084
 
1078
1085
  ```ruby
1079
- puts @lst[[1]]
1086
+ puts lst[[1]]
1080
1087
  ```
1081
1088
 
1082
1089
  ```
1083
1090
  ## [1] 1 2 3
1084
1091
  ```
1085
1092
 
1086
- When elements are named, as dones with @lst, indexing can be done by name:
1093
+ When elements are named, as dones with lst, indexing can be done by name:
1087
1094
 
1088
1095
 
1089
1096
  ```ruby
1090
- puts @lst[['bool']][[1]] >> 0
1097
+ puts lst[['bool']][[1]] >> 0
1091
1098
  ```
1092
1099
 
1093
1100
  ```
@@ -1207,23 +1214,31 @@ puts (~:mtcars)[R.c('Datsun 710', 'Camaro Z28'), :all]
1207
1214
  Finally, a data frame can also be indexed with a logical vector. In this next example, the
1208
1215
  'am' column of :mtcars is compared with 0 (with method 'eq'). When 'am' is equal to 0 the
1209
1216
  car is automatic. So, by doing '(~:mtcars).am.eq 0' a logical vector is created with
1210
- 'true' whenever 'am' is 0 and 'false' otherwise. Using this logical vector, the data frame
1211
- is indexed, returning a new data frame in which all cars have automatic transmission.
1217
+ 'true' whenever 'am' is 0 and 'false' otherwise.
1212
1218
 
1213
1219
 
1214
1220
  ```ruby
1215
1221
  # obtain a vector with 'true' for cars with automatic transmission
1216
1222
  automatic = (~:mtcars).am.eq 0
1217
1223
  puts automatic
1218
-
1219
- # slice the data frame by using this vector
1220
- puts (~:mtcars)[automatic, :all]
1221
1224
  ```
1222
1225
 
1223
1226
  ```
1224
1227
  ## [1] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
1225
1228
  ## [12] TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE
1226
1229
  ## [23] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
1230
+ ```
1231
+
1232
+ Using this logical vector, the data frame is indexed, returning a new data frame in
1233
+ which all cars have automatic transmission.
1234
+
1235
+
1236
+ ```ruby
1237
+ # slice the data frame by using this vector
1238
+ puts (~:mtcars)[automatic, :all]
1239
+ ```
1240
+
1241
+ ```
1227
1242
  ## mpg cyl disp hp drat wt qsec vs am gear carb
1228
1243
  ## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
1229
1244
  ## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
@@ -1366,6 +1381,62 @@ puts exp7
1366
1381
  ## y <- sin(x)
1367
1382
  ```
1368
1383
 
1384
+ Expressions can also be written using '.' notation:
1385
+
1386
+
1387
+ ```ruby
1388
+ exp8 = :y.assign :x.sin
1389
+ puts exp8
1390
+ ```
1391
+
1392
+ ```
1393
+ ## y <- sin(x)
1394
+ ```
1395
+
1396
+ When a function has multiple arguments, the first one can be used before the '.':
1397
+
1398
+
1399
+ ```ruby
1400
+ exp9 = :x.c(:y)
1401
+ puts exp9
1402
+ ```
1403
+
1404
+ ```
1405
+ ## c(x, y)
1406
+ ```
1407
+
1408
+ ## Evaluating an Expression
1409
+
1410
+ Expressions can be evaluated by calling function 'eval' with a binding. A binding can be provided
1411
+ with a list:
1412
+
1413
+
1414
+ ```ruby
1415
+ exp = (:a + :b) * 2.0 + :c ** 2 / :z
1416
+ puts exp.eval(R.list(a: 10, b: 20, c: 30, z: 40))
1417
+ ```
1418
+
1419
+ ```
1420
+ ## [1] 82.5
1421
+ ```
1422
+
1423
+ ... with a data frame:
1424
+
1425
+
1426
+ ```ruby
1427
+ df = R.data__frame(
1428
+ a: R.c(1, 2, 3),
1429
+ b: R.c(10, 20, 30),
1430
+ c: R.c(100, 200, 300),
1431
+ z: R.c(1000, 2000, 3000))
1432
+
1433
+ puts exp.eval(df)
1434
+ ```
1435
+
1436
+ ```
1437
+ ## [1] 32 64 96
1438
+ ```
1439
+
1369
1440
  # Manipulating Data
1370
1441
 
1371
1442
  One of the major benefits of Galaaz is to bring strong data manipulation to Ruby. The following
@@ -1389,8 +1460,8 @@ R.library('dplyr')
1389
1460
 
1390
1461
 
1391
1462
  ```ruby
1392
- @flights = ~:flights
1393
- puts @flights.head.as__data__frame
1463
+ flights = ~:flights
1464
+ puts flights.head.as__data__frame
1394
1465
  ```
1395
1466
 
1396
1467
  ```
@@ -1424,7 +1495,7 @@ the first :month.eq 1
1424
1495
 
1425
1496
 
1426
1497
  ```ruby
1427
- puts @flights.filter((:month.eq 1), (:day.eq 1)).head.as__data__frame
1498
+ puts flights.filter((:month.eq 1), (:day.eq 1)).head.as__data__frame
1428
1499
  ```
1429
1500
 
1430
1501
  ```
@@ -1457,7 +1528,7 @@ All flights that departed in November of December
1457
1528
 
1458
1529
 
1459
1530
  ```ruby
1460
- puts @flights.filter((:month.eq 11) | (:month.eq 12)).head.as__data__frame
1531
+ puts flights.filter((:month.eq 11) | (:month.eq 12)).head.as__data__frame
1461
1532
  ```
1462
1533
 
1463
1534
  ```
@@ -1491,7 +1562,7 @@ symbol, in this case ':in' and the second argument is the vector:
1491
1562
 
1492
1563
 
1493
1564
  ```ruby
1494
- puts @flights.filter(:month._ :in, R.c(11, 12)).head.as__data__frame
1565
+ puts flights.filter(:month._ :in, R.c(11, 12)).head.as__data__frame
1495
1566
  ```
1496
1567
 
1497
1568
  ```
@@ -1527,8 +1598,8 @@ what is obtained from data frame.
1527
1598
 
1528
1599
 
1529
1600
  ```ruby
1530
- @df = R.tibble(x: R.c(1, R::NA, 3))
1531
- puts @df.as__data__frame
1601
+ df = R.tibble(x: R.c(1, R::NA, 3))
1602
+ puts df.as__data__frame
1532
1603
  ```
1533
1604
 
1534
1605
  ```
@@ -1543,7 +1614,7 @@ not.
1543
1614
 
1544
1615
 
1545
1616
  ```ruby
1546
- puts @df.filter(:x > 1).as__data__frame
1617
+ puts df.filter(:x > 1).as__data__frame
1547
1618
  ```
1548
1619
 
1549
1620
  ```
@@ -1555,7 +1626,7 @@ To match an NA use method 'is__na'
1555
1626
 
1556
1627
 
1557
1628
  ```ruby
1558
- puts @df.filter((:x.is__na) | (:x > 1)).as__data__frame
1629
+ puts df.filter((:x.is__na) | (:x > 1)).as__data__frame
1559
1630
  ```
1560
1631
 
1561
1632
  ```
@@ -1570,7 +1641,7 @@ Arrange reorders the rows of a data frame by the given arguments.
1570
1641
 
1571
1642
 
1572
1643
  ```ruby
1573
- puts @flights.arrange(:year, :month, :day).head.as__data__frame
1644
+ puts flights.arrange(:year, :month, :day).head.as__data__frame
1574
1645
  ```
1575
1646
 
1576
1647
  ```
@@ -1601,7 +1672,7 @@ To arrange in descending order, use function 'desc'
1601
1672
 
1602
1673
 
1603
1674
  ```ruby
1604
- puts @flights.arrange(:dep_delay.desc).head.as__data__frame
1675
+ puts flights.arrange(:dep_delay.desc).head.as__data__frame
1605
1676
  ```
1606
1677
 
1607
1678
  ```
@@ -1634,7 +1705,7 @@ To select specific columns from a dataset we use function 'select':
1634
1705
 
1635
1706
 
1636
1707
  ```ruby
1637
- puts @flights.select(:year, :month, :day).head.as__data__frame
1708
+ puts flights.select(:year, :month, :day).head.as__data__frame
1638
1709
  ```
1639
1710
 
1640
1711
  ```
@@ -1651,7 +1722,7 @@ It is also possible to select column in a given range
1651
1722
 
1652
1723
 
1653
1724
  ```ruby
1654
- puts @flights.select(:year.up_to :day).head.as__data__frame
1725
+ puts flights.select(:year.up_to :day).head.as__data__frame
1655
1726
  ```
1656
1727
 
1657
1728
  ```
@@ -1668,7 +1739,7 @@ Select all columns that start with a given name sequence
1668
1739
 
1669
1740
 
1670
1741
  ```ruby
1671
- puts @flights.select(E.starts_with('arr')).head.as__data__frame
1742
+ puts flights.select(E.starts_with('arr')).head.as__data__frame
1672
1743
  ```
1673
1744
 
1674
1745
  ```
@@ -1696,7 +1767,7 @@ A helper function that comes in handy when we just want to rearrange column orde
1696
1767
 
1697
1768
 
1698
1769
  ```ruby
1699
- puts @flights.select(:year, :month, :day, E.everything).head.as__data__frame
1770
+ puts flights.select(:year, :month, :day, E.everything).head.as__data__frame
1700
1771
  ```
1701
1772
 
1702
1773
  ```
@@ -1727,13 +1798,13 @@ puts @flights.select(:year, :month, :day, E.everything).head.as__data__frame
1727
1798
 
1728
1799
 
1729
1800
  ```ruby
1730
- @flights_sm = @flights.
1731
- select((:year.up_to :day),
1732
- E.ends_with('delay'),
1733
- :distance,
1734
- :air_time)
1801
+ flights_sm = flights.
1802
+ select((:year.up_to :day),
1803
+ E.ends_with('delay'),
1804
+ :distance,
1805
+ :air_time)
1735
1806
 
1736
- puts @flights_sm.head.as__data__frame
1807
+ puts flights_sm.head.as__data__frame
1737
1808
  ```
1738
1809
 
1739
1810
  ```
@@ -1748,10 +1819,10 @@ puts @flights_sm.head.as__data__frame
1748
1819
 
1749
1820
 
1750
1821
  ```ruby
1751
- @flights_sm = @flights_sm.
1752
- mutate(gain: :dep_delay - :arr_delay,
1753
- speed: :distance / :air_time * 60)
1754
- puts @flights_sm.head.as__data__frame
1822
+ flights_sm = flights_sm.
1823
+ mutate(gain: :dep_delay - :arr_delay,
1824
+ speed: :distance / :air_time * 60)
1825
+ puts flights_sm.head.as__data__frame
1755
1826
  ```
1756
1827
 
1757
1828
  ```
@@ -1771,7 +1842,7 @@ a single value is obtained from the data frame:
1771
1842
 
1772
1843
 
1773
1844
  ```ruby
1774
- puts @flights.summarise(delay: E.mean(:dep_delay, na__rm: true)).as__data__frame
1845
+ puts flights.summarise(delay: E.mean(:dep_delay, na__rm: true)).as__data__frame
1775
1846
  ```
1776
1847
 
1777
1848
  ```
@@ -1783,7 +1854,7 @@ When a data frame is groupe with 'group_by' summaries apply to the given group:
1783
1854
 
1784
1855
 
1785
1856
  ```ruby
1786
- by_day = @flights.group_by(:year, :month, :day)
1857
+ by_day = flights.group_by(:year, :month, :day)
1787
1858
  puts by_day.summarise(delay: :dep_delay.mean(na__rm: true)).head.as__data__frame
1788
1859
  ```
1789
1860
 
@@ -1801,7 +1872,7 @@ Next we put many operations together by pipping them one after the other:
1801
1872
 
1802
1873
 
1803
1874
  ```ruby
1804
- delays = @flights.
1875
+ delays = flights.
1805
1876
  group_by(:dest).
1806
1877
  summarise(
1807
1878
  count: E.n,
@@ -1809,108 +1880,17 @@ delays = @flights.
1809
1880
  delay: :arr_delay.mean(na__rm: true)).
1810
1881
  filter(:count > 20, :dest != "NHL")
1811
1882
 
1812
- puts delays.as__data__frame
1813
- ```
1814
-
1815
- ```
1816
- ## dest count dist delay
1817
- ## 1 ABQ 254 1826.00000 4.38188976
1818
- ## 2 ACK 265 199.00000 4.85227273
1819
- ## 3 ALB 439 143.00000 14.39712919
1820
- ## 4 ATL 17215 757.10822 11.30011285
1821
- ## 5 AUS 2439 1514.25297 6.01990875
1822
- ## 6 AVL 275 583.58182 8.00383142
1823
- ## 7 BDL 443 116.00000 7.04854369
1824
- ## 8 BGR 375 378.00000 8.02793296
1825
- ## 9 BHM 297 865.99663 16.87732342
1826
- ## 10 BNA 6333 758.21348 11.81245891
1827
- ## 11 BOS 15508 190.63696 2.91439222
1828
- ## 12 BQN 896 1578.98326 8.24549550
1829
- ## 13 BTV 2589 265.09154 8.95099602
1830
- ## 14 BUF 4681 296.80837 8.94595186
1831
- ## 15 BUR 371 2465.00000 8.17567568
1832
- ## 16 BWI 1781 179.41830 10.72673385
1833
- ## 17 BZN 36 1882.00000 7.60000000
1834
- ## 18 CAE 116 603.55172 41.76415094
1835
- ## 19 CAK 864 397.00000 19.69833729
1836
- ## 20 CHO 52 305.00000 9.50000000
1837
- ## 21 CHS 2884 632.91678 10.59296847
1838
- ## 22 CLE 4573 414.17428 9.18161129
1839
- ## 23 CLT 14064 538.02730 7.36031885
1840
- ## 24 CMH 3524 476.55505 10.60132291
1841
- ## 25 CRW 138 444.00000 14.67164179
1842
- ## 26 CVG 3941 575.15986 15.36456376
1843
- ## 27 DAY 1525 537.10230 12.68048606
1844
- ## 28 DCA 9705 211.00618 9.06695204
1845
- ## 29 DEN 7266 1614.67836 8.60650021
1846
- ## 30 DFW 8738 1383.04303 0.32212685
1847
- ## 31 DSM 569 1020.88752 19.00573614
1848
- ## 32 DTW 9384 498.12852 5.42996346
1849
- ## 33 EGE 213 1735.70892 6.30434783
1850
- ## 34 FLL 12055 1070.06877 8.08212154
1851
- ## 35 GRR 765 605.78170 18.18956044
1852
- ## 36 GSO 1606 449.84184 14.11260054
1853
- ## 37 GSP 849 595.95995 15.93544304
1854
- ## 38 HNL 707 4972.67468 -1.36519258
1855
- ## 39 HOU 2115 1420.15508 7.17618819
1856
- ## 40 IAD 5700 224.84684 13.86420212
1857
- ## 41 IAH 7198 1407.20672 4.24079040
1858
- ## 42 ILM 110 500.00000 4.63551402
1859
- ## 43 IND 2077 652.26288 9.94043412
1860
- ## 44 JAC 25 1875.60000 28.09523810
1861
- ## 45 JAX 2720 824.67610 11.84483416
1862
- ## 46 LAS 5997 2240.96148 0.25772849
1863
- ## 47 LAX 16174 2468.62236 0.54711094
1864
- ## 48 LGB 668 2465.00000 -0.06202723
1865
- ## 49 MCI 2008 1097.69522 14.51405836
1866
- ## 50 MCO 14082 943.11057 5.45464309
1867
- ## 51 MDW 4113 718.04595 12.36422360
1868
- ## 52 MEM 1789 954.20123 10.64531435
1869
- ## 53 MHT 1009 207.02973 14.78755365
1870
- ## 54 MIA 11728 1091.55244 0.29905978
1871
- ## 55 MKE 2802 733.38151 14.16722038
1872
- ## 56 MSN 572 803.95455 20.19604317
1873
- ## 57 MSP 7185 1017.40167 7.27016886
1874
- ## 58 MSY 3799 1177.70571 6.49017497
1875
- ## 59 MVY 221 173.00000 -0.28571429
1876
- ## 60 MYR 59 550.66102 4.60344828
1877
- ## 61 OAK 312 2576.00000 3.07766990
1878
- ## 62 OKC 346 1325.00000 30.61904762
1879
- ## 63 OMA 849 1135.56655 14.69889841
1880
- ## 64 ORD 17283 729.00081 5.87661475
1881
- ## 65 ORF 1536 288.52344 10.94909344
1882
- ## 66 PBI 6554 1028.83811 8.56297210
1883
- ## 67 PDX 1354 2445.56573 5.14157973
1884
- ## 68 PHL 1632 94.32353 10.12719014
1885
- ## 69 PHX 4656 2141.30326 2.09704733
1886
- ## 70 PIT 2875 334.06122 7.68099053
1887
- ## 71 PSE 365 1617.00000 7.87150838
1888
- ## 72 PVD 376 160.00000 16.23463687
1889
- ## 73 PWM 2352 276.12840 11.66040210
1890
- ## 74 RDU 8163 426.75769 10.05238095
1891
- ## 75 RIC 2454 281.40465 20.11125320
1892
- ## 76 ROC 2416 259.25083 11.56064461
1893
- ## 77 RSW 3537 1072.85327 3.23814963
1894
- ## 78 SAN 2737 2437.29923 3.13916574
1895
- ## 79 SAT 686 1578.34111 6.94537178
1896
- ## 80 SAV 804 709.18408 15.12950601
1897
- ## 81 SDF 1157 645.98358 12.66938406
1898
- ## 82 SEA 3923 2412.66531 -1.09909910
1899
- ## 83 SFO 13331 2577.92356 2.67289152
1900
- ## 84 SJC 329 2569.00000 3.44817073
1901
- ## 85 SJU 5819 1599.83365 2.52052659
1902
- ## 86 SLC 2467 1986.98662 0.17625459
1903
- ## 87 SMF 284 2521.00000 12.10992908
1904
- ## 88 SNA 825 2434.00000 -7.86822660
1905
- ## 89 SRQ 1211 1044.65153 3.08243131
1906
- ## 90 STL 4339 878.72321 11.07846451
1907
- ## 91 STT 522 1626.98276 -3.83590734
1908
- ## 92 SYR 1761 205.92164 8.90392501
1909
- ## 93 TPA 7466 1003.93557 7.40852503
1910
- ## 94 TUL 315 1215.00000 33.65986395
1911
- ## 95 TVC 101 652.38614 12.96842105
1912
- ## 96 TYS 631 638.80983 24.06920415
1913
- ## 97 XNA 1036 1142.50579 7.46572581
1883
+ puts delays.as__data__frame.head
1884
+ ```
1885
+
1886
+ ```
1887
+ ## dest count dist delay
1888
+ ## 1 ABQ 254 1826.0000 4.381890
1889
+ ## 2 ACK 265 199.0000 4.852273
1890
+ ## 3 ALB 439 143.0000 14.397129
1891
+ ## 4 ATL 17215 757.1082 11.300113
1892
+ ## 5 AUS 2439 1514.2530 6.019909
1893
+ ## 6 AVL 275 583.5818 8.003831
1914
1894
  ```
1915
1895
 
1916
1896
  # Using Data Table
@@ -1921,9 +1901,9 @@ R.library('data.table')
1921
1901
  R.install_and_loads('curl')
1922
1902
 
1923
1903
  input = "https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv"
1924
- @flights = R.fread(input)
1925
- puts @flights
1926
- puts @flights.dim
1904
+ flights = R.fread(input)
1905
+ puts flights
1906
+ puts flights.dim
1927
1907
  ```
1928
1908
 
1929
1909
  ```
@@ -1982,17 +1962,17 @@ puts data_table.ID
1982
1962
 
1983
1963
  ```ruby
1984
1964
  # subset rows in i
1985
- ans = @flights[(:origin.eq "JFK") & (:month.eq 6)]
1965
+ ans = flights[(:origin.eq "JFK") & (:month.eq 6)]
1986
1966
  puts ans.head
1987
1967
 
1988
1968
  # Get the first two rows from flights.
1989
1969
 
1990
- ans = @flights[(1..2)]
1970
+ ans = flights[(1..2)]
1991
1971
  puts ans
1992
1972
 
1993
1973
  # Sort flights first by column origin in ascending order, and then by dest in descending order:
1994
1974
 
1995
- # ans = @flights[E.order(:origin, -(:dest))]
1975
+ # ans = flights[E.order(:origin, -(:dest))]
1996
1976
  # puts ans.head
1997
1977
  ```
1998
1978
 
@@ -2024,15 +2004,15 @@ puts ans
2024
2004
  # Select column(s) in j
2025
2005
  # select arr_delay column, but return it as a vector.
2026
2006
 
2027
- ans = @flights[:all, :arr_delay]
2007
+ ans = flights[:all, :arr_delay]
2028
2008
  puts ans.head
2029
2009
 
2030
2010
  # Select arr_delay column, but return as a data.table instead.
2031
2011
 
2032
- ans = @flights[:all, :arr_delay.list]
2012
+ ans = flights[:all, :arr_delay.list]
2033
2013
  puts ans.head
2034
2014
 
2035
- ans = @flights[:all, E.list(:arr_delay, :dep_delay)]
2015
+ ans = flights[:all, E.list(:arr_delay, :dep_delay)]
2036
2016
  ```
2037
2017
 
2038
2018
  ```
@@ -2057,68 +2037,42 @@ the data frame with the necessary data:
2057
2037
 
2058
2038
  ```ruby
2059
2039
  # copy the R variable :mtcars to the Ruby mtcars variable
2060
- @mtcars = ~:mtcars
2040
+ mtcars = ~:mtcars
2061
2041
 
2062
2042
  # create a new column 'car_name' to store the car names so that it can be
2063
2043
  # used for plotting. The 'rownames' of the data frame cannot be used as
2064
2044
  # data for plotting
2065
- @mtcars.car_name = R.rownames(:mtcars)
2045
+ mtcars.car_name = R.rownames(:mtcars)
2066
2046
 
2067
2047
  # compute normalized mpg and add it to a new column called mpg_z
2068
2048
  # Note that the mean value for mpg can be obtained by calling the 'mean'
2069
2049
  # function on the vector 'mtcars.mpg'. The same with the standard
2070
2050
  # deviation 'sd'. The vector is then rounded to two digits with 'round 2'
2071
- @mtcars.mpg_z = ((@mtcars.mpg - @mtcars.mpg.mean)/@mtcars.mpg.sd).round 2
2051
+ mtcars.mpg_z = ((mtcars.mpg - mtcars.mpg.mean)/mtcars.mpg.sd).round 2
2072
2052
 
2073
2053
  # create a new column 'mpg_type'. Function 'ifelse' is a vectorized function
2074
2054
  # that looks at every element of the mpg_z vector and if the value is below
2075
2055
  # 0, returns 'below', otherwise returns 'above'
2076
- @mtcars.mpg_type = (@mtcars.mpg_z < 0).ifelse("below", "above")
2056
+ mtcars.mpg_type = (mtcars.mpg_z < 0).ifelse("below", "above")
2077
2057
 
2078
2058
  # order the mtcar data set by the mpg_z vector from smaler to larger values
2079
- @mtcars = @mtcars[@mtcars.mpg_z.order, :all]
2059
+ mtcars = mtcars[mtcars.mpg_z.order, :all]
2080
2060
 
2081
2061
  # convert the car_name column to a factor to retain sorted order in plot
2082
- @mtcars.car_name = @mtcars.car_name.factor levels: @mtcars.car_name
2062
+ mtcars.car_name = mtcars.car_name.factor levels: mtcars.car_name
2083
2063
 
2084
2064
  # let's look at the final data frame
2085
- puts @mtcars
2065
+ puts mtcars.head
2086
2066
  ```
2087
2067
 
2088
2068
  ```
2089
- ## mpg cyl disp hp drat wt qsec vs am gear carb
2090
- ## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
2091
- ## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
2092
- ## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
2093
- ## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
2094
- ## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
2095
- ## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
2096
- ## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
2097
- ## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
2098
- ## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
2099
- ## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
2100
- ## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
2101
- ## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
2102
- ## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
2103
- ## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
2104
- ## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
2105
- ## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
2106
- ## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
2107
- ## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
2108
- ## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
2109
- ## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
2110
- ## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
2111
- ## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
2112
- ## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
2113
- ## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
2114
- ## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
2115
- ## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
2116
- ## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
2117
- ## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
2118
- ## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
2119
- ## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
2120
- ## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
2121
- ## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
2069
+ ## mpg cyl disp hp drat wt qsec vs am gear carb
2070
+ ## Cadillac Fleetwood 10.4 8 472 205 2.93 5.250 17.98 0 0 3 4
2071
+ ## Lincoln Continental 10.4 8 460 215 3.00 5.424 17.82 0 0 3 4
2072
+ ## Camaro Z28 13.3 8 350 245 3.73 3.840 15.41 0 0 3 4
2073
+ ## Duster 360 14.3 8 360 245 3.21 3.570 15.84 0 0 3 4
2074
+ ## Chrysler Imperial 14.7 8 440 230 3.23 5.345 17.42 0 0 3 4
2075
+ ## Maserati Bora 15.0 8 301 335 3.54 3.570 14.60 0 1 5 8
2122
2076
  ## car_name mpg_z mpg_type
2123
2077
  ## Cadillac Fleetwood Cadillac Fleetwood -1.61 below
2124
2078
  ## Lincoln Continental Lincoln Continental -1.61 below
@@ -2126,32 +2080,6 @@ puts @mtcars
2126
2080
  ## Duster 360 Duster 360 -0.96 below
2127
2081
  ## Chrysler Imperial Chrysler Imperial -0.89 below
2128
2082
  ## Maserati Bora Maserati Bora -0.84 below
2129
- ## Merc 450SLC Merc 450SLC -0.81 below
2130
- ## AMC Javelin AMC Javelin -0.81 below
2131
- ## Dodge Challenger Dodge Challenger -0.76 below
2132
- ## Ford Pantera L Ford Pantera L -0.71 below
2133
- ## Merc 450SE Merc 450SE -0.61 below
2134
- ## Merc 450SL Merc 450SL -0.46 below
2135
- ## Merc 280C Merc 280C -0.38 below
2136
- ## Valiant Valiant -0.33 below
2137
- ## Hornet Sportabout Hornet Sportabout -0.23 below
2138
- ## Merc 280 Merc 280 -0.15 below
2139
- ## Pontiac Firebird Pontiac Firebird -0.15 below
2140
- ## Ferrari Dino Ferrari Dino -0.06 below
2141
- ## Mazda RX4 Mazda RX4 0.15 above
2142
- ## Mazda RX4 Wag Mazda RX4 Wag 0.15 above
2143
- ## Hornet 4 Drive Hornet 4 Drive 0.22 above
2144
- ## Volvo 142E Volvo 142E 0.22 above
2145
- ## Toyota Corona Toyota Corona 0.23 above
2146
- ## Datsun 710 Datsun 710 0.45 above
2147
- ## Merc 230 Merc 230 0.45 above
2148
- ## Merc 240D Merc 240D 0.72 above
2149
- ## Porsche 914-2 Porsche 914-2 0.98 above
2150
- ## Fiat X1-9 Fiat X1-9 1.20 above
2151
- ## Honda Civic Honda Civic 1.71 above
2152
- ## Lotus Europa Lotus Europa 1.71 above
2153
- ## Fiat 128 Fiat 128 2.04 above
2154
- ## Toyota Corolla Toyota Corolla 2.29 above
2155
2083
  ```
2156
2084
  Now, lets plot the diverging bar plot. When using gKnit, there is no need to call
2157
2085
  'R.awt' to create a plotting device, since gKnit does take care of it. Galaaz
@@ -2173,19 +2101,604 @@ but in this graph we want the bars to be horizontally layed so we add 'coord\_fl
2173
2101
  ```ruby
2174
2102
  require 'ggplot'
2175
2103
 
2176
- puts @mtcars.ggplot(E.aes(x: :car_name, y: :mpg_z, label: :mpg_z)) +
2177
- R.geom_bar(E.aes(fill: :mpg_type), stat: 'identity', width: 0.5) +
2178
- R.scale_fill_manual(name: 'Mileage',
2179
- labels: R.c('Above Average', 'Below Average'),
2180
- values: R.c('above': '#00ba38', 'below': '#f8766d')) +
2181
- R.labs(subtitle: "Normalised mileage from 'mtcars'",
2182
- title: "Diverging Bars") +
2183
- R.coord_flip
2104
+ puts mtcars.ggplot(E.aes(x: :car_name, y: :mpg_z, label: :mpg_z)) +
2105
+ R.geom_bar(E.aes(fill: :mpg_type), stat: 'identity', width: 0.5) +
2106
+ R.scale_fill_manual(name: 'Mileage',
2107
+ labels: R.c('Above Average', 'Below Average'),
2108
+ values: R.c('above': '#00ba38', 'below': '#f8766d')) +
2109
+ R.labs(subtitle: "Normalised mileage from 'mtcars'",
2110
+ title: "Diverging Bars") +
2111
+ R.coord_flip
2184
2112
  ```
2185
2113
 
2186
2114
 
2187
2115
  ![](/home/rbotafogo/desenv/galaaz/blogs/manual/manual_files/figure-html/diverging_bar.png)<!-- -->
2188
2116
 
2117
+ # Coding with Tidyverse
2118
+
2119
+ In R, and when coding with 'tidyverse', arguments to a function are usually not
2120
+ *referencially transparent*. That is, you can’t replace a value with a seemingly equivalent
2121
+ object that you’ve defined elsewhere. To see the problem, let's first define a data frame:
2122
+
2123
+
2124
+ ```ruby
2125
+ df = R.data__frame(x: (1..3), y: (3..1))
2126
+ puts df
2127
+ ```
2128
+
2129
+ ```
2130
+ ## x y
2131
+ ## 1 1 3
2132
+ ## 2 2 2
2133
+ ## 3 3 1
2134
+ ```
2135
+
2136
+ and now, let's look at this code:
2137
+
2138
+
2139
+ ```r
2140
+ my_var <- x
2141
+ filter(df, my_var == 1)
2142
+ ```
2143
+ It generates the following error: "object 'x' not found.
2144
+
2145
+ However, in Galaaz, arguments are referencially transparent as can be seen by the
2146
+ code bellow. Note initally that 'my_var = :x' will not give the error "object 'x' not found"
2147
+ since ':x' is treated as an expression and assigned to my\_var. Then when doing (my\_var.eq 1),
2148
+ my\_var is a variable that resolves to ':x' and it becomes equivalent to (:x.eq 1) which is
2149
+ what we want.
2150
+
2151
+
2152
+ ```ruby
2153
+ my_var = :x
2154
+ puts df.filter(my_var.eq 1)
2155
+ ```
2156
+
2157
+ ```
2158
+ ## x y
2159
+ ## 1 1 3
2160
+ ```
2161
+ As stated by Hardley
2162
+
2163
+ > dplyr code is ambiguous. Depending on what variables are defined where,
2164
+ > filter(df, x == y) could be equivalent to any of:
2165
+
2166
+ ```
2167
+ df[df$x == df$y, ]
2168
+ df[df$x == y, ]
2169
+ df[x == df$y, ]
2170
+ df[x == y, ]
2171
+ ```
2172
+ In galaaz this ambiguity does not exist, filter(df, x.eq y) is not a valid expression as
2173
+ expressions are build with symbols. In doing filter(df, :x.eq y) we are looking for elements
2174
+ of the 'x' column that are equal to a previously defined y variable. Finally in
2175
+ filter(df, :x.eq :y) we are looking for elements in which the 'x' column value is equal to
2176
+ the 'y' column value. This can be seen in the following two chunks of code:
2177
+
2178
+
2179
+ ```ruby
2180
+ y = 1
2181
+ x = 2
2182
+
2183
+ # looking for values where the 'x' column is equal to the 'y' column
2184
+ puts df.filter(:x.eq :y)
2185
+ ```
2186
+
2187
+ ```
2188
+ ## x y
2189
+ ## 1 2 2
2190
+ ```
2191
+
2192
+
2193
+ ```ruby
2194
+ # looking for values where the 'x' column is equal to the 'y' variable
2195
+ # in this case, the number 1
2196
+ puts df.filter(:x.eq y)
2197
+ ```
2198
+
2199
+ ```
2200
+ ## x y
2201
+ ## 1 1 3
2202
+ ```
2203
+ ## Writing a function that applies to different data sets
2204
+
2205
+ Let's suppose that we want to write a function that receives as the first argument a data frame
2206
+ and as second argument an expression that adds a column to the data frame that is equal to the
2207
+ sum of elements in column 'a' plus 'x'.
2208
+
2209
+ Here is the intended behaviour using the 'mutate' function of 'dplyr':
2210
+
2211
+ ```
2212
+ mutate(df1, y = a + x)
2213
+ mutate(df2, y = a + x)
2214
+ mutate(df3, y = a + x)
2215
+ mutate(df4, y = a + x)
2216
+ ```
2217
+ The naive approach to writing an R function to solve this problem is:
2218
+
2219
+ ```
2220
+ mutate_y <- function(df) {
2221
+ mutate(df, y = a + x)
2222
+ }
2223
+ ```
2224
+ Unfortunately, in R, this function can fail silently if one of the variables isn’t present
2225
+ in the data frame, but is present in the global environment. We will not go through here how
2226
+ to solve this problem in R.
2227
+
2228
+ In Galaaz the method mutate_y bellow will work fine and will never fail silently.
2229
+
2230
+
2231
+ ```ruby
2232
+ def mutate_y(df)
2233
+ df.mutate(:y.assign :a + :x)
2234
+ end
2235
+ ```
2236
+ Here we create a data frame that has only one column named 'x':
2237
+
2238
+
2239
+ ```ruby
2240
+ df1 = R.data__frame(x: (1..3))
2241
+ puts df1
2242
+ ```
2243
+
2244
+ ```
2245
+ ## x
2246
+ ## 1 1
2247
+ ## 2 2
2248
+ ## 3 3
2249
+ ```
2250
+
2251
+ Note that method mutate_y will fail independetly from the fact that variable 'a' is defined and
2252
+ in the scope of the method. Variable 'a' has no relationship with the symbol ':a' used in the
2253
+ definition of 'mutate\_y' above:
2254
+
2255
+
2256
+ ```ruby
2257
+ a = 10
2258
+ mutate_y(df1)
2259
+ ```
2260
+
2261
+ ```
2262
+ ## Message:
2263
+ ## Error in mutate_impl(.data, dots) :
2264
+ ## Evaluation error: object 'a' not found.
2265
+ ## In addition: Warning message:
2266
+ ## In mutate_impl(.data, dots) :
2267
+ ## mismatched protect/unprotect (unprotect with empty protect stack) (RError)
2268
+ ## Translated to internal error
2269
+ ```
2270
+ ## Different expressions
2271
+
2272
+ Let's move to the next problem as presented by Hardley where trying to write a function in R
2273
+ that will receive two argumens, the first a variable and the second an expression is not trivial.
2274
+ Bellow we create a data frame and we want to write a function that groups data by a variable and
2275
+ summarises it by an expression:
2276
+
2277
+
2278
+ ```r
2279
+ set.seed(123)
2280
+
2281
+ df <- data.frame(
2282
+ g1 = c(1, 1, 2, 2, 2),
2283
+ g2 = c(1, 2, 1, 2, 1),
2284
+ a = sample(5),
2285
+ b = sample(5)
2286
+ )
2287
+
2288
+ as.data.frame(df)
2289
+ ```
2290
+
2291
+ ```
2292
+ ## g1 g2 a b
2293
+ ## 1 1 1 2 1
2294
+ ## 2 1 2 4 3
2295
+ ## 3 2 1 5 4
2296
+ ## 4 2 2 3 2
2297
+ ## 5 2 1 1 5
2298
+ ```
2299
+
2300
+ ```r
2301
+ d2 <- df %>%
2302
+ group_by(g1) %>%
2303
+ summarise(a = mean(a))
2304
+
2305
+ as.data.frame(d2)
2306
+ ```
2307
+
2308
+ ```
2309
+ ## g1 a
2310
+ ## 1 1 3
2311
+ ## 2 2 3
2312
+ ```
2313
+
2314
+ ```r
2315
+ d2 <- df %>%
2316
+ group_by(g2) %>%
2317
+ summarise(a = mean(a))
2318
+
2319
+ as.data.frame(d2)
2320
+ ```
2321
+
2322
+ ```
2323
+ ## g2 a
2324
+ ## 1 1 2.666667
2325
+ ## 2 2 3.500000
2326
+ ```
2327
+
2328
+ As shown by Hardley, one might expect this function to do the trick:
2329
+
2330
+
2331
+ ```r
2332
+ my_summarise <- function(df, group_var) {
2333
+ df %>%
2334
+ group_by(group_var) %>%
2335
+ summarise(a = mean(a))
2336
+ }
2337
+
2338
+ # my_summarise(df, g1)
2339
+ #> Error: Column `group_var` is unknown
2340
+ ```
2341
+
2342
+ In order to solve this problem, coding with dplyr requires the introduction of many new concepts
2343
+ and functions such as 'quo', 'quos', 'enquo', 'enquos', '!!' (bang bang), '!!!' (triple bang).
2344
+ Again, we'll leave to Hardley the explanation on how to use all those functions.
2345
+
2346
+ Now, let's try to implement the same function in galaaz. The next code block first prints the
2347
+ 'df' data frame defined previously in R (to access an R variable from Galaaz, we use the tilda
2348
+ operator '~' applied to the R variable name as symbol, i.e., ':df'.
2349
+
2350
+
2351
+ ```ruby
2352
+ puts ~:df
2353
+ ```
2354
+
2355
+ ```
2356
+ ## g1 g2 a b
2357
+ ## 1 1 1 2 1
2358
+ ## 2 1 2 4 3
2359
+ ## 3 2 1 5 4
2360
+ ## 4 2 2 3 2
2361
+ ## 5 2 1 1 5
2362
+ ```
2363
+
2364
+ We then create the 'my_summarize' method and call it passing the R data frame and
2365
+ the group by variable ':g1':
2366
+
2367
+
2368
+ ```ruby
2369
+ def my_summarize(df, group_var)
2370
+ df.group_by(group_var).
2371
+ summarize(a: :a.mean)
2372
+ end
2373
+
2374
+ puts my_summarize(:df, :g1).as__data__frame
2375
+ ```
2376
+
2377
+ ```
2378
+ ## g1 a
2379
+ ## 1 1 3
2380
+ ## 2 2 3
2381
+ ```
2382
+
2383
+ It works!!! Well, let's make sure this was not just some coincidence
2384
+
2385
+
2386
+ ```ruby
2387
+ puts my_summarize(:df, :g2).as__data__frame
2388
+ ```
2389
+
2390
+ ```
2391
+ ## g2 a
2392
+ ## 1 1 2.666667
2393
+ ## 2 2 3.500000
2394
+ ```
2395
+
2396
+ Great, everything is fine! No magic, no new functions, no complexities, just normal, standard Ruby
2397
+ code. If you've ever done NSE in R, this certainly feels much safer and easy to implement.
2398
+
2399
+ ## Different input variables
2400
+
2401
+ In the previous section we've managed to get rid of all NSE formulation for a simple example, but
2402
+ does this remain true for more complex examples, or will the Galaaz way prove inpractical for
2403
+ more complex code?
2404
+
2405
+ In the next example Hardley proposes us to write a function that given an expression such as 'a'
2406
+ or 'a * b', calculates three summaries. What we want a function that does the same as these R
2407
+ statements:
2408
+
2409
+ ```
2410
+ summarise(df, mean = mean(a), sum = sum(a), n = n())
2411
+ #> # A tibble: 1 x 3
2412
+ #> mean sum n
2413
+ #> <dbl> <int> <int>
2414
+ #> 1 3 15 5
2415
+
2416
+ summarise(df, mean = mean(a * b), sum = sum(a * b), n = n())
2417
+ #> # A tibble: 1 x 3
2418
+ #> mean sum n
2419
+ #> <dbl> <int> <int>
2420
+ #> 1 9 45 5
2421
+ ```
2422
+
2423
+ Let's try it in galaaz:
2424
+
2425
+
2426
+ ```ruby
2427
+ def my_summarise2(df, expr)
2428
+ df.summarize(
2429
+ mean: E.mean(expr),
2430
+ sum: E.sum(expr),
2431
+ n: E.n
2432
+ )
2433
+ end
2434
+
2435
+ puts my_summarise2((~:df), :a)
2436
+ puts "\n"
2437
+ puts my_summarise2((~:df), :a * :b)
2438
+ ```
2439
+
2440
+ ```
2441
+ ## mean sum n
2442
+ ## 1 3 15 5
2443
+ ##
2444
+ ## mean sum n
2445
+ ## 1 9 45 5
2446
+ ```
2447
+
2448
+ Once again, there is no need to use any special theory or functions. The only point to be
2449
+ careful about is the use of 'E' to build expressions from functions 'mean', 'sum' and 'n'.
2450
+
2451
+ ## Different input and output variable
2452
+
2453
+ Now the next challenge presented by Hardley is to vary the name of the output variables based on
2454
+ the received expression. So, if the input expression is 'a', we want our data frame columns to
2455
+ be named 'mean\_a' and 'sum\_a'. Now, if the input expression is 'b', columns
2456
+ should be named 'mean\_b' and 'sum\_b'.
2457
+
2458
+ ```
2459
+ mutate(df, mean_a = mean(a), sum_a = sum(a))
2460
+ #> # A tibble: 5 x 6
2461
+ #> g1 g2 a b mean_a sum_a
2462
+ #> <dbl> <dbl> <int> <int> <dbl> <int>
2463
+ #> 1 1 1 1 3 3 15
2464
+ #> 2 1 2 4 2 3 15
2465
+ #> 3 2 1 2 1 3 15
2466
+ #> 4 2 2 5 4 3 15
2467
+ #> # … with 1 more row
2468
+
2469
+ mutate(df, mean_b = mean(b), sum_b = sum(b))
2470
+ #> # A tibble: 5 x 6
2471
+ #> g1 g2 a b mean_b sum_b
2472
+ #> <dbl> <dbl> <int> <int> <dbl> <int>
2473
+ #> 1 1 1 1 3 3 15
2474
+ #> 2 1 2 4 2 3 15
2475
+ #> 3 2 1 2 1 3 15
2476
+ #> 4 2 2 5 4 3 15
2477
+ #> # … with 1 more row
2478
+ ```
2479
+ In order to solve this problem in R, Hardley needs to introduce some more new functions and notations:
2480
+ 'quo_name' and the ':=' operator from package 'rlang'
2481
+
2482
+ Here is our Ruby code:
2483
+
2484
+
2485
+ ```ruby
2486
+ def my_mutate(df, expr)
2487
+ mean_name = "mean_#{expr.to_s}"
2488
+ sum_name = "sum_#{expr.to_s}"
2489
+
2490
+ df.mutate(mean_name => E.mean(expr),
2491
+ sum_name => E.sum(expr))
2492
+ end
2493
+
2494
+ puts my_mutate((~:df), :a)
2495
+ puts "\n"
2496
+ puts my_mutate((~:df), :b)
2497
+ ```
2498
+
2499
+ ```
2500
+ ## g1 g2 a b mean_a sum_a
2501
+ ## 1 1 1 2 1 3 15
2502
+ ## 2 1 2 4 3 3 15
2503
+ ## 3 2 1 5 4 3 15
2504
+ ## 4 2 2 3 2 3 15
2505
+ ## 5 2 1 1 5 3 15
2506
+ ##
2507
+ ## g1 g2 a b mean_b sum_b
2508
+ ## 1 1 1 2 1 3 15
2509
+ ## 2 1 2 4 3 3 15
2510
+ ## 3 2 1 5 4 3 15
2511
+ ## 4 2 2 3 2 3 15
2512
+ ## 5 2 1 1 5 3 15
2513
+ ```
2514
+ It really seems that "Non Standard Evaluation" is actually quite standard in Galaaz! But, you
2515
+ might have noticed a small change in the way the arguments to the mutate method were called.
2516
+ In a previous example we used df.summarise(mean: E.mean(:a), ...) where the column name was
2517
+ followed by a ':' colom. In this example, we have df.mutate(mean_name => E.mean(expr), ...)
2518
+ and variable mean\_name is not followed by ':' but by '=>'. This is standard Ruby notation.
2519
+
2520
+ [explain....]
2521
+
2522
+ ## Capturing multiple variables
2523
+
2524
+ Moving on with new complexities, Hardley proposes us to solve the problem in which the
2525
+ summarise function will receive any number of grouping variables.
2526
+
2527
+ This again is quite standard Ruby. In order to receive an undefined number of paramenters
2528
+ the paramenter is preceded by '*':
2529
+
2530
+
2531
+ ```ruby
2532
+ def my_summarise3(df, *group_vars)
2533
+ df.group_by(*group_vars).
2534
+ summarise(a: E.mean(:a))
2535
+ end
2536
+
2537
+ puts my_summarise3((~:df), :g1, :g2).as__data__frame
2538
+ ```
2539
+
2540
+ ```
2541
+ ## g1 g2 a
2542
+ ## 1 1 1 2
2543
+ ## 2 1 2 4
2544
+ ## 3 2 1 3
2545
+ ## 4 2 2 3
2546
+ ```
2547
+
2548
+ ## Why does R require NSE and Galaaz does not?
2549
+
2550
+ NSE introduces a number of new concepts, such as 'quoting', 'quasiquotation', 'unquoting' and
2551
+ 'unquote-splicing', while in Galaaz none of those concepts are needed. What gives?
2552
+
2553
+ R is an extremely flexible language and it has lazy evaluation of parameters. When in R a
2554
+ function is called as 'summarise(df, a = b)', the summarise function receives the litteral
2555
+ 'a = b' parameter and can work with this as if it were a string. In R, it is not clear what
2556
+ a and b are, they can be expressions or they can be variables, it is up to the function to
2557
+ decide what 'a = b' means.
2558
+
2559
+ In Ruby, there is no lazy evaluation of parameters and 'a' is always a variable and so is 'b'.
2560
+ Variables assume their value as soon as they are used, so 'x = a' is immediately evaluate and
2561
+ variable 'x' will receive the value of variable 'a' as soon as the Ruby statement is executed.
2562
+ Ruby also provides the notion of a symbol; ':a' is a symbol and does not evaluate to anything.
2563
+ Galaaz uses Ruby symbols to build expressions that are not bound to anything: ':a.eq :b' is
2564
+ clearly an expression and has no relationship whatsoever with the statment 'a = b'. By using
2565
+ symbols, variables and expressions all the possible ambiguities that are found in R are
2566
+ eliminated in Galaaz.
2567
+
2568
+ The main problem that remains, is that in R, functions are not clearly documented as what type
2569
+ of input they are expecting, they might be expecting regular variables or they might be
2570
+ expecting expressions and the R function will know how to deal with an input of the form
2571
+ 'a = b', now for the Ruby developer it might not be immediately clear if it should call the
2572
+ function passing the value 'true' if variable 'a' is equal to variable 'b' or if it should
2573
+ call the function passing the expression ':a.eq :b'.
2574
+
2575
+
2576
+ ## Advanced dplyr features
2577
+
2578
+ In the blog: Programming with dplyr by using dplyr (https://www.r-bloggers.com/programming-with-dplyr-by-using-dplyr/) Iñaki Úcar shows surprise that some R users are trying to code in dplyr avoiding
2579
+ the use of NSE. For instance he says:
2580
+
2581
+ > Take the example of seplyr. It stands for standard evaluation dplyr, and enables us to
2582
+ > program over dplyr without having “to bring in (or study) any deep-theory or
2583
+ > heavy-weight tools such as rlang/tidyeval”.
2584
+
2585
+ For me, there isn't really any surprise that users are trying to avoid dplyr deep-theory. R
2586
+ users frequently are not programmers and learning to code is already hard business, on top
2587
+ of that, having to learn how to 'quote' or 'enquo' or 'quos' or 'enquos' is not necessarily
2588
+ a 'piece of cake'. So much so, that 'tidyeval' has some more advanced functions that instead
2589
+ of using quoted expressions, uses strings as arguments.
2590
+
2591
+ In the following examples, we show the use of functions 'group\_by\_at', 'summarise\_at' and
2592
+ 'rename\_at' that receive strings as argument. The data frame used in 'starwars' that describes
2593
+ features of characters in the Starwars movies:
2594
+
2595
+
2596
+ ```ruby
2597
+ puts (~:starwars).head.as__data__frame
2598
+ ```
2599
+
2600
+ ```
2601
+ ## name height mass hair_color skin_color eye_color birth_year
2602
+ ## 1 Luke Skywalker 172 77 blond fair blue 19.0
2603
+ ## 2 C-3PO 167 75 <NA> gold yellow 112.0
2604
+ ## 3 R2-D2 96 32 <NA> white, blue red 33.0
2605
+ ## 4 Darth Vader 202 136 none white yellow 41.9
2606
+ ## 5 Leia Organa 150 49 brown light brown 19.0
2607
+ ## 6 Owen Lars 178 120 brown, grey light blue 52.0
2608
+ ## gender homeworld species
2609
+ ## 1 male Tatooine Human
2610
+ ## 2 <NA> Tatooine Droid
2611
+ ## 3 <NA> Naboo Droid
2612
+ ## 4 male Tatooine Human
2613
+ ## 5 female Alderaan Human
2614
+ ## 6 male Tatooine Human
2615
+ ## films
2616
+ ## 1 Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope, The Force Awakens
2617
+ ## 2 Attack of the Clones, The Phantom Menace, Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope
2618
+ ## 3 Attack of the Clones, The Phantom Menace, Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope, The Force Awakens
2619
+ ## 4 Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope
2620
+ ## 5 Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope, The Force Awakens
2621
+ ## 6 Attack of the Clones, Revenge of the Sith, A New Hope
2622
+ ## vehicles starships
2623
+ ## 1 Snowspeeder, Imperial Speeder Bike X-wing, Imperial shuttle
2624
+ ## 2
2625
+ ## 3
2626
+ ## 4 TIE Advanced x1
2627
+ ## 5 Imperial Speeder Bike
2628
+ ## 6
2629
+ ```
2630
+ The grouped_mean function bellow will receive a grouping variable and calculate summaries for
2631
+ the value\_variables given:
2632
+
2633
+
2634
+ ```r
2635
+ grouped_mean <- function(data, grouping_variables, value_variables) {
2636
+ data %>%
2637
+ group_by_at(grouping_variables) %>%
2638
+ mutate(count = n()) %>%
2639
+ summarise_at(c(value_variables, "count"), mean, na.rm = TRUE) %>%
2640
+ rename_at(value_variables, funs(paste0("mean_", .)))
2641
+ }
2642
+
2643
+ gm = starwars %>%
2644
+ grouped_mean("eye_color", c("mass", "birth_year"))
2645
+
2646
+ as.data.frame(gm)
2647
+ ```
2648
+
2649
+ ```
2650
+ ## eye_color mean_mass mean_birth_year count
2651
+ ## 1 black 76.28571 33.00000 10
2652
+ ## 2 blue 86.51667 67.06923 19
2653
+ ## 3 blue-gray 77.00000 57.00000 1
2654
+ ## 4 brown 66.09231 108.96429 21
2655
+ ## 5 dark NaN NaN 1
2656
+ ## 6 gold NaN NaN 1
2657
+ ## 7 green, yellow 159.00000 NaN 1
2658
+ ## 8 hazel 66.00000 34.50000 3
2659
+ ## 9 orange 282.33333 231.00000 8
2660
+ ## 10 pink NaN NaN 1
2661
+ ## 11 red 81.40000 33.66667 5
2662
+ ## 12 red, blue NaN NaN 1
2663
+ ## 13 unknown 31.50000 NaN 3
2664
+ ## 14 white 48.00000 NaN 1
2665
+ ## 15 yellow 81.11111 76.38000 11
2666
+ ```
2667
+
2668
+ The same code with Galaaz, becomes:
2669
+
2670
+
2671
+ ```ruby
2672
+ def grouped_mean(data, grouping_variables, value_variables)
2673
+ data.
2674
+ group_by_at(grouping_variables).
2675
+ mutate(count: E.n).
2676
+ summarise_at(E.c(value_variables, "count"), ~:mean, na__rm: true).
2677
+ rename_at(value_variables, E.funs(E.paste0("mean_", value_variables)))
2678
+ end
2679
+
2680
+ puts grouped_mean((~:starwars), "eye_color", E.c("mass", "birth_year")).as__data__frame
2681
+ ```
2682
+
2683
+ ```
2684
+ ## eye_color mean_mass mean_birth_year count
2685
+ ## 1 black 76.28571 33.00000 10
2686
+ ## 2 blue 86.51667 67.06923 19
2687
+ ## 3 blue-gray 77.00000 57.00000 1
2688
+ ## 4 brown 66.09231 108.96429 21
2689
+ ## 5 dark NaN NaN 1
2690
+ ## 6 gold NaN NaN 1
2691
+ ## 7 green, yellow 159.00000 NaN 1
2692
+ ## 8 hazel 66.00000 34.50000 3
2693
+ ## 9 orange 282.33333 231.00000 8
2694
+ ## 10 pink NaN NaN 1
2695
+ ## 11 red 81.40000 33.66667 5
2696
+ ## 12 red, blue NaN NaN 1
2697
+ ## 13 unknown 31.50000 NaN 3
2698
+ ## 14 white 48.00000 NaN 1
2699
+ ## 15 yellow 81.11111 76.38000 11
2700
+ ```
2701
+
2189
2702
 
2190
2703
  [TO BE CONTINUED...]
2191
2704