galaaz 0.4.6 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (181) hide show
  1. checksums.yaml +5 -5
  2. data/README.md +3575 -118
  3. data/Rakefile +21 -4
  4. data/bin/gknit +152 -6
  5. data/bin/gknit-draft +105 -0
  6. data/bin/gknit-draft.rb +28 -0
  7. data/bin/gknit_Rscript +127 -0
  8. data/bin/grun +27 -1
  9. data/bin/gstudio +47 -4
  10. data/bin/{gstudio.rb → gstudio_irb.rb} +0 -0
  11. data/bin/gstudio_pry.rb +7 -0
  12. data/blogs/galaaz_ggplot/galaaz_ggplot.Rmd +3 -12
  13. data/blogs/galaaz_ggplot/galaaz_ggplot.html +77 -222
  14. data/blogs/galaaz_ggplot/galaaz_ggplot.md +4 -31
  15. data/blogs/galaaz_ggplot/galaaz_ggplot.pdf +0 -0
  16. data/blogs/galaaz_ggplot/galaaz_ggplot_files/figure-html/midwest_rb.png +0 -0
  17. data/blogs/galaaz_ggplot/galaaz_ggplot_files/figure-html/scatter_plot_rb.png +0 -0
  18. data/blogs/galaaz_ggplot/midwest.Rmd +1 -9
  19. data/blogs/gknit/gknit.Rmd +232 -123
  20. data/blogs/{dev/dev.html → gknit/gknit.html} +1897 -33
  21. data/blogs/gknit/gknit.pdf +0 -0
  22. data/blogs/gknit/lst.rds +0 -0
  23. data/blogs/gknit/stats.bib +27 -0
  24. data/blogs/manual/lst.rds +0 -0
  25. data/blogs/manual/manual.Rmd +1893 -47
  26. data/blogs/manual/manual.html +3153 -347
  27. data/blogs/manual/manual.md +3575 -118
  28. data/blogs/manual/manual.pdf +0 -0
  29. data/blogs/manual/manual.tex +4026 -0
  30. data/blogs/manual/manual_files/figure-html/bubble-1.png +0 -0
  31. data/blogs/manual/manual_files/figure-html/diverging_bar.png +0 -0
  32. data/blogs/manual/manual_files/figure-latex/bubble-1.png +0 -0
  33. data/blogs/manual/manual_files/figure-latex/diverging_bar.pdf +0 -0
  34. data/blogs/{dev → manual}/model.rb +0 -0
  35. data/blogs/nse_dplyr/nse_dplyr.Rmd +849 -0
  36. data/blogs/nse_dplyr/nse_dplyr.html +878 -0
  37. data/blogs/nse_dplyr/nse_dplyr.md +1198 -0
  38. data/blogs/nse_dplyr/nse_dplyr.pdf +0 -0
  39. data/blogs/oh_my/oh_my.html +274 -386
  40. data/blogs/oh_my/oh_my.md +208 -205
  41. data/blogs/ruby_plot/ruby_plot.Rmd +64 -84
  42. data/blogs/ruby_plot/ruby_plot.html +235 -208
  43. data/blogs/ruby_plot/ruby_plot.md +239 -34
  44. data/blogs/ruby_plot/ruby_plot.pdf +0 -0
  45. data/blogs/ruby_plot/ruby_plot_files/figure-html/dose_len.png +0 -0
  46. data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_delivery.png +0 -0
  47. data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_dose.png +0 -0
  48. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color.png +0 -0
  49. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color2.png +0 -0
  50. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_decorations.png +0 -0
  51. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_jitter.png +0 -0
  52. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_points.png +0 -0
  53. data/blogs/ruby_plot/ruby_plot_files/figure-html/final_box_plot.png +0 -0
  54. data/blogs/ruby_plot/ruby_plot_files/figure-html/final_violin_plot.png +0 -0
  55. data/blogs/ruby_plot/ruby_plot_files/figure-html/violin_with_jitter.png +0 -0
  56. data/examples/Bibliography/master.bib +50 -0
  57. data/examples/Bibliography/stats.bib +72 -0
  58. data/examples/islr/ch2.spec.rb +1 -1
  59. data/examples/islr/ch3_boston.rb +4 -4
  60. data/examples/islr/x_y_rnorm.jpg +0 -0
  61. data/examples/latex_templates/Test-acm_article/Makefile +16 -0
  62. data/examples/latex_templates/Test-acm_article/Test-acm_article.Rmd +65 -0
  63. data/examples/latex_templates/Test-acm_article/acm_proc_article-sp.cls +1670 -0
  64. data/examples/latex_templates/Test-acm_article/sensys-abstract.cls +703 -0
  65. data/examples/latex_templates/Test-acm_article/sigproc.bib +59 -0
  66. data/examples/latex_templates/Test-acs_article/Test-acs_article.Rmd +260 -0
  67. data/examples/latex_templates/Test-acs_article/Test-acs_article.pdf +0 -0
  68. data/examples/latex_templates/Test-acs_article/acs-Test-acs_article.bib +11 -0
  69. data/examples/latex_templates/Test-acs_article/acs-my_output.bib +11 -0
  70. data/examples/latex_templates/Test-acs_article/acstest.bib +17 -0
  71. data/examples/latex_templates/Test-aea_article/AEA.cls +1414 -0
  72. data/examples/latex_templates/Test-aea_article/BibFile.bib +0 -0
  73. data/examples/latex_templates/Test-aea_article/Test-aea_article.Rmd +108 -0
  74. data/examples/latex_templates/Test-aea_article/Test-aea_article.pdf +0 -0
  75. data/examples/latex_templates/Test-aea_article/aea.bst +1269 -0
  76. data/examples/latex_templates/Test-aea_article/multicol.sty +853 -0
  77. data/examples/latex_templates/Test-aea_article/references.bib +0 -0
  78. data/examples/latex_templates/Test-aea_article/setspace.sty +546 -0
  79. data/examples/latex_templates/Test-amq_article/Test-amq_article.Rmd +256 -0
  80. data/examples/latex_templates/Test-amq_article/Test-amq_article.pdf +0 -0
  81. data/examples/latex_templates/Test-amq_article/Test-amq_article.pdfsync +3397 -0
  82. data/examples/latex_templates/Test-amq_article/pics/Figure2.pdf +0 -0
  83. data/examples/latex_templates/Test-ams_article/Test-ams_article.Rmd +215 -0
  84. data/examples/latex_templates/Test-ams_article/amstest.bib +436 -0
  85. data/examples/latex_templates/Test-asa_article/Test-asa_article.Rmd +153 -0
  86. data/examples/latex_templates/Test-asa_article/Test-asa_article.pdf +0 -0
  87. data/examples/latex_templates/Test-asa_article/agsm.bst +1353 -0
  88. data/examples/latex_templates/Test-asa_article/bibliography.bib +233 -0
  89. data/examples/latex_templates/Test-ieee_article/IEEEtran.bst +2409 -0
  90. data/examples/latex_templates/Test-ieee_article/IEEEtran.cls +6346 -0
  91. data/examples/latex_templates/Test-ieee_article/Test-ieee_article.Rmd +175 -0
  92. data/examples/latex_templates/Test-ieee_article/Test-ieee_article.pdf +0 -0
  93. data/examples/latex_templates/Test-ieee_article/mybibfile.bib +20 -0
  94. data/examples/latex_templates/Test-rjournal_article/RJournal.sty +335 -0
  95. data/examples/latex_templates/Test-rjournal_article/RJreferences.bib +18 -0
  96. data/examples/latex_templates/Test-rjournal_article/RJwrapper.pdf +0 -0
  97. data/examples/latex_templates/Test-rjournal_article/Test-rjournal_article.Rmd +52 -0
  98. data/examples/latex_templates/Test-springer_article/Test-springer_article.Rmd +65 -0
  99. data/examples/latex_templates/Test-springer_article/Test-springer_article.pdf +0 -0
  100. data/examples/latex_templates/Test-springer_article/bibliography.bib +26 -0
  101. data/examples/latex_templates/Test-springer_article/spbasic.bst +1658 -0
  102. data/examples/latex_templates/Test-springer_article/spmpsci.bst +1512 -0
  103. data/examples/latex_templates/Test-springer_article/spphys.bst +1443 -0
  104. data/examples/latex_templates/Test-springer_article/svglov3.clo +113 -0
  105. data/examples/latex_templates/Test-springer_article/svjour3.cls +1431 -0
  106. data/examples/misc/moneyball.rb +1 -1
  107. data/examples/misc/subsetting.rb +37 -37
  108. data/examples/rmarkdown/svm-rmarkdown-anon-ms-example/svm-rmarkdown-anon-ms-example.Rmd +73 -0
  109. data/examples/rmarkdown/svm-rmarkdown-anon-ms-example/svm-rmarkdown-anon-ms-example.pdf +0 -0
  110. data/examples/rmarkdown/svm-rmarkdown-article-example/svm-rmarkdown-article-example.Rmd +382 -0
  111. data/examples/rmarkdown/svm-rmarkdown-article-example/svm-rmarkdown-article-example.pdf +0 -0
  112. data/examples/rmarkdown/svm-rmarkdown-beamer-example/svm-rmarkdown-beamer-example.Rmd +164 -0
  113. data/examples/rmarkdown/svm-rmarkdown-beamer-example/svm-rmarkdown-beamer-example.pdf +0 -0
  114. data/examples/rmarkdown/svm-rmarkdown-cv/svm-rmarkdown-cv.Rmd +92 -0
  115. data/examples/rmarkdown/svm-rmarkdown-cv/svm-rmarkdown-cv.pdf +0 -0
  116. data/examples/rmarkdown/svm-rmarkdown-syllabus-example/attend-grade-relationships.csv +482 -0
  117. data/examples/rmarkdown/svm-rmarkdown-syllabus-example/svm-rmarkdown-syllabus-example.Rmd +280 -0
  118. data/examples/rmarkdown/svm-rmarkdown-syllabus-example/svm-rmarkdown-syllabus-example.pdf +0 -0
  119. data/examples/rmarkdown/svm-xaringan-example/svm-xaringan-example.Rmd +386 -0
  120. data/lib/R_interface/r.rb +2 -2
  121. data/lib/R_interface/r_libs.R +6 -1
  122. data/lib/R_interface/r_methods.rb +12 -2
  123. data/lib/R_interface/rdata_frame.rb +8 -17
  124. data/lib/R_interface/rindexed_object.rb +1 -2
  125. data/lib/R_interface/rlist.rb +1 -0
  126. data/lib/R_interface/robject.rb +20 -23
  127. data/lib/R_interface/rpkg.rb +15 -6
  128. data/lib/R_interface/rsupport.rb +13 -19
  129. data/lib/R_interface/ruby_extensions.rb +14 -18
  130. data/lib/R_interface/rvector.rb +0 -12
  131. data/lib/gknit.rb +2 -0
  132. data/lib/gknit/draft.rb +105 -0
  133. data/lib/gknit/knitr_engine.rb +6 -37
  134. data/lib/util/exec_ruby.rb +22 -84
  135. data/lib/util/inline_file.rb +7 -3
  136. data/specs/figures/bg.jpeg +0 -0
  137. data/specs/figures/bg.png +0 -0
  138. data/specs/figures/bg.svg +2 -2
  139. data/specs/figures/dose_len.png +0 -0
  140. data/specs/figures/no_args.jpeg +0 -0
  141. data/specs/figures/no_args.png +0 -0
  142. data/specs/figures/no_args.svg +2 -2
  143. data/specs/figures/width_height.jpeg +0 -0
  144. data/specs/figures/width_height.png +0 -0
  145. data/specs/figures/width_height_units1.jpeg +0 -0
  146. data/specs/figures/width_height_units1.png +0 -0
  147. data/specs/figures/width_height_units2.jpeg +0 -0
  148. data/specs/figures/width_height_units2.png +0 -0
  149. data/specs/r_dataframe.spec.rb +184 -11
  150. data/specs/r_list.spec.rb +4 -4
  151. data/specs/r_list_apply.spec.rb +11 -10
  152. data/specs/ruby_expression.spec.rb +3 -11
  153. data/specs/tmp.rb +106 -34
  154. data/version.rb +1 -1
  155. metadata +96 -33
  156. data/bin/gknit_old_r +0 -236
  157. data/blogs/dev/dev.Rmd +0 -77
  158. data/blogs/dev/dev.md +0 -87
  159. data/blogs/dev/dev_files/figure-html/bubble-1.png +0 -0
  160. data/blogs/dev/dev_files/figure-html/diverging_bar. +0 -0
  161. data/blogs/dev/dev_files/figure-html/diverging_bar.png +0 -0
  162. data/blogs/dplyr/dplyr.rb +0 -63
  163. data/blogs/galaaz_ggplot/galaaz_ggplot.aux +0 -43
  164. data/blogs/galaaz_ggplot/galaaz_ggplot.log +0 -640
  165. data/blogs/galaaz_ggplot/galaaz_ggplot.out +0 -10
  166. data/blogs/galaaz_ggplot/galaaz_ggplot.tex +0 -481
  167. data/blogs/galaaz_ggplot/midwest.png +0 -0
  168. data/blogs/galaaz_ggplot/scatter_plot.png +0 -0
  169. data/blogs/ruby_plot/ruby_plot.Rmd_external_figs +0 -662
  170. data/blogs/ruby_plot/ruby_plot.tex +0 -1077
  171. data/blogs/ruby_plot/ruby_plot_files/figure-html/dose_len.svg +0 -57
  172. data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_delivery.svg +0 -106
  173. data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_dose.svg +0 -110
  174. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color.svg +0 -174
  175. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color2.svg +0 -236
  176. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_jitter.svg +0 -296
  177. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_points.svg +0 -236
  178. data/blogs/ruby_plot/ruby_plot_files/figure-html/final_box_plot.svg +0 -218
  179. data/blogs/ruby_plot/ruby_plot_files/figure-html/final_violin_plot.svg +0 -128
  180. data/blogs/ruby_plot/ruby_plot_files/figure-html/violin_with_jitter.svg +0 -150
  181. data/examples/paper/paper.rb +0 -36
Binary file
Binary file
@@ -0,0 +1,27 @@
1
+ @book{Wilkinson:grammar_of_graphics,
2
+ author = {Wilkinson, Leland},
3
+ title = {The Grammar of Graphics (Statistics and Computing)},
4
+ year = {2005},
5
+ isbn = {0387245448},
6
+ publisher = {Springer-Verlag},
7
+ address = {Berlin, Heidelberg},
8
+ }
9
+
10
+ @article{Knuth:literate_programming,
11
+ author = {Knuth, Donald E.},
12
+ title = {Literate Programming},
13
+ journal = {Comput. J.},
14
+ issue_date = {May 1984},
15
+ volume = {27},
16
+ number = {2},
17
+ month = may,
18
+ year = {1984},
19
+ issn = {0010-4620},
20
+ pages = {97--111},
21
+ numpages = {15},
22
+ url = {http://dx.doi.org/10.1093/comjnl/27.2.97},
23
+ doi = {10.1093/comjnl/27.2.97},
24
+ acmid = {479},
25
+ publisher = {Oxford University Press},
26
+ address = {Oxford, UK},
27
+ }
Binary file
@@ -4,27 +4,28 @@ subtitle: "How to tightly couple Ruby and R in GraalVM"
4
4
  author: "Rodrigo Botafogo"
5
5
  tags: [Galaaz, Ruby, R, TruffleRuby, FastR, GraalVM, ggplot2]
6
6
  date: "2019"
7
+ bibliography: "/home/rbotafogo/Bibliography/stats.bib"
7
8
  output:
8
- html_document:
9
- self_contained: true
10
- keep_md: true
11
- md_document:
12
- variant: markdown_github
13
9
  pdf_document:
14
10
  includes:
15
11
  in_header: "../../sty/galaaz.sty"
16
12
  keep_tex: yes
17
13
  number_sections: yes
18
14
  toc: true
19
- toc_depth: 2
15
+ toc_depth: 3
16
+ html_document:
17
+ self_contained: true
18
+ keep_md: true
19
+ md_document:
20
+ variant: markdown_github
20
21
  fontsize: 11pt
21
22
  ---
22
23
 
23
24
  ```{ruby setup, echo=FALSE}
25
+ R.options(crayon__enabled: false)
24
26
  R.install_and_loads('kableExtra')
25
27
  ```
26
28
 
27
-
28
29
  # Introduction
29
30
 
30
31
  Galaaz is a system for tightly coupling Ruby and R. Ruby is a powerful language, with a large
@@ -34,6 +35,92 @@ other hand, R is considered one of the most powerful languages for solving all o
34
35
  problems. Maybe the strongest competitor to R is Python with libraries such as NumPy,
35
36
  Panda, SciPy, SciKit-Learn and a couple more.
36
37
 
38
+ With Galaaz we do not intend to re-implement any of the scientific libraries in R, we allow
39
+ for very tight coupling between the two languages to the point that the Ruby developer does
40
+ not need to know that there is an R engine running.
41
+
42
+ According to Wikipedia "Ruby is a dynamic, interpreted, reflective, object-oriented,
43
+ general-purpose programming language. It was designed and developed in the mid-1990s by Yukihiro
44
+ "Matz" Matsumoto in Japan." It reached high popularity with the development of Ruby on Rails
45
+ (RoR) by David Heinemeier Hansson. RoR is a web application framework first released
46
+ around 2005. It makes extensive use of Ruby's metaprogramming features. With RoR,
47
+ Ruby became very popular. According to [Ruby's Tiobe index](https://www.tiobe.com/tiobe-index/ruby/)
48
+ it peeked in popularity around 2008, then declined until 2015 when it started picking up again.
49
+ At the time of this writing (November 2018), the Tiobe index puts Ruby in 16th position as
50
+ most popular language.
51
+
52
+ Python, a language similar to Ruby, ranks 4th in the index. Java, C and C++ take the
53
+ first three positions. Ruby is often criticized for its focus on web applications.
54
+ But Ruby can do [much more](https://github.com/markets/awesome-ruby) than just web applications.
55
+ Yet, for scientific computing, Ruby lags way behind Python and R. Python has
56
+ Django framework for web, NumPy for numerical arrays, Pandas for data analysis.
57
+ R is a free software environment for statistical computing and graphics with thousands
58
+ of libraries for data analysis.
59
+
60
+ Until recently, there was no real perspective for Ruby to bridge this gap.
61
+ Implementing a complete scientific computing infrastructure would take too long.
62
+ Enters [Oracle's GraalVM](https://www.graalvm.org/):
63
+
64
+ > GraalVM is a universal virtual machine for running applications written in
65
+ > JavaScript, Python 3, Ruby, R, JVM-based languages like Java, Scala, Kotlin,
66
+ > and LLVM-based languages such as C and C++.
67
+ >
68
+ > GraalVM removes the isolation between programming languages and enables
69
+ > interoperability in a shared runtime. It can run either standalone or in the
70
+ > context of OpenJDK, Node.js, Oracle Database, or MySQL.
71
+ >
72
+ > GraalVM allows you to write polyglot applications with a seamless way to pass
73
+ > values from one language to another. With GraalVM there is no copying or
74
+ > marshaling necessary as it is with other polyglot systems. This lets you
75
+ > achieve high performance when language boundaries are crossed. Most of the time
76
+ > there is no additional cost for crossing a language boundary at all.
77
+ >
78
+ > Often developers have to make uncomfortable compromises that require them
79
+ > to rewrite their software in other languages. For example:
80
+ >
81
+ > * That library is not available in my language. I need to rewrite it.
82
+ > * That language would be the perfect fit for my problem, but we cannot
83
+ > run it in our environment.
84
+ > * That problem is already solved in my language, but the language is
85
+ > too slow.
86
+ >
87
+ > With GraalVM we aim to allow developers to freely choose the right language for
88
+ > the task at hand without making compromises.
89
+
90
+ As stated above, GraalVM is a _universal_ virtual machine that allows Ruby and R (and other
91
+ languages) to run on the same environment. GraalVM allows polyglot applications to
92
+ _seamlessly_ interact with one another and pass values from one language to the other.
93
+ Although a great idea, GraalVM still requires application writers to know several languages.
94
+ To eliminate that requirement, we built Galaaz, a gem for Ruby, to tightly couple
95
+ Ruby and R and allow those languages to interact in a way that the user will be unaware
96
+ of such interaction. In other words, a Ruby programmer will be able to use all
97
+ the capabilities of R without knowing the R syntax.
98
+
99
+ Library wrapping is a usual way of bringing features from one language into another.
100
+ To improve performance, Python often wraps more efficient C libraries. For the
101
+ Python developer, the existence of such C libraries is hidden. The problem with
102
+ library wrapping is that for any new library, there is the need to handcraft a new
103
+ wrapper.
104
+
105
+ Galaaz, instead of wrapping a single C or R library, wraps the whole R language
106
+ in Ruby. Doing so, all thousands of R libraries are available immediately
107
+ to Ruby developers without any new wrapping effort.
108
+
109
+ ## What does Galaaz mean
110
+
111
+ Galaaz is the Portuguese name for "Galahad". From Wikipedia:
112
+
113
+ Sir Galahad (sometimes referred to as Galeas or Galath),
114
+ in Arthurian legend, is a knight of King Arthur's Round Table and one
115
+ of the three achievers of the Holy Grail. He is the illegitimate son
116
+ of Sir Lancelot and Elaine of Corbenic, and is renowned for his
117
+ gallantry and purity as the most perfect of all knights. Emerging quite
118
+ late in the medieval Arthurian tradition, Sir Galahad first appears in the
119
+ Lancelot–Grail cycle, and his story is taken up in later works such as
120
+ the Post-Vulgate Cycle and Sir Thomas Malory's Le Morte d'Arthur.
121
+ His name should not be mistaken with Galehaut, a different knight from
122
+ Arthurian legend.
123
+
37
124
  # System Compatibility
38
125
 
39
126
  * Oracle Linux 7
@@ -84,7 +171,7 @@ Panda, SciPy, SciKit-Learn and a couple more.
84
171
  > galaaz -T
85
172
 
86
173
  Shows a list with all available executalbe tasks. To execute a task, substitute the
87
- 'rake' word in the list with 'galaaz'. For instance, the following line shows up
174
+ 'rake' word in the list with 'galaaz'. For instance, the following line shows up
88
175
  after 'galaaz -T'
89
176
 
90
177
  rake master_list:scatter_plot # scatter_plot from:....
@@ -93,9 +180,711 @@ Panda, SciPy, SciKit-Learn and a couple more.
93
180
 
94
181
  > galaaz master_list:scatter_plot
95
182
 
96
- # Basic Types
97
183
 
98
- ## Vectors
184
+ # Accessing R from Ruby
185
+
186
+ One of the nice aspects of Galaaz on GraalVM, is that variables and functions defined in R, can
187
+ be easily accessed from Ruby. For instance, to access the 'mtcars' data frame from R
188
+ in Ruby, we use the ':mtcar' symbol preceded by the '~' operator, thus '~:r_vec' retrieves the
189
+ value of the 'mtcars' variable.
190
+
191
+ ```{ruby access_r}
192
+ puts ~:mtcars
193
+ ```
194
+
195
+ To access an R function from Ruby, the R function needs to be preceeded by 'R.' scoping.
196
+ Bellow we see and example of creating a R::Vector by calling the 'c' R function
197
+
198
+ ```{ruby call_r_func}
199
+ puts vec = R.c(1.0, 2.0, 3.0, 4.0)
200
+ ```
201
+ Note that 'vec' is an object of type R::Vector:
202
+
203
+ ```{ruby r_object}
204
+ puts vec.class
205
+ ```
206
+ Every object created by a call to an R function will be of a type that inherits from
207
+ R::Object. In R, there is also a function 'class'. In order to access that function we
208
+ can call method 'rclass' in the R::Object:
209
+
210
+ ```{ruby rclass}
211
+ puts vec.rclass
212
+ ```
213
+ When working with R::Object(s), it is possible to use the '.' operator to pipe operations.
214
+ When using '.', the object to which the '.' is applied becomes the first argument of the
215
+ corresponding R function. For instance, function 'c' in R, can be used to concatenate
216
+ two vectors or more vectors (in R, there are no scalar values, scalars are converted to
217
+ vectors of size 1. Within Galaaz, scalar parameter is converted to a size one vector):
218
+
219
+ ```{ruby concat}
220
+ puts R.c(vec, 10, 20, 30)
221
+ ```
222
+ The call above to the 'c' function can also be done using '.' notation:
223
+
224
+ ```{ruby concat_with_dot}
225
+ puts vec.c(10, 20, 30)
226
+ ```
227
+ We will talk about vector indexing in a latter section. But notice here that indexing
228
+ an R::Vector will return another R::Vector:
229
+
230
+ ```{ruby indexing}
231
+ puts vec[1]
232
+ ```
233
+ Sometimes we want to index an R::Object and get back a Ruby object that is not wrapped
234
+ in an R::Object, but the native Ruby object. For this, we can index the R object with
235
+ the '>>' operator:
236
+
237
+ ```{ruby native_value}
238
+ puts vec >> 0
239
+ puts vec >> 2
240
+ ```
241
+
242
+ It is also possible to call an R function with named arguments, by creating the function
243
+ in Galaaz with named parameters. For instance, here is an example of creating a 'list'
244
+ with named elements:
245
+
246
+ ```{ruby named_parameters}
247
+ puts R.list(first_name: "Rodrigo", last_name: "Botafogo")
248
+ ```
249
+
250
+ Many R functions receive another function as argument. For instance, method 'map' applies
251
+ a function to every element of a vector. With Galaaz, it is possible to pass a Proc,
252
+ Method or Lambda in place of the expected R function. In this next example, we will
253
+ add 2 to every element of our previously created vector:
254
+
255
+ ```{ruby proc_as_param}
256
+ puts vec.map { |x| x + 2 }
257
+ ```
258
+
259
+ # gKnitting a Document
260
+
261
+ This manual has been formatted usign gKnit. gKnit uses Knitr and R markdown to knit
262
+ a document in Ruby or R and output it in any of the available formats for R markdown.
263
+ gKnit runs atop of GraalVM, and Galaaz. In gKnit, Ruby variables are persisted between
264
+ chunks, making it an ideal solution for literate programming. Also, since it is based
265
+ on Galaaz, Ruby chunks can have access to R variables and Polyglot Programming with
266
+ Ruby and R is quite natural.
267
+
268
+ The idea of "literate programming" was first introduced by Donald Knuth in the
269
+ 1980's [@Knuth:literate_programming].
270
+ The main intention of this approach was to develop software interspersing macro snippets,
271
+ traditional source code, and a natural language such as English in a document
272
+ that could be compiled into
273
+ executable code and at the same time easily read by a human developer. According to Knuth
274
+ "The practitioner of
275
+ literate programming can be regarded as an essayist, whose main concern is with exposition
276
+ and excellence of style."
277
+
278
+ The idea of literate programming evolved into the idea of reproducible research, in which
279
+ all the data, software code, documentation, graphics etc. needed to reproduce the research
280
+ and its reports could be included in a
281
+ single document or set of documents that when distributed to peers could be rerun generating
282
+ the same output and reports.
283
+
284
+ The R community has put a great deal of effort in reproducible research. In 2002, Sweave was
285
+ introduced and it allowed mixing R code with Latex generating high quality PDF documents. A
286
+ Sweave document could include code, the results of executing the code, graphics and text
287
+ such that it contained the whole narrative to reproduce the research. In
288
+ 2012, Knitr, developed by Yihui Xie from RStudio was released to replace Sweave and to
289
+ consolidate in one single package the many extensions and add-on packages that
290
+ were necessary for Sweave.
291
+
292
+ With Knitr, __R markdown__ was also developed, an extension to the
293
+ Markdown format. With __R markdown__ and Knitr it is possible to generate reports in a multitude
294
+ of formats such as HTML, markdown, Latex, PDF, dvi, etc. __R markdown__ also allows the use of
295
+ multiple programming languages such as R, Ruby, Python, etc. in the same document.
296
+
297
+ In __R markdown__, text is interspersed with
298
+ code chunks that can be executed and both the code and its results can become
299
+ part of the final report. Although __R markdown__ allows multiple programming languages in the
300
+ same document, only R and Python (with
301
+ the reticulate package) can persist variables between chunks. For other languages, such as
302
+ Ruby, every chunk will start a new process and thus all data is lost between chunks, unless it
303
+ is somehow stored in a data file that is read by the next chunk.
304
+
305
+ Being able to persist data
306
+ between chunks is critical for literate programming otherwise the flow of the narrative is lost
307
+ by all the effort of having to save data and then reload it. Although this might, at first, seem like
308
+ a small nuisance, not being able to persist data between chunks is a major issue. For example, let's
309
+ take a look at the following simple example in which we want to show how to create a list and the
310
+ use it. Let's first assume that data cannot be persisted between chunks. In the next chunk we
311
+ create a list, then we would need to save it to file, but to save it, we need somehow to marshal the
312
+ data into a binary format:
313
+
314
+ ```{ruby no_persistence}
315
+ lst = R.list(a: 1, b: 2, c: 3)
316
+ lst.saveRDS("lst.rds")
317
+ ```
318
+ then, on the next chunk, where variable 'lst' is used, we need to read back it's value
319
+
320
+ ```{ruby load_persisted_data}
321
+ lst = R.readRDS("lst.rds")
322
+ puts lst
323
+ ```
324
+
325
+ Now, any single code has dozens of variables that we might want to use and reuse between chunks.
326
+ Clearly, such an approach becomes quickly unmanageable. Probably, because of
327
+ this problem, it is very rare to see any __R markdown__ document in the Ruby community.
328
+
329
+ When variables can be used accross chunks, then no overhead is needed:
330
+
331
+ ```{ruby persistence}
332
+ lst = R.list(a: 1, b: 2, c: 3)
333
+ # any other code can be added here
334
+ ```
335
+
336
+ ```{ruby use_var}
337
+ puts lst
338
+ ```
339
+
340
+ In the Python community, the same effort to have code and text in an integrated environment
341
+ started around the first decade of 2000. In 2006 iPython 0.7.2 was released. In 2014,
342
+ Fernando Pérez, spun off project Jupyter from iPython creating a web-based interactive
343
+ computation environment. Jupyter can now be used with many languages, including Ruby with the
344
+ iruby gem (https://github.com/SciRuby/iruby). In order to have multiple languages in a Jupyter
345
+ notebook the SoS kernel was developed (https://vatlab.github.io/sos-docs/).
346
+
347
+ ## gKnit and __R markdown__
348
+
349
+ gKnit is based on knitr and __R markdown__ and can knit a document
350
+ written both in Ruby and/or R and output it in any of the available formats of __R markdown__. gKnit
351
+ allows ruby developers to do literate programming and reproducible research by allowing them to
352
+ have in a single document, text and code.
353
+
354
+ In gKnit, Ruby variables are persisted between
355
+ chunks, making it an ideal solution for literate programming in this language. Also,
356
+ since it is based on Galaaz, Ruby chunks can have access to R variables and Polyglot Programming
357
+ with Ruby and R is quite natural.
358
+
359
+ This is not a blog post on __R markdown__, and the interested user is directed to the following links
360
+ for detailed information on its capabilities and use.
361
+
362
+ * https://rmarkdown.rstudio.com/ or
363
+ * https://bookdown.org/yihui/rmarkdown/
364
+
365
+ In this post, we will describe just the main aspects of __R markdown__, so the user can start
366
+ gKnitting Ruby and R documents quickly.
367
+
368
+ ## The Yaml header
369
+
370
+ An __R markdown__ document should start with a Yaml header and be stored in a file with
371
+ '.Rmd' extension. This document has the following header for gKitting an HTML document.
372
+
373
+ ```
374
+ ---
375
+ title: "How to do reproducible research in Ruby with gKnit"
376
+ author:
377
+ - "Rodrigo Botafogo"
378
+ - "Daniel Mossé - University of Pittsburgh"
379
+ tags: [Tech, Data Science, Ruby, R, GraalVM]
380
+ date: "20/02/2019"
381
+ output:
382
+ html_document:
383
+ self_contained: true
384
+ keep_md: true
385
+ pdf_document:
386
+ includes:
387
+ in_header: ["../../sty/galaaz.sty"]
388
+ number_sections: yes
389
+ ---
390
+ ```
391
+
392
+ For more information on the options in the Yaml header, [check here](https://bookdown.org/yihui/rmarkdown/html-document.html).
393
+
394
+ ## __R Markdown__ formatting
395
+
396
+ Document formatting can be done with simple markups such as:
397
+
398
+ ## Headers
399
+
400
+ ```
401
+ # Header 1
402
+
403
+ ## Header 2
404
+
405
+ ### Header 3
406
+
407
+ ```
408
+
409
+ ## Lists
410
+
411
+ ```
412
+ Unordered lists:
413
+
414
+ * Item 1
415
+ * Item 2
416
+ + Item 2a
417
+ + Item 2b
418
+ ```
419
+
420
+ ```
421
+ Ordered Lists
422
+
423
+ 1. Item 1
424
+ 2. Item 2
425
+ 3. Item 3
426
+ + Item 3a
427
+ + Item 3b
428
+ ```
429
+
430
+ For more R markdown formatting go to https://rmarkdown.rstudio.com/authoring_basics.html.
431
+
432
+ ## R chunks
433
+
434
+ Running and executing Ruby and R code is actually what really interests us is this blog.
435
+ Inserting a code chunk is done by adding code in a block delimited by three back ticks
436
+ followed by an open
437
+ curly brace ('{') followed with the engine name (r, ruby, rb, include, ...), an
438
+ any optional chunk_label and options, as shown bellow:
439
+
440
+ ````
441
+ ```{engine_name [chunk_label], [chunk_options]}`r ''`
442
+ ```
443
+ ````
444
+
445
+ for instance, let's add an R chunk to the document labeled 'first_r_chunk'. This is
446
+ a very simple code just to create a variable and print it out, as follows:
447
+
448
+ ````
449
+ ```{r first_r_chunk}`r ''`
450
+ vec <- c(1, 2, 3)
451
+ print(vec)
452
+ ```
453
+ ````
454
+
455
+ If this block is added to an __R markdown__ document and gKnitted the result will be:
456
+
457
+ ```{r first_r_chunk}
458
+ vec <- c(1, 2, 3)
459
+ print(vec)
460
+ ```
461
+
462
+ Now let's say that we want to do some analysis in the code, but just print the result and not the
463
+ code itself. For this, we need to add the option 'echo = FALSE'.
464
+
465
+ ````
466
+ ```{r second_r_chunk, echo = FALSE}`r ''`
467
+ vec2 <- c(10, 20, 30)
468
+ vec3 <- vec * vec2
469
+ print(vec3)
470
+ ```
471
+ ````
472
+ Here is how this block will show up in the document. Observe that the code is not shown
473
+ and we only see the execution result in a white box
474
+
475
+ ```{r second_r_chunk, echo = FALSE}
476
+ vec2 <- c(10, 20, 30)
477
+ vec3 <- vec * vec2
478
+ print(vec3)
479
+ ```
480
+
481
+ A description of the available chunk options can be found in https://yihui.name/knitr/.
482
+
483
+ Let's add another R chunk with a function definition. In this example, a vector
484
+ 'r_vec' is created and
485
+ a new function 'reduce_sum' is defined. The chunk specification is
486
+
487
+ ````
488
+ ```{r data_creation}`r ''`
489
+ r_vec <- c(1, 2, 3, 4, 5)
490
+
491
+ reduce_sum <- function(...) {
492
+ Reduce(sum, as.list(...))
493
+ }
494
+ ```
495
+ ````
496
+
497
+ and this is how it will look like once executed. From now on, to be concise in the
498
+ presentation we will not show chunk definitions any longer.
499
+
500
+
501
+ ```{r data_creation}
502
+ r_vec <- c(1, 2, 3, 4, 5)
503
+
504
+ reduce_sum <- function(...) {
505
+ Reduce(sum, as.list(...))
506
+ }
507
+ ```
508
+
509
+ We can, possibly in another chunk, access the vector and call the function as follows:
510
+
511
+ ```{r using_previous}
512
+ print(r_vec)
513
+ print(reduce_sum(r_vec))
514
+ ```
515
+ ## R Graphics with ggplot
516
+
517
+ In the following chunk, we create a bubble chart in R using ggplot and include it in
518
+ this document. Note that there is no directive in the code to include the image, this
519
+ occurs automatically. The 'mpg' dataframe is natively available to R and to Galaaz as
520
+ well.
521
+
522
+ For the reader not knowledgeable of ggplot, ggplot is a graphics library based on "the
523
+ grammar of graphics" [@Wilkinson:grammar_of_graphics]. The idea of the grammar of graphics
524
+ is to build a graphics by adding layers to the plot. More information can be found in
525
+ https://towardsdatascience.com/a-comprehensive-guide-to-the-grammar-of-graphics-for-effective-visualization-of-multi-dimensional-1f92b4ed4149.
526
+
527
+ In the plot bellow the 'mpg' dataset from base R is used. "The data concerns city-cycle fuel
528
+ consumption in miles per gallon, to be predicted in terms of 3 multivalued discrete and 5
529
+ continuous attributes." (Quinlan, 1993)
530
+
531
+ First, the 'mpg' dataset if filtered to extract only cars from the following manumactures: Audi, Ford,
532
+ Honda, and Hyundai and stored in the 'mpg_select' variable. Then, the selected dataframe is passed
533
+ to the ggplot function specifying in the aesthetic method (aes) that 'displacement' (disp) should
534
+ be plotted in the 'x' axis and 'city mileage' should be on the 'y' axis. In the 'labs' layer we
535
+ pass the 'title' and 'subtitle' for the plot. To the basic plot 'g', geom\_jitter is added, that
536
+ plots cars from the same manufactures with the same color (col=manufactures) and the size of the
537
+ car point equal its high way consumption (size = hwy). Finally, a last layer is plotter containing
538
+ a linear regression line (method = "lm") for every manufacturer.
539
+
540
+ ```{r bubble, dev='png'}
541
+ # load package and data
542
+ library(ggplot2)
543
+ data(mpg, package="ggplot2")
544
+
545
+ mpg_select <- mpg[mpg$manufacturer %in% c("audi", "ford", "honda", "hyundai"), ]
546
+
547
+ # Scatterplot
548
+ theme_set(theme_bw()) # pre-set the bw theme.
549
+ g <- ggplot(mpg_select, aes(displ, cty)) +
550
+ labs(subtitle="mpg: Displacement vs City Mileage",
551
+ title="Bubble chart")
552
+
553
+ g + geom_jitter(aes(col=manufacturer, size=hwy)) +
554
+ geom_smooth(aes(col=manufacturer), method="lm", se=F)
555
+ ```
556
+
557
+ ## Ruby chunks
558
+
559
+ Including a Ruby chunk is just as easy as including an R chunk in the document: just
560
+ change the name of the engine to 'ruby'. It is also possible to pass chunk options
561
+ to the Ruby engine; however, this version does not accept all the options that are
562
+ available to R chunks. Future versions will add those options.
563
+
564
+ ````
565
+ ```{ruby first_ruby_chunk}`r ''`
566
+ ```
567
+ ````
568
+
569
+ In this example, the ruby chunk is called 'first_ruby_chunk'. One important
570
+ aspect of chunk labels is that they cannot be duplicated. If a chunk label is
571
+ duplicated, gKnit will stop with an error.
572
+
573
+ In the following chunk, variable 'a', 'b' and 'c' are standard Ruby variables
574
+ and 'vec' and 'vec2' are two vectors created by calling the 'c' method on the
575
+ R module.
576
+
577
+ In Galaaz, the R module allows us to access R functions transparently. The 'c'
578
+ function in R, is a function that concatenates its arguments making a vector.
579
+
580
+ It
581
+ should be clear that there is no requirement in gknit to call or use any R
582
+ functions. gKnit will knit standard Ruby code, or even general text without
583
+ any code.
584
+
585
+ ```{ruby split_data}
586
+ a = [1, 2, 3]
587
+ b = "US$ 250.000"
588
+ c = "The 'outputs' function"
589
+
590
+ vec = R.c(1, 2, 3)
591
+ vec2 = R.c(10, 20, 30)
592
+ ```
593
+
594
+ In the next block, variables 'a', 'vec' and 'vec2' are used and printed.
595
+
596
+ ```{ruby split2}
597
+ puts a
598
+ puts vec * vec2
599
+ ```
600
+
601
+ Note that 'a' is a standard Ruby Array and 'vec' and 'vec2' are vectors that behave accordingly,
602
+ where multiplication works as expected.
603
+
604
+ ## Inline Ruby code
605
+
606
+ When using a Ruby chunk, the code and the output are formatted in blocks as seen above.
607
+ This formatting is not always desired. Sometimes, we want to have the results of the
608
+ Ruby evaluation included in the middle of a phrase. gKnit allows adding inline Ruby code
609
+ with the 'rb' engine. The following chunk specification will
610
+ create and inline Ruby text:
611
+
612
+ ````
613
+ This is some text with inline Ruby accessing variable 'b' which has value:
614
+ ```{rb puts "```{rb puts b}\n```"}
615
+ ```
616
+ and is followed by some other text!
617
+ ````
618
+
619
+ <div style="margin-bottom:30px;">
620
+ </div>
621
+
622
+ This is some text with inline Ruby accessing variable 'b' which has value:
623
+ ```{rb puts b}
624
+ ```
625
+ and is followed by some other text!
626
+
627
+ <div style="margin-bottom:30px;">
628
+ </div>
629
+
630
+ Note that it is important not to add any new line before of after the code
631
+ block if we want everything to be in only one line, resulting in the following sentence
632
+ with inline Ruby code.
633
+
634
+
635
+ ```{ruby heading, echo = FALSE}
636
+ outputs "### #{c}"
637
+ ```
638
+
639
+ He have previously used the standard 'puts' method in Ruby chunks in order produce
640
+ output. The result of a 'puts', as seen in all previous chunks that use it, is formatted
641
+ inside a white box that
642
+ follows the code block. Many times however, we would like to do some processing in the
643
+ Ruby chunk and have the result of this processing generate and output that is
644
+ "included" in the document as if we had typed it in __R markdown__ document.
645
+
646
+ For example, suppose we want to create a new heading in our document, but the heading
647
+ phrase is the result of some code processing: maybe it's the first line of a file we are
648
+ going to read. Method 'outputs' adds its output as if typed in the __R markdown__ document.
649
+
650
+ Take now a look at variable 'c' (it was defined in a previous block above) as
651
+ 'c = "The 'outputs' function". "The 'outputs' function" is actually the name of this
652
+ section and it was created using the 'outputs' function inside a Ruby chunk.
653
+
654
+ The ruby chunk to generate this heading is:
655
+
656
+ ````
657
+ ```{ruby heading}`r ''`
658
+ outputs "### #{c}"
659
+ ```
660
+ ````
661
+
662
+ The three '###' is the way we add a Heading 3 in __R markdown__.
663
+
664
+
665
+ ### HTML Output from Ruby Chunks
666
+
667
+ We've just seen the use of method 'outputs' to add text to the the __R markdown__
668
+ document. This technique can also be used to add HTML code to the document. In
669
+ __R markdown__, any html code typed directly in the document will be properly rendered.
670
+ Here, for instance, is a table definition in HTML and its output in the document:
671
+
672
+ ```
673
+ <table style="width:100%">
674
+ <tr>
675
+ <th>Firstname</th>
676
+ <th>Lastname</th>
677
+ <th>Age</th>
678
+ </tr>
679
+ <tr>
680
+ <td>Jill</td>
681
+ <td>Smith</td>
682
+ <td>50</td>
683
+ </tr>
684
+ <tr>
685
+ <td>Eve</td>
686
+ <td>Jackson</td>
687
+ <td>94</td>
688
+ </tr>
689
+ </table>
690
+ ```
691
+ <div style="margin-bottom:30px;">
692
+ </div>
693
+
694
+ <table style="width:100%">
695
+ <tr>
696
+ <th>Firstname</th>
697
+ <th>Lastname</th>
698
+ <th>Age</th>
699
+ </tr>
700
+ <tr>
701
+ <td>Jill</td>
702
+ <td>Smith</td>
703
+ <td>50</td>
704
+ </tr>
705
+ <tr>
706
+ <td>Eve</td>
707
+ <td>Jackson</td>
708
+ <td>94</td>
709
+ </tr>
710
+ </table>
711
+
712
+ <div style="margin-bottom:30px;">
713
+ </div>
714
+
715
+ But manually creating HTML output is not always easy or desirable, specially
716
+ if we intend the document to be rendered in other formats, for example, as Latex.
717
+ Also, The above
718
+ table looks ugly. The 'kableExtra' library is a great library for
719
+ creating beautiful tables. Take a look at https://cran.r-project.org/web/packages/kableExtra/vignettes/awesome_table_in_html.html
720
+
721
+ In the next chunk, we output the 'mtcars' dataframe from R in a nicely formatted
722
+ table. Note that we retrieve the mtcars dataframe by using '~:mtcars'.
723
+
724
+ ```{ruby nice_table}
725
+ R.install_and_loads('kableExtra')
726
+ outputs (~:mtcars).kable.kable_styling
727
+ ```
728
+
729
+ ## Including Ruby files in a chunk
730
+
731
+ R is a language that was created to be easy and fast for statisticians to use. As far
732
+ as I know, it was not a
733
+ language to be used for developing large systems. Of course, there are large systems and
734
+ libraries in R, but the focus of the language is for developing statistical models and
735
+ distribute that to peers.
736
+
737
+ Ruby on the other hand, is a language for large software development. Systems written in
738
+ Ruby will have dozens, hundreds or even thousands of files. To document a
739
+ large system with literate programming, we cannot expect the developer to add all the
740
+ files in a single '.Rmd' file. gKnit provides the 'include' chunk engine to include
741
+ a Ruby file as if it had being typed in the '.Rmd' file.
742
+
743
+ To include a file, the following chunk should be created, where <filename> is the name of
744
+ the file to be included and where the extension, if it is '.rb', does not need to be added.
745
+ If the 'relative' option is not included, then it is treated as TRUE. When 'relative' is
746
+ true, ruby's 'require\_relative' semantics is used to load the file, when false, Ruby's
747
+ \$LOAD_PATH is searched to find the file and it is 'require'd.
748
+
749
+ ````
750
+ ```{include <filename>, relative = <TRUE/FALSE>}`r ''`
751
+ ```
752
+ ````
753
+
754
+ Bellow we include file 'model.rb', which is in the same directory of this blog.
755
+ This code uses R 'caret' package to split a dataset in a train and test sets.
756
+ The 'caret' package is a very important a useful package for doing Data Analysis,
757
+ it has hundreds of functions for all steps of the Data Analysis workflow. To
758
+ use 'caret' just to split a dataset is like using the proverbial cannon to
759
+ kill the fly. We use it here only to show that integrating Ruby and R and
760
+ using even a very complex package as 'caret' is trivial with Galaaz.
761
+
762
+ A word of advice: the 'caret' package has lots of dependencies and installing
763
+ it in a Linux system is a time consuming operation. Method 'R.install_and_loads'
764
+ will install the package if it is not already installed and can take a while.
765
+
766
+ ````
767
+ ```{include model}`r ''`
768
+ ```
769
+ ````
770
+
771
+ ```{include model}
772
+ ```
773
+
774
+ ```{ruby model_partition}
775
+ mtcars = ~:mtcars
776
+ model = Model.new(mtcars, percent_train: 0.8)
777
+ model.partition(:mpg)
778
+ puts model.train.head
779
+ puts model.test.head
780
+ ```
781
+
782
+ ## Documenting Gems
783
+
784
+ gKnit also allows developers to document and load files that are not in the same directory
785
+ of the '.Rmd' file.
786
+
787
+ Here is an example of loading the 'find.rb' file from TruffleRuby. In this example, relative
788
+ is set to FALSE, so Ruby will look for the file in its $LOAD\_PATH, and the user does not
789
+ need to no it's directory.
790
+
791
+ ````
792
+ ```{include find, relative = FALSE}`r ''`
793
+ ```
794
+ ````
795
+
796
+ ```{include find, relative = FALSE}
797
+ ```
798
+
799
+ ## Converting to PDF
800
+
801
+ One of the beauties of knitr is that the same input can be converted to many different outputs.
802
+ One very useful format, is, of course, PDF. In order to converted an __R markdown__ file to PDF
803
+ it is necessary to have LaTeX installed on the system. We will not explain here how to
804
+ install LaTeX as there are plenty of documents on the web showing how to proceed.
805
+
806
+ gKnit comes with a simple LaTeX style file for gknitting this blog as a PDF document. Here is
807
+ the Yaml header to generate this blog in PDF format instead of HTML:
808
+
809
+ ```
810
+ ---
811
+ title: "gKnit - Ruby and R Knitting with Galaaz in GraalVM"
812
+ author: "Rodrigo Botafogo"
813
+ tags: [Galaaz, Ruby, R, TruffleRuby, FastR, GraalVM, knitr, gknit]
814
+ date: "29 October 2018"
815
+ output:
816
+ pdf\_document:
817
+ includes:
818
+ in\_header: ["../../sty/galaaz.sty"]
819
+ number\_sections: yes
820
+ ---
821
+ ```
822
+
823
+ ## Template based documents generation
824
+
825
+ When a document is converted to PDF it follows a certain convertion template. We've seen above
826
+ the use of 'galaaz.sty' as a basic template to generate a PDF document. Using the
827
+ 'gknit-draft' app that comes with Galaaz, the same .Rmd file can be compiled to different
828
+ looking PDF documents. Galaaz automatically loads the 'rticles' R package that comes with
829
+ templates for the following journals with the respective template name:
830
+
831
+ * ACM articles: acm_article
832
+ * ACS articles: acs_article
833
+ * AEA journal submissions: aea_article
834
+ * AGU journal submissions: ????
835
+ * AMS articles: ams_article
836
+ * American Statistical Association: asa_article
837
+ * Biometrics articles: biometrics_article
838
+ * Bulletin de l'AMQ journal submissions: amq_article
839
+ * CTeX documents: ctex
840
+ * Elsevier journal submissions: elsevier_article
841
+ * IEEE Transaction journal submissions: ieee_article
842
+ * JSS articles: jss_article
843
+ * MDPI journal submissions: mdpi_article
844
+ * Monthly Notices of the Royal Astronomical Society articles: mnras_article
845
+ * NNRAS journal submissions: nmras_article
846
+ * PeerJ articles: peerj_article
847
+ * Royal Society Open Science journal submissions: rsos_article
848
+ * Royal Statistical Society: rss_article
849
+ * Sage journal submissions: sage_article
850
+ * Springer journal submissions: springer_article
851
+ * Statistics in Medicine journal submissions: sim_article
852
+ * Copernicus Publications journal submissions: copernicus_article
853
+ * The R Journal articles: rjournal_article
854
+ * Frontiers articles: ???
855
+ * Taylor & Francis articles: ???
856
+ * Bulletin De L'AMQ: amq_article
857
+ * PLOS journal: plos_article
858
+ * Proceedings of the National Academy of Sciences of the USA: pnas_article
859
+
860
+ In order to create a document with one of those templates, use the following command:
861
+
862
+ ```
863
+ gknit-draft --filename <my_document> --template <template> --package <package>
864
+ --create_dir
865
+ ```
866
+ So, in order to create a template for writing an R Journal, use:
867
+
868
+ ```
869
+ gknit-draft --filename my_r_article --template rjournal_article --package rticles
870
+ --create_dir
871
+ ```
872
+
873
+ # Accessing R variables
874
+
875
+ Galaaz allows Ruby to access variables created in R. For example, the 'mtcars' data set is
876
+ available in R and can be accessed from Ruby by using the 'tilda' operator followed by the
877
+ symbol for the variable, in this case ':mtcar'. In the code bellow method 'outputs' is
878
+ used to output the 'mtcars' data set nicely formatted in HTML by use of the 'kable' and
879
+ 'kable_styling' functions. Method 'outputs' is only available when used with 'gknit'.
880
+
881
+ ```{ruby view_kable}
882
+ outputs (~:mtcars).kable.kable_styling
883
+ ```
884
+
885
+ # Basic Data Types
886
+
887
+ ## Vector
99
888
 
100
889
  Vectors can be thought of as contiguous cells containing data. Cells are accessed through
101
890
  indexing operations such as x[5]. Galaaz has six basic (‘atomic’) vector types: logical,
@@ -120,20 +909,22 @@ vector is often referred to as a character string.
120
909
  To create a vector the 'c' (concatenate) method from the 'R' module should be used:
121
910
 
122
911
  ```{ruby integer}
123
- @vec = R.c(1, 2, 3)
124
- puts @vec
912
+ vec = R.c(1, 2, 3)
913
+ puts vec
125
914
  ```
126
915
 
127
- Lets take a look at the type, mode and storage.mode of our vector @vec. In order to print
916
+ Lets take a look at the type, mode and storage.mode of our vector vec. In order to print
128
917
  this out, we are creating a data frame 'df' and printing it out. A data frame, for those
129
- not familiar with it, it basically a table. Here we create the data frame and add the
918
+ not familiar with it, is basically a table. Here we create the data frame and add the
130
919
  column name by passing named parameters for each column, such as 'typeof:', 'mode:' and
131
- 'storage__mode'. You should also note here that the double underscore is converted to a '.'.
920
+ 'storage__mode?'. You should also note here that the double underscore is converted to a '.'.
921
+ So, when printed 'storage\_\_mode' will actually print as 'storage.mode'.
132
922
 
133
- In R, the method used to create a data frame is 'data.frame', in Galaaz we use 'data__frame'.
923
+ Data frames will later be more carefully described. In R, the method used to create a
924
+ data frame is 'data.frame', in Galaaz we use 'data\_\_frame'.
134
925
 
135
926
  ```{ruby typeof_integer}
136
- df = R.data__frame(typeof: @vec.typeof, mode: @vec.mode, storage__mode: @vec.storage__mode)
927
+ df = R.data__frame(typeof: vec.typeof, mode: vec.mode, storage__mode: vec.storage__mode)
137
928
  puts df
138
929
  ```
139
930
 
@@ -143,12 +934,12 @@ like '1' is converted to float and to have an integer the R developer will use '
143
934
  follows normal Ruby rules and the number 1 is an integer and 1.0 is a float.
144
935
 
145
936
  ```{ruby float}
146
- @vec = R.c(1.0, 2, 3)
147
- puts @vec
937
+ vec = R.c(1.0, 2, 3)
938
+ puts vec
148
939
  ```
149
940
 
150
941
  ```{ruby typeof_float}
151
- df = R.data__frame(typeof: @vec.typeof, mode: @vec.mode, storage__mode: @vec.storage__mode)
942
+ df = R.data__frame(typeof: vec.typeof, mode: vec.mode, storage__mode: vec.storage__mode)
152
943
  outputs df.kable.kable_styling
153
944
  ```
154
945
 
@@ -161,47 +952,1101 @@ of the error.
161
952
  vec = R.c(1, hello, 5)
162
953
  ```
163
954
 
164
- ```{ruby view_kable}
165
- outputs (~:mtcars).kable.kable_styling
955
+ Here is a vector with logical values
956
+
957
+ ```{ruby logical_vector}
958
+ vec = R.c(true, true, false, false, true)
959
+ puts vec
166
960
  ```
167
961
 
962
+ ### Combining Vectors
168
963
 
169
- ## Graphics with ggplot
964
+ The 'c' functions used to create vectors can also be used to combine two vectors:
170
965
 
171
- ```{ruby diverging_bar}
172
- require 'ggplot'
966
+ ```{ruby combining_vectors}
967
+ vec1 = R.c(10.0, 20.0, 30.0)
968
+ vec2 = R.c(4.0, 5.0, 6.0)
969
+ vec = R.c(vec1, vec2)
970
+ puts vec
971
+ ```
972
+ In galaaz, methods can be chainned (somewhat like the pipe operator in R %>%, but more generic).
973
+ In this next example, method 'c' is chainned after 'vec1'. This also looks like 'c' is a
974
+ method of the vector, but in reallity, this is actually closer to the pipe operator. When
975
+ Galaaz identifies that 'c' is not a method of 'vec' it actually tries to call 'R.c' with
976
+ 'vec1' as the first argument concatenated with all the other available arguments. The code
977
+ bellow is automatically converted to the code above.
173
978
 
174
- R.theme_set R.theme_bw
979
+ ```{ruby chainning_methods}
980
+ vec = vec1.c(vec2)
981
+ puts vec
982
+ ```
175
983
 
176
- # Data Prep
177
- mtcars = ~:mtcars
178
- mtcars.car_name = R.rownames(:mtcars)
179
- # compute normalized mpg
180
- mtcars.mpg_z = ((mtcars.mpg - mtcars.mpg.mean)/mtcars.mpg.sd).round 2
181
- mtcars.mpg_type = mtcars.mpg_z < 0 ? "below" : "above"
182
- mtcars = mtcars[mtcars.mpg_z.order, :all]
183
- # convert to factor to retain sorted order in plot
184
- mtcars.car_name = mtcars.car_name.factor levels: mtcars.car_name
984
+ ### Vector Arithmetic
185
985
 
186
- # Diverging Barcharts
187
- gg = mtcars.ggplot(E.aes(x: :car_name, y: :mpg_z, label: :mpg_z)) +
188
- R.geom_bar(E.aes(fill: :mpg_type), stat: 'identity', width: 0.5) +
189
- R.scale_fill_manual(name: "Mileage",
190
- labels: R.c("Above Average", "Below Average"),
191
- values: R.c("above": "#00ba38", "below": "#f8766d")) +
192
- R.labs(subtitle: "Normalised mileage from 'mtcars'",
193
- title: "Diverging Bars") +
194
- R.coord_flip()
986
+ Arithmetic operations on vectors are performed element by element:
195
987
 
196
- puts gg
988
+ ```{ruby vec_arith1}
989
+ puts vec1 + vec2
197
990
  ```
198
991
 
992
+ ```{ruby mult}
993
+ puts vec1 * 5
994
+ ```
199
995
 
200
- [TO BE CONTINUED...]
996
+ When vectors have different length, a recycling rule is applied to the shorter vector:
201
997
 
998
+ ```{ruby recycle}
999
+ vec3 = R.c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)
1000
+ puts vec4 = vec1 + vec3
1001
+ ```
202
1002
 
203
- # Contributing
1003
+ ### Vector Indexing
1004
+
1005
+ Vectors can be indexed by using the '[]' operator:
1006
+
1007
+ ```{ruby index}
1008
+ puts vec4[3]
1009
+ ```
1010
+
1011
+ We can also index a vector with another vector. For example, in the code bellow, we take elements
1012
+ 1, 3, 5, and 7 from vec3:
1013
+
1014
+ ```{ruby index_by_vector}
1015
+ puts vec4[R.c(1, 3, 5, 7)]
1016
+ ```
1017
+
1018
+ Repeating an index and having indices out of order is valid code:
1019
+
1020
+ ```{ruby repeated_index}
1021
+ puts vec4[R.c(1, 3, 3, 1)]
1022
+ ```
1023
+
1024
+ It is also possible to index a vector with a negative number or negative vector. In these cases
1025
+ the indexed values are not returned:
1026
+
1027
+ ```{ruby neg_index}
1028
+ puts vec4[-3]
1029
+ puts vec4[-R.c(1, 3, 5, 7)]
1030
+ ```
1031
+
1032
+ If an index is out of range, a missing value (NA) will be reported.
1033
+
1034
+ ```{ruby out_of_range}
1035
+ puts vec4[30]
1036
+ ```
1037
+
1038
+ It is also possible to index a vector by range:
1039
+
1040
+ ```{ruby range}
1041
+ puts vec4[(2..5)]
1042
+ ```
1043
+
1044
+ Elements in a vector can be named using the 'names' attribute of a vector:
1045
+
1046
+ ```{ruby naming}
1047
+ full_name = R.c("Rodrigo", "A", "Botafogo")
1048
+ full_name.names = R.c("First", "Middle", "Last")
1049
+ puts full_name
1050
+ ```
1051
+
1052
+ Or it can also be named by using the 'c' function with named paramenters:
1053
+
1054
+ ```{ruby named_param}
1055
+ full_name = R.c(First: "Rodrigo", Middle: "A", Last: "Botafogo")
1056
+ puts full_name
1057
+ ```
1058
+
1059
+ ### Extracting Native Ruby Types from a Vector
1060
+
1061
+ Vectors created with 'R.c' are of class R::Vector. You might have noticed that when indexing a
1062
+ vector, a new vector is returned, even if this vector has one single element. In order to use
1063
+ R::Vector with other ruby classes it might be necessary to extract the actual Ruby native type
1064
+ from the vector. In order to do this extraction the '>>' operator is used.
1065
+
1066
+ ```{ruby ruby_native}
1067
+ puts vec4
1068
+ puts vec4 >> 0
1069
+ puts vec4 >> 4
1070
+ ```
1071
+
1072
+ Note that indexing with '>>' starts at 0 and not at 1, also, we cannot do negative indexing.
1073
+
1074
+ ## Matrix
1075
+
1076
+ A matrix is a collection of elements organized as a two dimensional table. A matrix can be
1077
+ created by the 'matrix' function:
1078
+
1079
+ ```{ruby matrix}
1080
+ mat = R.matrix(R.c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0),
1081
+ nrow: 3,
1082
+ ncol: 3)
1083
+
1084
+ puts mat
1085
+ ```
1086
+ Note that matrices data is organized by column first. It is possible to organize the matrix
1087
+ memory by row first passing an extra argument to the 'matrix' function:
1088
+
1089
+ ```{ruby matrix_rowfirst}
1090
+ mat_row = R.matrix(R.c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0),
1091
+ nrow: 3,
1092
+ ncol: 3,
1093
+ byrow: true)
1094
+
1095
+ puts mat_row
1096
+ ```
1097
+
1098
+ ### Indexing a Matrix
1099
+
1100
+ A matrix can be indexed by [row, column]:
1101
+
1102
+ ```{ruby matrix_index}
1103
+ puts mat_row[1, 1]
1104
+ puts mat_row[2, 3]
1105
+ ```
1106
+ It is possible to index an entire row or column with the ':all' keyword
1107
+
1108
+ ```{ruby matrix_index_all}
1109
+ puts mat_row[1, :all]
1110
+ puts mat_row[:all, 2]
1111
+ ```
1112
+
1113
+ Indexing with a vector is also possible for matrices. In the following example we want
1114
+ rows 1 and 3 and columns 2 and 3 building a 2 x 2 matrix.
1115
+
1116
+ ```{ruby matrix_index_vector}
1117
+ puts mat_row[R.c(1, 3), R.c(2, 3)]
1118
+ ```
1119
+
1120
+ Matrices can be combined with functions 'rbind':
1121
+
1122
+ ```{ruby matrix_combine_rbind}
1123
+ puts mat_row.rbind(mat)
1124
+ ```
1125
+
1126
+ and 'cbind':
1127
+
1128
+ ```{ruby matrix_combine_cbind}
1129
+ puts mat_row.cbind(mat)
1130
+ ```
1131
+
1132
+ ## List
1133
+
1134
+ A list is a data structure that can contain sublists of different types, while vector and matrix
1135
+ can only hold one type of element.
1136
+
1137
+ ```{ruby list}
1138
+ nums = R.c(1.0, 2.0, 3.0)
1139
+ strs = R.c("a", "b", "c", "d")
1140
+ bool = R.c(true, true, false)
1141
+ lst = R.list(nums: nums, strs: strs, bool: bool)
1142
+ puts lst
1143
+ ```
1144
+
1145
+ Note that 'lst' elements are named elements.
1146
+
1147
+
1148
+ ### List Indexing
1149
+
1150
+ List indexing, also called slicing, is done using the '[]' operator and the '[[]]' operator. Let's
1151
+ first start with the '[]' operator. The list above has three sublist indexing with '[]' will
1152
+ return one of the sublists.
1153
+
1154
+ ```{ruby list_indexing}
1155
+ puts lst[1]
1156
+ ```
1157
+
1158
+ Note that when using '[]' a new list is returned. When using the double square bracket operator
1159
+ the value returned is the actual element of the list in the given position and not a slice of
1160
+ the original list
1161
+
1162
+
1163
+ ```{ruby list_indexing_single}
1164
+ puts lst[[1]]
1165
+ ```
1166
+
1167
+ When elements are named, as dones with lst, indexing can be done by name:
1168
+
1169
+ ```{ruby list_indexing_by_name}
1170
+ puts lst[['bool']][[1]] >> 0
1171
+ ```
1172
+
1173
+ In this example, first the 'bool' element of the list was extracted, not as a list, but as a vector,
1174
+ then the first element of the vector was extracted (note that vectors also accept the '[[]]'
1175
+ operator) and then the vector was indexed by its first element, extracting the native Ruby type.
1176
+
1177
+
1178
+ ## Data Frame
1179
+
1180
+ A data frame is a table like structure in which each column has the same number of
1181
+ rows. Data frames are the basic structure for storing data for data analysis. We have already
1182
+ seen a data frame previously when we accessed variable '~:mtcars'. In order to create a
1183
+ data frame, function 'data__frame' is used:
1184
+
1185
+ ```{ruby dataframe}
1186
+ df = R.data__frame(
1187
+ year: R.c(2010, 2011, 2012),
1188
+ income: R.c(1000.0, 1500.0, 2000.0))
1189
+
1190
+ puts df
1191
+ ```
1192
+
1193
+ ### Data Frame Indexing
1194
+
1195
+ A data frame can be indexed the same way as a matrix, by using '[row, column]', where row and
1196
+ column can either be a numeric or the name of the row or column
1197
+
1198
+ ```{ruby dataframe_index}
1199
+ puts (~:mtcars).head
1200
+ puts (~:mtcars)[1, 2]
1201
+ puts (~:mtcars)['Datsun 710', 'mpg']
1202
+ ```
1203
+
1204
+ Extracting a column from a data frame as a vector can be done by using the double square bracket
1205
+ operator:
1206
+
1207
+ ```{ruby dataframe_column}
1208
+ puts (~:mtcars)[['mpg']]
1209
+ ```
1210
+
1211
+ A data frame column can also be accessed as if it were an instance variable of the data frame:
1212
+
1213
+ ```{ruby dataframe_instance_variable}
1214
+ puts (~:mtcars).mpg
1215
+ ```
1216
+
1217
+ Slicing a data frame can be done by indexing it with a vector (we use 'head' to reduce the
1218
+ output):
1219
+
1220
+ ```{ruby dataframe_column_slice}
1221
+ puts (~:mtcars)[R.c('mpg', 'hp')].head
1222
+ ```
1223
+
1224
+ A row slice can be obtained by indexing by row and using the ':all' keyword for the column:
1225
+
1226
+ ```{ruby dataframe_row_slice}
1227
+ puts (~:mtcars)[R.c('Datsun 710', 'Camaro Z28'), :all]
1228
+ ```
1229
+
1230
+ Finally, a data frame can also be indexed with a logical vector. In this next example, the
1231
+ 'am' column of :mtcars is compared with 0 (with method 'eq'). When 'am' is equal to 0 the
1232
+ car is automatic. So, by doing '(~:mtcars).am.eq 0' a logical vector is created with
1233
+ 'true' whenever 'am' is 0 and 'false' otherwise.
1234
+
1235
+ ```{ruby logical_vector_filter}
1236
+ # obtain a vector with 'true' for cars with automatic transmission
1237
+ automatic = (~:mtcars).am.eq 0
1238
+ puts automatic
1239
+ ```
1240
+
1241
+ Using this logical vector, the data frame is indexed, returning a new data frame in
1242
+ which all cars have automatic transmission.
1243
+
1244
+ ```{ruby dataframe_logical}
1245
+ # slice the data frame by using this vector
1246
+ puts (~:mtcars)[automatic, :all]
1247
+ ```
1248
+
1249
+ # Writing Expressions in Galaaz
1250
+
1251
+ Galaaz extends Ruby to work with complex expressions, similar to R's expressions build with 'quote'
1252
+ (base R) or 'quo' (tidyverse). Let's take a look at some of those expressions.
1253
+
1254
+ ## Expressions from operators
1255
+
1256
+ The code bellow
1257
+ creates an expression summing two symbols
1258
+
1259
+ ```{ruby expressions}
1260
+ exp1 = :a + :b
1261
+ puts exp1
1262
+ ```
1263
+ We can build any complex mathematical expression
1264
+
1265
+ ```{ruby expr2}
1266
+ exp2 = (:a + :b) * 2.0 + :c ** 2 / :z
1267
+ puts exp2
1268
+ ```
1269
+
1270
+ It is also possible to use inequality operators in building expressions
1271
+
1272
+ ```{ruby expr3}
1273
+ exp3 = (:a + :b) >= :z
1274
+ puts exp3
1275
+ ```
1276
+
1277
+ Galaaz provides both symbolic representations for operators, such as (>, <, !=) as functional
1278
+ notation for those operators such as (.gt, .ge, etc.). So the same expression written
1279
+ above can also be written as
1280
+
1281
+ ```{ruby expr4}
1282
+ exp4 = (:a + :b).ge :z
1283
+ puts exp4
1284
+ ```
1285
+
1286
+ Two type of expression can only be created with the functional representation of the operators,
1287
+ those are expressions involving '==', and '='. In order to write an expression involving '==' we
1288
+ need to use the method '.eq' and for '=' we need the function '.assign'
1289
+
1290
+ ```{ruby expr5}
1291
+ exp5 = (:a + :b).eq :z
1292
+ puts exp5
1293
+ ```
1294
+
1295
+ ```{ruby expr6}
1296
+ exp6 = :y.assign :a + :b
1297
+ puts exp6
1298
+ ```
1299
+ In general we think that using the functional notation is preferable to using the
1300
+ symbolic notation as otherwise, we end up writing invalid expressions such as
1301
+
1302
+ ```{ruby exp_wrong, warning=FALSE, eval=FALSE}
1303
+ exp_wrong = (:a + :b) == :z
1304
+ puts exp_wrong
1305
+ ```
1306
+ and it might be difficult to understand what is going on here. The problem lies with the fact that
1307
+ when using '==' we are comparing expression (:a + :b) to expression :z with '=='. When the
1308
+ comparison is executed, the system tries to evaluate :a, :b and :z, and those symbols at
1309
+ this time are not bound to anything and we get a "object 'a' not found" message.
1310
+ If we only use functional notation, this type of error will not occur.
1311
+
1312
+ ## Expressions with R methods
1313
+
1314
+ It is often necessary to create an expression that uses a method or function. For instance, in
1315
+ mathematics, it's quite natural to write an expressin such as $y = sin(x)$. In this case, the
1316
+ 'sin' function is part of the expression and should not immediately executed. Now, let's say
1317
+ that 'x' is an angle of 45$^\circ$ and we acttually want our expression to be $y = 0.850...$.
1318
+ When we want the function to be part of the expression, we call the function preceeding it
1319
+ by the letter E, such as 'E.sin(x)'
1320
+
1321
+ ```{ruby method_expression}
1322
+ exp7 = :y.assign E.sin(:x)
1323
+ puts exp7
1324
+ ```
1325
+
1326
+ Expressions can also be written using '.' notation:
1327
+
1328
+ ```{ruby expression_with_dot}
1329
+ exp8 = :y.assign :x.sin
1330
+ puts exp8
1331
+ ```
1332
+
1333
+ When a function has multiple arguments, the first one can be used before the '.':
1334
+
1335
+ ```{ruby expression_multiple_args}
1336
+ exp9 = :x.c(:y)
1337
+ puts exp9
1338
+ ```
1339
+
1340
+ ## Evaluating an Expression
1341
+
1342
+ Expressions can be evaluated by calling function 'eval' with a binding. A binding can be provided
1343
+ with a list:
1344
+
1345
+ ```{ruby eval_expression_list}
1346
+ exp = (:a + :b) * 2.0 + :c ** 2 / :z
1347
+ puts exp.eval(R.list(a: 10, b: 20, c: 30, z: 40))
1348
+ ```
1349
+
1350
+ ... with a data frame:
1351
+
1352
+ ```{ruby eval_expression_df}
1353
+ df = R.data__frame(
1354
+ a: R.c(1, 2, 3),
1355
+ b: R.c(10, 20, 30),
1356
+ c: R.c(100, 200, 300),
1357
+ z: R.c(1000, 2000, 3000))
1358
+
1359
+ puts exp.eval(df)
1360
+ ```
1361
+
1362
+ # Manipulating Data
1363
+
1364
+ One of the major benefits of Galaaz is to bring strong data manipulation to Ruby. The following
1365
+ examples were extracted from Hardley's "R for Data Science" (https://r4ds.had.co.nz/). This
1366
+ is a highly recommended book for those not already familiar with the 'tidyverse' style of
1367
+ programming in R. In the sections to follow, we will limit ourselves to convert the R code to
1368
+ Galaaz.
1369
+
1370
+ For these
1371
+ examples, we will investigate the nycflights13 data set available on the package by the
1372
+ same name. We use function 'R.install\_and\_loads' that checks if the library is available
1373
+ locally, and if not, installs it. This data frame contains all 336,776 flights that
1374
+ departed from New York City in 2013. The data comes from the US Bureau of
1375
+ Transportation Statistics.
1376
+
1377
+ Dplyr uses 'tibbles' in place of data frames; unfortunately, tibbles do not print yet properly in
1378
+ Galaaz due to a bug in fastR. In order to print a tibble we need to convert it to a data frame
1379
+ using the 'as\_\_data__frame' method.
1380
+
1381
+ ```{ruby nycflights13}
1382
+ R.install_and_loads('nycflights13')
1383
+ R.library('dplyr')
1384
+ ```
1385
+
1386
+ ```{ruby flights}
1387
+ flights = ~:flights
1388
+ puts flights.head
1389
+ ```
1390
+
1391
+ ## Filtering rows with Filter
1392
+
1393
+ In this example we filter the flights data set by giving to the filter function two expressions:
1394
+ the first :month.eq 1
1395
+
1396
+ ```{ruby filter_rows}
1397
+ puts flights.filter((:month.eq 1), (:day.eq 1)).head
1398
+ ```
1399
+
1400
+ ## Logical Operators
1401
+
1402
+ All flights that departed in November of December
1403
+
1404
+ ```{ruby nov_dec}
1405
+ puts flights.filter((:month.eq 11) | (:month.eq 12)).head
1406
+ ```
1407
+
1408
+ The same as above, but using the 'in' operator. In R, it is possible to define many operators
1409
+ by doing %<op>%. The %in% operator checks if a value is in a vector. In order to use those
1410
+ operators from Galaaz the '._' method is used, where the first argument is the operator's
1411
+ symbol, in this case ':in' and the second argument is the vector:
1412
+
1413
+ ```{ruby in_op}
1414
+ puts flights.filter(:month._ :in, R.c(11, 12)).head
1415
+ ```
1416
+
1417
+ ## Filtering with NA (Not Available)
1418
+
1419
+ Let's first create a 'tibble' with a Not Available value (R::NA). Tibbles are a modern
1420
+ version of a data frame and operate very similarly to one. It differs in how it outputs
1421
+ the values and the result of some subsetting operations that are more consistent than
1422
+ what is obtained from data frame.
1423
+
1424
+ ```{ruby na_tibble}
1425
+ df = R.tibble(x: R.c(1, R::NA, 3))
1426
+ puts df
1427
+ ```
1428
+
1429
+ Now filtering by :x > 1 shows all lines that satisfy this condition, where the row with R:NA does
1430
+ not.
1431
+
1432
+ ```{ruby filter_na}
1433
+ puts df.filter(:x > 1)
1434
+ ```
1435
+
1436
+ To match an NA use method 'is__na'
1437
+
1438
+ ```{ruby with_na}
1439
+ puts df.filter((:x.is__na) | (:x > 1))
1440
+ ```
1441
+
1442
+ ## Arrange Rows with arrange
1443
+
1444
+ Arrange reorders the rows of a data frame by the given arguments.
1445
+
1446
+ ```{ruby arrange}
1447
+ puts flights.arrange(:year, :month, :day).head
1448
+ ```
1449
+
1450
+ To arrange in descending order, use function 'desc'
1451
+
1452
+ ```{ruby desc_arrange}
1453
+ puts flights.arrange(:dep_delay.desc).head
1454
+ ```
1455
+
1456
+ ## Selecting columns
1457
+
1458
+ To select specific columns from a dataset we use function 'select':
1459
+
1460
+ ```{ruby select}
1461
+ puts flights.select(:year, :month, :day).head
1462
+ ```
1463
+
1464
+ It is also possible to select column in a given range
1465
+
1466
+ ```{ruby select_range}
1467
+ puts flights.select(:year.up_to :day).head
1468
+ ```
1469
+
1470
+ Select all columns that start with a given name sequence
204
1471
 
1472
+ ```{ruby select_starts_with}
1473
+ puts flights.select(E.starts_with('arr')).head
1474
+ ```
1475
+
1476
+ Other functions that can be used:
1477
+
1478
+ * ends_with("xyz"): matches names that end with “xyz”.
1479
+
1480
+ * contains("ijk"): matches names that contain “ijk”.
1481
+
1482
+ * matches("(.)\\1"): selects variables that match a regular expression. This one matches
1483
+ any variables that contain repeated characters.
1484
+
1485
+ * num_range("x", (1..3)): matches x1, x2 and x3
1486
+
1487
+ A helper function that comes in handy when we just want to rearrange column order is 'Everything':
1488
+
1489
+ ```{ruby everything}
1490
+ puts flights.select(:year, :month, :day, E.everything).head
1491
+ ```
1492
+
1493
+ ## Add variables to a dataframe with 'mutate'
1494
+
1495
+ ```{ruby small_flights}
1496
+ flights_sm = flights.
1497
+ select((:year.up_to :day),
1498
+ E.ends_with('delay'),
1499
+ :distance,
1500
+ :air_time)
1501
+
1502
+ puts flights_sm.head
1503
+ ```
1504
+
1505
+ ```{ruby mutate}
1506
+ flights_sm = flights_sm.
1507
+ mutate(gain: :dep_delay - :arr_delay,
1508
+ speed: :distance / :air_time * 60)
1509
+ puts flights_sm.head
1510
+ ```
1511
+
1512
+ ## Summarising data
1513
+
1514
+ Function 'summarise' calculates summaries for the data frame. When no 'group_by' is used
1515
+ a single value is obtained from the data frame:
1516
+
1517
+ ```{ruby summarise}
1518
+ puts flights.summarise(delay: E.mean(:dep_delay, na__rm: true))
1519
+ ```
1520
+
1521
+ When a data frame is grouped with 'group_by' summaries apply to the given group:
1522
+
1523
+ ```{ruby summarise_group_by}
1524
+ by_day = flights.group_by(:year, :month, :day)
1525
+ puts by_day.summarise(delay: :dep_delay.mean(na__rm: true)).head
1526
+ ```
1527
+
1528
+ Next we put many operations together by pipping them one after the other:
1529
+
1530
+ ```{ruby pipping}
1531
+ delays = flights.
1532
+ group_by(:dest).
1533
+ summarise(
1534
+ count: E.n,
1535
+ dist: :distance.mean(na__rm: true),
1536
+ delay: :arr_delay.mean(na__rm: true)).
1537
+ filter(:count > 20, :dest != "NHL")
1538
+
1539
+ puts delays.head
1540
+ ```
1541
+
1542
+ # Using Data Table
1543
+
1544
+ ```{ruby fread}
1545
+ R.library('data.table')
1546
+ R.install_and_loads('curl')
1547
+
1548
+ input = "https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv"
1549
+ flights = R.fread(input)
1550
+ puts flights
1551
+ puts flights.dim
1552
+ ```
1553
+
1554
+ ```{ruby data_table}
1555
+
1556
+ data_table = R.data__table(
1557
+ ID: R.c("b","b","b","a","a","c"),
1558
+ a: (1..6),
1559
+ b: (7..12),
1560
+ c: (13..18)
1561
+ )
1562
+
1563
+ puts data_table
1564
+ puts data_table.ID
1565
+ ```
1566
+
1567
+ ```{ruby subset_i}
1568
+ # subset rows in i
1569
+ ans = flights[(:origin.eq "JFK") & (:month.eq 6)]
1570
+ puts ans.head
1571
+
1572
+ # Get the first two rows from flights.
1573
+
1574
+ ans = flights[(1..2)]
1575
+ puts ans
1576
+
1577
+ # Sort flights first by column origin in ascending order, and then by dest in descending order:
1578
+
1579
+ # ans = flights[E.order(:origin, -(:dest))]
1580
+ # puts ans.head
1581
+
1582
+ ```
1583
+
1584
+ ```{ruby select_j}
1585
+ # Select column(s) in j
1586
+ # select arr_delay column, but return it as a vector.
1587
+
1588
+ ans = flights[:all, :arr_delay]
1589
+ puts ans.head
1590
+
1591
+ # Select arr_delay column, but return as a data.table instead.
1592
+
1593
+ ans = flights[:all, :arr_delay.list]
1594
+ puts ans.head
1595
+
1596
+ ans = flights[:all, E.list(:arr_delay, :dep_delay)]
1597
+ ```
1598
+
1599
+ # Graphics in Galaaz
1600
+
1601
+ Creating graphics in Galaaz is quite easy, as it can use all the power of ggplot2. There are
1602
+ many resources in the web that teaches ggplot, so here we give a quick example of ggplot
1603
+ integration with Ruby. We continue to use the :mtcars dataset and we will plot a diverging
1604
+ bar plot, showing cars that have 'above' or 'below' gas consuption. Let's first prepare
1605
+ the data frame with the necessary data:
1606
+
1607
+ ```{ruby diverging_plot_pre}
1608
+ # copy the R variable :mtcars to the Ruby mtcars variable
1609
+ mtcars = ~:mtcars
1610
+
1611
+ # create a new column 'car_name' to store the car names so that it can be
1612
+ # used for plotting. The 'rownames' of the data frame cannot be used as
1613
+ # data for plotting
1614
+ mtcars.car_name = R.rownames(:mtcars)
1615
+
1616
+ # compute normalized mpg and add it to a new column called mpg_z
1617
+ # Note that the mean value for mpg can be obtained by calling the 'mean'
1618
+ # function on the vector 'mtcars.mpg'. The same with the standard
1619
+ # deviation 'sd'. The vector is then rounded to two digits with 'round 2'
1620
+ mtcars.mpg_z = ((mtcars.mpg - mtcars.mpg.mean)/mtcars.mpg.sd).round 2
1621
+
1622
+ # create a new column 'mpg_type'. Function 'ifelse' is a vectorized function
1623
+ # that looks at every element of the mpg_z vector and if the value is below
1624
+ # 0, returns 'below', otherwise returns 'above'
1625
+ mtcars.mpg_type = (mtcars.mpg_z < 0).ifelse("below", "above")
1626
+
1627
+ # order the mtcar data set by the mpg_z vector from smaler to larger values
1628
+ mtcars = mtcars[mtcars.mpg_z.order, :all]
1629
+
1630
+ # convert the car_name column to a factor to retain sorted order in plot
1631
+ mtcars.car_name = mtcars.car_name.factor levels: mtcars.car_name
1632
+
1633
+ # let's look at the final data frame
1634
+ puts mtcars.head
1635
+ ```
1636
+ Now, lets plot the diverging bar plot. When using gKnit, there is no need to call
1637
+ 'R.awt' to create a plotting device, since gKnit does take care of it. Galaaz
1638
+ provides integration with ggplot. The interested reader should check online for more
1639
+ information on ggplot, since it is outside the scope of this manual describing
1640
+ how ggplot works. We give here but a brief description on how this plot is generated.
1641
+
1642
+ ggplot implements the 'grammar of graphics'. In this approach, plots are build by
1643
+ adding layers to the plot. On the first layer we describe what we want on the 'x'
1644
+ and 'y' axis of the plot. In this case, we have 'car_name' on the 'x' axis and
1645
+ 'mpg\_z' on the 'y' axis. Then the type of graph is specified by adding
1646
+ 'geom\_bar' (for a bar graph). We specify that our bars should be filled using
1647
+ 'mpg\_type', which is either 'above' or 'bellow' giving then two colours for
1648
+ filling. On the next layer we specify the labels for the graph, then we add the
1649
+ title and subtitle. Finally, in a bar chart usually bars go on the vertical direction,
1650
+ but in this graph we want the bars to be horizontally layed so we add 'coord\_flip'.
1651
+
1652
+ ```{ruby diverging_bar, fig.width = 9.1, fig.height = 6.5}
1653
+ require 'ggplot'
1654
+
1655
+ puts mtcars.ggplot(E.aes(x: :car_name, y: :mpg_z, label: :mpg_z)) +
1656
+ R.geom_bar(E.aes(fill: :mpg_type), stat: 'identity', width: 0.5) +
1657
+ R.scale_fill_manual(name: 'Mileage',
1658
+ labels: R.c('Above Average', 'Below Average'),
1659
+ values: R.c('above': '#00ba38', 'below': '#f8766d')) +
1660
+ R.labs(subtitle: "Normalised mileage from 'mtcars'",
1661
+ title: "Diverging Bars") +
1662
+ R.coord_flip
1663
+ ```
1664
+
1665
+ # Coding with Tidyverse
1666
+
1667
+ In R, and when coding with 'tidyverse', arguments to a function are usually not
1668
+ *referencially transparent*. That is, you can’t replace a value with a seemingly equivalent
1669
+ object that you’ve defined elsewhere. To see the problem, let's first define a data frame:
1670
+
1671
+ ```{ruby df}
1672
+ df = R.data__frame(x: (1..3), y: (3..1))
1673
+ puts df
1674
+ ```
1675
+
1676
+ and now, let's look at this code:
1677
+
1678
+ ```{r not_transp, eval=FALSE}
1679
+ my_var <- x
1680
+ filter(df, my_var == 1)
1681
+ ```
1682
+ It generates the following error: "object 'x' not found.
1683
+
1684
+ However, in Galaaz, arguments are referencially transparent as can be seen by the
1685
+ code bellow. Note initally that 'my_var = :x' will not give the error "object 'x' not found"
1686
+ since ':x' is treated as an expression and assigned to my\_var. Then when doing (my\_var.eq 1),
1687
+ my\_var is a variable that resolves to ':x' and it becomes equivalent to (:x.eq 1) which is
1688
+ what we want.
1689
+
1690
+ ```{ruby my_var}
1691
+ my_var = :x
1692
+ puts df.filter(my_var.eq 1)
1693
+ ```
1694
+ As stated by Hardley
1695
+
1696
+ > dplyr code is ambiguous. Depending on what variables are defined where,
1697
+ > filter(df, x == y) could be equivalent to any of:
1698
+
1699
+ ```
1700
+ df[df$x == df$y, ]
1701
+ df[df$x == y, ]
1702
+ df[x == df$y, ]
1703
+ df[x == y, ]
1704
+ ```
1705
+ In galaaz this ambiguity does not exist, filter(df, x.eq y) is not a valid expression as
1706
+ expressions are build with symbols. In doing filter(df, :x.eq y) we are looking for elements
1707
+ of the 'x' column that are equal to a previously defined y variable. Finally in
1708
+ filter(df, :x.eq :y) we are looking for elements in which the 'x' column value is equal to
1709
+ the 'y' column value. This can be seen in the following two chunks of code:
1710
+
1711
+ ```{ruby disamb1}
1712
+ y = 1
1713
+ x = 2
1714
+
1715
+ # looking for values where the 'x' column is equal to the 'y' column
1716
+ puts df.filter(:x.eq :y)
1717
+ ```
1718
+
1719
+ ```{ruby disamb2}
1720
+ # looking for values where the 'x' column is equal to the 'y' variable
1721
+ # in this case, the number 1
1722
+ puts df.filter(:x.eq y)
1723
+ ```
1724
+ ## Writing a function that applies to different data sets
1725
+
1726
+ Let's suppose that we want to write a function that receives as the first argument a data frame
1727
+ and as second argument an expression that adds a column to the data frame that is equal to the
1728
+ sum of elements in column 'a' plus 'x'.
1729
+
1730
+ Here is the intended behaviour using the 'mutate' function of 'dplyr':
1731
+
1732
+ ```
1733
+ mutate(df1, y = a + x)
1734
+ mutate(df2, y = a + x)
1735
+ mutate(df3, y = a + x)
1736
+ mutate(df4, y = a + x)
1737
+ ```
1738
+ The naive approach to writing an R function to solve this problem is:
1739
+
1740
+ ```
1741
+ mutate_y <- function(df) {
1742
+ mutate(df, y = a + x)
1743
+ }
1744
+ ```
1745
+ Unfortunately, in R, this function can fail silently if one of the variables isn’t present
1746
+ in the data frame, but is present in the global environment. We will not go through here how
1747
+ to solve this problem in R.
1748
+
1749
+ In Galaaz the method mutate_y bellow will work fine and will never fail silently.
1750
+
1751
+ ```{ruby mutate_y, warning=FALSE}
1752
+ def mutate_y(df)
1753
+ df.mutate(:y.assign :a + :x)
1754
+ end
1755
+ ```
1756
+ Here we create a data frame that has only one column named 'x':
1757
+
1758
+ ```{ruby data_frame_no_a_column, warning=FALSE}
1759
+ df1 = R.data__frame(x: (1..3))
1760
+ puts df1
1761
+ ```
1762
+
1763
+ Note that method mutate_y will fail independetly from the fact that variable 'a' is defined and
1764
+ in the scope of the method. Variable 'a' has no relationship with the symbol ':a' used in the
1765
+ definition of 'mutate\_y' above:
1766
+
1767
+ ```{ruby call_mutate_y, warning = FALSE}
1768
+ a = 10
1769
+ mutate_y(df1)
1770
+ ```
1771
+ ## Different expressions
1772
+
1773
+ Let's move to the next problem as presented by Hardley where trying to write a function in R
1774
+ that will receive two argumens, the first a variable and the second an expression is not trivial.
1775
+ Bellow we create a data frame and we want to write a function that groups data by a variable and
1776
+ summarises it by an expression:
1777
+
1778
+ ```{r diff_expr}
1779
+ set.seed(123)
1780
+
1781
+ df <- data.frame(
1782
+ g1 = c(1, 1, 2, 2, 2),
1783
+ g2 = c(1, 2, 1, 2, 1),
1784
+ a = sample(5),
1785
+ b = sample(5)
1786
+ )
1787
+
1788
+ as.data.frame(df)
1789
+
1790
+ d2 <- df %>%
1791
+ group_by(g1) %>%
1792
+ summarise(a = mean(a))
1793
+
1794
+ as.data.frame(d2)
1795
+
1796
+ d2 <- df %>%
1797
+ group_by(g2) %>%
1798
+ summarise(a = mean(a))
1799
+
1800
+ as.data.frame(d2)
1801
+ ```
1802
+
1803
+ As shown by Hardley, one might expect this function to do the trick:
1804
+
1805
+ ```{r diff_exp_fnc}
1806
+ my_summarise <- function(df, group_var) {
1807
+ df %>%
1808
+ group_by(group_var) %>%
1809
+ summarise(a = mean(a))
1810
+ }
1811
+
1812
+ # my_summarise(df, g1)
1813
+ #> Error: Column `group_var` is unknown
1814
+ ```
1815
+
1816
+ In order to solve this problem, coding with dplyr requires the introduction of many new concepts
1817
+ and functions such as 'quo', 'quos', 'enquo', 'enquos', '!!' (bang bang), '!!!' (triple bang).
1818
+ Again, we'll leave to Hardley the explanation on how to use all those functions.
1819
+
1820
+ Now, let's try to implement the same function in galaaz. The next code block first prints the
1821
+ 'df' data frame defined previously in R (to access an R variable from Galaaz, we use the tilda
1822
+ operator '~' applied to the R variable name as symbol, i.e., ':df'.
1823
+
1824
+ ```{ruby r_dataframe}
1825
+ puts ~:df
1826
+ ```
1827
+
1828
+ We then create the 'my_summarize' method and call it passing the R data frame and
1829
+ the group by variable ':g1':
1830
+
1831
+ ```{ruby diff_exp_ruby_func}
1832
+ def my_summarize(df, group_var)
1833
+ df.group_by(group_var).
1834
+ summarize(a: :a.mean)
1835
+ end
1836
+
1837
+ puts my_summarize(:df, :g1)
1838
+ ```
1839
+
1840
+ It works!!! Well, let's make sure this was not just some coincidence
1841
+
1842
+ ```{ruby group_g2}
1843
+ puts my_summarize(:df, :g2)
1844
+ ```
1845
+
1846
+ Great, everything is fine! No magic, no new functions, no complexities, just normal, standard Ruby
1847
+ code. If you've ever done NSE in R, this certainly feels much safer and easy to implement.
1848
+
1849
+ ## Different input variables
1850
+
1851
+ In the previous section we've managed to get rid of all NSE formulation for a simple example, but
1852
+ does this remain true for more complex examples, or will the Galaaz way prove inpractical for
1853
+ more complex code?
1854
+
1855
+ In the next example Hardley proposes us to write a function that given an expression such as 'a'
1856
+ or 'a * b', calculates three summaries. What we want a function that does the same as these R
1857
+ statements:
1858
+
1859
+ ```
1860
+ summarise(df, mean = mean(a), sum = sum(a), n = n())
1861
+ #> # A tibble: 1 x 3
1862
+ #> mean sum n
1863
+ #> <dbl> <int> <int>
1864
+ #> 1 3 15 5
1865
+
1866
+ summarise(df, mean = mean(a * b), sum = sum(a * b), n = n())
1867
+ #> # A tibble: 1 x 3
1868
+ #> mean sum n
1869
+ #> <dbl> <int> <int>
1870
+ #> 1 9 45 5
1871
+ ```
1872
+
1873
+ Let's try it in galaaz:
1874
+
1875
+ ```{ruby summarize_method}
1876
+ def my_summarise2(df, expr)
1877
+ df.summarize(
1878
+ mean: E.mean(expr),
1879
+ sum: E.sum(expr),
1880
+ n: E.n
1881
+ )
1882
+ end
1883
+
1884
+ puts my_summarise2((~:df), :a)
1885
+ puts "\n"
1886
+ puts my_summarise2((~:df), :a * :b)
1887
+ ```
1888
+
1889
+ Once again, there is no need to use any special theory or functions. The only point to be
1890
+ careful about is the use of 'E' to build expressions from functions 'mean', 'sum' and 'n'.
1891
+
1892
+ ## Different input and output variable
1893
+
1894
+ Now the next challenge presented by Hardley is to vary the name of the output variables based on
1895
+ the received expression. So, if the input expression is 'a', we want our data frame columns to
1896
+ be named 'mean\_a' and 'sum\_a'. Now, if the input expression is 'b', columns
1897
+ should be named 'mean\_b' and 'sum\_b'.
1898
+
1899
+ ```
1900
+ mutate(df, mean_a = mean(a), sum_a = sum(a))
1901
+ #> # A tibble: 5 x 6
1902
+ #> g1 g2 a b mean_a sum_a
1903
+ #> <dbl> <dbl> <int> <int> <dbl> <int>
1904
+ #> 1 1 1 1 3 3 15
1905
+ #> 2 1 2 4 2 3 15
1906
+ #> 3 2 1 2 1 3 15
1907
+ #> 4 2 2 5 4 3 15
1908
+ #> # … with 1 more row
1909
+
1910
+ mutate(df, mean_b = mean(b), sum_b = sum(b))
1911
+ #> # A tibble: 5 x 6
1912
+ #> g1 g2 a b mean_b sum_b
1913
+ #> <dbl> <dbl> <int> <int> <dbl> <int>
1914
+ #> 1 1 1 1 3 3 15
1915
+ #> 2 1 2 4 2 3 15
1916
+ #> 3 2 1 2 1 3 15
1917
+ #> 4 2 2 5 4 3 15
1918
+ #> # … with 1 more row
1919
+ ```
1920
+ In order to solve this problem in R, Hardley needs to introduce some more new functions and notations:
1921
+ 'quo_name' and the ':=' operator from package 'rlang'
1922
+
1923
+ Here is our Ruby code:
1924
+
1925
+ ```{ruby name_change}
1926
+ def my_mutate(df, expr)
1927
+ mean_name = "mean_#{expr.to_s}"
1928
+ sum_name = "sum_#{expr.to_s}"
1929
+
1930
+ df.mutate(mean_name => E.mean(expr),
1931
+ sum_name => E.sum(expr))
1932
+ end
1933
+
1934
+ puts my_mutate((~:df), :a)
1935
+ puts "\n"
1936
+ puts my_mutate((~:df), :b)
1937
+ ```
1938
+ It really seems that "Non Standard Evaluation" is actually quite standard in Galaaz! But, you
1939
+ might have noticed a small change in the way the arguments to the mutate method were called.
1940
+ In a previous example we used df.summarise(mean: E.mean(:a), ...) where the column name was
1941
+ followed by a ':' colom. In this example, we have df.mutate(mean_name => E.mean(expr), ...)
1942
+ and variable mean\_name is not followed by ':' but by '=>'. This is standard Ruby notation.
1943
+
1944
+ [explain....]
1945
+
1946
+ ## Capturing multiple variables
1947
+
1948
+ Moving on with new complexities, Hardley proposes us to solve the problem in which the
1949
+ summarise function will receive any number of grouping variables.
1950
+
1951
+ This again is quite standard Ruby. In order to receive an undefined number of paramenters
1952
+ the paramenter is preceded by '*':
1953
+
1954
+ ```{ruby multiple_vars}
1955
+ def my_summarise3(df, *group_vars)
1956
+ df.group_by(*group_vars).
1957
+ summarise(a: E.mean(:a))
1958
+ end
1959
+
1960
+ puts my_summarise3((~:df), :g1, :g2)
1961
+ ```
1962
+
1963
+ ## Why does R require NSE and Galaaz does not?
1964
+
1965
+ NSE introduces a number of new concepts, such as 'quoting', 'quasiquotation', 'unquoting' and
1966
+ 'unquote-splicing', while in Galaaz none of those concepts are needed. What gives?
1967
+
1968
+ R is an extremely flexible language and it has lazy evaluation of parameters. When in R a
1969
+ function is called as 'summarise(df, a = b)', the summarise function receives the litteral
1970
+ 'a = b' parameter and can work with this as if it were a string. In R, it is not clear what
1971
+ a and b are, they can be expressions or they can be variables, it is up to the function to
1972
+ decide what 'a = b' means.
1973
+
1974
+ In Ruby, there is no lazy evaluation of parameters and 'a' is always a variable and so is 'b'.
1975
+ Variables assume their value as soon as they are used, so 'x = a' is immediately evaluate and
1976
+ variable 'x' will receive the value of variable 'a' as soon as the Ruby statement is executed.
1977
+ Ruby also provides the notion of a symbol; ':a' is a symbol and does not evaluate to anything.
1978
+ Galaaz uses Ruby symbols to build expressions that are not bound to anything: ':a.eq :b' is
1979
+ clearly an expression and has no relationship whatsoever with the statment 'a = b'. By using
1980
+ symbols, variables and expressions all the possible ambiguities that are found in R are
1981
+ eliminated in Galaaz.
1982
+
1983
+ The main problem that remains, is that in R, functions are not clearly documented as what type
1984
+ of input they are expecting, they might be expecting regular variables or they might be
1985
+ expecting expressions and the R function will know how to deal with an input of the form
1986
+ 'a = b', now for the Ruby developer it might not be immediately clear if it should call the
1987
+ function passing the value 'true' if variable 'a' is equal to variable 'b' or if it should
1988
+ call the function passing the expression ':a.eq :b'.
1989
+
1990
+
1991
+ ## Advanced dplyr features
1992
+
1993
+ In the blog: Programming with dplyr by using dplyr (https://www.r-bloggers.com/programming-with-dplyr-by-using-dplyr/) Iñaki Úcar shows surprise that some R users are trying to code in dplyr avoiding
1994
+ the use of NSE. For instance he says:
1995
+
1996
+ > Take the example of seplyr. It stands for standard evaluation dplyr, and enables us to
1997
+ > program over dplyr without having “to bring in (or study) any deep-theory or
1998
+ > heavy-weight tools such as rlang/tidyeval”.
1999
+
2000
+ For me, there isn't really any surprise that users are trying to avoid dplyr deep-theory. R
2001
+ users frequently are not programmers and learning to code is already hard business, on top
2002
+ of that, having to learn how to 'quote' or 'enquo' or 'quos' or 'enquos' is not necessarily
2003
+ a 'piece of cake'. So much so, that 'tidyeval' has some more advanced functions that instead
2004
+ of using quoted expressions, uses strings as arguments.
2005
+
2006
+ In the following examples, we show the use of functions 'group\_by\_at', 'summarise\_at' and
2007
+ 'rename\_at' that receive strings as argument. The data frame used in 'starwars' that describes
2008
+ features of characters in the Starwars movies:
2009
+
2010
+ ```{ruby starwars}
2011
+ puts (~:starwars).head
2012
+ ```
2013
+ The grouped_mean function bellow will receive a grouping variable and calculate summaries for
2014
+ the value\_variables given:
2015
+
2016
+ ```{r grouped_mean}
2017
+ grouped_mean <- function(data, grouping_variables, value_variables) {
2018
+ data %>%
2019
+ group_by_at(grouping_variables) %>%
2020
+ mutate(count = n()) %>%
2021
+ summarise_at(c(value_variables, "count"), mean, na.rm = TRUE) %>%
2022
+ rename_at(value_variables, funs(paste0("mean_", .)))
2023
+ }
2024
+
2025
+ gm = starwars %>%
2026
+ grouped_mean("eye_color", c("mass", "birth_year"))
2027
+
2028
+ as.data.frame(gm)
2029
+ ```
2030
+
2031
+ The same code with Galaaz, becomes:
2032
+
2033
+ ```{ruby advanced_starwars}
2034
+ def grouped_mean(data, grouping_variables, value_variables)
2035
+ data.
2036
+ group_by_at(grouping_variables).
2037
+ mutate(count: E.n).
2038
+ summarise_at(E.c(value_variables, "count"), ~:mean, na__rm: true).
2039
+ rename_at(value_variables, E.funs(E.paste0("mean_", value_variables)))
2040
+ end
2041
+
2042
+ puts grouped_mean((~:starwars), "eye_color", E.c("mass", "birth_year"))
2043
+ ```
2044
+
2045
+
2046
+ [TO BE CONTINUED...]
2047
+
2048
+
2049
+ # Contributing
205
2050
 
206
2051
  * Fork it
207
2052
  * Create your feature branch (git checkout -b my-new-feature)
@@ -210,3 +2055,4 @@ puts gg
210
2055
  * Push to the branch (git push origin my-new-feature)
211
2056
  * Create new Pull Request
212
2057
 
2058
+ # References