galaaz 0.4.7 → 0.4.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1179 -39
  3. data/Rakefile +4 -2
  4. data/bin/grun +1 -1
  5. data/bin/gstudio +1 -1
  6. data/blogs/dev/dev.Rmd +2 -56
  7. data/blogs/dev/dev.md +32 -61
  8. data/blogs/dev/dev2.Rmd +65 -0
  9. data/blogs/dplyr/dplyr.Rmd +29 -0
  10. data/blogs/{dev/dev.html → dplyr/dplyr.html} +88 -57
  11. data/blogs/dplyr/dplyr.md +58 -0
  12. data/blogs/gknit/gknit.html +1262 -25
  13. data/blogs/gknit/gknit.md +471 -27
  14. data/blogs/gknit/gknit_files/figure-html/bubble-1.png +0 -0
  15. data/blogs/manual/graph.rb +29 -0
  16. data/blogs/manual/manual.Rmd +567 -29
  17. data/blogs/manual/manual.html +743 -46
  18. data/blogs/manual/manual.md +1179 -39
  19. data/blogs/nse_dplyr/nse_dplyr.Rmd +466 -11
  20. data/blogs/nse_dplyr/nse_dplyr.html +472 -37
  21. data/blogs/nse_dplyr/nse_dplyr.md +645 -32
  22. data/blogs/ruby_plot/ruby_plot.Rmd +4 -4
  23. data/blogs/ruby_plot/ruby_plot.html +217 -2
  24. data/blogs/ruby_plot/ruby_plot.md +226 -1
  25. data/blogs/ruby_plot/ruby_plot_files/figure-html/dose_len.png +0 -0
  26. data/blogs/ruby_plot/ruby_plot_files/figure-html/dose_len.svg +2 -2
  27. data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_delivery.png +0 -0
  28. data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_delivery.svg +70 -70
  29. data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_dose.png +0 -0
  30. data/blogs/ruby_plot/ruby_plot_files/figure-html/facet_by_dose.svg +72 -72
  31. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color.png +0 -0
  32. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color.svg +116 -116
  33. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color2.png +0 -0
  34. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_by_delivery_color2.svg +176 -176
  35. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_decorations.png +0 -0
  36. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_jitter.png +0 -0
  37. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_jitter.svg +236 -236
  38. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_points.png +0 -0
  39. data/blogs/ruby_plot/ruby_plot_files/figure-html/facets_with_points.svg +176 -176
  40. data/blogs/ruby_plot/ruby_plot_files/figure-html/final_box_plot.png +0 -0
  41. data/blogs/ruby_plot/ruby_plot_files/figure-html/final_box_plot.svg +160 -160
  42. data/blogs/ruby_plot/ruby_plot_files/figure-html/final_violin_plot.png +0 -0
  43. data/blogs/ruby_plot/ruby_plot_files/figure-html/final_violin_plot.svg +105 -105
  44. data/blogs/ruby_plot/ruby_plot_files/figure-html/violin_with_jitter.png +0 -0
  45. data/blogs/ruby_plot/ruby_plot_files/figure-html/violin_with_jitter.svg +121 -121
  46. data/examples/islr/ch2.spec.rb +1 -1
  47. data/examples/islr/ch3_boston.rb +4 -4
  48. data/examples/islr/x_y_rnorm.jpg +0 -0
  49. data/lib/R_interface/r.rb +1 -1
  50. data/lib/R_interface/r_methods.rb +2 -2
  51. data/lib/R_interface/rdata_frame.rb +8 -5
  52. data/lib/R_interface/rindexed_object.rb +1 -2
  53. data/lib/R_interface/rlist.rb +1 -0
  54. data/lib/R_interface/robject.rb +0 -1
  55. data/lib/R_interface/rpkg.rb +14 -6
  56. data/lib/R_interface/rsupport.rb +7 -9
  57. data/lib/R_interface/ruby_extensions.rb +17 -5
  58. data/lib/gknit/knitr_engine.rb +9 -2
  59. data/lib/util/exec_ruby.rb +2 -2
  60. data/specs/r_dataframe.spec.rb +173 -0
  61. data/specs/r_list.spec.rb +4 -4
  62. data/specs/ruby_expression.spec.rb +2 -11
  63. data/specs/tmp.rb +76 -34
  64. data/version.rb +1 -1
  65. metadata +17 -6
  66. data/blogs/dev/dev_files/figure-html/bubble-1.png +0 -0
  67. data/blogs/dev/dev_files/figure-html/diverging_bar. +0 -0
  68. data/blogs/dev/dev_files/figure-html/diverging_bar.png +0 -0
@@ -283,6 +283,9 @@ img {
283
283
  button.code-folding-btn:focus {
284
284
  outline: none;
285
285
  }
286
+ summary {
287
+ display: list-item;
288
+ }
286
289
  </style>
287
290
 
288
291
 
@@ -290,10 +293,71 @@ button.code-folding-btn:focus {
290
293
  <div class="container-fluid main-container">
291
294
 
292
295
  <!-- tabsets -->
296
+
297
+ <style type="text/css">
298
+ .tabset-dropdown > .nav-tabs {
299
+ display: inline-table;
300
+ max-height: 500px;
301
+ min-height: 44px;
302
+ overflow-y: auto;
303
+ background: white;
304
+ border: 1px solid #ddd;
305
+ border-radius: 4px;
306
+ }
307
+
308
+ .tabset-dropdown > .nav-tabs > li.active:before {
309
+ content: "";
310
+ font-family: 'Glyphicons Halflings';
311
+ display: inline-block;
312
+ padding: 10px;
313
+ border-right: 1px solid #ddd;
314
+ }
315
+
316
+ .tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
317
+ content: "";
318
+ border: none;
319
+ }
320
+
321
+ .tabset-dropdown > .nav-tabs.nav-tabs-open:before {
322
+ content: "";
323
+ font-family: 'Glyphicons Halflings';
324
+ display: inline-block;
325
+ padding: 10px;
326
+ border-right: 1px solid #ddd;
327
+ }
328
+
329
+ .tabset-dropdown > .nav-tabs > li.active {
330
+ display: block;
331
+ }
332
+
333
+ .tabset-dropdown > .nav-tabs > li > a,
334
+ .tabset-dropdown > .nav-tabs > li > a:focus,
335
+ .tabset-dropdown > .nav-tabs > li > a:hover {
336
+ border: none;
337
+ display: inline-block;
338
+ border-radius: 4px;
339
+ }
340
+
341
+ .tabset-dropdown > .nav-tabs.nav-tabs-open > li {
342
+ display: block;
343
+ float: none;
344
+ }
345
+
346
+ .tabset-dropdown > .nav-tabs > li {
347
+ display: none;
348
+ }
349
+ </style>
350
+
293
351
  <script>
294
352
  $(document).ready(function () {
295
353
  window.buildTabsets("TOC");
296
354
  });
355
+
356
+ $(document).ready(function () {
357
+ $('.tabset-dropdown > .nav-tabs > li').click(function () {
358
+ $(this).parent().toggleClass('nav-tabs-open')
359
+ });
360
+ });
297
361
  </script>
298
362
 
299
363
  <!-- code folding -->
@@ -302,7 +366,6 @@ $(document).ready(function () {
302
366
 
303
367
 
304
368
 
305
-
306
369
  <div class="fluid-row" id="header">
307
370
 
308
371
 
@@ -317,6 +380,63 @@ $(document).ready(function () {
317
380
 
318
381
  <div id="introduction" class="section level1">
319
382
  <h1>Introduction</h1>
383
+ <p>In this post we will see how to program with dplyr in Galaaz.</p>
384
+ <div id="but-first-what-is-galaaz" class="section level3">
385
+ <h3>But first, what is Galaaz??</h3>
386
+ <p>Galaaz is a system for tightly coupling Ruby and R. Ruby is a powerful language, with a large community, a very large set of libraries and great for web development. However, it lacks libraries for data science, statistics, scientific plotting and machine learning. On the other hand, R is considered one of the most powerful languages for solving all of the above problems. Maybe the strongest competitor to R is Python with libraries such as NumPy, Panda, SciPy, SciKit-Learn and a couple more.</p>
387
+ <p>With Galaaz we do not intend to re-implement any of the scientific libraries in R. However, we allow for very tight coupling between the two languages to the point that the Ruby developer does not need to know that there is an R engine running. For this to happen we use new technologies provided by Oracle: GraalVM, TruffleRuby and FastR:</p>
388
+ <pre><code> GraalVM is a universal virtual machine for running applications
389
+ written in JavaScript, Python 3, Ruby, R, JVM-based languages like Java,
390
+ Scala, Kotlin, and LLVM-based languages such as C and C++.
391
+
392
+ GraalVM removes the isolation between programming languages and enables
393
+ interoperability in a shared runtime. It can run either standalone or in
394
+ the context of OpenJDK, Node.js, Oracle Database, or MySQL.
395
+
396
+ GraalVM allows you to write polyglot applications with a seamless way to
397
+ pass values from one language to another. With GraalVM there is no copying
398
+ or marshaling necessary as it is with other polyglot systems. This lets
399
+ you achieve high performance when language boundaries are crossed. Most
400
+ of the time there is no additional cost for crossing a language boundary
401
+ at all.
402
+
403
+ Often developers have to make uncomfortable compromises that require them
404
+ to rewrite their software in other languages. For example:
405
+
406
+ * “That library is not available in my language. I need to rewrite it.”
407
+ * “That language would be the perfect fit for my problem, but we cannot
408
+ run it in our environment.”
409
+ * “That problem is already solved in my language, but the language is
410
+ too slow.”
411
+
412
+ With GraalVM we aim to allow developers to freely choose the right language
413
+ for the task at hand without making compromises.</code></pre>
414
+ <p>Interested readers should also check out the following sites:</p>
415
+ <ul>
416
+ <li><a href="https://www.graalvm.org/">GraalVM Home</a></li>
417
+ <li><a href="https://github.com/oracle/truffleruby">TruffleRuby</a></li>
418
+ <li><a href="https://github.com/oracle/fastr">FastR</a></li>
419
+ <li><a href="https://medium.com/graalvm/faster-r-with-fastr-4b8db0e0dceb">Faster R with FastR</a></li>
420
+ </ul>
421
+ </div>
422
+ <div id="now-to-programming-with-dplyr" class="section level3">
423
+ <h3>Now to programming with dplyr</h3>
424
+ <p>According to Hardley (<a href="https://dplyr.tidyverse.org/articles/programming.html" class="uri">https://dplyr.tidyverse.org/articles/programming.html</a>)</p>
425
+ <blockquote>
426
+ <p>Most dplyr functions use non-standard evaluation (NSE). This is a catch-all term that means they don’t follow the usual R rules of evaluation. Instead, they capture the expression that you typed and evaluate it in a custom way. This has two main benefits for dplyr code:</p>
427
+ </blockquote>
428
+ <blockquote>
429
+ <p>Operations on data frames can be expressed succinctly because you don’t need to repeat the name of the data frame. For example, you can write filter(df, x == 1, y == 2, z == 3) instead of df[df<span class="math inline">\(x == 1 &amp; df\)</span>y ==2 &amp; df$z == 3, ].</p>
430
+ </blockquote>
431
+ <blockquote>
432
+ <p>dplyr can choose to compute results in a different way to base R. This is important for database backends because dplyr itself doesn’t do any work, but instead generates the SQL that tells the database what to do.</p>
433
+ </blockquote>
434
+ <blockquote>
435
+ <p>Unfortunately these benefits do not come for free. There are two main drawbacks:</p>
436
+ </blockquote>
437
+ <blockquote>
438
+ <p>Most dplyr arguments are not referentially transparent. That means you can’t replace a value with a seemingly equivalent object that you’ve defined elsewhere. In other words, this code:</p>
439
+ </blockquote>
320
440
  <pre class="r"><code>df &lt;- data.frame(x = 1:3, y = 3:1)
321
441
  print(df)</code></pre>
322
442
  <pre><code>## x y
@@ -330,63 +450,378 @@ print(df)</code></pre>
330
450
  #&gt; x y
331
451
  #&gt; &lt;int&gt; &lt;int&gt;
332
452
  #&gt; 1 1 3</code></pre>
453
+ <blockquote>
454
+ <p>Is not equivalent to this code:</p>
455
+ </blockquote>
333
456
  <pre class="r"><code>my_var &lt;- x
334
457
  #&gt; Error in eval(expr, envir, enclos): object 'x' not found
335
458
  filter(df, my_var == 1)
336
459
  #&gt; Error: object 'my_var' not found</code></pre>
460
+ <blockquote>
461
+ <p>This makes it hard to create functions with arguments that change how dplyr verbs are computed.</p>
462
+ </blockquote>
463
+ </div>
464
+ </div>
465
+ <div id="writing-expressions-in-galaaz" class="section level1">
466
+ <h1>Writing Expressions in Galaaz</h1>
467
+ <p>Galaaz extends Ruby to work with complex expressions, similar to R’s expressions build with ‘quote’ (base R) or ‘quo’ (tidyverse). Let’s take a look at some of those expressions.</p>
468
+ <div id="expressions-from-operators" class="section level2">
469
+ <h2>Expressions from operators</h2>
470
+ <p>The code bellow creates an expression summing two symbols</p>
471
+ <pre class="ruby"><code>exp1 = :a + :b
472
+ puts exp1</code></pre>
473
+ <pre><code>## a + b</code></pre>
474
+ <p>We can build any complex mathematical expression</p>
475
+ <pre class="ruby"><code>exp2 = (:a + :b) * 2.0 + :c ** 2 / :z
476
+ puts exp2</code></pre>
477
+ <pre><code>## (a + b) * 2 + c^2L/z</code></pre>
478
+ <p>It is also possible to use inequality operators in building expressions</p>
479
+ <pre class="ruby"><code>exp3 = (:a + :b) &gt;= :z
480
+ puts exp3</code></pre>
481
+ <pre><code>## a + b &gt;= z</code></pre>
482
+ <p>Galaaz provides both symbolic representations for operators, such as (&gt;, &lt;, !=) as functional notation for those operators such as (.gt, .ge, etc.). So the same expression written above can also be written as</p>
483
+ <pre class="ruby"><code>exp4 = (:a + :b).ge :z
484
+ puts exp4</code></pre>
485
+ <pre><code>## a + b &gt;= z</code></pre>
486
+ <p>Two type of expression can only be created with the functional representation of the operators, those are expressions involving ‘==’, and ‘=’. In order to write an expression involving ‘==’ we need to use the method ‘.eq’ and for ‘=’ we need the function ‘.assign’</p>
487
+ <pre class="ruby"><code>exp5 = (:a + :b).eq :z
488
+ puts exp5</code></pre>
489
+ <pre><code>## a + b == z</code></pre>
490
+ <pre class="ruby"><code>exp6 = :y.assign :a + :b
491
+ puts exp6</code></pre>
492
+ <pre><code>## y &lt;- a + b</code></pre>
493
+ <p>In general we think that using the functional notation is preferable to using the symbolic notation as otherwise, we end up writing invalid expressions such as</p>
494
+ <pre class="ruby"><code>exp_wrong = (:a + :b) == :z
495
+ puts exp_wrong</code></pre>
496
+ <pre><code>## Message:
497
+ ## Error in function (x, y, num.eq = TRUE, single.NA = TRUE, attrib.as.set = TRUE, :
498
+ ## object 'a' not found (RError)
499
+ ## Translated to internal error</code></pre>
500
+ <p>and it might be difficult to understand what is going on here. The problem lies with the fact that when using ‘==’ we are comparing expression (:a + :b) to expression :z with ‘==’. When the comparison is executed, the system tries to evaluate :a, :b and :z, and those symbols, at this time are not bound to anything and we get a “object ‘a’ not found” message.<br />
501
+ If we only use functional notation, this type of error will never occur.</p>
502
+ </div>
503
+ <div id="expressions-with-r-methods" class="section level2">
504
+ <h2>Expressions with R methods</h2>
505
+ <p>It is often necessary to create an expression that uses a method or function. For instance, in mathematics, it’s quite natural to write an expressin such as <span class="math inline">\(y = sin(x)\)</span>. In this case, the ‘sin’ function is part of the expression and should not immediately executed. Now, let’s say that ‘x’ is an angle of 45<span class="math inline">\(^\circ\)</span> and we acttually want our expression to be <span class="math inline">\(y = 0.850...\)</span>. When we want the function to be part of the expression, we call the function preceeding it by the letter E, such as ‘E.sin(x)’</p>
506
+ <pre class="ruby"><code>exp7 = :y.assign E.sin(:x)
507
+ puts exp7</code></pre>
508
+ <pre><code>## y &lt;- sin(x)</code></pre>
509
+ <p>However, if we want the function to be evaluated, then we use the normal call to function with R as ‘R.sin(x)’.</p>
510
+ <pre class="ruby"><code>x = 45
511
+ exp8 = :y.assign R.sin(x)
512
+ puts exp8</code></pre>
513
+ <pre><code>## y &lt;- 0.850903524534118</code></pre>
514
+ </div>
515
+ </div>
516
+ <div id="filtering-using-expressions" class="section level1">
517
+ <h1>Filtering using expressions</h1>
518
+ <p>Now that we now how to write expression, we can use then to filter a data frame by expressions.<br />
519
+ Let’s first start by creating a simple data frame with two columns named ‘x’ and ‘y’</p>
337
520
  <pre class="ruby"><code>@df = R.data__frame(x: (1..3), y: (3..1))
338
- puts @df
339
-
340
- puts @df.filter(:x.eq 1)</code></pre>
521
+ puts @df</code></pre>
341
522
  <pre><code>## x y
342
523
  ## 1 1 3
343
524
  ## 2 2 2
344
- ## 3 3 1
345
- ## x y
525
+ ## 3 3 1</code></pre>
526
+ <p>In the code bellow we want to filter the data frame by rows in which the value of ‘x is equal to 1.</p>
527
+ <pre class="ruby"><code>puts @df.filter(:x.eq 1)</code></pre>
528
+ <pre><code>## x y
346
529
  ## 1 1 3</code></pre>
530
+ <p>In R, and when coding with ‘tidyverse’, arguments to a function are usually not <em>referencially transparent</em>. That is, ou can’t replace a value with a seemingly equivalent object that you’ve defined elsewhere. In other words, this code</p>
531
+ <pre class="r"><code>my_var &lt;- x
532
+ filter(df, my_var == 1)</code></pre>
533
+ <p>Generates the following error: &quot;object ‘x’ not found.</p>
534
+ <p>However, in Ruby and Galaaz, arguments are referencially transparent as can be seen by the code bellow. Note, initally that ‘my_var = :x’ will not give the error “object ‘x’ not found” since ‘:x’ is treated as an expression and assigned to my_var. Then when doing (my_var.eq 1), my_var is a variable that resolves to ‘:x’ and it becomes equivalent to (:x.eq 1) which is what we want.</p>
347
535
  <pre class="ruby"><code>my_var = :x
348
536
  puts @df.filter(my_var.eq 1)</code></pre>
349
537
  <pre><code>## x y
350
538
  ## 1 1 3</code></pre>
351
- <blockquote>
539
+ <p>As stated by Hardley</p>
352
540
  <blockquote>
353
541
  <p>dplyr code is ambiguous. Depending on what variables are defined where, filter(df, x == y) could be equivalent to any of:</p>
354
542
  </blockquote>
355
- </blockquote>
356
543
  <pre><code>df[df$x == df$y, ]
357
544
  df[df$x == y, ]
358
545
  df[x == df$y, ]
359
546
  df[x == y, ]</code></pre>
360
- <p>In galaaz this ambiguity does not exist</p>
361
- <pre class="ruby"><code>y = 2
362
- x = 2
363
-
364
- @df[:x.eq :y, :all]
365
- @df[:x.eq y, :all]
366
- # @df[x.eq :y, :all]
367
- # @df[x == y, :all]</code></pre>
368
- <pre><code>## Message:
369
- ## wrong number of arguments (given 2, expected 1)</code></pre>
547
+ <p>In galaaz this ambiguity does not exist, filter(df, x.eq y) is not a valid expression as expressions are build with symbols. In doing filter(df, :x.eq y) we are looking for elements of the ‘x’ column that are equal to a previously defined y variable. Finally, filter(df, :x.eq :y) we are looking for elements in which the ‘x’ column value is equal to the ‘y’ column value. This can be seen in the following two chunks of code:</p>
548
+ <pre class="ruby"><code>@y = 1
549
+ @x = 2
550
+
551
+ # looking for values where the 'x' column is equal to the 'y' column
552
+ puts @df.filter(:x.eq :y)</code></pre>
553
+ <pre><code>## x y
554
+ ## 1 2 2</code></pre>
555
+ <pre class="ruby"><code># looking for values where the 'x' column is equal to the 'y' variable
556
+ # in this case, the number 1
557
+ puts @df.filter(:x.eq @y)</code></pre>
558
+ <pre><code>## x y
559
+ ## 1 1 3</code></pre>
560
+ </div>
561
+ <div id="writing-a-function-that-applies-to-different-data-sets" class="section level1">
562
+ <h1>Writing a function that applies to different data sets</h1>
563
+ <pre><code>mutate(df1, y = a + x)
564
+ mutate(df2, y = a + x)
565
+ mutate(df3, y = a + x)
566
+ mutate(df4, y = a + x)</code></pre>
567
+ <p>Here we create a mutate_y Ruby method.</p>
568
+ <pre class="ruby"><code>def mutate_y(df)
569
+ df.mutate(:y.assign :a + :x)
570
+ end</code></pre>
571
+ <p>Note that contrary to what happens in R, method mutate_y will fail independetly from the fact that variable ‘a’ is defined or not.</p>
572
+ <pre class="ruby"><code>df1 = R.data__frame(x: (1..3))
573
+ puts df1
574
+ a = 10
575
+ mutate_y(df1)</code></pre>
370
576
  <pre><code>## Message:
371
- ## /home/rbotafogo/desenv/galaaz/lib/R_interface/rbinary_operators.rb:134:in `eq'
372
- ## (eval):4:in `exec_ruby'
373
- ## /home/rbotafogo/desenv/galaaz/lib/util/exec_ruby.rb:138:in `instance_eval'
374
- ## /home/rbotafogo/desenv/galaaz/lib/util/exec_ruby.rb:138:in `exec_ruby'
375
- ## /home/rbotafogo/desenv/galaaz/lib/gknit/knitr_engine.rb:650:in `block in initialize'
376
- ## /home/rbotafogo/desenv/galaaz/lib/R_interface/ruby_callback.rb:77:in `call'
377
- ## /home/rbotafogo/desenv/galaaz/lib/R_interface/ruby_callback.rb:77:in `callback'
378
- ## (eval):3:in `function(...) {\n rb_method(...)'
379
- ## unknown.r:1:in `in_dir'
380
- ## unknown.r:1:in `block_exec'
381
- ## /home/rbotafogo/lib/graalvm-ce-1.0.0-rc12/jre/languages/R/library/knitr/R/block.R:91:in `call_block'
382
- ## /home/rbotafogo/lib/graalvm-ce-1.0.0-rc12/jre/languages/R/library/knitr/R/block.R:6:in `process_group.block'
383
- ## /home/rbotafogo/lib/graalvm-ce-1.0.0-rc12/jre/languages/R/library/knitr/R/block.R:3:in `&lt;no source&gt;'
384
- ## unknown.r:1:in `withCallingHandlers'
385
- ## unknown.r:1:in `process_file'
386
- ## unknown.r:1:in `&lt;no source&gt;'
387
- ## unknown.r:1:in `&lt;no source&gt;'
388
- ## &lt;REPL&gt;:5:in `&lt;repl wrapper&gt;'
389
- ## &lt;REPL&gt;:1</code></pre>
577
+ ## Error in mutate_impl(.data, dots) :
578
+ ## Evaluation error: object 'a' not found.
579
+ ## In addition: Warning message:
580
+ ## In mutate_impl(.data, dots) :
581
+ ## mismatched protect/unprotect (unprotect with empty protect stack) (RError)
582
+ ## Translated to internal error</code></pre>
583
+ </div>
584
+ <div id="different-expressions" class="section level1">
585
+ <h1>Different expressions</h1>
586
+ <pre class="r"><code>df &lt;- data.frame(
587
+ g1 = c(1, 1, 2, 2, 2),
588
+ g2 = c(1, 2, 1, 2, 1),
589
+ a = sample(5),
590
+ b = sample(5)
591
+ )
592
+
593
+ d2 &lt;- df %&gt;%
594
+ group_by(g1) %&gt;%
595
+ summarise(a = mean(a))
596
+
597
+ as.data.frame(d2) </code></pre>
598
+ <pre><code>## g1 a
599
+ ## 1 1 3
600
+ ## 2 2 3</code></pre>
601
+ <pre class="r"><code>d2 &lt;- df %&gt;%
602
+ group_by(g2) %&gt;%
603
+ summarise(a = mean(a))
604
+
605
+ as.data.frame(d2) </code></pre>
606
+ <pre><code>## g2 a
607
+ ## 1 1 3.666667
608
+ ## 2 2 2.000000</code></pre>
609
+ <p>Trying to write a function in R that will receive two argumens, the first a variable and the second an expression is not trivia. As shown by Hardley, one might expect this function to do the trick:</p>
610
+ <pre class="r"><code>my_summarise &lt;- function(df, group_var) {
611
+ df %&gt;%
612
+ group_by(group_var) %&gt;%
613
+ summarise(a = mean(a))
614
+ }
615
+
616
+ # my_summarise(df, g1)
617
+ #&gt; Error: Column `group_var` is unknown</code></pre>
618
+ <p>In order to solve this problem, coding with dplyr requires the introduction of many new concepts and functions such as ‘quo’, ‘quos’, ‘enquo’, ‘enquos’, ‘!!’ (bang bang), ‘!!!’ (triple bang).</p>
619
+ <p>Now, let’s try to implement the same function in galaaz. The next code block first prints the ‘df’ data frame define previously in R, then creates the my_summarize function and calls it passing the R data frame and the group by variable ‘:g1’</p>
620
+ <pre class="ruby"><code>puts ~:df
621
+ print &quot;\n&quot;
622
+
623
+ def my_summarize(df, group_var)
624
+ df.group_by(group_var).
625
+ summarize(a: E.mean(:a))
626
+ end
627
+
628
+ puts my_summarize((~:df), :g1).as__data__frame</code></pre>
629
+ <pre><code>## g1 g2 a b
630
+ ## 1 1 1 5 2
631
+ ## 2 1 2 1 5
632
+ ## 3 2 1 2 4
633
+ ## 4 2 2 3 1
634
+ ## 5 2 1 4 3
635
+ ##
636
+ ## g1 a
637
+ ## 1 1 3
638
+ ## 2 2 3</code></pre>
639
+ <p>It works!!! Well let’s make sure this was not just some coincidence</p>
640
+ <pre class="ruby"><code>puts my_summarize((~:df), :g2).as__data__frame</code></pre>
641
+ <pre><code>## g2 a
642
+ ## 1 1 3.666667
643
+ ## 2 2 2.000000</code></pre>
644
+ <p>Great, everything is fine! No magic, no new functions, no complexities, just normal, standard Ruby code. If you’ve ever done NSE in R, this certainly feels much safer and easy to implement.</p>
645
+ </div>
646
+ <div id="different-input-variables" class="section level1">
647
+ <h1>Different input variables</h1>
648
+ <p>In the previous section we’ve managed to get rid of all NSE formulation for a simple example, but does this remain true for more complex examples, or will the Ruby way prove inpractical for more complex code?</p>
649
+ <p>In the next example Hardley proposes us to write a function that given an expression such as ‘a’ or ‘a * b’, calculates three summaries. What we want a function that does the same as these R statements:</p>
650
+ <pre><code>summarise(df, mean = mean(a), sum = sum(a), n = n())
651
+ #&gt; # A tibble: 1 x 3
652
+ #&gt; mean sum n
653
+ #&gt; &lt;dbl&gt; &lt;int&gt; &lt;int&gt;
654
+ #&gt; 1 3 15 5
655
+
656
+ summarise(df, mean = mean(a * b), sum = sum(a * b), n = n())
657
+ #&gt; # A tibble: 1 x 3
658
+ #&gt; mean sum n
659
+ #&gt; &lt;dbl&gt; &lt;int&gt; &lt;int&gt;
660
+ #&gt; 1 9.6 48 5</code></pre>
661
+ <p>Let’s try it in galaaz:</p>
662
+ <pre class="ruby"><code>def my_summarise2(df, expr)
663
+ df.summarize(
664
+ mean: E.mean(expr),
665
+ sum: E.sum(expr),
666
+ n: E.n
667
+ )
668
+ end
669
+
670
+ puts my_summarise2((~:df), :a)
671
+ puts my_summarise2((~:df), :a * :b)</code></pre>
672
+ <pre><code>## mean sum n
673
+ ## 1 3 15 5
674
+ ## mean sum n
675
+ ## 1 7.6 38 5</code></pre>
676
+ <p>Once again, there is no need to use any special theory or functions. The only point to be careful about is the use of ‘E’ to build an expression that uses the mean, sum and n.</p>
677
+ </div>
678
+ <div id="different-input-and-output-variable" class="section level1">
679
+ <h1>Different input and output variable</h1>
680
+ <p>Now the next challenge presented by Hardley is to vary the name of the output variables based on the received expression. So, if the input expression is ‘a’, we want our data frame columns to be named ‘mean_a’ and ‘sum_a’. Now, if the input expression is ‘b’, columns should be named ‘mean_b’ and ‘sum_b’.</p>
681
+ <pre><code>mutate(df, mean_a = mean(a), sum_a = sum(a))
682
+ #&gt; # A tibble: 5 x 6
683
+ #&gt; g1 g2 a b mean_a sum_a
684
+ #&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;int&gt; &lt;int&gt; &lt;dbl&gt; &lt;int&gt;
685
+ #&gt; 1 1 1 1 3 3 15
686
+ #&gt; 2 1 2 4 2 3 15
687
+ #&gt; 3 2 1 2 1 3 15
688
+ #&gt; 4 2 2 5 4 3 15
689
+ #&gt; # … with 1 more row
690
+
691
+ mutate(df, mean_b = mean(b), sum_b = sum(b))
692
+ #&gt; # A tibble: 5 x 6
693
+ #&gt; g1 g2 a b mean_b sum_b
694
+ #&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;int&gt; &lt;int&gt; &lt;dbl&gt; &lt;int&gt;
695
+ #&gt; 1 1 1 1 3 3 15
696
+ #&gt; 2 1 2 4 2 3 15
697
+ #&gt; 3 2 1 2 1 3 15
698
+ #&gt; 4 2 2 5 4 3 15
699
+ #&gt; # … with 1 more row</code></pre>
700
+ <p>Here is our Ruby code</p>
701
+ <pre class="ruby"><code>def my_mutate(df, expr)
702
+ mean_name = &quot;mean_#{expr.to_s}&quot;
703
+ sum_name = &quot;sum_#{expr.to_s}&quot;
704
+
705
+ df.mutate(mean_name =&gt; E.mean(expr),
706
+ sum_name =&gt; E.sum(expr))
707
+ end
708
+
709
+ puts my_mutate((~:df), :a)
710
+ puts my_mutate((~:df), :b)</code></pre>
711
+ <pre><code>## g1 g2 a b mean_a sum_a
712
+ ## 1 1 1 5 2 3 15
713
+ ## 2 1 2 1 5 3 15
714
+ ## 3 2 1 2 4 3 15
715
+ ## 4 2 2 3 1 3 15
716
+ ## 5 2 1 4 3 3 15
717
+ ## g1 g2 a b mean_b sum_b
718
+ ## 1 1 1 5 2 3 15
719
+ ## 2 1 2 1 5 3 15
720
+ ## 3 2 1 2 4 3 15
721
+ ## 4 2 2 3 1 3 15
722
+ ## 5 2 1 4 3 3 15</code></pre>
723
+ <p>It really seems that “Non Standard Evaluation” is actually quite standard in Galaaz! But, you might have noticed a small change in the way the arguments to the mutate method were called. In a previous example we used df.summarise(mean: E.mean(:a), …) where the column name was followed by a ‘:’ colom. In this example, we have df.mutate(mean_name =&gt; E.mean(expr), …) and variable mean_name is not followed by ‘:’ but by ‘=&gt;’. This is standard Ruby notation.</p>
724
+ <p>[explain….]</p>
725
+ </div>
726
+ <div id="capturing-multiple-variables" class="section level1">
727
+ <h1>Capturing multiple variables</h1>
728
+ <pre class="ruby"><code>def my_summarise3(df, *group_vars)
729
+ df.group_by(*group_vars).
730
+ summarise(a: E.mean(:a))
731
+ end
732
+
733
+ puts my_summarise3((~:df), :g1, :g2).as__data__frame</code></pre>
734
+ <pre><code>## g1 g2 a
735
+ ## 1 1 1 5
736
+ ## 2 1 2 1
737
+ ## 3 2 1 3
738
+ ## 4 2 2 3</code></pre>
739
+ </div>
740
+ <div id="advanced-dplyr-features" class="section level1">
741
+ <h1>Advanced dplyr features</h1>
742
+ <p><a href="https://www.r-bloggers.com/programming-with-dplyr-by-using-dplyr/" class="uri">https://www.r-bloggers.com/programming-with-dplyr-by-using-dplyr/</a></p>
743
+ <pre class="ruby"><code>puts (~:starwars).head.as__data__frame</code></pre>
744
+ <pre><code>## name height mass hair_color skin_color eye_color birth_year
745
+ ## 1 Luke Skywalker 172 77 blond fair blue 19.0
746
+ ## 2 C-3PO 167 75 &lt;NA&gt; gold yellow 112.0
747
+ ## 3 R2-D2 96 32 &lt;NA&gt; white, blue red 33.0
748
+ ## 4 Darth Vader 202 136 none white yellow 41.9
749
+ ## 5 Leia Organa 150 49 brown light brown 19.0
750
+ ## 6 Owen Lars 178 120 brown, grey light blue 52.0
751
+ ## gender homeworld species
752
+ ## 1 male Tatooine Human
753
+ ## 2 &lt;NA&gt; Tatooine Droid
754
+ ## 3 &lt;NA&gt; Naboo Droid
755
+ ## 4 male Tatooine Human
756
+ ## 5 female Alderaan Human
757
+ ## 6 male Tatooine Human
758
+ ## films
759
+ ## 1 Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope, The Force Awakens
760
+ ## 2 Attack of the Clones, The Phantom Menace, Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope
761
+ ## 3 Attack of the Clones, The Phantom Menace, Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope, The Force Awakens
762
+ ## 4 Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope
763
+ ## 5 Revenge of the Sith, Return of the Jedi, The Empire Strikes Back, A New Hope, The Force Awakens
764
+ ## 6 Attack of the Clones, Revenge of the Sith, A New Hope
765
+ ## vehicles starships
766
+ ## 1 Snowspeeder, Imperial Speeder Bike X-wing, Imperial shuttle
767
+ ## 2
768
+ ## 3
769
+ ## 4 TIE Advanced x1
770
+ ## 5 Imperial Speeder Bike
771
+ ## 6</code></pre>
772
+ <pre class="r"><code>grouped_mean &lt;- function(data, grouping_variables, value_variables) {
773
+ data %&gt;%
774
+ group_by_at(grouping_variables) %&gt;%
775
+ mutate(count = n()) %&gt;%
776
+ summarise_at(c(value_variables, &quot;count&quot;), mean, na.rm = TRUE) %&gt;%
777
+ rename_at(value_variables, funs(paste0(&quot;mean_&quot;, .)))
778
+ }
779
+
780
+ gm = starwars %&gt;%
781
+ grouped_mean(&quot;eye_color&quot;, c(&quot;mass&quot;, &quot;birth_year&quot;))
782
+
783
+ as.data.frame(gm) </code></pre>
784
+ <pre><code>## eye_color mean_mass mean_birth_year count
785
+ ## 1 black 76.28571 33.00000 10
786
+ ## 2 blue 86.51667 67.06923 19
787
+ ## 3 blue-gray 77.00000 57.00000 1
788
+ ## 4 brown 66.09231 108.96429 21
789
+ ## 5 dark NaN NaN 1
790
+ ## 6 gold NaN NaN 1
791
+ ## 7 green, yellow 159.00000 NaN 1
792
+ ## 8 hazel 66.00000 34.50000 3
793
+ ## 9 orange 282.33333 231.00000 8
794
+ ## 10 pink NaN NaN 1
795
+ ## 11 red 81.40000 33.66667 5
796
+ ## 12 red, blue NaN NaN 1
797
+ ## 13 unknown 31.50000 NaN 3
798
+ ## 14 white 48.00000 NaN 1
799
+ ## 15 yellow 81.11111 76.38000 11</code></pre>
800
+ <pre class="ruby"><code>def grouped_mean(data, grouping_variables, value_variables)
801
+ data.
802
+ group_by_at(grouping_variables).
803
+ mutate(count: E.n).
804
+ summarise_at(E.c(value_variables, &quot;count&quot;), ~:mean, na__rm: true).
805
+ rename_at(value_variables, R.funs(E.paste0(&quot;mean_&quot;, value_variables)))
806
+ end
807
+
808
+ puts grouped_mean((~:starwars), &quot;eye_color&quot;, R.c(&quot;mass&quot;, &quot;birth_year&quot;)).as__data__frame</code></pre>
809
+ <pre><code>## eye_color mean_mass mean_birth_year count
810
+ ## 1 black 76.28571 33.00000 10
811
+ ## 2 blue 86.51667 67.06923 19
812
+ ## 3 blue-gray 77.00000 57.00000 1
813
+ ## 4 brown 66.09231 108.96429 21
814
+ ## 5 dark NaN NaN 1
815
+ ## 6 gold NaN NaN 1
816
+ ## 7 green, yellow 159.00000 NaN 1
817
+ ## 8 hazel 66.00000 34.50000 3
818
+ ## 9 orange 282.33333 231.00000 8
819
+ ## 10 pink NaN NaN 1
820
+ ## 11 red 81.40000 33.66667 5
821
+ ## 12 red, blue NaN NaN 1
822
+ ## 13 unknown 31.50000 NaN 3
823
+ ## 14 white 48.00000 NaN 1
824
+ ## 15 yellow 81.11111 76.38000 11</code></pre>
390
825
  </div>
391
826
 
392
827