BioDSL 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (186) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/BioDSL.gemspec +1 -1
  4. data/Gemfile +6 -0
  5. data/README.md +289 -155
  6. data/Rakefile +18 -16
  7. data/lib/BioDSL.rb +1 -1
  8. data/lib/BioDSL/cary.rb +78 -53
  9. data/lib/BioDSL/command.rb +2 -2
  10. data/lib/BioDSL/commands.rb +1 -1
  11. data/lib/BioDSL/commands/add_key.rb +1 -1
  12. data/lib/BioDSL/commands/align_seq_mothur.rb +4 -4
  13. data/lib/BioDSL/commands/analyze_residue_distribution.rb +5 -5
  14. data/lib/BioDSL/commands/assemble_pairs.rb +13 -13
  15. data/lib/BioDSL/commands/assemble_seq_idba.rb +7 -9
  16. data/lib/BioDSL/commands/assemble_seq_ray.rb +13 -13
  17. data/lib/BioDSL/commands/assemble_seq_spades.rb +4 -4
  18. data/lib/BioDSL/commands/classify_seq.rb +8 -8
  19. data/lib/BioDSL/commands/classify_seq_mothur.rb +5 -5
  20. data/lib/BioDSL/commands/clip_primer.rb +7 -7
  21. data/lib/BioDSL/commands/cluster_otus.rb +5 -5
  22. data/lib/BioDSL/commands/collapse_otus.rb +2 -2
  23. data/lib/BioDSL/commands/collect_otus.rb +2 -2
  24. data/lib/BioDSL/commands/complement_seq.rb +4 -4
  25. data/lib/BioDSL/commands/count.rb +1 -1
  26. data/lib/BioDSL/commands/count_values.rb +2 -2
  27. data/lib/BioDSL/commands/degap_seq.rb +6 -7
  28. data/lib/BioDSL/commands/dereplicate_seq.rb +1 -1
  29. data/lib/BioDSL/commands/dump.rb +2 -2
  30. data/lib/BioDSL/commands/filter_rrna.rb +4 -4
  31. data/lib/BioDSL/commands/genecall.rb +7 -7
  32. data/lib/BioDSL/commands/grab.rb +1 -1
  33. data/lib/BioDSL/commands/index_taxonomy.rb +3 -3
  34. data/lib/BioDSL/commands/mask_seq.rb +4 -4
  35. data/lib/BioDSL/commands/mean_scores.rb +2 -2
  36. data/lib/BioDSL/commands/merge_pair_seq.rb +3 -3
  37. data/lib/BioDSL/commands/merge_table.rb +1 -1
  38. data/lib/BioDSL/commands/merge_values.rb +1 -1
  39. data/lib/BioDSL/commands/plot_heatmap.rb +4 -5
  40. data/lib/BioDSL/commands/plot_histogram.rb +4 -4
  41. data/lib/BioDSL/commands/plot_matches.rb +5 -5
  42. data/lib/BioDSL/commands/plot_residue_distribution.rb +6 -6
  43. data/lib/BioDSL/commands/plot_scores.rb +7 -7
  44. data/lib/BioDSL/commands/random.rb +1 -1
  45. data/lib/BioDSL/commands/read_fasta.rb +9 -9
  46. data/lib/BioDSL/commands/read_fastq.rb +16 -16
  47. data/lib/BioDSL/commands/read_table.rb +2 -3
  48. data/lib/BioDSL/commands/reverse_seq.rb +4 -4
  49. data/lib/BioDSL/commands/slice_align.rb +4 -4
  50. data/lib/BioDSL/commands/slice_seq.rb +3 -3
  51. data/lib/BioDSL/commands/sort.rb +1 -1
  52. data/lib/BioDSL/commands/split_pair_seq.rb +6 -7
  53. data/lib/BioDSL/commands/split_values.rb +2 -2
  54. data/lib/BioDSL/commands/trim_primer.rb +13 -8
  55. data/lib/BioDSL/commands/trim_seq.rb +5 -5
  56. data/lib/BioDSL/commands/uchime_ref.rb +6 -6
  57. data/lib/BioDSL/commands/uclust.rb +5 -5
  58. data/lib/BioDSL/commands/unique_values.rb +1 -1
  59. data/lib/BioDSL/commands/usearch_global.rb +2 -2
  60. data/lib/BioDSL/commands/usearch_local.rb +2 -2
  61. data/lib/BioDSL/commands/write_fasta.rb +7 -9
  62. data/lib/BioDSL/commands/write_fastq.rb +4 -4
  63. data/lib/BioDSL/commands/write_table.rb +3 -3
  64. data/lib/BioDSL/commands/write_tree.rb +2 -3
  65. data/lib/BioDSL/config.rb +2 -2
  66. data/lib/BioDSL/csv.rb +8 -10
  67. data/lib/BioDSL/debug.rb +1 -1
  68. data/lib/BioDSL/fasta.rb +54 -40
  69. data/lib/BioDSL/fastq.rb +35 -32
  70. data/lib/BioDSL/filesys.rb +56 -47
  71. data/lib/BioDSL/fork.rb +1 -1
  72. data/lib/BioDSL/hamming.rb +1 -1
  73. data/lib/BioDSL/helpers.rb +1 -1
  74. data/lib/BioDSL/helpers/aux_helper.rb +1 -1
  75. data/lib/BioDSL/helpers/email_helper.rb +1 -1
  76. data/lib/BioDSL/helpers/history_helper.rb +1 -1
  77. data/lib/BioDSL/helpers/log_helper.rb +1 -1
  78. data/lib/BioDSL/helpers/options_helper.rb +1 -1
  79. data/lib/BioDSL/helpers/status_helper.rb +1 -1
  80. data/lib/BioDSL/html_report.rb +1 -1
  81. data/lib/BioDSL/math.rb +1 -1
  82. data/lib/BioDSL/mummer.rb +1 -1
  83. data/lib/BioDSL/pipeline.rb +1 -1
  84. data/lib/BioDSL/seq.rb +240 -231
  85. data/lib/BioDSL/seq/ambiguity.rb +1 -1
  86. data/lib/BioDSL/seq/assemble.rb +1 -1
  87. data/lib/BioDSL/seq/backtrack.rb +93 -76
  88. data/lib/BioDSL/seq/digest.rb +1 -1
  89. data/lib/BioDSL/seq/dynamic.rb +43 -55
  90. data/lib/BioDSL/seq/homopolymer.rb +34 -36
  91. data/lib/BioDSL/seq/kmer.rb +67 -50
  92. data/lib/BioDSL/seq/levenshtein.rb +35 -40
  93. data/lib/BioDSL/seq/translate.rb +64 -55
  94. data/lib/BioDSL/seq/trim.rb +60 -50
  95. data/lib/BioDSL/serializer.rb +1 -1
  96. data/lib/BioDSL/stream.rb +1 -1
  97. data/lib/BioDSL/taxonomy.rb +1 -1
  98. data/lib/BioDSL/test.rb +1 -1
  99. data/lib/BioDSL/tmp_dir.rb +1 -1
  100. data/lib/BioDSL/usearch.rb +1 -1
  101. data/lib/BioDSL/verbose.rb +1 -1
  102. data/lib/BioDSL/version.rb +2 -2
  103. data/test/BioDSL/commands/test_add_key.rb +1 -1
  104. data/test/BioDSL/commands/test_align_seq_mothur.rb +1 -1
  105. data/test/BioDSL/commands/test_analyze_residue_distribution.rb +1 -1
  106. data/test/BioDSL/commands/test_assemble_pairs.rb +1 -1
  107. data/test/BioDSL/commands/test_assemble_seq_idba.rb +1 -1
  108. data/test/BioDSL/commands/test_assemble_seq_ray.rb +1 -1
  109. data/test/BioDSL/commands/test_assemble_seq_spades.rb +1 -1
  110. data/test/BioDSL/commands/test_classify_seq.rb +1 -1
  111. data/test/BioDSL/commands/test_classify_seq_mothur.rb +1 -1
  112. data/test/BioDSL/commands/test_clip_primer.rb +1 -1
  113. data/test/BioDSL/commands/test_cluster_otus.rb +1 -1
  114. data/test/BioDSL/commands/test_collapse_otus.rb +1 -1
  115. data/test/BioDSL/commands/test_collect_otus.rb +1 -1
  116. data/test/BioDSL/commands/test_complement_seq.rb +1 -1
  117. data/test/BioDSL/commands/test_count.rb +1 -1
  118. data/test/BioDSL/commands/test_count_values.rb +1 -1
  119. data/test/BioDSL/commands/test_degap_seq.rb +1 -1
  120. data/test/BioDSL/commands/test_dereplicate_seq.rb +1 -1
  121. data/test/BioDSL/commands/test_dump.rb +1 -1
  122. data/test/BioDSL/commands/test_filter_rrna.rb +1 -1
  123. data/test/BioDSL/commands/test_genecall.rb +1 -1
  124. data/test/BioDSL/commands/test_grab.rb +1 -1
  125. data/test/BioDSL/commands/test_index_taxonomy.rb +1 -1
  126. data/test/BioDSL/commands/test_mask_seq.rb +1 -1
  127. data/test/BioDSL/commands/test_mean_scores.rb +1 -1
  128. data/test/BioDSL/commands/test_merge_pair_seq.rb +1 -1
  129. data/test/BioDSL/commands/test_merge_table.rb +1 -1
  130. data/test/BioDSL/commands/test_merge_values.rb +1 -1
  131. data/test/BioDSL/commands/test_plot_heatmap.rb +1 -1
  132. data/test/BioDSL/commands/test_plot_histogram.rb +1 -1
  133. data/test/BioDSL/commands/test_plot_matches.rb +1 -1
  134. data/test/BioDSL/commands/test_plot_residue_distribution.rb +1 -1
  135. data/test/BioDSL/commands/test_plot_scores.rb +1 -1
  136. data/test/BioDSL/commands/test_random.rb +1 -1
  137. data/test/BioDSL/commands/test_read_fasta.rb +1 -1
  138. data/test/BioDSL/commands/test_read_fastq.rb +1 -1
  139. data/test/BioDSL/commands/test_read_table.rb +1 -1
  140. data/test/BioDSL/commands/test_reverse_seq.rb +1 -1
  141. data/test/BioDSL/commands/test_slice_align.rb +1 -1
  142. data/test/BioDSL/commands/test_slice_seq.rb +1 -1
  143. data/test/BioDSL/commands/test_sort.rb +1 -1
  144. data/test/BioDSL/commands/test_split_pair_seq.rb +1 -1
  145. data/test/BioDSL/commands/test_split_values.rb +1 -1
  146. data/test/BioDSL/commands/test_trim_primer.rb +1 -1
  147. data/test/BioDSL/commands/test_trim_seq.rb +1 -1
  148. data/test/BioDSL/commands/test_uchime_ref.rb +1 -1
  149. data/test/BioDSL/commands/test_uclust.rb +1 -1
  150. data/test/BioDSL/commands/test_unique_values.rb +1 -1
  151. data/test/BioDSL/commands/test_usearch_global.rb +1 -1
  152. data/test/BioDSL/commands/test_usearch_local.rb +1 -1
  153. data/test/BioDSL/commands/test_write_fasta.rb +1 -1
  154. data/test/BioDSL/commands/test_write_fastq.rb +1 -1
  155. data/test/BioDSL/commands/test_write_table.rb +1 -1
  156. data/test/BioDSL/commands/test_write_tree.rb +1 -1
  157. data/test/BioDSL/helpers/test_options_helper.rb +3 -3
  158. data/test/BioDSL/seq/test_assemble.rb +58 -56
  159. data/test/BioDSL/seq/test_backtrack.rb +83 -81
  160. data/test/BioDSL/seq/test_digest.rb +47 -45
  161. data/test/BioDSL/seq/test_dynamic.rb +66 -64
  162. data/test/BioDSL/seq/test_homopolymer.rb +35 -33
  163. data/test/BioDSL/seq/test_kmer.rb +29 -28
  164. data/test/BioDSL/seq/test_translate.rb +44 -42
  165. data/test/BioDSL/seq/test_trim.rb +59 -57
  166. data/test/BioDSL/test_cary.rb +1 -1
  167. data/test/BioDSL/test_command.rb +2 -2
  168. data/test/BioDSL/test_csv.rb +34 -31
  169. data/test/BioDSL/test_debug.rb +31 -31
  170. data/test/BioDSL/test_fasta.rb +30 -29
  171. data/test/BioDSL/test_fastq.rb +27 -26
  172. data/test/BioDSL/test_filesys.rb +28 -27
  173. data/test/BioDSL/test_fork.rb +29 -28
  174. data/test/BioDSL/test_math.rb +31 -30
  175. data/test/BioDSL/test_mummer.rb +1 -1
  176. data/test/BioDSL/test_pipeline.rb +1 -1
  177. data/test/BioDSL/test_seq.rb +42 -41
  178. data/test/BioDSL/test_serializer.rb +35 -33
  179. data/test/BioDSL/test_stream.rb +28 -27
  180. data/test/BioDSL/test_taxonomy.rb +38 -37
  181. data/test/BioDSL/test_test.rb +32 -31
  182. data/test/BioDSL/test_tmp_dir.rb +1 -1
  183. data/test/BioDSL/test_usearch.rb +28 -27
  184. data/test/BioDSL/test_verbose.rb +32 -31
  185. data/test/helper.rb +34 -31
  186. metadata +3 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b828e339f7d9337acdaf88a4206cb4cf15a6778c
4
- data.tar.gz: 6c130d98ba2e9ca1c1bdf6a044bbe7d6e2c6f309
3
+ metadata.gz: 806bfca700a56365bd01a11fb981fb16363aad95
4
+ data.tar.gz: 91718f260a6e32fb38af4724cfef035a9224e072
5
5
  SHA512:
6
- metadata.gz: 8f1fcfd7080a7487fd1a75152c4da3ce7328e86e19843181a93c30d1bb94a2f8f78a065cd208002324df2e0bbbbfbde5576a6157ece8c4c6c4878a6311e0074e
7
- data.tar.gz: 31b549d1294e2be25897d824ab019154dfa570305dd1f79b041ae641b8208d2fa00e97ac36e4bdfe8a2dacd46d7cc8da9db4ddb5d75f4c9f3a3e6997aa4e0ae0
6
+ metadata.gz: 875d37e145698145b42b250a0bed8ac81ad3bb9576b48cb6e14a68515906a6b773c154a9caf33f282a9f193aaf0877484f15fffd4a2600ac266322fef7e9f347
7
+ data.tar.gz: 21aeb489434d449fbfab7950015481672b3e2734f7cb4ae3384aa66927416655c53b847c13a03f4d42f4ceb907513cfbe80772d3175d4b1e9f25c96628d625df
data/.gitignore CHANGED
@@ -8,3 +8,4 @@ pkg/
8
8
  .tags*
9
9
  tags
10
10
  8mer
11
+ Gemfile.lock
@@ -20,7 +20,7 @@
20
20
  # #
21
21
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
22
22
  # #
23
- # This software is part of BioDSL (www.github.com/maasha/BioDSL). #
23
+ # This software is part of BioDSL (http://maasha.github.io/BioDSL). #
24
24
  # #
25
25
  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
26
26
 
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ # A sample Gemfile
2
+ source "https://rubygems.org"
3
+
4
+ gem 'bundler' #, '1.7.4'
5
+ gem 'mocha' #, '1.0.0'
6
+ gem 'simplecov' #, '0.9.2'
data/README.md CHANGED
@@ -1,169 +1,224 @@
1
- BioDSL
2
- =========
3
-
4
- Installation
5
- ------------
6
-
7
- `gem install BioDSL`
8
-
9
- Getting started
10
- ---------------
11
-
12
- A test script:
13
-
14
- #!/usr/bin/env ruby
15
-
16
- require 'BioDSL'
17
-
18
- p = BD.new.
19
- read_fasta(input: "input.fna").
20
- grab(select: "ATC$", keys: :SEQ).
21
- write_fasta(output: "output.fna").
22
- run(progress: true)
23
-
24
- Or using an interactive shell using the alias ibp which you can create by
25
- adding the following to your `~/.bashrc` file:
26
-
27
- alias ibp="irb -r BioDSL --noinspect"
28
-
29
- And then start the interactive shell:
30
-
31
- $ ibp
32
- irb(main):001:0> p = BD.new
33
- => BD.new
34
- irb(main):002:0> p.read_fasta(input: "input.fna")
35
- => BD.new.read_fasta(input: "input.fna")
36
- irb(main):003:0> p.grab(select: "ATC$", keys: :SEQ)
37
- => BD.new.read_fasta(input: "input.fna").grab(select: "ATC$", keys: :SEQ)
38
- irb(main):004:0> p.write_fasta(output: "output.fna")
39
- => BD.new.read_fasta(input: "input.fna").grab(select: "ATC$", keys: :SEQ).write_fasta(output: "output.fna")
40
- irb(main):005:0> p.run(progress: true)
41
- => BD.new.read_fasta(input: "input.fna").grab(select: "ATC$", keys: :SEQ).write_fasta(output: "output.fna").run(progress: true)
42
- irb(main):006:0>
43
-
44
-
45
- Or chaining commands directly:
46
-
47
- $ ibp
48
- irb(main):001:0> BD.new.read_fasta(input: "input.fna").grab(select: "ATC$", keys: :SEQ).write_fasta(output: "output.fna").run(progress: true)
49
- => BD.new.read_fasta(input: "input.fna").grab(select: "ATC$", keys: :SEQ).write_fasta(output: "output.fna").run(progress: true)
50
- irb(main):002:0>
51
-
52
- Or run on the command line with the alias bp which you can create by adding the
53
- following to your ~/.bashrc file:
54
-
55
- alias bp="ruby -r BioDSL"
56
-
57
- Then you can run the below from the command line:
58
-
59
- $ bp -e 'BD.new.read_fasta(input: "input.fna").grab(select: "ATC$", keys: :SEQ).write_fasta(output: "output.fna").run(progress: true)'
60
-
61
- Available BioDSL
62
- -------------------
63
-
64
- * [add_key] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/AddKey)
65
- * [align_seq_mothur] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/AlignSeqMothur)
66
- * [analyze_residue_distribution] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/AnalyzeResidueDistribution)
67
- * [assemble_pairs] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/AssemblePairs)
68
- * [assemble_seq_idba] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/AssembleSeqIdba)
69
- * [assemble_seq_ray] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/AssembleSeqRay)
70
- * [assemble_seq_spades] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/AssembleSeqSpades)
71
- * [classify_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/ClassifySeq)
72
- * [classify_seq_mothur] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/ClassifySeqMothur)
73
- * [clip_primer] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/ClipPrimer)
74
- * [cluster_otus] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/ClusterOtus)
75
- * [collapse_otus] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/CollapseOtus)
76
- * [collect_otus] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/CollectOtus)
77
- * [complement_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/ComplementSeq)
78
- * [count] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/Count)
79
- * [degap_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/DegapSeq)
80
- * [dereplicate_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/DereplicateSeq)
81
- * [dump] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/Dump)
82
- * [filter_rrna] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/FilterRrna)
83
- * [genecall] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/Genecall)
84
- * [grab] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/Grab)
85
- * [index_taxonomy] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/IndexTaxonomy)
86
- * [mean_scores] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/MeanScores)
87
- * [merge_pair_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/MergePairSeq)
88
- * [merge_table] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/MergeTable)
89
- * [merge_values] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/MergeValues)
90
- * [plot_heatmap] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/PlotHeatmap)
91
- * [plot_histogram] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/PlotHistogram)
92
- * [plot_matches] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/PlotMatches)
93
- * [plot_residue_distribution] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/PlotResidueDistribution)
94
- * [plot_scores] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/PlotScores)
95
- * [random] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/Random)
96
- * [read_fasta] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/ReadFasta)
97
- * [read_fastq] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/ReadFastq)
98
- * [read_table] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/ReadTable)
99
- * [reverse_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/ReverseSeq)
100
- * [slice_align] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/SliceAlign)
101
- * [slice_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/SliceSeq)
102
- * [sort] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/Sort)
103
- * [split_pair_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/SplitPairSeq)
104
- * [split_values] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/SplitValues)
105
- * [trim_primer] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/TrimPrimer)
106
- * [trim_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/TrimSeq)
107
- * [uchime_ref] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/UchimeRef)
108
- * [unique_values] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/UniqueValues)
109
- * [usearch_global] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/UsearchGlobal)
110
- * [write_fasta] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/WriteFasta)
111
- * [write_fastq] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/WriteFastq)
112
- * [write_table] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/WriteTable)
113
- * [write_tree] (http://www.rubydoc.info/gems/BioDSL/1.0.1/BioDSL/WriteTree)
114
-
115
- Log and History
116
- ---------------
1
+ BioDSL (pronounced Biodiesel) is a Domain Specific Language for creating
2
+ bioinformatic analysis workflows. A workflow may consist of several pipelines
3
+ and each pipeline consists of a series of steps such as reading in data from a
4
+ file, processing the data in some way, and writing data to a new file.
5
+
6
+ BioDSL is build on the same principles as [Biopieces](www.biopieces.org), where
7
+ data records are passed through multiple commands each with a specific task. The
8
+ idea is that a command will process the data record if this contains the
9
+ relevant attributes that the command can process. E.g. if a data record contains
10
+ a sequence, then the command [reverse_seq](reverse_seq) will reverse that
11
+ sequence.
12
+
13
+ # Installation
14
+
15
+ The recommended way of installing BioDSL is via Ruby’s gem package manager:
16
+
17
+ `$ gem install BioDSL`
18
+
19
+ For those commands which are wrappers around third-party tools, such as Usearch,
20
+ Mothur and SPAdes, you will have to install these and make the executables
21
+ available in your `$PATH`.
22
+
23
+ # Getting started
24
+
25
+ BioDSL is implemented in Ruby making use of Ruby’s powerful metaprogramming
26
+ facilities. Thus, a workflow is basically a Ruby script containing one or more
27
+ pipelines.
28
+
29
+ Here is a test script with a single pipeline that reads all FASTA entries from
30
+ the file `input.fna`, selects all records with a sequence ending in `ATC`, and
31
+ writing those records as FASTA entries to the file `output.fna`:
32
+
33
+ ```
34
+ #!/usr/bin/env ruby
35
+
36
+ require 'BioDSL'
37
+
38
+ BD.new.
39
+ read_fasta(input: "input.fna").
40
+ grab(select: "ATC$", keys: :SEQ).
41
+ write_fasta(output: "output.fna").
42
+ run
43
+ ```
44
+
45
+ Save the test script to a file `test.biodsl` and execute on the command line:
46
+
47
+ ```
48
+ $ ruby test.biodsl
49
+ ```
50
+
51
+ # Combining multiple pipelines
52
+
53
+ This script demonstrates how multiple pipelines can be created and combined. In
54
+ the end two pipelines are run, one consisting of p1 + p2 and one consisting of
55
+ p1 + p3. The first pipeline run will produce a histogram plot of sequence length
56
+ from sequences containing the pattern `ATCG`, and the other pipeline run will
57
+ produce a plot with sequences length distribution of sequences not matching
58
+ `ATCG`.
59
+
60
+ ```
61
+ #!/usr/bin/env ruby
62
+
63
+ require 'BioDSL'
64
+
65
+ p1 = BD.new.read_fasta(input: "test.fna")
66
+ p2 = BD.new.grab(keys: :SEQ, select: "ATCG").
67
+ plot_histogram(key: :SEQ_LEN, terminal: :png, output: "select.png")
68
+ p3 = BD.new.grab(keys: :SEQ, reject: "ATCG").
69
+ plot_histogram(key: :SEQ_LEN, terminal: :png, output: "reject.png")
70
+ p4 = p1 + p3
71
+
72
+ (p1 + p2).write_fasta(output: "select.fna").run
73
+ p4.write_fasta(output: "reject.fna").run
74
+ ```
75
+
76
+ # Running pipelines in parallel
77
+
78
+ This script demonstrates how to run multiple pipelines in parallel using 20 CPU
79
+ cores. Here we filter pair-end FASTQ entries from a list of samples described in
80
+ the file `samples.txt` which contains three tab separated columns: sample name,
81
+ a forward read file path, and a reverse read file path.
82
+
83
+ ```
84
+ #!/usr/bin/env ruby
85
+
86
+ require 'BioDSL'
87
+ require 'csv'
88
+
89
+ samples = CSV.read("samples.txt")
90
+
91
+ Parallel.each(samples, in_processes: 20) do |sample|
92
+ BD.new.
93
+ read_fastq(input: sample[1], input2: sample[2], encoding: :base_33).
94
+ grab(keys: :SEQ, select: "ATCG").
95
+ write_fastq(output: "#{sample[0]}_filted.fastq.bz2", bzip2: true).
96
+ run
97
+ end
98
+ ```
99
+
100
+ # Ruby one-liners
101
+
102
+ It is possible to execute BioDSL pipelines on the command line:
103
+
104
+ ```
105
+ ruby -r BioDSL -e 'BD.new.read_fasta(input: "test.fna").plot_histogram(key: :SEQ_LEN).run'
106
+ ```
107
+
108
+ And to save typing we may use the alias `bd` which is set like this on the
109
+ command line:
110
+
111
+ ```
112
+ $ alias bd='ruby -r BioDSL'
113
+ ```
114
+
115
+ It may be a good idea to save that alias in your `.bashrc` file.
116
+
117
+ Now it is possible to run a BioDSL pipeline on the command line like this:
118
+
119
+ ```
120
+ $ bd -e 'BD.new.read_fasta(input: "test.fna").plot_histogram(key: :SEQ_LEN).run'
121
+ ```
122
+
123
+ # Using the Interactive Ruby interpreter
124
+
125
+ Here we demonstrate the use of Ruby's `irb` shell:
126
+
127
+ ```
128
+ $ irb -r BioDSL --noinspect
129
+ irb(main):001:0> p = BD.new
130
+ => BD.new
131
+ irb(main):002:0> p.read_fasta(input: "input.fna")
132
+ => BD.new.read_fasta(input: "input.fna")
133
+ irb(main):003:0> p.grab(select: "ATC$", keys: :SEQ)
134
+ => BD.new.read_fasta(input: "input.fna").grab(select: "ATC$", keys: :SEQ)
135
+ irb(main):004:0> p.write_fasta(output: "output.fna")
136
+ => BD.new.read_fasta(input: "input.fna").grab(select: "ATC$", keys: :SEQ).write_fasta(output: "output.fna")
137
+ irb(main):005:0> p.run
138
+ => BD.new.read_fasta(input: "input.fna").grab(select: "ATC$", keys: :SEQ).write_fasta(output: "output.fna").run
139
+ irb(main):006:0>
140
+ ```
141
+
142
+ Again, it may be a good idea to save an alias `alias biodsl="irb -r BioDSL --noinspect"` to your `.bashrc` file. Thus, we can use the new `biodsl` alias to chain commands directly:
143
+
144
+ ```
145
+ $ biodsl
146
+ irb(main):001:0> BD.new.read_fasta(input: "input.fna").grab(select: "ATC$", keys: :SEQ).write_fasta(output: "output.fna").run(progress: true)
147
+ => BD.new.read_fasta(input: "input.fna").grab(select: "ATC$", keys: :SEQ).write_fasta(output: "output.fna").run(progress: true)
148
+ irb(main):002:0>
149
+ ```
150
+
151
+ # History file
152
+
153
+ A history file is kept in `$USER/.BioDSL_history` and each time run is called a history entry is added to this file:
154
+
155
+ ```
156
+ BD.new.read_fasta(input: "test_big.fna", first: 100).plot_histogram(key: :SEQ_LEN).run
157
+ BD.new.read_fasta(input: "test_big.fna", first: 100).plot_histogram(key: :SEQ_LEN).run
158
+ BD.new.read_fasta(input: "test_big.fna", first: 10).plot_histogram(key: :SEQ_LEN).run
159
+ BD.new.read_fasta(input: "test_big.fna").plot_histogram(key: :SEQ_LEN).run
160
+ BD.new.read_fasta(input: "test_big.fna", first: 1000).plot_histogram(key: :SEQ_LEN).run
161
+ ```
162
+
163
+ Thus it is possible to redo the last pipeline by pasting the line in irb or a Ruby one-liner.
164
+
165
+ # Log and History
117
166
 
118
167
  All BioDSL events are logged to `~/.BioDSL_log`.
119
168
 
120
169
  BioDSL history is saved to `~/.BioDSL_history`.
121
170
 
171
+ # Features
122
172
 
123
- Features
124
- --------
125
-
126
- Progress:
173
+ ## Progress
127
174
 
128
175
  Show nifty progress table with commands, records read and emittet and time.
129
176
 
130
177
  `BD.new.read_fasta(input: "input.fna").dump.run(progress: true)`
131
178
 
132
- Verbose:
179
+ ## Verbose
133
180
 
134
181
  Output verbose messages from commands and the run status.
135
182
 
136
- `BD.new.read_fasta(input: "input.fna").dump.run(verbose: true)`
183
+ ```
184
+ BD.new.read_fasta(input: "input.fna").dump.run(verbose: true)
185
+ ```
137
186
 
138
- Debug:
187
+ ## Debug
139
188
 
140
189
  Output debug messages from commands using these.
141
190
 
142
- `BD.new.read_fasta(input: "input.fna").dump.run(debug: true)`
191
+ ```
192
+ BD.new.read_fasta(input: "input.fna").dump.run(debug: true)
193
+ ```
143
194
 
144
- E-mail notification:
195
+ ## E-mail notification
145
196
 
146
197
  Send an email when run is complete.
147
198
 
148
- `BD.new.read_fasta(input: "input.fna").dump.run(email: mail@maasha.dk, subject: "Script done!")`
149
-
150
- Report:
199
+ ```
200
+ BD.new.read_fasta(input: "input.fna").dump.run(email: bill@hotmail.com, subject: "Script done!")
201
+ ```
151
202
 
152
- Create an HTML report of the run stats:
203
+ ## Reports
153
204
 
154
- `BD.new.read_fasta(input: "input.fna").dump.run(report: "status.html")`
205
+ Create an HTML report of the run stats for a pipeline:
155
206
 
156
- Output dir:
207
+ ```
208
+ BD.new.read_fasta(input: "input.fna").dump.run(report: "status.html")
209
+ ```
157
210
 
158
- All output files from commands are put in a specified dir:
211
+ ## Output directory
159
212
 
160
- `BD.new.read_fasta(input: "input.fna").dump.run(output_dir: "Results")`
213
+ All output files from commands are put in a specified directory:
161
214
 
215
+ ```
216
+ BD.new.read_fasta(input: "input.fna").dump.run(output_dir: "Results")
217
+ ```
162
218
 
163
- Configuration File
164
- ------------------
219
+ ## Configuration File
165
220
 
166
- It is possible to pre-set options in a configuration file located in your $HOME
221
+ It is possible to pre-set options in a configuration file located in your `$HOME`
167
222
  directory called `.BioDSLrc`. Thus if an option is not already set, its value
168
223
  will fall back to the one set in the configuration file. The configuration file
169
224
  contains three whitespace separated columns:
@@ -172,34 +227,113 @@ contains three whitespace separated columns:
172
227
  * Option
173
228
  * Option value
174
229
 
175
- Lines starting with '#' are considered comments and are ignored.
230
+ Lines starting with `#` are considered comments and are ignored.
176
231
 
177
232
  An example:
178
233
 
179
- maasha@mel:~$ cat ~/.BioDSLrc
180
- uchime_ref database /home/maasha/Install/QIIME1.8/data/rdp_gold.fa
181
- uchime_ref cpus 20
234
+ ```
235
+ maasha@mel:~$ cat ~/.BioDSLrc
236
+ uchime_ref database /home/maasha/Install/QIIME1.8/data/rdp_gold.fa
237
+ uchime_ref cpus 20
238
+ ```
182
239
 
183
240
  On compute clusters it is necessary to specify the max processor count, which
184
241
  is otherwise determined as the number of cores on the current node. To override
185
242
  this add the following line:
186
243
 
187
- pipeline processor_count 1000
244
+ ```
245
+ pipeline processor_count 1000
246
+ ```
188
247
 
189
248
  It is also possible to change the temporary directory from the systems default
190
249
  by adding the following line:
191
250
 
192
- pipeline tmp_dir /home/projects/ku_microbio/scratch/tmp
193
-
194
- Contributing
195
- ------------
196
-
197
- Fork it
198
-
199
- Create your feature branch (git checkout -b my-new-feature)
200
-
201
- Commit your changes (git commit -am 'Add some feature')
202
-
203
- Push to the branch (git push origin my-new-feature)
204
-
205
- Create new Pull Request
251
+ ```
252
+ pipeline tmp_dir /home/projects/ku_microbio/scratch/tmp
253
+ ```
254
+
255
+ # Available BioDSL commands
256
+
257
+ * [add_key] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/AddKey)
258
+ * [align_seq_mothur] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/AlignSeqMothur)
259
+ * [analyze_residue_distribution] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/AnalyzeResidueDistribution)
260
+ * [assemble_pairs] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/AssemblePairs)
261
+ * [assemble_seq_idba] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/AssembleSeqIdba)
262
+ * [assemble_seq_ray] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/AssembleSeqRay)
263
+ * [assemble_seq_spades] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/AssembleSeqSpades)
264
+ * [classify_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/ClassifySeq)
265
+ * [classify_seq_mothur] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/ClassifySeqMothur)
266
+ * [clip_primer] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/ClipPrimer)
267
+ * [cluster_otus] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/ClusterOtus)
268
+ * [collapse_otus] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/CollapseOtus)
269
+ * [collect_otus] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/CollectOtus)
270
+ * [complement_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/ComplementSeq)
271
+ * [count] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/Count)
272
+ * [degap_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/DegapSeq)
273
+ * [dereplicate_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/DereplicateSeq)
274
+ * [dump] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/Dump)
275
+ * [filter_rrna] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/FilterRrna)
276
+ * [genecall] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/Genecall)
277
+ * [grab] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/Grab)
278
+ * [index_taxonomy] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/IndexTaxonomy)
279
+ * [mean_scores] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/MeanScores)
280
+ * [merge_pair_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/MergePairSeq)
281
+ * [merge_table] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/MergeTable)
282
+ * [merge_values] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/MergeValues)
283
+ * [plot_heatmap] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/PlotHeatmap)
284
+ * [plot_histogram] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/PlotHistogram)
285
+ * [plot_matches] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/PlotMatches)
286
+ * [plot_residue_distribution] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/PlotResidueDistribution)
287
+ * [plot_scores] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/PlotScores)
288
+ * [random] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/Random)
289
+ * [read_fasta] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/ReadFasta)
290
+ * [read_fastq] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/ReadFastq)
291
+ * [read_table] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/ReadTable)
292
+ * [reverse_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/ReverseSeq)
293
+ * [slice_align] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/SliceAlign)
294
+ * [slice_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/SliceSeq)
295
+ * [sort] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/Sort)
296
+ * [split_pair_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/SplitPairSeq)
297
+ * [split_values] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/SplitValues)
298
+ * [trim_primer] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/TrimPrimer)
299
+ * [trim_seq] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/TrimSeq)
300
+ * [uchime_ref] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/UchimeRef)
301
+ * [unique_values] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/UniqueValues)
302
+ * [usearch_global] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/UsearchGlobal)
303
+ * [write_fasta] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/WriteFasta)
304
+ * [write_fastq] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/WriteFastq)
305
+ * [write_table] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/WriteTable)
306
+ * [write_tree] (http://www.rubydoc.info/gems/BioDSL/1.0.2/BioDSL/WriteTree)
307
+
308
+ # Running the test suite
309
+
310
+ BioDSL have an extended set of unit tests that can be run after installing
311
+ development dependencies. First you need to install the bundler gem:
312
+
313
+ ```
314
+ $ gem install bundler
315
+ ```
316
+
317
+ Next you need to change to the source directory of BioDSL and run bundler to
318
+ download depending gems:
319
+
320
+ ```
321
+ $ bundle install
322
+ ```
323
+
324
+ And then you run the test suite by running `rake`:
325
+
326
+ ```
327
+ $ rake
328
+ ```
329
+
330
+ And the unit tests should all run, except those omitted because a third-party
331
+ executable was missing.
332
+
333
+ # Contributing
334
+
335
+ 1. Fork it
336
+ 1. Create your feature branch (git checkout -b my-new-feature)
337
+ 1. Commit your changes (git commit -am 'Add some feature')
338
+ 1. Push to the branch (git push origin my-new-feature)
339
+ 1. Create new Pull Request