bio-ngs 0.3.2.alpha.01

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/Gemfile +39 -0
  3. data/Gemfile.lock +81 -0
  4. data/LICENSE.txt +28 -0
  5. data/README.rdoc +240 -0
  6. data/Rakefile +60 -0
  7. data/VERSION +1 -0
  8. data/bin/biongs +35 -0
  9. data/bio-ngs.gemspec +215 -0
  10. data/ext/mkrf_conf.rb +87 -0
  11. data/lib/bio-ngs.rb +54 -0
  12. data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
  13. data/lib/bio/appl/ngs/blast.rb +36 -0
  14. data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
  15. data/lib/bio/appl/ngs/cufflinks.rb +489 -0
  16. data/lib/bio/appl/ngs/fastx.rb +170 -0
  17. data/lib/bio/appl/ngs/samtools.rb +118 -0
  18. data/lib/bio/appl/ngs/sff_extract.rb +23 -0
  19. data/lib/bio/appl/ngs/tophat.rb +158 -0
  20. data/lib/bio/ngs/converter.rb +100 -0
  21. data/lib/bio/ngs/core_ext.rb +12 -0
  22. data/lib/bio/ngs/db.rb +66 -0
  23. data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
  24. data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
  25. data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
  26. data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
  27. data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
  28. data/lib/bio/ngs/db/models.rb +1 -0
  29. data/lib/bio/ngs/db/models/homology.rb +8 -0
  30. data/lib/bio/ngs/db/models/ontology.rb +16 -0
  31. data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
  32. data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
  33. data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
  34. data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
  35. data/lib/bio/ngs/ext/versions.yaml +73 -0
  36. data/lib/bio/ngs/graphics.rb +189 -0
  37. data/lib/bio/ngs/homology.rb +102 -0
  38. data/lib/bio/ngs/ontology.rb +103 -0
  39. data/lib/bio/ngs/quality.rb +64 -0
  40. data/lib/bio/ngs/record.rb +50 -0
  41. data/lib/bio/ngs/task.rb +46 -0
  42. data/lib/bio/ngs/utils.rb +176 -0
  43. data/lib/development_tasks.rb +34 -0
  44. data/lib/enumerable.rb +37 -0
  45. data/lib/tasks/bwa.thor +126 -0
  46. data/lib/tasks/convert.thor +454 -0
  47. data/lib/tasks/history.thor +51 -0
  48. data/lib/tasks/homology.thor +121 -0
  49. data/lib/tasks/ontology.thor +93 -0
  50. data/lib/tasks/project.thor +51 -0
  51. data/lib/tasks/quality.thor +142 -0
  52. data/lib/tasks/rna.thor +126 -0
  53. data/lib/tasks/sff_extract.thor +9 -0
  54. data/lib/templates/README.tt +43 -0
  55. data/lib/templates/db.tt +6 -0
  56. data/lib/wrapper.rb +225 -0
  57. data/spec/converter_qseq_spec.rb +56 -0
  58. data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
  59. data/spec/quality_spec.rb +40 -0
  60. data/spec/sff_extract_spec.rb +98 -0
  61. data/spec/spec_helper.rb +55 -0
  62. data/spec/tophat_spec.rb +99 -0
  63. data/spec/utils_spec.rb +22 -0
  64. data/test/conf/test_db.yml +4 -0
  65. data/test/data/blastoutput.xml +69 -0
  66. data/test/data/gene-GO.json +1 -0
  67. data/test/data/goa_uniprot +27 -0
  68. data/test/data/goslim_goa.obo +1763 -0
  69. data/test/helper.rb +18 -0
  70. data/test/test_bio-ngs.rb +17 -0
  71. data/test/test_db.rb +21 -0
  72. data/test/test_homology.rb +102 -0
  73. data/test/test_ngs.rb +21 -0
  74. data/test/test_ontology.rb +74 -0
  75. data/test/test_utils.rb +29 -0
  76. metadata +460 -0
@@ -0,0 +1,36 @@
1
+ module Bio
2
+ module Ngs
3
+ class Blast
4
+
5
+ include Bio::Command::Wrapper
6
+
7
+ class BlastN < Blast
8
+ set_program Bio::Ngs::Utils.binary("blastn")
9
+ add_option "evalue", :type => :string, :desc => "E-value cutoff"
10
+ add_option "query", :type => :string, :desc => "Query sequence"
11
+ add_option "db", :type => :string, :desc => "Database sequences"
12
+ add_option "query", :type => :string, :desc => "Query sequence"
13
+ add_option "word_size", :type => :string, :desc => "Query sequence"
14
+ add_option "task", :type => :string, :desc => "Task type", :default => "blastn"
15
+ add_option "out", :type => :string, :desc => "Output file", :default => "blastout.xml"
16
+ add_option "outfmt", :type => :numeric, :desc => "Output format type", :default => 5
17
+ add_option "num_descriptions", :type => :numeric, :desc => "Number of HIT descriptions", :default => 1
18
+ add_option "num_alignments", :type => :numeric, :desc => "Number of HIT alignments", :default => 1
19
+ add_option "num_threads", :type => :numeric, :desc => "Number of threads", :default => 1
20
+ end
21
+
22
+ class BlastX < Blast
23
+ set_program Bio::Ngs::Utils.binary("blastx")
24
+ add_option "evalue", :type => :string, :desc => "E-value cutoff"
25
+ add_option "query", :type => :string, :desc => "Query sequence"
26
+ add_option "db", :type => :string, :desc => "Database sequences"
27
+ add_option "query", :type => :string, :desc => "Query sequence"
28
+ add_option "out", :type => :string, :desc => "Output file", :default => "blastout.xml"
29
+ add_option "outfmt", :type => :numeric, :desc => "Output format type", :default => 5
30
+ add_option "num_descriptions", :type => :numeric, :desc => "Number of HIT descriptions", :default => 1
31
+ add_option "num_alignments", :type => :numeric, :desc => "Number of HIT alignments", :default => 1
32
+ add_option "num_threads", :type => :numeric, :desc => "Number of threads", :default => 1
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,50 @@
1
+ #
2
+ # bowtie-inspect.rb - Wrapper for bowtie-inspect
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Raoul Bonnal <r@bioruby.org>
6
+ # License:: The Ruby License
7
+ #
8
+ #
9
+
10
+
11
+ # Usage: bowtie-inspect [options]* <ebwt_base>
12
+ # <ebwt_base> ebwt filename minus trailing .1.ebwt/.2.ebwt
13
+ #
14
+ # By default, prints FASTA records of the indexed nucleotide sequences to
15
+ # standard out. With -n, just prints names. With -s, just prints a summary of
16
+ # the index parameters and sequences. With -e, preserves colors if applicable.
17
+ #
18
+ # Options:
19
+ # -a/--across <int> Number of characters across in FASTA output (default: 60)
20
+ # -n/--names Print reference sequence names only
21
+ # -s/--summary Print summary incl. ref names, lengths, index properties
22
+ # -e/--ebwt-ref Reconstruct reference from ebwt (slow, preserves colors)
23
+ # -v/--verbose Verbose output (for debugging)
24
+ # -h/--help print detailed description of tool and its options
25
+ # --help print this usage message
26
+
27
+
28
+ module Bio
29
+ module Ngs
30
+ class BowtieInspect
31
+
32
+ include Bio::Command::Wrapper
33
+
34
+ set_program Bio::Ngs::Utils.binary("bowtie-inspect")
35
+ # User should provide a complete path to the tool.
36
+ # I think it would it better identify the program from just a name
37
+ # looking int othe ext/ or host system path
38
+ # Why not grab the file name from the class name if not specified ?
39
+
40
+ set_output :stdout
41
+
42
+
43
+ add_option "across",:type => :numeric, :aliases => '-a'
44
+ add_option "names", :type => :boolean, :aliases => '-n'
45
+ add_option "summary", :type => :boolean, :aliases => '-s'
46
+ add_option "ebwt-ref", :type => :boolean, :aliases => '-e'
47
+ add_option "verbose", :type => :boolean, :aliases => '-v'
48
+ end #BowtieInspect
49
+ end#Ngs
50
+ end#Bio
@@ -0,0 +1,489 @@
1
+ #
2
+ # cufflinks.rb - description
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Raoul Bonnal <r@bioruby.org>
6
+ # License:: The Ruby License
7
+ #
8
+ #
9
+
10
+
11
+
12
+ module Bio
13
+ module Ngs
14
+ module Cufflinks
15
+ VERSION = "1.0.X"
16
+ class << self
17
+ def version
18
+ VERSION
19
+ end
20
+ end
21
+
22
+
23
+ # cufflinks v1.0.2 (2335)
24
+ # linked against Boost version 104000
25
+ # -----------------------------
26
+ # Usage: cufflinks [options] <hits.sam>
27
+ # Options:
28
+ #
29
+ # -p/--num-threads number of threads used during analysis [ default: 1 ]
30
+ # -L/--label assembled transcripts have this ID prefix [ default: CUFF ]
31
+ # -G/--GTF quantitate against reference transcript annotations
32
+ # -F/--min-isoform-fraction suppress transcripts below this abundance level [ default: 0.15 ]
33
+ # -f/--min-intron-fraction filter spliced alignments below this level [ default: 0.05 ]
34
+ # -j/--pre-mrna-fraction suppress intra-intronic transcripts below this level [ default: 0.15 ]
35
+ # -I/--max-intron-length ignore alignments with gaps longer than this [ default: 300000 ]
36
+ # -Q/--min-map-qual ignore alignments with lower than this mapping qual [ default: 0 ]
37
+ # -M/--mask-file ignore all alignment within transcripts in this file
38
+ # -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
39
+ # -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
40
+ # -o/--output-dir write all output files to this directory [ default: ./ ]
41
+ # -r/--reference-seq reference fasta file for sequence bias correction [ default: NULL ]
42
+ #
43
+ # Advanced Options:
44
+ #
45
+ # -N/--quartile-normalization use quartile normalization instead of total counts [ default: FALSE ]
46
+ # -a/--junc-alpha alpha for junction binomial test filter [ default: 0.01 ]
47
+ # -A/--small-anchor-fraction percent read overhang taken as 'suspiciously small' [ default: 0.12 ]
48
+ # -m/--frag-len-mean the average fragment length [ default: 200 ]
49
+ # -s/--frag-len-std-dev the fragment length standard deviation [ default: 80 ]
50
+ # --min-frags-per-transfrag minimum number of fragments needed for new transfrags [ default: 10 ]
51
+ # --overhang-tolerance number of terminal exon bp to tolerate in introns [ default: 8 ]
52
+ # --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
53
+ # --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
54
+ # --library-type Library prep used for input reads [ default: below ]
55
+ # --max-bundle-length maximum genomic length allowed for a given bundle [ default:3500000 ]
56
+ # --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]
57
+ # --min-intron-length minimum intron size allowed in genome [ default: 50 ]
58
+ # Supported library types:
59
+ # ff-firststrand
60
+ # ff-secondstrand
61
+ # ff-unstranded
62
+ # fr-firststrand
63
+ # fr-secondstrand
64
+ # fr-unstranded (default)
65
+ # transfrags
66
+ class Quantification
67
+
68
+ include Bio::Command::Wrapper
69
+
70
+ set_program Bio::Ngs::Utils.binary("cufflinks")
71
+
72
+ add_option "num-threads", :type => :numeric, :aliases => '-p', :default => 1
73
+ add_option "label", :type => :string, :aliases => '-L', :default => "CUFF"
74
+ add_option "GTF", :type => :string, :aliases => '-G'
75
+ add_option "min-isoform-fraction", :type => :numeric, :aliases => '-F', :default => 0.15
76
+ add_option "min-intron-fraction", :type => :numeric, :aliases => '-f', :default => 0.05
77
+ add_option "pre-mrna-fraction", :type => :numeric, :aliases => '-j', :default => 0.15
78
+ add_option "max-intron-length", :type => :numeric, :aliases => '-I', :default => 300000
79
+ add_option "min-map-qual", :type => :numeric, :aliases => '-Q', :default => 0
80
+ add_option "mask-file", :type => :string, :aliases => '-M'
81
+ add_option "verbose", :type => :boolean, :aliases => '-v'
82
+ add_option "quiet", :type => :boolean, :aliases => '-q'
83
+ add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
84
+ add_option "reference-seq", :type => :string, :aliases => '-r'
85
+ add_option "quartile-normalization", :type => :boolean, :aliases => '-N'
86
+ add_option "junc-alpha", :type => :numeric, :aliases => '-a', :default => 0.01
87
+ add_option "small-anchor-fraction", :type => :numeric, :aliases => '-A', :default => 0.12
88
+ #TODO Check why with these defaults is not working properly
89
+ add_option "farg-len-mean", :type => :numeric, :aliases => '-m'#, :default => 200
90
+ add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'#, :default => 80
91
+ add_option "min-frags-per-transfrag", :type => :numeric#, :default => 10
92
+ add_option "overhang-tolerance", :type => :numeric#, :default => 8
93
+ add_option "num-importance-samples", :type => :numeric#, :default => 1000
94
+ add_option "max-mle-iterations", :type => :numeric#, :default => 5000
95
+ add_option "library-type", :type => :string
96
+ add_option "max-bundle-length", :type => :numeric #, :default => 3500000
97
+ add_option "max-bundle-frags", :type => :numeric #, :default => 500000
98
+ add_option "min-intron-length", :type => :numeric#, :default => 50
99
+ end #Quantification
100
+
101
+ # cuffdiff v1.0.2 (2336)
102
+ # -----------------------------
103
+ # Usage: cuffdiff [options] <transcripts.gtf> <sample1_hits.sam> <sample2_hits.sam> [... sampleN_hits.sam]
104
+ # Supply replicate SAMs as comma separated lists for each condition: sample1_rep1.sam,sample1_rep2.sam,...sample1_repM.sam
105
+ # General Options:
106
+ # -o/--output-dir write all output files to this directory [ default: ./ ]
107
+ # -T/--time-series treat samples as a time-series [ default: FALSE ]
108
+ # -c/--min-alignment-count minimum number of alignments in a locus for testing [ default: 10 ]
109
+ # --FDR False discovery rate used in testing [ default: 0.05 ]
110
+ # -M/--mask-file ignore all alignment within transcripts in this file [ default: NULL ]
111
+ # -b/--frag-bias-correct use bias correction - reference fasta required [ default: NULL ]
112
+ # -u/--multi-read-correct use 'rescue method' for multi-reads (more accurate) [ default: FALSE ]
113
+ # -N/--upper-quartile-norm use upper-quartile normalization [ default: FALSE ]
114
+ # -L/--labels comma-separated list of condition labels
115
+ # -p/--num-threads number of threads used during quantification [ default: 1 ]
116
+ #
117
+ # Advanced Options:
118
+ # --library-type Library prep used for input reads [ default: below ]
119
+ # -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]
120
+ # -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]
121
+ # --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
122
+ # --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
123
+ # --compatible-hits-norm count hits compatible with reference RNAs only [ default: TRUE ]
124
+ # --total-hits-norm count all hits for normalization [ default: FALSE ]
125
+ # --poisson-dispersion Don't fit fragment counts for overdispersion [ default: FALSE ]
126
+ # -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
127
+ # -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
128
+ # --no-update-check do not contact server to check for update availability[ default: FALSE ]
129
+ # --emit-count-tables print count tables used to fit overdispersion [ default: FALSE ]
130
+ #
131
+ # Supported library types:
132
+ # ff-firststrand
133
+ # ff-secondstrand
134
+ # ff-unstranded
135
+ # fr-firststrand
136
+ # fr-secondstrand
137
+ # fr-unstranded (default)
138
+ # transfrags
139
+ class Diff
140
+ include Bio::Command::Wrapper
141
+
142
+ set_program Bio::Ngs::Utils.binary("cuffdiff")
143
+
144
+ add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
145
+ add_option "time-series", :type => :boolean, :aliases => '-T'
146
+ add_option "min-alignment-count", :type => :numeric, :aliases => '-c'
147
+ add_option "FDR", :type => :numeric, :aliases => '-F'
148
+ #TODO:FIX add_option "mask-file", :type => :string, :aliases => '-M'
149
+ #TODO:FIX add_option "frag-bias-correct", :type =>
150
+ add_option "multi-read-correct", :type => :boolean, :aliases => '-u'
151
+ add_option "upper-quartile-norm", :type => :boolean, :aliases => 'N'
152
+ add_option "labels", :type => :array, :aliases => '-L'
153
+ add_option "num-threads", :type => :numeric, :aliases => '-p'
154
+ add_option "library-type", :type => :string, :aliases => '-l'
155
+ add_option "frag-len-mean", :type => :numeric, :aliases => '-m'
156
+ add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'
157
+ add_option "num-importance-samples", :type => :numeric, :aliases => '-i'
158
+ add_option "max-mle-iterations", :type => :numeric, :aliases => '-e'
159
+ add_option "compatible-hits-norm", :type => :boolean, :aliases => '-h'
160
+ add_option "total-hits-norm", :type => :boolean, :aliases => '-t'
161
+ add_option "poisson-dispersion", :type => :boolean, :aliases => '-d'
162
+ add_option "verbose", :type => :boolean, :aliases => '-v'
163
+ add_option "quiet", :type => :boolean, :aliases => '-q'
164
+ add_option "no-update-check", :type => :boolean, :aliases => '-j'
165
+ add_option "emit-count-tables", :type => :boolean, :aliases => '-b'
166
+
167
+ #Examples
168
+ #Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,3,0.6,false,true)
169
+ #Bio::Ngs::Cufflinks::Diff.genes("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/gene_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,5,0.5,false,true)
170
+
171
+ class << self
172
+
173
+ #Return the version of CuffDiff used to produce the output
174
+ def version(diff)
175
+ #cufflink_version_offset = Bio::Ngs::Cufflinks.version
176
+ f=File.open(diff,'r')
177
+ header=f.readline #skip header
178
+ f.close
179
+ cufflink_version_offset = case header.split.size
180
+ when 12
181
+ "0.9.X"
182
+ when 14
183
+ Bio::Ngs::Cufflinks.version #latest
184
+ end
185
+ end#version
186
+
187
+
188
+ def offset_by_version(cufflinks_version)
189
+ case cufflinks_version
190
+ when "0.9.X"
191
+ 0
192
+ when "1.0.X"
193
+ 1
194
+ end
195
+ end
196
+
197
+ #write a file with the information
198
+ #See process_de for options available
199
+ # Example: Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DEPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/isoform_exp.diff",
200
+ # "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_PopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/ComparepPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8.combined.gtf",
201
+ # fold:0.5,min_samples:5,min_fpkm:0.5,z_scores:true, :regulated=>:up)
202
+ def isoforms(diff, gtf, options={})
203
+ process_de(diff, gtf, options) do |dict_info, diff_reference, gtf_kb, fpkm_values|
204
+ "#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:nearest_ref]}_#{gtf_kb[diff_reference][:gene_name]}\t#{fpkm_values.join("\t")}"
205
+ end
206
+ end #isoform
207
+
208
+ #write a file with the information
209
+ #See process_de for options available
210
+ # Example: Bio::Ngs::Cufflinks::Diff.genes("/Users/bonnalraoul/Desktop/RRep16giugno/DEPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/gene_exp.diff",
211
+ # "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_PopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/ComparepPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8.combined.gtf",
212
+ # fold:0.5,min_samples:5,min_fpkm:0.5,z_scores:true, :regulated=>:up)
213
+ def genes(diff, gtf, options={})
214
+ process_de(diff, gtf, options) do |dict_info, diff_reference, gtf_kb, fpkm_values|
215
+ "#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:gene_name]}\t#{fpkm_values.join("\t")}"
216
+ end
217
+ end #genes
218
+
219
+ private
220
+ #Options hash
221
+ # :fold(float), :min_samples(integer), :min_fpkm(float), :only_significative(boolean, false) , :z_score(boolean, false)
222
+ # :regulated(symbol :up or :down default :up)
223
+ def process_de(diff, gtf, options={})
224
+ fold = options[:fold] || 0.0
225
+ min_samples = options[:min_samples] || 0
226
+ min_fpkm = options[:min_fpkm] || 0.0
227
+ only_significative = options[:only_significative] || false
228
+ z_scores = options[:z_scores] || false
229
+ #TODO improve check on paramters
230
+ regulated =options[:regulated] || :up
231
+
232
+ gtf_kb = Bio::Ngs::Cufflinks::Compare.exists_kb?(gtf) ? Bio::Ngs::Cufflinks::Compare.load_compare_kb(gtf) : Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
233
+
234
+ #convert log2 fold value into natural log value (internally computed by cuffdiff)
235
+ fold_log2 = fold
236
+ fold = fold==0 ? 0.0 : (fold*Math.log(2))
237
+
238
+ dict=Hash.new {|h, k| h[k]=Hash.new{|hh,kk| hh[kk]=[]}; }
239
+ dict_samples = Hash.new{|h,k| h[k]=""}
240
+
241
+ #which offset may I consider to get data from cuffdiff?
242
+ cufflink_version_offset = offset_by_version(version(diff))
243
+
244
+ File.open(diff,'r') do |f|
245
+ header=f.readline #skip header
246
+
247
+ q_first = 3 + cufflink_version_offset
248
+ q_second = 4 + cufflink_version_offset
249
+ fpkm_first = 6 + cufflink_version_offset
250
+ fpkm_second = 7 + cufflink_version_offset
251
+ fold_position = 8 + cufflink_version_offset
252
+ significant_position = 11 + cufflink_version_offset + (cufflink_version_offset==1 ? 1 : 0)
253
+ f.each_line do |line|
254
+ data=line.split
255
+ if data[fold_position].to_f<=0
256
+ data[fold_position]=data[fold_position].sub(/-/,"")
257
+ else
258
+ a=data[fpkm_second]
259
+ data[fpkm_second]=data[fpkm_first]
260
+ data[fpkm_first]=a
261
+ a=data[q_second]
262
+ data[q_second]=data[q_first]
263
+ data[q_first]=a
264
+ end
265
+ #0 TCONS
266
+ #4 name sample is the max diff for the item
267
+ #5 name sample is the less diff for the item
268
+ #9 is the fold
269
+ dict_samples[data[q_first]]
270
+ dict_samples[data[q_second]]
271
+
272
+ #7 is the fpkm value of max pop/sample
273
+ #8 is the fpkm value of min pop/sample
274
+ if ((only_significative==true && data[significant_position]=="yes") || (data[significant_position]=="yes" && data[fold_position].to_f>=fold)) && data[fpkm_first].to_f>=min_fpkm && data[fpkm_second].to_f>=min_fpkm
275
+ k_reference = data[0].to_sym #This can be TCONS if isoforms or XLOC if genes
276
+
277
+ ###### puts data.join(" ") if k_reference == :XLOC_017497
278
+ #TODO refactor: this can be done using lambda
279
+ k_sample = case regulated
280
+ when :up
281
+ k_sample = data[q_first].to_sym
282
+ dict[k_reference][k_sample]<<data[q_second].to_sym
283
+ k_sample
284
+ when :down
285
+ k_sample = data[q_second].to_sym
286
+ dict[k_reference][k_sample]<<data[q_first].to_sym
287
+ k_sample
288
+ end
289
+
290
+ # puts dict[k_reference].inspect if k_reference == :XLOC_017497
291
+
292
+ unless dict[k_reference].key?(:values)
293
+ dict[k_reference][:values]={}
294
+ end
295
+ #store fpkm values as well for each pop/sample it should be
296
+ dict[k_reference][:values][k_sample]=data[fpkm_first].to_f unless dict[k_reference][:values].key?(k_sample)
297
+ dict[k_reference][:values][data[q_second].to_sym]=data[fpkm_second].to_f unless dict[k_reference][:values].key?(data[q_second].to_sym)
298
+ if dict[k_reference][k_sample].size >= min_samples
299
+ dict[k_reference][:winner] << k_sample
300
+ end
301
+ # puts dict[k_reference].inspect if k_reference == :XLOC_017497
302
+ else
303
+ #TODO add threshold value below min fpkm
304
+ #dict[k_reference][:values][k_sample]=data[6].to_f
305
+ #dict[k_reference][:values][data[4].to_sym]=data[7].to_f
306
+ end
307
+ end #each line
308
+
309
+ #example structure
310
+ #{:TCONS_00086164=>{:q5=>[:q1, :q2, :q3, :q6]}, :TCONS_00086166=>{:q5=>[:q1, :q2, :q3, :q4, :q6]}
311
+ end #file.open
312
+
313
+
314
+ file_lines =[]
315
+ dict.each do |diff_reference, dict_info|
316
+
317
+ if dict_info.key?(:winner)
318
+
319
+ #BAD PERFORMANCES use lambda
320
+ valz = case z_scores
321
+ when true
322
+ items=dict_info[:values].sort.map{|sample| sample[1]}
323
+ average = items.average
324
+ stdev = items.standard_deviation
325
+ items.map do |fpkm|
326
+ (fpkm-average)/stdev
327
+ end
328
+ when false
329
+ dict_info[:values].sort.map{|sample| sample[1]}
330
+ end #case
331
+
332
+ #TODO generalize to isoforms and genes now only isoforms
333
+ # puts yield(dict_info, diff_reference, gtf_kb, valz) if diff_reference == :XLOC_017497
334
+ file_lines<< yield(dict_info, diff_reference, gtf_kb, valz) #fpkm_values
335
+ #file_lines<<"#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:nearest_ref]}_#{gtf_kb[diff_reference][:gene_name]}\t#{valz.join("\t")}"
336
+ else
337
+ #TODO not winner or number of min samples
338
+ end#winner
339
+ end # dict_each
340
+ file_name_output =File.join(File.dirname(diff),File.basename(diff,".diff")+"-f#{fold_log2}_s#{min_samples}_fpkm#{min_fpkm}")
341
+ file_name_output += "_z" if z_scores
342
+ file_name_output += regulated.to_s
343
+ file_name_output += ".txt"
344
+ File.open(file_name_output,'w') do |odiff|
345
+ odiff.puts "sample\thumanized_id\t#{dict_samples.keys.sort.join("\t")}"
346
+ file_lines.sort.each do |file_line|
347
+ odiff.puts file_line
348
+ end#each sorted line
349
+ end#open
350
+ end #process_de
351
+ end
352
+
353
+ end #Diff
354
+
355
+
356
+ # cuffcompare v1.0.2 (2335)
357
+ # -----------------------------
358
+ # Usage:
359
+ # cuffcompare [-r <reference_mrna.gtf>] [-R] [-T] [-V] [-s <seq_path>]
360
+ # [-o <outprefix>] [-p <cprefix>]
361
+ # {-i <input_gtf_list> | <input1.gtf> [<input2.gtf> .. <inputN.gtf>]}
362
+ #
363
+ # Cuffcompare provides classification, reference annotation mapping and various
364
+ # statistics for Cufflinks transfrags.
365
+ # Cuffcompare clusters and tracks transfrags across multiple samples, writing
366
+ # matching transcripts (intron chains) into <outprefix>.tracking, and a GTF
367
+ # file <outprefix>.combined.gtf containing a nonredundant set of transcripts
368
+ # across all input files (with a single representative transfrag chosen
369
+ # for each clique of matching transfrags across samples).
370
+ #
371
+ # Options:
372
+ # -i provide a text file with a list of Cufflinks GTF files to process instead
373
+ # of expecting them as command line arguments (useful when a large number
374
+ # of GTF files should be processed)
375
+ #
376
+ # -r a set of known mRNAs to use as a reference for assessing
377
+ # the accuracy of mRNAs or gene models given in <input.gtf>
378
+ #
379
+ # -R for -r option, reduce the set of reference transcripts to
380
+ # only those found to overlap any of the input loci
381
+ # -M discard (ignore) single-exon transfrags and reference transcripts
382
+ # -N discard (ignore) single-exon reference transcripts
383
+ #
384
+ # -s <seq_path> can be a multi-fasta file with all the genomic sequences or
385
+ # a directory containing multiple single-fasta files (one file per contig);
386
+ # lower case bases will be used to classify input transcripts as repeats
387
+ #
388
+ # -d max distance (range) for grouping transcript start sites (100)
389
+ # -p the name prefix to use for consensus transcripts in the
390
+ # <outprefix>.combined.gtf file (default: 'TCONS')
391
+ # -C include the "contained" transcripts in the .combined.gtf file
392
+ # -G generic GFF input file(s) (do not assume Cufflinks GTF)
393
+ # -T do not generate .tmap and .refmap files for each input file
394
+ # -V verbose processing mode (showing all GFF parsing warnings)
395
+ class Compare
396
+ include Bio::Command::Wrapper
397
+
398
+ set_program Bio::Ngs::Utils.binary("cuffcompare")
399
+ use_aliases
400
+ #TODO: add descriptions
401
+ add_option "outprefix", :type => :string, :aliases => '-o', :default => "Comparison"
402
+ add_option "gtf_combine_file", :type => :string, :aliases => '-i'
403
+ add_option "gtf_reference", :type => :string, :aliases => '-r'
404
+ add_option "only_overlap", :type => :boolean, :aliases => '-R'
405
+ add_option "discard_transfrags", :type => :boolean, :aliases => '-M'
406
+ add_option "discard_ref_transcripts", :type => :boolean, :aliases => '-N'
407
+ add_option "multi_fasta", :type => :string, :aliases => '-s'
408
+ add_option "distance_tss", :type => :numeric, :aliases => '-d'
409
+ add_option "prefix_transcripts_consensus", :type => :string, :aliases => '-p'
410
+ add_option "contained", :type=>:boolean, :aliases => '-C'
411
+ add_option "GFF", :type => :boolean, :aliases =>'-G'
412
+ add_option "no_map_files", :type => :boolean, :aliases =>'-T'
413
+
414
+ class << self
415
+
416
+ def kb_name(gtf)
417
+ gtf.sub(/\.[a-zA-Z0-9]*$/,".kb")
418
+ end
419
+
420
+ def exists_kb?(gtf)
421
+ File.exists?(kb_name(gtf))
422
+ end
423
+
424
+ # Dump an hash of associations from a GTF file generated from CuffCompare
425
+ # gene_id: transcript_id, gene_name, oid, nearest_ref
426
+ # gene_id example: :XLOC_000001=>{:gene_name=>:RP11-304M2.1, :transcripts=>{:TCONS_00000001=>{:oid=>:ENST00000519787, :nearest_ref=>:ENST00000519787}}}
427
+ # the others are just plain hash
428
+ # transcript_id: gene_id, gene_name, oid, nearest_ref
429
+ # gene_name: gene_id, transcript_id, oid, nearest_ref
430
+ # oid: gene_id, transcript_id, gene_name, nearest_ref
431
+ # nearest_ref: gene_id, transcript_id, gene_name, oid
432
+ #Note:exons and coordinates are not saved.
433
+ def build_compare_kb(gtf)
434
+ unless File.exists?(gtf)
435
+ STDERR.puts "File #{gtf} doesn't exist."
436
+ return nil
437
+ end
438
+
439
+ dict = {} #build an hash with the combinations of data extracted from GTF file, XLOC, TCONS, ENST, SYMBOL
440
+ File.open(gtf,'r') do |f|
441
+ f.lines do |line|
442
+ line=~/gene_id (.*?);/
443
+ gene_id = $1.gsub(/"/,'').to_sym
444
+ line=~/transcript_id (.*?);/
445
+ transcript_id = $1.gsub(/"/,'').to_sym
446
+ line=~/gene_name (.*?);/
447
+ gene_name = $1.gsub(/"/,'').to_sym
448
+ line=~/oId (.*?);/
449
+ oid=$1.gsub(/"/,'').to_sym
450
+ line=~/nearest_ref (.*?);/
451
+ nearest_ref = $1.gsub(/"/,'').to_sym
452
+ unless dict.key?(gene_id)
453
+ dict[gene_id]={:gene_name=>gene_name,:transcripts=>{}}
454
+ end
455
+ unless dict[gene_id][:transcripts].key?(transcript_id)
456
+ dict[gene_id][:transcripts][transcript_id]={:odi=>oid, :nearest_ref=>nearest_ref}
457
+ end
458
+ dict[transcript_id]={:gene_id=>gene_id, :gene_name=>gene_name, :odi=>oid, :nearest_ref=>nearest_ref}
459
+ dict[gene_name]={:gene_id=>gene_id, :transcript_id=>transcript_id, :odi=>oid, :nearest_ref=>nearest_ref}
460
+ dict[oid]={:gene_id=>gene_id, :transcript_id=>transcript_id, :gene_name=>gene_name, :nearest_ref=>nearest_ref}
461
+ dict[nearest_ref]={:gene_id=>gene_id, :transcript_id=>transcript_id, :odi=>oid, :gene_name=>gene_name}
462
+ end#lines
463
+ end#file
464
+ kb_filename = kb_name(gtf)
465
+ File.open(kb_filename,'w') do |fkb|
466
+ #fkb.write(dict.to_json)
467
+ Marshal.dump(dict,fkb)
468
+ end #fkb
469
+ dict
470
+ end #build_compare_kb
471
+
472
+ # Return the hash of associations
473
+ # gene_id: transcript_id, gene_name, oid, nearest_ref
474
+ # transcript_id: gene_id, gene_name, oid, nearest_ref
475
+ # gene_name: gene_id, transcript_id, oid, nearest_ref
476
+ # oid: gene_id, transcript_id, gene_name, nearest_ref
477
+ # nearest_ref: gene_id, transcript_id, gene_name, oid
478
+ def load_compare_kb(gtf)
479
+ #TODO rescue Exceptions
480
+ kb_filename = kb_name(gtf)
481
+ gtf_kb = File.open(kb_filename,'r') do |kb_dump|
482
+ Marshal.load(kb_dump)
483
+ end
484
+ end #load_compare_kb
485
+ end
486
+ end #Compare
487
+ end #Cufflinks
488
+ end #Ngs
489
+ end #Bio