bio-ngs 0.3.2.alpha.01

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/Gemfile +39 -0
  3. data/Gemfile.lock +81 -0
  4. data/LICENSE.txt +28 -0
  5. data/README.rdoc +240 -0
  6. data/Rakefile +60 -0
  7. data/VERSION +1 -0
  8. data/bin/biongs +35 -0
  9. data/bio-ngs.gemspec +215 -0
  10. data/ext/mkrf_conf.rb +87 -0
  11. data/lib/bio-ngs.rb +54 -0
  12. data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
  13. data/lib/bio/appl/ngs/blast.rb +36 -0
  14. data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
  15. data/lib/bio/appl/ngs/cufflinks.rb +489 -0
  16. data/lib/bio/appl/ngs/fastx.rb +170 -0
  17. data/lib/bio/appl/ngs/samtools.rb +118 -0
  18. data/lib/bio/appl/ngs/sff_extract.rb +23 -0
  19. data/lib/bio/appl/ngs/tophat.rb +158 -0
  20. data/lib/bio/ngs/converter.rb +100 -0
  21. data/lib/bio/ngs/core_ext.rb +12 -0
  22. data/lib/bio/ngs/db.rb +66 -0
  23. data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
  24. data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
  25. data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
  26. data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
  27. data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
  28. data/lib/bio/ngs/db/models.rb +1 -0
  29. data/lib/bio/ngs/db/models/homology.rb +8 -0
  30. data/lib/bio/ngs/db/models/ontology.rb +16 -0
  31. data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
  32. data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
  33. data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
  34. data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
  35. data/lib/bio/ngs/ext/versions.yaml +73 -0
  36. data/lib/bio/ngs/graphics.rb +189 -0
  37. data/lib/bio/ngs/homology.rb +102 -0
  38. data/lib/bio/ngs/ontology.rb +103 -0
  39. data/lib/bio/ngs/quality.rb +64 -0
  40. data/lib/bio/ngs/record.rb +50 -0
  41. data/lib/bio/ngs/task.rb +46 -0
  42. data/lib/bio/ngs/utils.rb +176 -0
  43. data/lib/development_tasks.rb +34 -0
  44. data/lib/enumerable.rb +37 -0
  45. data/lib/tasks/bwa.thor +126 -0
  46. data/lib/tasks/convert.thor +454 -0
  47. data/lib/tasks/history.thor +51 -0
  48. data/lib/tasks/homology.thor +121 -0
  49. data/lib/tasks/ontology.thor +93 -0
  50. data/lib/tasks/project.thor +51 -0
  51. data/lib/tasks/quality.thor +142 -0
  52. data/lib/tasks/rna.thor +126 -0
  53. data/lib/tasks/sff_extract.thor +9 -0
  54. data/lib/templates/README.tt +43 -0
  55. data/lib/templates/db.tt +6 -0
  56. data/lib/wrapper.rb +225 -0
  57. data/spec/converter_qseq_spec.rb +56 -0
  58. data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
  59. data/spec/quality_spec.rb +40 -0
  60. data/spec/sff_extract_spec.rb +98 -0
  61. data/spec/spec_helper.rb +55 -0
  62. data/spec/tophat_spec.rb +99 -0
  63. data/spec/utils_spec.rb +22 -0
  64. data/test/conf/test_db.yml +4 -0
  65. data/test/data/blastoutput.xml +69 -0
  66. data/test/data/gene-GO.json +1 -0
  67. data/test/data/goa_uniprot +27 -0
  68. data/test/data/goslim_goa.obo +1763 -0
  69. data/test/helper.rb +18 -0
  70. data/test/test_bio-ngs.rb +17 -0
  71. data/test/test_db.rb +21 -0
  72. data/test/test_homology.rb +102 -0
  73. data/test/test_ngs.rb +21 -0
  74. data/test/test_ontology.rb +74 -0
  75. data/test/test_utils.rb +29 -0
  76. metadata +460 -0
@@ -0,0 +1,36 @@
1
+ module Bio
2
+ module Ngs
3
+ class Blast
4
+
5
+ include Bio::Command::Wrapper
6
+
7
+ class BlastN < Blast
8
+ set_program Bio::Ngs::Utils.binary("blastn")
9
+ add_option "evalue", :type => :string, :desc => "E-value cutoff"
10
+ add_option "query", :type => :string, :desc => "Query sequence"
11
+ add_option "db", :type => :string, :desc => "Database sequences"
12
+ add_option "query", :type => :string, :desc => "Query sequence"
13
+ add_option "word_size", :type => :string, :desc => "Query sequence"
14
+ add_option "task", :type => :string, :desc => "Task type", :default => "blastn"
15
+ add_option "out", :type => :string, :desc => "Output file", :default => "blastout.xml"
16
+ add_option "outfmt", :type => :numeric, :desc => "Output format type", :default => 5
17
+ add_option "num_descriptions", :type => :numeric, :desc => "Number of HIT descriptions", :default => 1
18
+ add_option "num_alignments", :type => :numeric, :desc => "Number of HIT alignments", :default => 1
19
+ add_option "num_threads", :type => :numeric, :desc => "Number of threads", :default => 1
20
+ end
21
+
22
+ class BlastX < Blast
23
+ set_program Bio::Ngs::Utils.binary("blastx")
24
+ add_option "evalue", :type => :string, :desc => "E-value cutoff"
25
+ add_option "query", :type => :string, :desc => "Query sequence"
26
+ add_option "db", :type => :string, :desc => "Database sequences"
27
+ add_option "query", :type => :string, :desc => "Query sequence"
28
+ add_option "out", :type => :string, :desc => "Output file", :default => "blastout.xml"
29
+ add_option "outfmt", :type => :numeric, :desc => "Output format type", :default => 5
30
+ add_option "num_descriptions", :type => :numeric, :desc => "Number of HIT descriptions", :default => 1
31
+ add_option "num_alignments", :type => :numeric, :desc => "Number of HIT alignments", :default => 1
32
+ add_option "num_threads", :type => :numeric, :desc => "Number of threads", :default => 1
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,50 @@
1
+ #
2
+ # bowtie-inspect.rb - Wrapper for bowtie-inspect
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Raoul Bonnal <r@bioruby.org>
6
+ # License:: The Ruby License
7
+ #
8
+ #
9
+
10
+
11
+ # Usage: bowtie-inspect [options]* <ebwt_base>
12
+ # <ebwt_base> ebwt filename minus trailing .1.ebwt/.2.ebwt
13
+ #
14
+ # By default, prints FASTA records of the indexed nucleotide sequences to
15
+ # standard out. With -n, just prints names. With -s, just prints a summary of
16
+ # the index parameters and sequences. With -e, preserves colors if applicable.
17
+ #
18
+ # Options:
19
+ # -a/--across <int> Number of characters across in FASTA output (default: 60)
20
+ # -n/--names Print reference sequence names only
21
+ # -s/--summary Print summary incl. ref names, lengths, index properties
22
+ # -e/--ebwt-ref Reconstruct reference from ebwt (slow, preserves colors)
23
+ # -v/--verbose Verbose output (for debugging)
24
+ # -h/--help print detailed description of tool and its options
25
+ # --help print this usage message
26
+
27
+
28
+ module Bio
29
+ module Ngs
30
+ class BowtieInspect
31
+
32
+ include Bio::Command::Wrapper
33
+
34
+ set_program Bio::Ngs::Utils.binary("bowtie-inspect")
35
+ # User should provide a complete path to the tool.
36
+ # I think it would it better identify the program from just a name
37
+ # looking int othe ext/ or host system path
38
+ # Why not grab the file name from the class name if not specified ?
39
+
40
+ set_output :stdout
41
+
42
+
43
+ add_option "across",:type => :numeric, :aliases => '-a'
44
+ add_option "names", :type => :boolean, :aliases => '-n'
45
+ add_option "summary", :type => :boolean, :aliases => '-s'
46
+ add_option "ebwt-ref", :type => :boolean, :aliases => '-e'
47
+ add_option "verbose", :type => :boolean, :aliases => '-v'
48
+ end #BowtieInspect
49
+ end#Ngs
50
+ end#Bio
@@ -0,0 +1,489 @@
1
+ #
2
+ # cufflinks.rb - description
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Raoul Bonnal <r@bioruby.org>
6
+ # License:: The Ruby License
7
+ #
8
+ #
9
+
10
+
11
+
12
+ module Bio
13
+ module Ngs
14
+ module Cufflinks
15
+ VERSION = "1.0.X"
16
+ class << self
17
+ def version
18
+ VERSION
19
+ end
20
+ end
21
+
22
+
23
+ # cufflinks v1.0.2 (2335)
24
+ # linked against Boost version 104000
25
+ # -----------------------------
26
+ # Usage: cufflinks [options] <hits.sam>
27
+ # Options:
28
+ #
29
+ # -p/--num-threads number of threads used during analysis [ default: 1 ]
30
+ # -L/--label assembled transcripts have this ID prefix [ default: CUFF ]
31
+ # -G/--GTF quantitate against reference transcript annotations
32
+ # -F/--min-isoform-fraction suppress transcripts below this abundance level [ default: 0.15 ]
33
+ # -f/--min-intron-fraction filter spliced alignments below this level [ default: 0.05 ]
34
+ # -j/--pre-mrna-fraction suppress intra-intronic transcripts below this level [ default: 0.15 ]
35
+ # -I/--max-intron-length ignore alignments with gaps longer than this [ default: 300000 ]
36
+ # -Q/--min-map-qual ignore alignments with lower than this mapping qual [ default: 0 ]
37
+ # -M/--mask-file ignore all alignment within transcripts in this file
38
+ # -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
39
+ # -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
40
+ # -o/--output-dir write all output files to this directory [ default: ./ ]
41
+ # -r/--reference-seq reference fasta file for sequence bias correction [ default: NULL ]
42
+ #
43
+ # Advanced Options:
44
+ #
45
+ # -N/--quartile-normalization use quartile normalization instead of total counts [ default: FALSE ]
46
+ # -a/--junc-alpha alpha for junction binomial test filter [ default: 0.01 ]
47
+ # -A/--small-anchor-fraction percent read overhang taken as 'suspiciously small' [ default: 0.12 ]
48
+ # -m/--frag-len-mean the average fragment length [ default: 200 ]
49
+ # -s/--frag-len-std-dev the fragment length standard deviation [ default: 80 ]
50
+ # --min-frags-per-transfrag minimum number of fragments needed for new transfrags [ default: 10 ]
51
+ # --overhang-tolerance number of terminal exon bp to tolerate in introns [ default: 8 ]
52
+ # --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
53
+ # --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
54
+ # --library-type Library prep used for input reads [ default: below ]
55
+ # --max-bundle-length maximum genomic length allowed for a given bundle [ default:3500000 ]
56
+ # --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]
57
+ # --min-intron-length minimum intron size allowed in genome [ default: 50 ]
58
+ # Supported library types:
59
+ # ff-firststrand
60
+ # ff-secondstrand
61
+ # ff-unstranded
62
+ # fr-firststrand
63
+ # fr-secondstrand
64
+ # fr-unstranded (default)
65
+ # transfrags
66
+ class Quantification
67
+
68
+ include Bio::Command::Wrapper
69
+
70
+ set_program Bio::Ngs::Utils.binary("cufflinks")
71
+
72
+ add_option "num-threads", :type => :numeric, :aliases => '-p', :default => 1
73
+ add_option "label", :type => :string, :aliases => '-L', :default => "CUFF"
74
+ add_option "GTF", :type => :string, :aliases => '-G'
75
+ add_option "min-isoform-fraction", :type => :numeric, :aliases => '-F', :default => 0.15
76
+ add_option "min-intron-fraction", :type => :numeric, :aliases => '-f', :default => 0.05
77
+ add_option "pre-mrna-fraction", :type => :numeric, :aliases => '-j', :default => 0.15
78
+ add_option "max-intron-length", :type => :numeric, :aliases => '-I', :default => 300000
79
+ add_option "min-map-qual", :type => :numeric, :aliases => '-Q', :default => 0
80
+ add_option "mask-file", :type => :string, :aliases => '-M'
81
+ add_option "verbose", :type => :boolean, :aliases => '-v'
82
+ add_option "quiet", :type => :boolean, :aliases => '-q'
83
+ add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
84
+ add_option "reference-seq", :type => :string, :aliases => '-r'
85
+ add_option "quartile-normalization", :type => :boolean, :aliases => '-N'
86
+ add_option "junc-alpha", :type => :numeric, :aliases => '-a', :default => 0.01
87
+ add_option "small-anchor-fraction", :type => :numeric, :aliases => '-A', :default => 0.12
88
+ #TODO Check why with these defaults is not working properly
89
+ add_option "farg-len-mean", :type => :numeric, :aliases => '-m'#, :default => 200
90
+ add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'#, :default => 80
91
+ add_option "min-frags-per-transfrag", :type => :numeric#, :default => 10
92
+ add_option "overhang-tolerance", :type => :numeric#, :default => 8
93
+ add_option "num-importance-samples", :type => :numeric#, :default => 1000
94
+ add_option "max-mle-iterations", :type => :numeric#, :default => 5000
95
+ add_option "library-type", :type => :string
96
+ add_option "max-bundle-length", :type => :numeric #, :default => 3500000
97
+ add_option "max-bundle-frags", :type => :numeric #, :default => 500000
98
+ add_option "min-intron-length", :type => :numeric#, :default => 50
99
+ end #Quantification
100
+
101
+ # cuffdiff v1.0.2 (2336)
102
+ # -----------------------------
103
+ # Usage: cuffdiff [options] <transcripts.gtf> <sample1_hits.sam> <sample2_hits.sam> [... sampleN_hits.sam]
104
+ # Supply replicate SAMs as comma separated lists for each condition: sample1_rep1.sam,sample1_rep2.sam,...sample1_repM.sam
105
+ # General Options:
106
+ # -o/--output-dir write all output files to this directory [ default: ./ ]
107
+ # -T/--time-series treat samples as a time-series [ default: FALSE ]
108
+ # -c/--min-alignment-count minimum number of alignments in a locus for testing [ default: 10 ]
109
+ # --FDR False discovery rate used in testing [ default: 0.05 ]
110
+ # -M/--mask-file ignore all alignment within transcripts in this file [ default: NULL ]
111
+ # -b/--frag-bias-correct use bias correction - reference fasta required [ default: NULL ]
112
+ # -u/--multi-read-correct use 'rescue method' for multi-reads (more accurate) [ default: FALSE ]
113
+ # -N/--upper-quartile-norm use upper-quartile normalization [ default: FALSE ]
114
+ # -L/--labels comma-separated list of condition labels
115
+ # -p/--num-threads number of threads used during quantification [ default: 1 ]
116
+ #
117
+ # Advanced Options:
118
+ # --library-type Library prep used for input reads [ default: below ]
119
+ # -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]
120
+ # -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]
121
+ # --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
122
+ # --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
123
+ # --compatible-hits-norm count hits compatible with reference RNAs only [ default: TRUE ]
124
+ # --total-hits-norm count all hits for normalization [ default: FALSE ]
125
+ # --poisson-dispersion Don't fit fragment counts for overdispersion [ default: FALSE ]
126
+ # -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
127
+ # -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
128
+ # --no-update-check do not contact server to check for update availability[ default: FALSE ]
129
+ # --emit-count-tables print count tables used to fit overdispersion [ default: FALSE ]
130
+ #
131
+ # Supported library types:
132
+ # ff-firststrand
133
+ # ff-secondstrand
134
+ # ff-unstranded
135
+ # fr-firststrand
136
+ # fr-secondstrand
137
+ # fr-unstranded (default)
138
+ # transfrags
139
+ class Diff
140
+ include Bio::Command::Wrapper
141
+
142
+ set_program Bio::Ngs::Utils.binary("cuffdiff")
143
+
144
+ add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
145
+ add_option "time-series", :type => :boolean, :aliases => '-T'
146
+ add_option "min-alignment-count", :type => :numeric, :aliases => '-c'
147
+ add_option "FDR", :type => :numeric, :aliases => '-F'
148
+ #TODO:FIX add_option "mask-file", :type => :string, :aliases => '-M'
149
+ #TODO:FIX add_option "frag-bias-correct", :type =>
150
+ add_option "multi-read-correct", :type => :boolean, :aliases => '-u'
151
+ add_option "upper-quartile-norm", :type => :boolean, :aliases => 'N'
152
+ add_option "labels", :type => :array, :aliases => '-L'
153
+ add_option "num-threads", :type => :numeric, :aliases => '-p'
154
+ add_option "library-type", :type => :string, :aliases => '-l'
155
+ add_option "frag-len-mean", :type => :numeric, :aliases => '-m'
156
+ add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'
157
+ add_option "num-importance-samples", :type => :numeric, :aliases => '-i'
158
+ add_option "max-mle-iterations", :type => :numeric, :aliases => '-e'
159
+ add_option "compatible-hits-norm", :type => :boolean, :aliases => '-h'
160
+ add_option "total-hits-norm", :type => :boolean, :aliases => '-t'
161
+ add_option "poisson-dispersion", :type => :boolean, :aliases => '-d'
162
+ add_option "verbose", :type => :boolean, :aliases => '-v'
163
+ add_option "quiet", :type => :boolean, :aliases => '-q'
164
+ add_option "no-update-check", :type => :boolean, :aliases => '-j'
165
+ add_option "emit-count-tables", :type => :boolean, :aliases => '-b'
166
+
167
+ #Examples
168
+ #Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,3,0.6,false,true)
169
+ #Bio::Ngs::Cufflinks::Diff.genes("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/gene_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,5,0.5,false,true)
170
+
171
+ class << self
172
+
173
+ #Return the version of CuffDiff used to produce the output
174
+ def version(diff)
175
+ #cufflink_version_offset = Bio::Ngs::Cufflinks.version
176
+ f=File.open(diff,'r')
177
+ header=f.readline #skip header
178
+ f.close
179
+ cufflink_version_offset = case header.split.size
180
+ when 12
181
+ "0.9.X"
182
+ when 14
183
+ Bio::Ngs::Cufflinks.version #latest
184
+ end
185
+ end#version
186
+
187
+
188
+ def offset_by_version(cufflinks_version)
189
+ case cufflinks_version
190
+ when "0.9.X"
191
+ 0
192
+ when "1.0.X"
193
+ 1
194
+ end
195
+ end
196
+
197
+ #write a file with the information
198
+ #See process_de for options available
199
+ # Example: Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DEPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/isoform_exp.diff",
200
+ # "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_PopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/ComparepPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8.combined.gtf",
201
+ # fold:0.5,min_samples:5,min_fpkm:0.5,z_scores:true, :regulated=>:up)
202
+ def isoforms(diff, gtf, options={})
203
+ process_de(diff, gtf, options) do |dict_info, diff_reference, gtf_kb, fpkm_values|
204
+ "#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:nearest_ref]}_#{gtf_kb[diff_reference][:gene_name]}\t#{fpkm_values.join("\t")}"
205
+ end
206
+ end #isoform
207
+
208
+ #write a file with the information
209
+ #See process_de for options available
210
+ # Example: Bio::Ngs::Cufflinks::Diff.genes("/Users/bonnalraoul/Desktop/RRep16giugno/DEPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/gene_exp.diff",
211
+ # "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_PopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/ComparepPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8.combined.gtf",
212
+ # fold:0.5,min_samples:5,min_fpkm:0.5,z_scores:true, :regulated=>:up)
213
+ def genes(diff, gtf, options={})
214
+ process_de(diff, gtf, options) do |dict_info, diff_reference, gtf_kb, fpkm_values|
215
+ "#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:gene_name]}\t#{fpkm_values.join("\t")}"
216
+ end
217
+ end #genes
218
+
219
+ private
220
+ #Options hash
221
+ # :fold(float), :min_samples(integer), :min_fpkm(float), :only_significative(boolean, false) , :z_score(boolean, false)
222
+ # :regulated(symbol :up or :down default :up)
223
+ def process_de(diff, gtf, options={})
224
+ fold = options[:fold] || 0.0
225
+ min_samples = options[:min_samples] || 0
226
+ min_fpkm = options[:min_fpkm] || 0.0
227
+ only_significative = options[:only_significative] || false
228
+ z_scores = options[:z_scores] || false
229
+ #TODO improve check on paramters
230
+ regulated =options[:regulated] || :up
231
+
232
+ gtf_kb = Bio::Ngs::Cufflinks::Compare.exists_kb?(gtf) ? Bio::Ngs::Cufflinks::Compare.load_compare_kb(gtf) : Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
233
+
234
+ #convert log2 fold value into natural log value (internally computed by cuffdiff)
235
+ fold_log2 = fold
236
+ fold = fold==0 ? 0.0 : (fold*Math.log(2))
237
+
238
+ dict=Hash.new {|h, k| h[k]=Hash.new{|hh,kk| hh[kk]=[]}; }
239
+ dict_samples = Hash.new{|h,k| h[k]=""}
240
+
241
+ #which offset may I consider to get data from cuffdiff?
242
+ cufflink_version_offset = offset_by_version(version(diff))
243
+
244
+ File.open(diff,'r') do |f|
245
+ header=f.readline #skip header
246
+
247
+ q_first = 3 + cufflink_version_offset
248
+ q_second = 4 + cufflink_version_offset
249
+ fpkm_first = 6 + cufflink_version_offset
250
+ fpkm_second = 7 + cufflink_version_offset
251
+ fold_position = 8 + cufflink_version_offset
252
+ significant_position = 11 + cufflink_version_offset + (cufflink_version_offset==1 ? 1 : 0)
253
+ f.each_line do |line|
254
+ data=line.split
255
+ if data[fold_position].to_f<=0
256
+ data[fold_position]=data[fold_position].sub(/-/,"")
257
+ else
258
+ a=data[fpkm_second]
259
+ data[fpkm_second]=data[fpkm_first]
260
+ data[fpkm_first]=a
261
+ a=data[q_second]
262
+ data[q_second]=data[q_first]
263
+ data[q_first]=a
264
+ end
265
+ #0 TCONS
266
+ #4 name sample is the max diff for the item
267
+ #5 name sample is the less diff for the item
268
+ #9 is the fold
269
+ dict_samples[data[q_first]]
270
+ dict_samples[data[q_second]]
271
+
272
+ #7 is the fpkm value of max pop/sample
273
+ #8 is the fpkm value of min pop/sample
274
+ if ((only_significative==true && data[significant_position]=="yes") || (data[significant_position]=="yes" && data[fold_position].to_f>=fold)) && data[fpkm_first].to_f>=min_fpkm && data[fpkm_second].to_f>=min_fpkm
275
+ k_reference = data[0].to_sym #This can be TCONS if isoforms or XLOC if genes
276
+
277
+ ###### puts data.join(" ") if k_reference == :XLOC_017497
278
+ #TODO refactor: this can be done using lambda
279
+ k_sample = case regulated
280
+ when :up
281
+ k_sample = data[q_first].to_sym
282
+ dict[k_reference][k_sample]<<data[q_second].to_sym
283
+ k_sample
284
+ when :down
285
+ k_sample = data[q_second].to_sym
286
+ dict[k_reference][k_sample]<<data[q_first].to_sym
287
+ k_sample
288
+ end
289
+
290
+ # puts dict[k_reference].inspect if k_reference == :XLOC_017497
291
+
292
+ unless dict[k_reference].key?(:values)
293
+ dict[k_reference][:values]={}
294
+ end
295
+ #store fpkm values as well for each pop/sample it should be
296
+ dict[k_reference][:values][k_sample]=data[fpkm_first].to_f unless dict[k_reference][:values].key?(k_sample)
297
+ dict[k_reference][:values][data[q_second].to_sym]=data[fpkm_second].to_f unless dict[k_reference][:values].key?(data[q_second].to_sym)
298
+ if dict[k_reference][k_sample].size >= min_samples
299
+ dict[k_reference][:winner] << k_sample
300
+ end
301
+ # puts dict[k_reference].inspect if k_reference == :XLOC_017497
302
+ else
303
+ #TODO add threshold value below min fpkm
304
+ #dict[k_reference][:values][k_sample]=data[6].to_f
305
+ #dict[k_reference][:values][data[4].to_sym]=data[7].to_f
306
+ end
307
+ end #each line
308
+
309
+ #example structure
310
+ #{:TCONS_00086164=>{:q5=>[:q1, :q2, :q3, :q6]}, :TCONS_00086166=>{:q5=>[:q1, :q2, :q3, :q4, :q6]}
311
+ end #file.open
312
+
313
+
314
+ file_lines =[]
315
+ dict.each do |diff_reference, dict_info|
316
+
317
+ if dict_info.key?(:winner)
318
+
319
+ #BAD PERFORMANCES use lambda
320
+ valz = case z_scores
321
+ when true
322
+ items=dict_info[:values].sort.map{|sample| sample[1]}
323
+ average = items.average
324
+ stdev = items.standard_deviation
325
+ items.map do |fpkm|
326
+ (fpkm-average)/stdev
327
+ end
328
+ when false
329
+ dict_info[:values].sort.map{|sample| sample[1]}
330
+ end #case
331
+
332
+ #TODO generalize to isoforms and genes now only isoforms
333
+ # puts yield(dict_info, diff_reference, gtf_kb, valz) if diff_reference == :XLOC_017497
334
+ file_lines<< yield(dict_info, diff_reference, gtf_kb, valz) #fpkm_values
335
+ #file_lines<<"#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:nearest_ref]}_#{gtf_kb[diff_reference][:gene_name]}\t#{valz.join("\t")}"
336
+ else
337
+ #TODO not winner or number of min samples
338
+ end#winner
339
+ end # dict_each
340
+ file_name_output =File.join(File.dirname(diff),File.basename(diff,".diff")+"-f#{fold_log2}_s#{min_samples}_fpkm#{min_fpkm}")
341
+ file_name_output += "_z" if z_scores
342
+ file_name_output += regulated.to_s
343
+ file_name_output += ".txt"
344
+ File.open(file_name_output,'w') do |odiff|
345
+ odiff.puts "sample\thumanized_id\t#{dict_samples.keys.sort.join("\t")}"
346
+ file_lines.sort.each do |file_line|
347
+ odiff.puts file_line
348
+ end#each sorted line
349
+ end#open
350
+ end #process_de
351
+ end
352
+
353
+ end #Diff
354
+
355
+
356
+ # cuffcompare v1.0.2 (2335)
357
+ # -----------------------------
358
+ # Usage:
359
+ # cuffcompare [-r <reference_mrna.gtf>] [-R] [-T] [-V] [-s <seq_path>]
360
+ # [-o <outprefix>] [-p <cprefix>]
361
+ # {-i <input_gtf_list> | <input1.gtf> [<input2.gtf> .. <inputN.gtf>]}
362
+ #
363
+ # Cuffcompare provides classification, reference annotation mapping and various
364
+ # statistics for Cufflinks transfrags.
365
+ # Cuffcompare clusters and tracks transfrags across multiple samples, writing
366
+ # matching transcripts (intron chains) into <outprefix>.tracking, and a GTF
367
+ # file <outprefix>.combined.gtf containing a nonredundant set of transcripts
368
+ # across all input files (with a single representative transfrag chosen
369
+ # for each clique of matching transfrags across samples).
370
+ #
371
+ # Options:
372
+ # -i provide a text file with a list of Cufflinks GTF files to process instead
373
+ # of expecting them as command line arguments (useful when a large number
374
+ # of GTF files should be processed)
375
+ #
376
+ # -r a set of known mRNAs to use as a reference for assessing
377
+ # the accuracy of mRNAs or gene models given in <input.gtf>
378
+ #
379
+ # -R for -r option, reduce the set of reference transcripts to
380
+ # only those found to overlap any of the input loci
381
+ # -M discard (ignore) single-exon transfrags and reference transcripts
382
+ # -N discard (ignore) single-exon reference transcripts
383
+ #
384
+ # -s <seq_path> can be a multi-fasta file with all the genomic sequences or
385
+ # a directory containing multiple single-fasta files (one file per contig);
386
+ # lower case bases will be used to classify input transcripts as repeats
387
+ #
388
+ # -d max distance (range) for grouping transcript start sites (100)
389
+ # -p the name prefix to use for consensus transcripts in the
390
+ # <outprefix>.combined.gtf file (default: 'TCONS')
391
+ # -C include the "contained" transcripts in the .combined.gtf file
392
+ # -G generic GFF input file(s) (do not assume Cufflinks GTF)
393
+ # -T do not generate .tmap and .refmap files for each input file
394
+ # -V verbose processing mode (showing all GFF parsing warnings)
395
+ class Compare
396
+ include Bio::Command::Wrapper
397
+
398
+ set_program Bio::Ngs::Utils.binary("cuffcompare")
399
+ use_aliases
400
+ #TODO: add descriptions
401
+ add_option "outprefix", :type => :string, :aliases => '-o', :default => "Comparison"
402
+ add_option "gtf_combine_file", :type => :string, :aliases => '-i'
403
+ add_option "gtf_reference", :type => :string, :aliases => '-r'
404
+ add_option "only_overlap", :type => :boolean, :aliases => '-R'
405
+ add_option "discard_transfrags", :type => :boolean, :aliases => '-M'
406
+ add_option "discard_ref_transcripts", :type => :boolean, :aliases => '-N'
407
+ add_option "multi_fasta", :type => :string, :aliases => '-s'
408
+ add_option "distance_tss", :type => :numeric, :aliases => '-d'
409
+ add_option "prefix_transcripts_consensus", :type => :string, :aliases => '-p'
410
+ add_option "contained", :type=>:boolean, :aliases => '-C'
411
+ add_option "GFF", :type => :boolean, :aliases =>'-G'
412
+ add_option "no_map_files", :type => :boolean, :aliases =>'-T'
413
+
414
+ class << self
415
+
416
+ def kb_name(gtf)
417
+ gtf.sub(/\.[a-zA-Z0-9]*$/,".kb")
418
+ end
419
+
420
+ def exists_kb?(gtf)
421
+ File.exists?(kb_name(gtf))
422
+ end
423
+
424
+ # Dump an hash of associations from a GTF file generated from CuffCompare
425
+ # gene_id: transcript_id, gene_name, oid, nearest_ref
426
+ # gene_id example: :XLOC_000001=>{:gene_name=>:RP11-304M2.1, :transcripts=>{:TCONS_00000001=>{:oid=>:ENST00000519787, :nearest_ref=>:ENST00000519787}}}
427
+ # the others are just plain hash
428
+ # transcript_id: gene_id, gene_name, oid, nearest_ref
429
+ # gene_name: gene_id, transcript_id, oid, nearest_ref
430
+ # oid: gene_id, transcript_id, gene_name, nearest_ref
431
+ # nearest_ref: gene_id, transcript_id, gene_name, oid
432
+ #Note:exons and coordinates are not saved.
433
+ def build_compare_kb(gtf)
434
+ unless File.exists?(gtf)
435
+ STDERR.puts "File #{gtf} doesn't exist."
436
+ return nil
437
+ end
438
+
439
+ dict = {} #build an hash with the combinations of data extracted from GTF file, XLOC, TCONS, ENST, SYMBOL
440
+ File.open(gtf,'r') do |f|
441
+ f.lines do |line|
442
+ line=~/gene_id (.*?);/
443
+ gene_id = $1.gsub(/"/,'').to_sym
444
+ line=~/transcript_id (.*?);/
445
+ transcript_id = $1.gsub(/"/,'').to_sym
446
+ line=~/gene_name (.*?);/
447
+ gene_name = $1.gsub(/"/,'').to_sym
448
+ line=~/oId (.*?);/
449
+ oid=$1.gsub(/"/,'').to_sym
450
+ line=~/nearest_ref (.*?);/
451
+ nearest_ref = $1.gsub(/"/,'').to_sym
452
+ unless dict.key?(gene_id)
453
+ dict[gene_id]={:gene_name=>gene_name,:transcripts=>{}}
454
+ end
455
+ unless dict[gene_id][:transcripts].key?(transcript_id)
456
+ dict[gene_id][:transcripts][transcript_id]={:odi=>oid, :nearest_ref=>nearest_ref}
457
+ end
458
+ dict[transcript_id]={:gene_id=>gene_id, :gene_name=>gene_name, :odi=>oid, :nearest_ref=>nearest_ref}
459
+ dict[gene_name]={:gene_id=>gene_id, :transcript_id=>transcript_id, :odi=>oid, :nearest_ref=>nearest_ref}
460
+ dict[oid]={:gene_id=>gene_id, :transcript_id=>transcript_id, :gene_name=>gene_name, :nearest_ref=>nearest_ref}
461
+ dict[nearest_ref]={:gene_id=>gene_id, :transcript_id=>transcript_id, :odi=>oid, :gene_name=>gene_name}
462
+ end#lines
463
+ end#file
464
+ kb_filename = kb_name(gtf)
465
+ File.open(kb_filename,'w') do |fkb|
466
+ #fkb.write(dict.to_json)
467
+ Marshal.dump(dict,fkb)
468
+ end #fkb
469
+ dict
470
+ end #build_compare_kb
471
+
472
+ # Return the hash of associations
473
+ # gene_id: transcript_id, gene_name, oid, nearest_ref
474
+ # transcript_id: gene_id, gene_name, oid, nearest_ref
475
+ # gene_name: gene_id, transcript_id, oid, nearest_ref
476
+ # oid: gene_id, transcript_id, gene_name, nearest_ref
477
+ # nearest_ref: gene_id, transcript_id, gene_name, oid
478
+ def load_compare_kb(gtf)
479
+ #TODO rescue Exceptions
480
+ kb_filename = kb_name(gtf)
481
+ gtf_kb = File.open(kb_filename,'r') do |kb_dump|
482
+ Marshal.load(kb_dump)
483
+ end
484
+ end #load_compare_kb
485
+ end
486
+ end #Compare
487
+ end #Cufflinks
488
+ end #Ngs
489
+ end #Bio