bio-ngs 0.3.2.alpha.01 → 0.4.2.alpha.01

Sign up to get free protection for your applications and to get access to all the features.
@@ -20,41 +20,55 @@ module Bio
20
20
  end
21
21
 
22
22
 
23
- # cufflinks v1.0.2 (2335)
23
+ # cufflinks v1.3.0
24
24
  # linked against Boost version 104000
25
25
  # -----------------------------
26
26
  # Usage: cufflinks [options] <hits.sam>
27
- # Options:
28
- #
27
+ # General Options:
28
+ # -o/--output-dir write all output files to this directory [ default: ./ ]
29
29
  # -p/--num-threads number of threads used during analysis [ default: 1 ]
30
- # -L/--label assembled transcripts have this ID prefix [ default: CUFF ]
30
+ # --seed value of random number generator seed [ default: 0 ]
31
31
  # -G/--GTF quantitate against reference transcript annotations
32
- # -F/--min-isoform-fraction suppress transcripts below this abundance level [ default: 0.15 ]
33
- # -f/--min-intron-fraction filter spliced alignments below this level [ default: 0.05 ]
34
- # -j/--pre-mrna-fraction suppress intra-intronic transcripts below this level [ default: 0.15 ]
35
- # -I/--max-intron-length ignore alignments with gaps longer than this [ default: 300000 ]
36
- # -Q/--min-map-qual ignore alignments with lower than this mapping qual [ default: 0 ]
32
+ # -g/--GTF-guide use reference transcript annotation to guide assembly
37
33
  # -M/--mask-file ignore all alignment within transcripts in this file
38
- # -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
39
- # -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
40
- # -o/--output-dir write all output files to this directory [ default: ./ ]
41
- # -r/--reference-seq reference fasta file for sequence bias correction [ default: NULL ]
34
+ # -b/--frag-bias-correct use bias correction - reference fasta required [ default: NULL ]
35
+ # -u/--multi-read-correct use 'rescue method' for multi-reads (more accurate) [ default: FALSE ]
36
+ # --library-type library prep used for input reads [ default: below ]
42
37
  #
43
- # Advanced Options:
38
+ # Advanced Abundance Estimation Options:
39
+ # -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]
40
+ # -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]
41
+ # --upper-quartile-norm use upper-quartile normalization [ default: FALSE ]
42
+ # --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
43
+ # --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
44
+ # --compatible-hits-norm count hits compatible with reference RNAs only [ default: FALSE ]
45
+ # --total-hits-norm count all hits for normalization [ default: TRUE ]
44
46
  #
45
- # -N/--quartile-normalization use quartile normalization instead of total counts [ default: FALSE ]
46
- # -a/--junc-alpha alpha for junction binomial test filter [ default: 0.01 ]
47
- # -A/--small-anchor-fraction percent read overhang taken as 'suspiciously small' [ default: 0.12 ]
48
- # -m/--frag-len-mean the average fragment length [ default: 200 ]
49
- # -s/--frag-len-std-dev the fragment length standard deviation [ default: 80 ]
47
+ # Advanced Assembly Options:
48
+ # -L/--label assembled transcripts have this ID prefix [ default: CUFF ]
49
+ # -F/--min-isoform-fraction suppress transcripts below this abundance level [ default: 0.10 ]
50
+ # -j/--pre-mrna-fraction suppress intra-intronic transcripts below this level [ default: 0.15 ]
51
+ # -I/--max-intron-length ignore alignments with gaps longer than this [ default: 300000 ]
52
+ # -a/--junc-alpha alpha for junction binomial test filter [ default: 0.001 ]
53
+ # -A/--small-anchor-fraction percent read overhang taken as 'suspiciously small' [ default: 0.09 ]
50
54
  # --min-frags-per-transfrag minimum number of fragments needed for new transfrags [ default: 10 ]
51
55
  # --overhang-tolerance number of terminal exon bp to tolerate in introns [ default: 8 ]
52
- # --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
53
- # --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
54
- # --library-type Library prep used for input reads [ default: below ]
55
56
  # --max-bundle-length maximum genomic length allowed for a given bundle [ default:3500000 ]
56
- # --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]
57
+ # --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]
57
58
  # --min-intron-length minimum intron size allowed in genome [ default: 50 ]
59
+ # --trim-3-avgcov-thresh minimum avg coverage required to attempt 3' trimming [ default: 10 ]
60
+ # --trim-3-dropoff-frac fraction of avg coverage below which to trim 3' end [ default: 0.1 ]
61
+ #
62
+ # Advanced Reference Annotation Guided Assembly Options:
63
+ # --no-faux-reads disable tiling by faux reads [ default: FALSE ]
64
+ # --3-overhang-tolerance overhang allowed on 3' end when merging with reference[ default: 600 ]
65
+ # --intron-overhang-tolerance overhang allowed inside reference intron when merging [ default: 30 ]
66
+ #
67
+ # Advanced Program Behavior Options:
68
+ # -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
69
+ # -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
70
+ # --no-update-check do not contact server to check for update availability[ default: FALSE ]
71
+ #
58
72
  # Supported library types:
59
73
  # ff-firststrand
60
74
  # ff-secondstrand
@@ -62,48 +76,69 @@ module Bio
62
76
  # fr-firststrand
63
77
  # fr-secondstrand
64
78
  # fr-unstranded (default)
65
- # transfrags
79
+ # transfrags
66
80
  class Quantification
67
81
 
68
82
  include Bio::Command::Wrapper
83
+ include Bio::Ngs::Cufflinks::Utils
69
84
 
70
85
  set_program Bio::Ngs::Utils.binary("cufflinks")
71
86
 
87
+ add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
72
88
  add_option "num-threads", :type => :numeric, :aliases => '-p', :default => 1
73
- add_option "label", :type => :string, :aliases => '-L', :default => "CUFF"
89
+ add_option "seed", :type => :numeric
74
90
  add_option "GTF", :type => :string, :aliases => '-G'
91
+ add_option "GTF-guide", :type => :boolean, :aliases => '-g'
92
+ add_option "mask-file", :type => :string, :aliases => '-M'
93
+ add_option "frag-bias-correct", :type => :string, :aliases => '-b'
94
+ add_option "multi-read-correct", :type => :boolean, :aliases => '-u'
95
+ add_option "library-type", :type => :string
96
+ add_option "farg-len-mean", :type => :numeric, :aliases => '-m'#, :default => 200
97
+ add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'#, :default => 80
98
+ add_option "upper-quartile-norm", :type => :boolean
99
+ add_option "max-mle-iterations", :type => :numeric#, :default => 5000
100
+ add_option "num-importance-samples", :type => :numeric#, :default => 1000
101
+ add_option "compatible-hits-norm", :type => :boolean, :aliases => '-h'
102
+ add_option "total-hits-norm", :type => :boolean, :aliases => '-t'
103
+ add_option "label", :type => :string, :aliases => '-L', :default => "CUFF"
75
104
  add_option "min-isoform-fraction", :type => :numeric, :aliases => '-F', :default => 0.15
76
- add_option "min-intron-fraction", :type => :numeric, :aliases => '-f', :default => 0.05
77
105
  add_option "pre-mrna-fraction", :type => :numeric, :aliases => '-j', :default => 0.15
106
+ #deprecated add_option "min-intron-fraction", :type => :numeric, :aliases => '-f', :default => 0.05
78
107
  add_option "max-intron-length", :type => :numeric, :aliases => '-I', :default => 300000
79
- add_option "min-map-qual", :type => :numeric, :aliases => '-Q', :default => 0
80
- add_option "mask-file", :type => :string, :aliases => '-M'
81
- add_option "verbose", :type => :boolean, :aliases => '-v'
82
- add_option "quiet", :type => :boolean, :aliases => '-q'
83
- add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
84
- add_option "reference-seq", :type => :string, :aliases => '-r'
85
- add_option "quartile-normalization", :type => :boolean, :aliases => '-N'
86
108
  add_option "junc-alpha", :type => :numeric, :aliases => '-a', :default => 0.01
87
109
  add_option "small-anchor-fraction", :type => :numeric, :aliases => '-A', :default => 0.12
88
- #TODO Check why with these defaults is not working properly
89
- add_option "farg-len-mean", :type => :numeric, :aliases => '-m'#, :default => 200
90
- add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'#, :default => 80
91
110
  add_option "min-frags-per-transfrag", :type => :numeric#, :default => 10
92
111
  add_option "overhang-tolerance", :type => :numeric#, :default => 8
93
- add_option "num-importance-samples", :type => :numeric#, :default => 1000
94
- add_option "max-mle-iterations", :type => :numeric#, :default => 5000
95
- add_option "library-type", :type => :string
96
112
  add_option "max-bundle-length", :type => :numeric #, :default => 3500000
97
113
  add_option "max-bundle-frags", :type => :numeric #, :default => 500000
98
114
  add_option "min-intron-length", :type => :numeric#, :default => 50
115
+ add_option "trim-3-avgcov-thresh", :type => :numeric
116
+ add_option "trim-3-dropoff-frac", :type => :numeric
117
+ add_option "no-faux-reads", :type => :boolean
118
+ add_option "3-overhang-tolerance", :type => :numeric
119
+ add_option "intron-overhang-tolerance", :type => :numeric
120
+ add_option "verbose", :type => :boolean, :aliases => '-v'
121
+ add_option "quiet", :type => :boolean, :aliases => '-q'
122
+
123
+ #deprecated add_option "min-map-qual", :type => :numeric, :aliases => '-Q', :default => 0
124
+ #deprecated add_option "reference-seq", :type => :string, :aliases => '-r'
125
+ #deprecated add_option "quartile-normalization", :type => :boolean, :aliases => '-N'
126
+
127
+ #TODO Check why with these defaults is not working properly
128
+
129
+
130
+
131
+ add_iterator_for :genes
132
+ add_iterator_for :isoforms
99
133
  end #Quantification
100
134
 
101
- # cuffdiff v1.0.2 (2336)
135
+ # cuffdiff v1.3.0 (3022)
102
136
  # -----------------------------
103
137
  # Usage: cuffdiff [options] <transcripts.gtf> <sample1_hits.sam> <sample2_hits.sam> [... sampleN_hits.sam]
104
138
  # Supply replicate SAMs as comma separated lists for each condition: sample1_rep1.sam,sample1_rep2.sam,...sample1_repM.sam
105
139
  # General Options:
106
140
  # -o/--output-dir write all output files to this directory [ default: ./ ]
141
+ # --seed value of random number generator seed [ default: 0 ]
107
142
  # -T/--time-series treat samples as a time-series [ default: FALSE ]
108
143
  # -c/--min-alignment-count minimum number of alignments in a locus for testing [ default: 10 ]
109
144
  # --FDR False discovery rate used in testing [ default: 0.05 ]
@@ -119,14 +154,23 @@ module Bio
119
154
  # -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]
120
155
  # -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]
121
156
  # --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
157
+ # --num-bootstrap-samples Number of bootstrap replications [ default: 20 ]
158
+ # --bootstrap-fraction Fraction of fragments in each bootstrap sample [ default: 1.0 ]
122
159
  # --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
123
- # --compatible-hits-norm count hits compatible with reference RNAs only [ default: TRUE ]
160
+ # --compatible-hits-norm count hits compatible with reference RNAs only [ default: TRUE ]
124
161
  # --total-hits-norm count all hits for normalization [ default: FALSE ]
125
162
  # --poisson-dispersion Don't fit fragment counts for overdispersion [ default: FALSE ]
126
163
  # -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
127
164
  # -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
128
165
  # --no-update-check do not contact server to check for update availability[ default: FALSE ]
129
166
  # --emit-count-tables print count tables used to fit overdispersion [ default: FALSE ]
167
+ # --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]
168
+ #
169
+ # Debugging use only:
170
+ # --read-skip-fraction Skip a random subset of reads this size [ default: 0.0 ]
171
+ # --no-read-pairs Break all read pairs [ default: FALSE ]
172
+ # --trim-read-length Trim reads to be this long (keep 5' end) [ default: none ]
173
+ # --cov-delta Maximum gap between bootstrap and IS [ default: 2.0 ]
130
174
  #
131
175
  # Supported library types:
132
176
  # ff-firststrand
@@ -138,10 +182,12 @@ module Bio
138
182
  # transfrags
139
183
  class Diff
140
184
  include Bio::Command::Wrapper
185
+ include Bio::Ngs::Cufflinks::Utils
141
186
 
142
187
  set_program Bio::Ngs::Utils.binary("cuffdiff")
143
188
 
144
189
  add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
190
+ add_option "seed", :type => :numeric
145
191
  add_option "time-series", :type => :boolean, :aliases => '-T'
146
192
  add_option "min-alignment-count", :type => :numeric, :aliases => '-c'
147
193
  add_option "FDR", :type => :numeric, :aliases => '-F'
@@ -155,6 +201,8 @@ module Bio
155
201
  add_option "frag-len-mean", :type => :numeric, :aliases => '-m'
156
202
  add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'
157
203
  add_option "num-importance-samples", :type => :numeric, :aliases => '-i'
204
+ add_option "num-bootstrap-samples", :type => :numeric
205
+ add_option "bootstrap-fraction", :type => :numeric
158
206
  add_option "max-mle-iterations", :type => :numeric, :aliases => '-e'
159
207
  add_option "compatible-hits-norm", :type => :boolean, :aliases => '-h'
160
208
  add_option "total-hits-norm", :type => :boolean, :aliases => '-t'
@@ -163,13 +211,24 @@ module Bio
163
211
  add_option "quiet", :type => :boolean, :aliases => '-q'
164
212
  add_option "no-update-check", :type => :boolean, :aliases => '-j'
165
213
  add_option "emit-count-tables", :type => :boolean, :aliases => '-b'
214
+ add_option "max-bundle-frags", :type => :numeric
215
+ add_option "read-skip-fraction", :type => :numeric
216
+ add_option "no-read-pairs", :type => :numeric
217
+ add_option "trim-read-length", :type => :numeric
218
+ add_option "cov-delta", :type => :numeric
219
+
220
+ #define iterators
221
+ add_iterator_for :genes
222
+ add_iterator_for :isoforms
223
+ add_iterator_for :cds
224
+ add_iterator_for :tss_groups
166
225
 
167
226
  #Examples
168
227
  #Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,3,0.6,false,true)
169
228
  #Bio::Ngs::Cufflinks::Diff.genes("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/gene_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,5,0.5,false,true)
170
229
 
171
230
  class << self
172
-
231
+
173
232
  #Return the version of CuffDiff used to produce the output
174
233
  def version(diff)
175
234
  #cufflink_version_offset = Bio::Ngs::Cufflinks.version
@@ -193,7 +252,7 @@ module Bio
193
252
  1
194
253
  end
195
254
  end
196
-
255
+
197
256
  #write a file with the information
198
257
  #See process_de for options available
199
258
  # Example: Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DEPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/isoform_exp.diff",
@@ -212,7 +271,11 @@ module Bio
212
271
  # fold:0.5,min_samples:5,min_fpkm:0.5,z_scores:true, :regulated=>:up)
213
272
  def genes(diff, gtf, options={})
214
273
  process_de(diff, gtf, options) do |dict_info, diff_reference, gtf_kb, fpkm_values|
215
- "#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:gene_name]}\t#{fpkm_values.join("\t")}"
274
+ # puts diff_reference
275
+ # puts fpkm_values
276
+ # "#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:gene_name]}\t#{fpkm_values.join("\t")}"
277
+ #do not use th gtf kb
278
+ "#{dict_info[:winner].first}\t#{dict_info[:gene_name]}\t#{fpkm_values.join("\t")}"
216
279
  end
217
280
  end #genes
218
281
 
@@ -220,7 +283,9 @@ module Bio
220
283
  #Options hash
221
284
  # :fold(float), :min_samples(integer), :min_fpkm(float), :only_significative(boolean, false) , :z_score(boolean, false)
222
285
  # :regulated(symbol :up or :down default :up)
286
+ # :fpkm_log_two (:true :false, default :true)
223
287
  def process_de(diff, gtf, options={})
288
+ #init default options
224
289
  fold = options[:fold] || 0.0
225
290
  min_samples = options[:min_samples] || 0
226
291
  min_fpkm = options[:min_fpkm] || 0.0
@@ -228,12 +293,15 @@ module Bio
228
293
  z_scores = options[:z_scores] || false
229
294
  #TODO improve check on paramters
230
295
  regulated =options[:regulated] || :up
296
+ fpkm_log_two = options[:fpkm_log_two] || true
297
+ force_not_significative = options[:force_not_significative] || false
231
298
 
232
- gtf_kb = Bio::Ngs::Cufflinks::Compare.exists_kb?(gtf) ? Bio::Ngs::Cufflinks::Compare.load_compare_kb(gtf) : Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
299
+ #set up the kb if not available = pass an option with the path of the kb ?
300
+ gtf_kb = nil###### Bio::Ngs::Cufflinks::Compare.exists_kb?(gtf) ? Bio::Ngs::Cufflinks::Compare.load_compare_kb(gtf) : Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
233
301
 
234
302
  #convert log2 fold value into natural log value (internally computed by cuffdiff)
235
303
  fold_log2 = fold
236
- fold = fold==0 ? 0.0 : (fold*Math.log(2))
304
+ (fold = fold==0 ? 0.0 : (fold*Math.log(2))) unless fpkm_log_two
237
305
 
238
306
  dict=Hash.new {|h, k| h[k]=Hash.new{|hh,kk| hh[kk]=[]}; }
239
307
  dict_samples = Hash.new{|h,k| h[k]=""}
@@ -243,78 +311,105 @@ module Bio
243
311
 
244
312
  File.open(diff,'r') do |f|
245
313
  header=f.readline #skip header
314
+
315
+ test_id_idx = 0
316
+ gene_name_idx = 2
317
+ q_first_idx = 3 + cufflink_version_offset
318
+ q_second_idx = 4 + cufflink_version_offset
319
+ fpkm_first_idx = 6 + cufflink_version_offset
320
+ fpkm_second_idx = 7 + cufflink_version_offset
321
+ fold_idx = 8 + cufflink_version_offset
322
+ significant_idx = 11 + cufflink_version_offset + (cufflink_version_offset==1 ? 1 : 0)
323
+
324
+ #Commenti:
325
+ # per ogni riga del diff devo salvare il valore dei espressione di ogni test
326
+ # quindi fpkm e se è significativo o meno
246
327
 
247
- q_first = 3 + cufflink_version_offset
248
- q_second = 4 + cufflink_version_offset
249
- fpkm_first = 6 + cufflink_version_offset
250
- fpkm_second = 7 + cufflink_version_offset
251
- fold_position = 8 + cufflink_version_offset
252
- significant_position = 11 + cufflink_version_offset + (cufflink_version_offset==1 ? 1 : 0)
253
328
  f.each_line do |line|
254
329
  data=line.split
255
- if data[fold_position].to_f<=0
256
- data[fold_position]=data[fold_position].sub(/-/,"")
330
+
331
+ #fix comparison t-test, remove negative symbol e invert comparison: if fold change q1 vs q2 <0 abs(foldchange) & swaap q1,q2
332
+ # puts data[fold_idx].to_f
333
+ #delete puts "#{data[fold_idx].to_f} #{data[fold_idx].to_f<0}"
334
+ if data[fold_idx].to_f<0
335
+ data[fold_idx]=data[fold_idx][1..-1] #.sub(/-/,"") remove the minus symbol from the number, the values q1, q2 and their fpkm will be reorganized into the data structure
257
336
  else
258
- a=data[fpkm_second]
259
- data[fpkm_second]=data[fpkm_first]
260
- data[fpkm_first]=a
261
- a=data[q_second]
262
- data[q_second]=data[q_first]
263
- data[q_first]=a
337
+ # puts "ciao"
338
+ data[fpkm_first_idx],data[fpkm_second_idx]=data[fpkm_second_idx],data[fpkm_first_idx]
339
+ data[q_first_idx],data[q_second_idx]=data[q_second_idx],data[q_first_idx]
340
+ #delete puts "#{q_first_idx},#{q_second_idx}"
264
341
  end
342
+ #delete puts "#{q_first_idx},#{q_second_idx}"
343
+ #delete puts "#{data[q_first_idx].to_sym} #{data[q_second_idx].to_sym}"
344
+ #delete puts "#{data[fpkm_first_idx].to_sym} #{data[fpkm_second_idx].to_sym}"
345
+
346
+
265
347
  #0 TCONS
266
348
  #4 name sample is the max diff for the item
267
349
  #5 name sample is the less diff for the item
268
350
  #9 is the fold
269
- dict_samples[data[q_first]]
270
- dict_samples[data[q_second]]
351
+ dict_samples[data[q_first_idx]]
352
+ dict_samples[data[q_second_idx]]
271
353
 
272
354
  #7 is the fpkm value of max pop/sample
273
355
  #8 is the fpkm value of min pop/sample
274
- if ((only_significative==true && data[significant_position]=="yes") || (data[significant_position]=="yes" && data[fold_position].to_f>=fold)) && data[fpkm_first].to_f>=min_fpkm && data[fpkm_second].to_f>=min_fpkm
275
- k_reference = data[0].to_sym #This can be TCONS if isoforms or XLOC if genes
356
+ k_reference = data[test_id_idx].to_sym #This can be TCONS if isoforms or XLOC if genes
357
+
358
+ unless dict[k_reference].key?(:values)
359
+ dict[k_reference][:values]={}
360
+ dict[k_reference][:gene_name]=data[gene_name_idx]
361
+ end
362
+ dict[k_reference][:values][data[q_first_idx].to_sym]=data[fpkm_first_idx].to_f unless dict[k_reference][:values].key?(data[q_first_idx].to_sym)
363
+ dict[k_reference][:values][data[q_second_idx].to_sym]=data[fpkm_second_idx].to_f unless dict[k_reference][:values].key?(data[q_second_idx].to_sym)
364
+
365
+ if ((only_significative==true && data[significant_idx]=="yes") || ((data[significant_idx]=="yes"||force_not_significative) && data[fold_idx].to_f>=fold)) && data[fpkm_first_idx].to_f>=min_fpkm && data[fpkm_second_idx].to_f>=min_fpkm
276
366
 
277
367
  ###### puts data.join(" ") if k_reference == :XLOC_017497
278
368
  #TODO refactor: this can be done using lambda
279
- k_sample = case regulated
280
- when :up
281
- k_sample = data[q_first].to_sym
282
- dict[k_reference][k_sample]<<data[q_second].to_sym
369
+ k_sample = ""
370
+ if regulated==:up
371
+
372
+ k_sample = data[q_first_idx].to_sym
373
+ #delete puts "#{k_sample} #{data[q_second_idx].to_sym}"
374
+ dict[k_reference][k_sample]<<data[q_second_idx].to_sym
375
+ #delete puts "#{k_reference} #{q_first_idx}, #{q_second_idx}"
283
376
  k_sample
284
- when :down
285
- k_sample = data[q_second].to_sym
286
- dict[k_reference][k_sample]<<data[q_first].to_sym
377
+ elsif regulated==:down
378
+ k_sample = data[q_second_idx].to_sym
379
+ dict[k_reference][k_sample]<<data[q_first_idx].to_sym
287
380
  k_sample
288
381
  end
289
382
 
290
- # puts dict[k_reference].inspect if k_reference == :XLOC_017497
291
-
292
- unless dict[k_reference].key?(:values)
293
- dict[k_reference][:values]={}
294
- end
383
+ #delete puts dict[k_reference].inspect if k_reference == :XLOC_017497
384
+ #delete puts dict.inspect
295
385
  #store fpkm values as well for each pop/sample it should be
296
- dict[k_reference][:values][k_sample]=data[fpkm_first].to_f unless dict[k_reference][:values].key?(k_sample)
297
- dict[k_reference][:values][data[q_second].to_sym]=data[fpkm_second].to_f unless dict[k_reference][:values].key?(data[q_second].to_sym)
298
386
  if dict[k_reference][k_sample].size >= min_samples
299
- dict[k_reference][:winner] << k_sample
387
+ (dict[k_reference][:winner] << k_sample).uniq!
300
388
  end
301
- # puts dict[k_reference].inspect if k_reference == :XLOC_017497
389
+ #delete puts dict[k_reference].inspect if k_reference == :XLOC_017497
302
390
  else
303
- #TODO add threshold value below min fpkm
304
- #dict[k_reference][:values][k_sample]=data[6].to_f
305
- #dict[k_reference][:values][data[4].to_sym]=data[7].to_f
391
+ # k_reference = data[0].to_sym #This can be TCONS if isoforms or XLOC if genes
392
+ #
393
+ # unless dict[k_reference].key?(:values)
394
+ # dict[k_reference][:values]={}
395
+ # end
396
+ # #TODO add threshold value below min fpkm
397
+ # dict[k_reference][:values][data[q_first_idx].to_sym]=data[fpkm_first_idx].to_f unless dict[k_reference][:values].key?(data[q_first_idx].to_sym)
398
+ # dict[k_reference][:values][data[q_second_idx].to_sym]=data[fpkm_second_idx].to_f unless dict[k_reference][:values].key?(data[q_second_idx].to_sym)
399
+ # #dict[k_reference][:values][data[4].to_sym]=data[7].to_f
306
400
  end
401
+ #delete puts dict[k_reference].inspect
402
+
307
403
  end #each line
308
-
309
404
  #example structure
310
405
  #{:TCONS_00086164=>{:q5=>[:q1, :q2, :q3, :q6]}, :TCONS_00086166=>{:q5=>[:q1, :q2, :q3, :q4, :q6]}
311
406
  end #file.open
312
407
 
313
-
314
408
  file_lines =[]
315
409
  dict.each do |diff_reference, dict_info|
316
410
 
317
411
  if dict_info.key?(:winner)
412
+ #puts dict_info.inspect
318
413
 
319
414
  #BAD PERFORMANCES use lambda
320
415
  valz = case z_scores
@@ -55,7 +55,7 @@ module Bio
55
55
  # If [-o] is specified, report will be printed to STDOUT.
56
56
  # If [-o] is not specified (and output goes to STDOUT),
57
57
  # report will be printed to STDERR.
58
- class Trim
58
+ class QualityTrim
59
59
  include Bio::Command::Wrapper
60
60
  set_program Bio::Ngs::Utils.binary("fastq_quality_trimmer")
61
61
  use_aliases
@@ -72,6 +72,26 @@ module Bio
72
72
  report will be printed to STDERR."
73
73
  add_option :quality_type, :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
74
74
  end #Trim
75
+
76
+ # [-f N] = First base to keep. Default is 1 (=first base).
77
+ # [-l N] = Last base to keep. Default is entire read.
78
+ # [-t N] = Trim N nucleotides from the end of the read.
79
+ # '-t' can not be used with '-l' and '-f'.
80
+ # [-z] = Compress output with GZIP.
81
+ # [-i INFILE] = FASTA/Q input file. default is STDIN.
82
+ # [-o OUTFILE] = FASTA/Q output file. default is STDOUT.
83
+ class Trim
84
+ include Bio::Command::Wrapper
85
+ set_program Bio::Ngs::Utils.binary("fastx_trimmer")
86
+ use_aliases
87
+ add_option :first_base, :type => :numeric, :aliases => "-f", :desc => "First base to keep"
88
+ add_option :last_base, :type => :numeric, :aliases => "-l", :desc => "Last base to keep"
89
+ add_option :compress, :type => :boolean, :aliases => "-z", :desc => "Compress output with GZIP"
90
+ add_option :input, :type => :string, :aliases => "-i", :desc => "Input FASTA/Q file", :collapse => true
91
+ add_option :output, :type => :string, :aliases => "-o", :desc => "Output FASTA/Q file", :collapse => true
92
+ add_option :trim, :type => :numeric, :aliases => "-t", :desc => "Trim N nucleotides from the end of the read"
93
+ add_option :quality_type, :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
94
+ end
75
95
 
76
96
  # Solexa-Quality BoxPlot plotter
77
97
  # Generates a solexa quality score box-plot graph
@@ -90,6 +110,7 @@ module Bio
90
110
  add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file."
91
111
  add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file."
92
112
  add_option :title, :type => :string, :aliases => "-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
113
+ add_option :quality_type, :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
93
114
  end #ReadsBoxPlot
94
115
 
95
116
  # Solexa-Reads coverage plotter
@@ -109,6 +130,7 @@ module Bio
109
130
  add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file."
110
131
  add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file."
111
132
  add_option :title, :type => :string, :aliases => "-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
133
+ add_option :quality_type, :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
112
134
  end #ReadsCoverage
113
135
 
114
136
 
@@ -163,6 +185,7 @@ module Bio
163
185
  add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file.", :collapse=>true
164
186
  add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file.", :collapse=>true
165
187
  add_option :new_format, :type => :boolean, :aliases => "-N", :desc => "New output format (with more information per nucleotide/cycle)."
188
+ add_option :quality_type, :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
166
189
  end #ReadsCoverage
167
190
 
168
191
  end #Fastx
@@ -27,7 +27,11 @@
27
27
  # -i/--min-intron-length <int> [ default: 50 ]
28
28
  # -I/--max-intron-length <int> [ default: 500000 ]
29
29
  # -g/--max-multihits <int> [ default: 20 ]
30
- # -F/--min-isoform-fraction <float> [ default: 0.15 ]
30
+ # -x/--transcriptome-max-hits <int> [ default: 60 ]
31
+ # -n/--transcriptome-mismatches <int> [ default: 1 ]
32
+ # -M/--prefilter-multihits ( for -G/--GTF option, enable
33
+ # an initial bowtie search
34
+ # against the genome )
31
35
  # --max-insertion-length <int> [ default: 3 ]
32
36
  # --max-deletion-length <int> [ default: 3 ]
33
37
  # --solexa-quals
@@ -40,7 +44,9 @@
40
44
  # --library-type <string> (fr-unstranded, fr-firststrand,
41
45
  # fr-secondstrand)
42
46
  # -p/--num-threads <int> [ default: 1 ]
43
- # -G/--GTF <filename>
47
+ # -G/--GTF <filename> (GTF/GFF with known transcripts)
48
+ # --transcriptome-index <bwtidx> (transcriptome bowtie index)
49
+ # -T/--transcriptome-only (map only to the transcriptome)
44
50
  # -j/--raw-juncs <filename>
45
51
  # --insertions <filename>
46
52
  # --deletions <filename>
@@ -59,10 +65,11 @@
59
65
  # --keep-tmp
60
66
  # --tmp-dir <dirname> [ default: <output_dir>/tmp ]
61
67
  # -z/--zpacker <program> [ default: gzip ]
62
- # -X/--unmapped-fifo [ use mkfifo to compress more temporary files]
68
+ # -X/--unmapped-fifo ( use mkfifo to compress
69
+ # more temporary files )
63
70
  #
64
71
  # Advanced Options:
65
- # --initial-read-mismatches <int> [ default: 2 ]
72
+ # -N/--initial-read-mismatches <int> [ default: 2 ]
66
73
  # --segment-mismatches <int> [ default: 2 ]
67
74
  # --segment-length <int> [ default: 25 ]
68
75
  # --bowtie-n [ default: bowtie -v ]
@@ -73,10 +80,10 @@
73
80
  # --max-coverage-intron <int> [ default: 20000 ]
74
81
  # --min-segment-intron <int> [ default: 50 ]
75
82
  # --max-segment-intron <int> [ default: 500000 ]
76
- # --no-sort-bam [Output BAM is not coordinate-sorted]
77
- # --no-convert-bam [Do not convert to bam format.
83
+ # --no-sort-bam (Output BAM is not coordinate-sorted)
84
+ # --no-convert-bam (Do not convert to bam format.
78
85
  # Output is <output_dir>accepted_hit.sam.
79
- # Implies --no-sort-bam.]
86
+ # Implies --no-sort-bam)
80
87
  #
81
88
  # SAM Header Options (for embedding sequencing run metadata in output):
82
89
  # --rg-id <string> (read group ID)
@@ -105,7 +112,9 @@ module Bio
105
112
  add_option "min-intron-length", :type => :numeric , :aliases => '-i'
106
113
  add_option "max-intron-length", :type => :numeric, :aliases => '-I'
107
114
  add_option "max-multihits", :type => :numeric, :aliases => '-g'
108
- add_option "min-isoform_fraction", :type => :numeric, :aliases => '-F'
115
+ add_option "transcriptome-max-hits", :type => :numeric, :aliases =>'-x'
116
+ add_option "transcriptome-mismatches", :type => :numeric, :aliases =>'-n'
117
+ add_option "prefilter-multihits", :type => :boolean, :aliases =>'-M'
109
118
  add_option "max-insertion-length", :type => :numeric
110
119
  add_option "max-deletion-length", :type => :numeric
111
120
  add_option "solexa-quals", :type => :boolean
@@ -116,6 +125,8 @@ module Bio
116
125
  add_option "library-type", :type => :string
117
126
  add_option "num-threads", :type => :numeric, :aliases => '-p'
118
127
  add_option "GTF", :type => :string, :aliases => '-G'
128
+ add_option "transcriptome-index", :type => :string
129
+ add_option "transcriptome-only", :type => :boolean
119
130
  add_option "raw-juncs", :type => :string, :aliases => '-j'
120
131
  add_option :insertions, :type => :string
121
132
  add_option :deletions, :type => :string
@@ -135,6 +146,9 @@ module Bio
135
146
  add_option "no-butterfly-search", :type => :boolean
136
147
  add_option "keep-tmp", :type => :boolean
137
148
  add_option "tmp-dir", :type => :string
149
+ add_option "zpacker", :type => :string, :aliases => '-z'
150
+ add_option "unmapped-fifo", :type => :boolean, :aliases => '-X'
151
+ add_option "initial-read-mismatches", :type => :int, :aliases => '-N'
138
152
  add_option "segment-mismatches", :type => :numeric
139
153
  add_option "segment-length", :type => :numeric
140
154
  add_option "min-closure-exon", :type => :numeric
@@ -144,6 +158,8 @@ module Bio
144
158
  add_option "max-coverage-intron", :type => :numeric
145
159
  add_option "min-segment-intron", :type => :numeric
146
160
  add_option "max-segment-intron", :type => :numeric
161
+ add_option "no-sort-bam", :type => :boolean
162
+ add_option "no-convert-bam", :type => :boolean
147
163
  add_option "rg-id", :type => :string
148
164
  add_option "rg-sample", :type => :string
149
165
  add_option "rg-library", :type => :string
@@ -15,16 +15,16 @@ common:
15
15
  type: source
16
16
  linux:
17
17
  cufflinks:
18
- version: 1.1.0
19
- url: http://cufflinks.cbcb.umd.edu/downloads/cufflinks-1.1.0.Linux_x86_64.tar.gz
20
- basename: cufflinks-1.1.0.Linux_x86_64
18
+ version: 1.3.0
19
+ url: http://cufflinks.cbcb.umd.edu/downloads/cufflinks-1.3.0.Linux_x86_64.tar.gz
20
+ basename: cufflinks-1.3.0.Linux_x86_64
21
21
  suffix: tar.gz
22
22
  desc: ""
23
23
  type: binary
24
24
  tophat:
25
- version: 1.3.2
26
- url: http://tophat.cbcb.umd.edu/downloads/tophat-1.3.2.Linux_x86_64.tar.gz
27
- basename: tophat-1.3.2.Linux_x86_64
25
+ version: 1.4.1
26
+ url: http://tophat.cbcb.umd.edu/downloads/tophat-1.4.1.Linux_x86_64.tar.gz
27
+ basename: tophat-1.4.1.Linux_x86_64
28
28
  suffix: tar.gz
29
29
  desc: ""
30
30
  type: binary
@@ -44,16 +44,16 @@ linux:
44
44
  # type: binary
45
45
  osx:
46
46
  cufflinks:
47
- version: 1.1.0
48
- url: http://cufflinks.cbcb.umd.edu/downloads/cufflinks-1.1.0.OSX_x86_64.tar.gz
49
- basename: cufflinks-1.1.0.OSX_x86_64
47
+ version: 1.3.0
48
+ url: http://cufflinks.cbcb.umd.edu/downloads/cufflinks-1.3.0.OSX_x86_64.tar.gz
49
+ basename: cufflinks-1.3.0.OSX_x86_64
50
50
  suffix: tar.gz
51
51
  desc: ""
52
52
  type: binary
53
53
  tophat:
54
- version: 1.3.2
55
- url: http://tophat.cbcb.umd.edu/downloads/tophat-1.3.2.OSX_x86_64.tar.gz
56
- basename: tophat-1.3.2.OSX_x86_64
54
+ version: 1.4.1
55
+ url: http://tophat.cbcb.umd.edu/downloads/tophat-1.4.1.OSX_x86_64.tar.gz
56
+ basename: tophat-1.4.1.OSX_x86_64
57
57
  suffix: tar.gz
58
58
  desc: ""
59
59
  type: binary