bio-ngs 0.3.2.alpha.01 → 0.4.2.alpha.01
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +5 -3
- data/Gemfile.lock +36 -30
- data/README.rdoc +33 -0
- data/VERSION +1 -1
- data/bio-ngs.gemspec +30 -22
- data/ext/mkrf_conf.rb +0 -2
- data/lib/bio/appl/ngs/cufflinks/iterators.rb +35 -0
- data/lib/bio/appl/ngs/cufflinks.rb +180 -85
- data/lib/bio/appl/ngs/fastx.rb +24 -1
- data/lib/bio/appl/ngs/tophat.rb +24 -8
- data/lib/bio/ngs/ext/versions.yaml +12 -12
- data/lib/bio/ngs/utils.rb +11 -1
- data/lib/bio-ngs.rb +1 -0
- data/lib/tasks/convert.thor +16 -0
- data/lib/tasks/pre.thor +130 -0
- data/lib/tasks/quality.thor +3 -4
- data/lib/tasks/rna.thor +2 -1
- metadata +90 -66
@@ -20,41 +20,55 @@ module Bio
|
|
20
20
|
end
|
21
21
|
|
22
22
|
|
23
|
-
# cufflinks v1.0
|
23
|
+
# cufflinks v1.3.0
|
24
24
|
# linked against Boost version 104000
|
25
25
|
# -----------------------------
|
26
26
|
# Usage: cufflinks [options] <hits.sam>
|
27
|
-
# Options:
|
28
|
-
#
|
27
|
+
# General Options:
|
28
|
+
# -o/--output-dir write all output files to this directory [ default: ./ ]
|
29
29
|
# -p/--num-threads number of threads used during analysis [ default: 1 ]
|
30
|
-
#
|
30
|
+
# --seed value of random number generator seed [ default: 0 ]
|
31
31
|
# -G/--GTF quantitate against reference transcript annotations
|
32
|
-
# -
|
33
|
-
# -f/--min-intron-fraction filter spliced alignments below this level [ default: 0.05 ]
|
34
|
-
# -j/--pre-mrna-fraction suppress intra-intronic transcripts below this level [ default: 0.15 ]
|
35
|
-
# -I/--max-intron-length ignore alignments with gaps longer than this [ default: 300000 ]
|
36
|
-
# -Q/--min-map-qual ignore alignments with lower than this mapping qual [ default: 0 ]
|
32
|
+
# -g/--GTF-guide use reference transcript annotation to guide assembly
|
37
33
|
# -M/--mask-file ignore all alignment within transcripts in this file
|
38
|
-
# -
|
39
|
-
# -
|
40
|
-
# -
|
41
|
-
# -r/--reference-seq reference fasta file for sequence bias correction [ default: NULL ]
|
34
|
+
# -b/--frag-bias-correct use bias correction - reference fasta required [ default: NULL ]
|
35
|
+
# -u/--multi-read-correct use 'rescue method' for multi-reads (more accurate) [ default: FALSE ]
|
36
|
+
# --library-type library prep used for input reads [ default: below ]
|
42
37
|
#
|
43
|
-
# Advanced Options:
|
38
|
+
# Advanced Abundance Estimation Options:
|
39
|
+
# -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]
|
40
|
+
# -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]
|
41
|
+
# --upper-quartile-norm use upper-quartile normalization [ default: FALSE ]
|
42
|
+
# --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
|
43
|
+
# --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
|
44
|
+
# --compatible-hits-norm count hits compatible with reference RNAs only [ default: FALSE ]
|
45
|
+
# --total-hits-norm count all hits for normalization [ default: TRUE ]
|
44
46
|
#
|
45
|
-
#
|
46
|
-
# -
|
47
|
-
# -
|
48
|
-
# -
|
49
|
-
# -
|
47
|
+
# Advanced Assembly Options:
|
48
|
+
# -L/--label assembled transcripts have this ID prefix [ default: CUFF ]
|
49
|
+
# -F/--min-isoform-fraction suppress transcripts below this abundance level [ default: 0.10 ]
|
50
|
+
# -j/--pre-mrna-fraction suppress intra-intronic transcripts below this level [ default: 0.15 ]
|
51
|
+
# -I/--max-intron-length ignore alignments with gaps longer than this [ default: 300000 ]
|
52
|
+
# -a/--junc-alpha alpha for junction binomial test filter [ default: 0.001 ]
|
53
|
+
# -A/--small-anchor-fraction percent read overhang taken as 'suspiciously small' [ default: 0.09 ]
|
50
54
|
# --min-frags-per-transfrag minimum number of fragments needed for new transfrags [ default: 10 ]
|
51
55
|
# --overhang-tolerance number of terminal exon bp to tolerate in introns [ default: 8 ]
|
52
|
-
# --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
|
53
|
-
# --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
|
54
|
-
# --library-type Library prep used for input reads [ default: below ]
|
55
56
|
# --max-bundle-length maximum genomic length allowed for a given bundle [ default:3500000 ]
|
56
|
-
# --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]
|
57
|
+
# --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]
|
57
58
|
# --min-intron-length minimum intron size allowed in genome [ default: 50 ]
|
59
|
+
# --trim-3-avgcov-thresh minimum avg coverage required to attempt 3' trimming [ default: 10 ]
|
60
|
+
# --trim-3-dropoff-frac fraction of avg coverage below which to trim 3' end [ default: 0.1 ]
|
61
|
+
#
|
62
|
+
# Advanced Reference Annotation Guided Assembly Options:
|
63
|
+
# --no-faux-reads disable tiling by faux reads [ default: FALSE ]
|
64
|
+
# --3-overhang-tolerance overhang allowed on 3' end when merging with reference[ default: 600 ]
|
65
|
+
# --intron-overhang-tolerance overhang allowed inside reference intron when merging [ default: 30 ]
|
66
|
+
#
|
67
|
+
# Advanced Program Behavior Options:
|
68
|
+
# -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
|
69
|
+
# -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
|
70
|
+
# --no-update-check do not contact server to check for update availability[ default: FALSE ]
|
71
|
+
#
|
58
72
|
# Supported library types:
|
59
73
|
# ff-firststrand
|
60
74
|
# ff-secondstrand
|
@@ -62,48 +76,69 @@ module Bio
|
|
62
76
|
# fr-firststrand
|
63
77
|
# fr-secondstrand
|
64
78
|
# fr-unstranded (default)
|
65
|
-
# transfrags
|
79
|
+
# transfrags
|
66
80
|
class Quantification
|
67
81
|
|
68
82
|
include Bio::Command::Wrapper
|
83
|
+
include Bio::Ngs::Cufflinks::Utils
|
69
84
|
|
70
85
|
set_program Bio::Ngs::Utils.binary("cufflinks")
|
71
86
|
|
87
|
+
add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
|
72
88
|
add_option "num-threads", :type => :numeric, :aliases => '-p', :default => 1
|
73
|
-
add_option "
|
89
|
+
add_option "seed", :type => :numeric
|
74
90
|
add_option "GTF", :type => :string, :aliases => '-G'
|
91
|
+
add_option "GTF-guide", :type => :boolean, :aliases => '-g'
|
92
|
+
add_option "mask-file", :type => :string, :aliases => '-M'
|
93
|
+
add_option "frag-bias-correct", :type => :string, :aliases => '-b'
|
94
|
+
add_option "multi-read-correct", :type => :boolean, :aliases => '-u'
|
95
|
+
add_option "library-type", :type => :string
|
96
|
+
add_option "farg-len-mean", :type => :numeric, :aliases => '-m'#, :default => 200
|
97
|
+
add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'#, :default => 80
|
98
|
+
add_option "upper-quartile-norm", :type => :boolean
|
99
|
+
add_option "max-mle-iterations", :type => :numeric#, :default => 5000
|
100
|
+
add_option "num-importance-samples", :type => :numeric#, :default => 1000
|
101
|
+
add_option "compatible-hits-norm", :type => :boolean, :aliases => '-h'
|
102
|
+
add_option "total-hits-norm", :type => :boolean, :aliases => '-t'
|
103
|
+
add_option "label", :type => :string, :aliases => '-L', :default => "CUFF"
|
75
104
|
add_option "min-isoform-fraction", :type => :numeric, :aliases => '-F', :default => 0.15
|
76
|
-
add_option "min-intron-fraction", :type => :numeric, :aliases => '-f', :default => 0.05
|
77
105
|
add_option "pre-mrna-fraction", :type => :numeric, :aliases => '-j', :default => 0.15
|
106
|
+
#deprecated add_option "min-intron-fraction", :type => :numeric, :aliases => '-f', :default => 0.05
|
78
107
|
add_option "max-intron-length", :type => :numeric, :aliases => '-I', :default => 300000
|
79
|
-
add_option "min-map-qual", :type => :numeric, :aliases => '-Q', :default => 0
|
80
|
-
add_option "mask-file", :type => :string, :aliases => '-M'
|
81
|
-
add_option "verbose", :type => :boolean, :aliases => '-v'
|
82
|
-
add_option "quiet", :type => :boolean, :aliases => '-q'
|
83
|
-
add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
|
84
|
-
add_option "reference-seq", :type => :string, :aliases => '-r'
|
85
|
-
add_option "quartile-normalization", :type => :boolean, :aliases => '-N'
|
86
108
|
add_option "junc-alpha", :type => :numeric, :aliases => '-a', :default => 0.01
|
87
109
|
add_option "small-anchor-fraction", :type => :numeric, :aliases => '-A', :default => 0.12
|
88
|
-
#TODO Check why with these defaults is not working properly
|
89
|
-
add_option "farg-len-mean", :type => :numeric, :aliases => '-m'#, :default => 200
|
90
|
-
add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'#, :default => 80
|
91
110
|
add_option "min-frags-per-transfrag", :type => :numeric#, :default => 10
|
92
111
|
add_option "overhang-tolerance", :type => :numeric#, :default => 8
|
93
|
-
add_option "num-importance-samples", :type => :numeric#, :default => 1000
|
94
|
-
add_option "max-mle-iterations", :type => :numeric#, :default => 5000
|
95
|
-
add_option "library-type", :type => :string
|
96
112
|
add_option "max-bundle-length", :type => :numeric #, :default => 3500000
|
97
113
|
add_option "max-bundle-frags", :type => :numeric #, :default => 500000
|
98
114
|
add_option "min-intron-length", :type => :numeric#, :default => 50
|
115
|
+
add_option "trim-3-avgcov-thresh", :type => :numeric
|
116
|
+
add_option "trim-3-dropoff-frac", :type => :numeric
|
117
|
+
add_option "no-faux-reads", :type => :boolean
|
118
|
+
add_option "3-overhang-tolerance", :type => :numeric
|
119
|
+
add_option "intron-overhang-tolerance", :type => :numeric
|
120
|
+
add_option "verbose", :type => :boolean, :aliases => '-v'
|
121
|
+
add_option "quiet", :type => :boolean, :aliases => '-q'
|
122
|
+
|
123
|
+
#deprecated add_option "min-map-qual", :type => :numeric, :aliases => '-Q', :default => 0
|
124
|
+
#deprecated add_option "reference-seq", :type => :string, :aliases => '-r'
|
125
|
+
#deprecated add_option "quartile-normalization", :type => :boolean, :aliases => '-N'
|
126
|
+
|
127
|
+
#TODO Check why with these defaults is not working properly
|
128
|
+
|
129
|
+
|
130
|
+
|
131
|
+
add_iterator_for :genes
|
132
|
+
add_iterator_for :isoforms
|
99
133
|
end #Quantification
|
100
134
|
|
101
|
-
# cuffdiff v1.0
|
135
|
+
# cuffdiff v1.3.0 (3022)
|
102
136
|
# -----------------------------
|
103
137
|
# Usage: cuffdiff [options] <transcripts.gtf> <sample1_hits.sam> <sample2_hits.sam> [... sampleN_hits.sam]
|
104
138
|
# Supply replicate SAMs as comma separated lists for each condition: sample1_rep1.sam,sample1_rep2.sam,...sample1_repM.sam
|
105
139
|
# General Options:
|
106
140
|
# -o/--output-dir write all output files to this directory [ default: ./ ]
|
141
|
+
# --seed value of random number generator seed [ default: 0 ]
|
107
142
|
# -T/--time-series treat samples as a time-series [ default: FALSE ]
|
108
143
|
# -c/--min-alignment-count minimum number of alignments in a locus for testing [ default: 10 ]
|
109
144
|
# --FDR False discovery rate used in testing [ default: 0.05 ]
|
@@ -119,14 +154,23 @@ module Bio
|
|
119
154
|
# -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]
|
120
155
|
# -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]
|
121
156
|
# --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
|
157
|
+
# --num-bootstrap-samples Number of bootstrap replications [ default: 20 ]
|
158
|
+
# --bootstrap-fraction Fraction of fragments in each bootstrap sample [ default: 1.0 ]
|
122
159
|
# --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
|
123
|
-
# --compatible-hits-norm count hits compatible with reference RNAs only [ default:
|
160
|
+
# --compatible-hits-norm count hits compatible with reference RNAs only [ default: TRUE ]
|
124
161
|
# --total-hits-norm count all hits for normalization [ default: FALSE ]
|
125
162
|
# --poisson-dispersion Don't fit fragment counts for overdispersion [ default: FALSE ]
|
126
163
|
# -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
|
127
164
|
# -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
|
128
165
|
# --no-update-check do not contact server to check for update availability[ default: FALSE ]
|
129
166
|
# --emit-count-tables print count tables used to fit overdispersion [ default: FALSE ]
|
167
|
+
# --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]
|
168
|
+
#
|
169
|
+
# Debugging use only:
|
170
|
+
# --read-skip-fraction Skip a random subset of reads this size [ default: 0.0 ]
|
171
|
+
# --no-read-pairs Break all read pairs [ default: FALSE ]
|
172
|
+
# --trim-read-length Trim reads to be this long (keep 5' end) [ default: none ]
|
173
|
+
# --cov-delta Maximum gap between bootstrap and IS [ default: 2.0 ]
|
130
174
|
#
|
131
175
|
# Supported library types:
|
132
176
|
# ff-firststrand
|
@@ -138,10 +182,12 @@ module Bio
|
|
138
182
|
# transfrags
|
139
183
|
class Diff
|
140
184
|
include Bio::Command::Wrapper
|
185
|
+
include Bio::Ngs::Cufflinks::Utils
|
141
186
|
|
142
187
|
set_program Bio::Ngs::Utils.binary("cuffdiff")
|
143
188
|
|
144
189
|
add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
|
190
|
+
add_option "seed", :type => :numeric
|
145
191
|
add_option "time-series", :type => :boolean, :aliases => '-T'
|
146
192
|
add_option "min-alignment-count", :type => :numeric, :aliases => '-c'
|
147
193
|
add_option "FDR", :type => :numeric, :aliases => '-F'
|
@@ -155,6 +201,8 @@ module Bio
|
|
155
201
|
add_option "frag-len-mean", :type => :numeric, :aliases => '-m'
|
156
202
|
add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'
|
157
203
|
add_option "num-importance-samples", :type => :numeric, :aliases => '-i'
|
204
|
+
add_option "num-bootstrap-samples", :type => :numeric
|
205
|
+
add_option "bootstrap-fraction", :type => :numeric
|
158
206
|
add_option "max-mle-iterations", :type => :numeric, :aliases => '-e'
|
159
207
|
add_option "compatible-hits-norm", :type => :boolean, :aliases => '-h'
|
160
208
|
add_option "total-hits-norm", :type => :boolean, :aliases => '-t'
|
@@ -163,13 +211,24 @@ module Bio
|
|
163
211
|
add_option "quiet", :type => :boolean, :aliases => '-q'
|
164
212
|
add_option "no-update-check", :type => :boolean, :aliases => '-j'
|
165
213
|
add_option "emit-count-tables", :type => :boolean, :aliases => '-b'
|
214
|
+
add_option "max-bundle-frags", :type => :numeric
|
215
|
+
add_option "read-skip-fraction", :type => :numeric
|
216
|
+
add_option "no-read-pairs", :type => :numeric
|
217
|
+
add_option "trim-read-length", :type => :numeric
|
218
|
+
add_option "cov-delta", :type => :numeric
|
219
|
+
|
220
|
+
#define iterators
|
221
|
+
add_iterator_for :genes
|
222
|
+
add_iterator_for :isoforms
|
223
|
+
add_iterator_for :cds
|
224
|
+
add_iterator_for :tss_groups
|
166
225
|
|
167
226
|
#Examples
|
168
227
|
#Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,3,0.6,false,true)
|
169
228
|
#Bio::Ngs::Cufflinks::Diff.genes("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/gene_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,5,0.5,false,true)
|
170
229
|
|
171
230
|
class << self
|
172
|
-
|
231
|
+
|
173
232
|
#Return the version of CuffDiff used to produce the output
|
174
233
|
def version(diff)
|
175
234
|
#cufflink_version_offset = Bio::Ngs::Cufflinks.version
|
@@ -193,7 +252,7 @@ module Bio
|
|
193
252
|
1
|
194
253
|
end
|
195
254
|
end
|
196
|
-
|
255
|
+
|
197
256
|
#write a file with the information
|
198
257
|
#See process_de for options available
|
199
258
|
# Example: Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DEPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/isoform_exp.diff",
|
@@ -212,7 +271,11 @@ module Bio
|
|
212
271
|
# fold:0.5,min_samples:5,min_fpkm:0.5,z_scores:true, :regulated=>:up)
|
213
272
|
def genes(diff, gtf, options={})
|
214
273
|
process_de(diff, gtf, options) do |dict_info, diff_reference, gtf_kb, fpkm_values|
|
215
|
-
|
274
|
+
# puts diff_reference
|
275
|
+
# puts fpkm_values
|
276
|
+
# "#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:gene_name]}\t#{fpkm_values.join("\t")}"
|
277
|
+
#do not use th gtf kb
|
278
|
+
"#{dict_info[:winner].first}\t#{dict_info[:gene_name]}\t#{fpkm_values.join("\t")}"
|
216
279
|
end
|
217
280
|
end #genes
|
218
281
|
|
@@ -220,7 +283,9 @@ module Bio
|
|
220
283
|
#Options hash
|
221
284
|
# :fold(float), :min_samples(integer), :min_fpkm(float), :only_significative(boolean, false) , :z_score(boolean, false)
|
222
285
|
# :regulated(symbol :up or :down default :up)
|
286
|
+
# :fpkm_log_two (:true :false, default :true)
|
223
287
|
def process_de(diff, gtf, options={})
|
288
|
+
#init default options
|
224
289
|
fold = options[:fold] || 0.0
|
225
290
|
min_samples = options[:min_samples] || 0
|
226
291
|
min_fpkm = options[:min_fpkm] || 0.0
|
@@ -228,12 +293,15 @@ module Bio
|
|
228
293
|
z_scores = options[:z_scores] || false
|
229
294
|
#TODO improve check on paramters
|
230
295
|
regulated =options[:regulated] || :up
|
296
|
+
fpkm_log_two = options[:fpkm_log_two] || true
|
297
|
+
force_not_significative = options[:force_not_significative] || false
|
231
298
|
|
232
|
-
|
299
|
+
#set up the kb if not available = pass an option with the path of the kb ?
|
300
|
+
gtf_kb = nil###### Bio::Ngs::Cufflinks::Compare.exists_kb?(gtf) ? Bio::Ngs::Cufflinks::Compare.load_compare_kb(gtf) : Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
|
233
301
|
|
234
302
|
#convert log2 fold value into natural log value (internally computed by cuffdiff)
|
235
303
|
fold_log2 = fold
|
236
|
-
fold = fold==0 ? 0.0 : (fold*Math.log(2))
|
304
|
+
(fold = fold==0 ? 0.0 : (fold*Math.log(2))) unless fpkm_log_two
|
237
305
|
|
238
306
|
dict=Hash.new {|h, k| h[k]=Hash.new{|hh,kk| hh[kk]=[]}; }
|
239
307
|
dict_samples = Hash.new{|h,k| h[k]=""}
|
@@ -243,78 +311,105 @@ module Bio
|
|
243
311
|
|
244
312
|
File.open(diff,'r') do |f|
|
245
313
|
header=f.readline #skip header
|
314
|
+
|
315
|
+
test_id_idx = 0
|
316
|
+
gene_name_idx = 2
|
317
|
+
q_first_idx = 3 + cufflink_version_offset
|
318
|
+
q_second_idx = 4 + cufflink_version_offset
|
319
|
+
fpkm_first_idx = 6 + cufflink_version_offset
|
320
|
+
fpkm_second_idx = 7 + cufflink_version_offset
|
321
|
+
fold_idx = 8 + cufflink_version_offset
|
322
|
+
significant_idx = 11 + cufflink_version_offset + (cufflink_version_offset==1 ? 1 : 0)
|
323
|
+
|
324
|
+
#Commenti:
|
325
|
+
# per ogni riga del diff devo salvare il valore dei espressione di ogni test
|
326
|
+
# quindi fpkm e se è significativo o meno
|
246
327
|
|
247
|
-
q_first = 3 + cufflink_version_offset
|
248
|
-
q_second = 4 + cufflink_version_offset
|
249
|
-
fpkm_first = 6 + cufflink_version_offset
|
250
|
-
fpkm_second = 7 + cufflink_version_offset
|
251
|
-
fold_position = 8 + cufflink_version_offset
|
252
|
-
significant_position = 11 + cufflink_version_offset + (cufflink_version_offset==1 ? 1 : 0)
|
253
328
|
f.each_line do |line|
|
254
329
|
data=line.split
|
255
|
-
|
256
|
-
|
330
|
+
|
331
|
+
#fix comparison t-test, remove negative symbol e invert comparison: if fold change q1 vs q2 <0 abs(foldchange) & swaap q1,q2
|
332
|
+
# puts data[fold_idx].to_f
|
333
|
+
#delete puts "#{data[fold_idx].to_f} #{data[fold_idx].to_f<0}"
|
334
|
+
if data[fold_idx].to_f<0
|
335
|
+
data[fold_idx]=data[fold_idx][1..-1] #.sub(/-/,"") remove the minus symbol from the number, the values q1, q2 and their fpkm will be reorganized into the data structure
|
257
336
|
else
|
258
|
-
|
259
|
-
data[
|
260
|
-
data[
|
261
|
-
|
262
|
-
data[q_second]=data[q_first]
|
263
|
-
data[q_first]=a
|
337
|
+
# puts "ciao"
|
338
|
+
data[fpkm_first_idx],data[fpkm_second_idx]=data[fpkm_second_idx],data[fpkm_first_idx]
|
339
|
+
data[q_first_idx],data[q_second_idx]=data[q_second_idx],data[q_first_idx]
|
340
|
+
#delete puts "#{q_first_idx},#{q_second_idx}"
|
264
341
|
end
|
342
|
+
#delete puts "#{q_first_idx},#{q_second_idx}"
|
343
|
+
#delete puts "#{data[q_first_idx].to_sym} #{data[q_second_idx].to_sym}"
|
344
|
+
#delete puts "#{data[fpkm_first_idx].to_sym} #{data[fpkm_second_idx].to_sym}"
|
345
|
+
|
346
|
+
|
265
347
|
#0 TCONS
|
266
348
|
#4 name sample is the max diff for the item
|
267
349
|
#5 name sample is the less diff for the item
|
268
350
|
#9 is the fold
|
269
|
-
dict_samples[data[
|
270
|
-
dict_samples[data[
|
351
|
+
dict_samples[data[q_first_idx]]
|
352
|
+
dict_samples[data[q_second_idx]]
|
271
353
|
|
272
354
|
#7 is the fpkm value of max pop/sample
|
273
355
|
#8 is the fpkm value of min pop/sample
|
274
|
-
|
275
|
-
|
356
|
+
k_reference = data[test_id_idx].to_sym #This can be TCONS if isoforms or XLOC if genes
|
357
|
+
|
358
|
+
unless dict[k_reference].key?(:values)
|
359
|
+
dict[k_reference][:values]={}
|
360
|
+
dict[k_reference][:gene_name]=data[gene_name_idx]
|
361
|
+
end
|
362
|
+
dict[k_reference][:values][data[q_first_idx].to_sym]=data[fpkm_first_idx].to_f unless dict[k_reference][:values].key?(data[q_first_idx].to_sym)
|
363
|
+
dict[k_reference][:values][data[q_second_idx].to_sym]=data[fpkm_second_idx].to_f unless dict[k_reference][:values].key?(data[q_second_idx].to_sym)
|
364
|
+
|
365
|
+
if ((only_significative==true && data[significant_idx]=="yes") || ((data[significant_idx]=="yes"||force_not_significative) && data[fold_idx].to_f>=fold)) && data[fpkm_first_idx].to_f>=min_fpkm && data[fpkm_second_idx].to_f>=min_fpkm
|
276
366
|
|
277
367
|
###### puts data.join(" ") if k_reference == :XLOC_017497
|
278
368
|
#TODO refactor: this can be done using lambda
|
279
|
-
k_sample =
|
280
|
-
|
281
|
-
|
282
|
-
|
369
|
+
k_sample = ""
|
370
|
+
if regulated==:up
|
371
|
+
|
372
|
+
k_sample = data[q_first_idx].to_sym
|
373
|
+
#delete puts "#{k_sample} #{data[q_second_idx].to_sym}"
|
374
|
+
dict[k_reference][k_sample]<<data[q_second_idx].to_sym
|
375
|
+
#delete puts "#{k_reference} #{q_first_idx}, #{q_second_idx}"
|
283
376
|
k_sample
|
284
|
-
|
285
|
-
k_sample = data[
|
286
|
-
dict[k_reference][k_sample]<<data[
|
377
|
+
elsif regulated==:down
|
378
|
+
k_sample = data[q_second_idx].to_sym
|
379
|
+
dict[k_reference][k_sample]<<data[q_first_idx].to_sym
|
287
380
|
k_sample
|
288
381
|
end
|
289
382
|
|
290
|
-
# puts dict[k_reference].inspect if k_reference == :XLOC_017497
|
291
|
-
|
292
|
-
unless dict[k_reference].key?(:values)
|
293
|
-
dict[k_reference][:values]={}
|
294
|
-
end
|
383
|
+
#delete puts dict[k_reference].inspect if k_reference == :XLOC_017497
|
384
|
+
#delete puts dict.inspect
|
295
385
|
#store fpkm values as well for each pop/sample it should be
|
296
|
-
dict[k_reference][:values][k_sample]=data[fpkm_first].to_f unless dict[k_reference][:values].key?(k_sample)
|
297
|
-
dict[k_reference][:values][data[q_second].to_sym]=data[fpkm_second].to_f unless dict[k_reference][:values].key?(data[q_second].to_sym)
|
298
386
|
if dict[k_reference][k_sample].size >= min_samples
|
299
|
-
dict[k_reference][:winner] << k_sample
|
387
|
+
(dict[k_reference][:winner] << k_sample).uniq!
|
300
388
|
end
|
301
|
-
# puts dict[k_reference].inspect if k_reference == :XLOC_017497
|
389
|
+
#delete puts dict[k_reference].inspect if k_reference == :XLOC_017497
|
302
390
|
else
|
303
|
-
#
|
304
|
-
#
|
305
|
-
#dict[k_reference]
|
391
|
+
# k_reference = data[0].to_sym #This can be TCONS if isoforms or XLOC if genes
|
392
|
+
#
|
393
|
+
# unless dict[k_reference].key?(:values)
|
394
|
+
# dict[k_reference][:values]={}
|
395
|
+
# end
|
396
|
+
# #TODO add threshold value below min fpkm
|
397
|
+
# dict[k_reference][:values][data[q_first_idx].to_sym]=data[fpkm_first_idx].to_f unless dict[k_reference][:values].key?(data[q_first_idx].to_sym)
|
398
|
+
# dict[k_reference][:values][data[q_second_idx].to_sym]=data[fpkm_second_idx].to_f unless dict[k_reference][:values].key?(data[q_second_idx].to_sym)
|
399
|
+
# #dict[k_reference][:values][data[4].to_sym]=data[7].to_f
|
306
400
|
end
|
401
|
+
#delete puts dict[k_reference].inspect
|
402
|
+
|
307
403
|
end #each line
|
308
|
-
|
309
404
|
#example structure
|
310
405
|
#{:TCONS_00086164=>{:q5=>[:q1, :q2, :q3, :q6]}, :TCONS_00086166=>{:q5=>[:q1, :q2, :q3, :q4, :q6]}
|
311
406
|
end #file.open
|
312
407
|
|
313
|
-
|
314
408
|
file_lines =[]
|
315
409
|
dict.each do |diff_reference, dict_info|
|
316
410
|
|
317
411
|
if dict_info.key?(:winner)
|
412
|
+
#puts dict_info.inspect
|
318
413
|
|
319
414
|
#BAD PERFORMANCES use lambda
|
320
415
|
valz = case z_scores
|
data/lib/bio/appl/ngs/fastx.rb
CHANGED
@@ -55,7 +55,7 @@ module Bio
|
|
55
55
|
# If [-o] is specified, report will be printed to STDOUT.
|
56
56
|
# If [-o] is not specified (and output goes to STDOUT),
|
57
57
|
# report will be printed to STDERR.
|
58
|
-
class
|
58
|
+
class QualityTrim
|
59
59
|
include Bio::Command::Wrapper
|
60
60
|
set_program Bio::Ngs::Utils.binary("fastq_quality_trimmer")
|
61
61
|
use_aliases
|
@@ -72,6 +72,26 @@ module Bio
|
|
72
72
|
report will be printed to STDERR."
|
73
73
|
add_option :quality_type, :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
|
74
74
|
end #Trim
|
75
|
+
|
76
|
+
# [-f N] = First base to keep. Default is 1 (=first base).
|
77
|
+
# [-l N] = Last base to keep. Default is entire read.
|
78
|
+
# [-t N] = Trim N nucleotides from the end of the read.
|
79
|
+
# '-t' can not be used with '-l' and '-f'.
|
80
|
+
# [-z] = Compress output with GZIP.
|
81
|
+
# [-i INFILE] = FASTA/Q input file. default is STDIN.
|
82
|
+
# [-o OUTFILE] = FASTA/Q output file. default is STDOUT.
|
83
|
+
class Trim
|
84
|
+
include Bio::Command::Wrapper
|
85
|
+
set_program Bio::Ngs::Utils.binary("fastx_trimmer")
|
86
|
+
use_aliases
|
87
|
+
add_option :first_base, :type => :numeric, :aliases => "-f", :desc => "First base to keep"
|
88
|
+
add_option :last_base, :type => :numeric, :aliases => "-l", :desc => "Last base to keep"
|
89
|
+
add_option :compress, :type => :boolean, :aliases => "-z", :desc => "Compress output with GZIP"
|
90
|
+
add_option :input, :type => :string, :aliases => "-i", :desc => "Input FASTA/Q file", :collapse => true
|
91
|
+
add_option :output, :type => :string, :aliases => "-o", :desc => "Output FASTA/Q file", :collapse => true
|
92
|
+
add_option :trim, :type => :numeric, :aliases => "-t", :desc => "Trim N nucleotides from the end of the read"
|
93
|
+
add_option :quality_type, :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
|
94
|
+
end
|
75
95
|
|
76
96
|
# Solexa-Quality BoxPlot plotter
|
77
97
|
# Generates a solexa quality score box-plot graph
|
@@ -90,6 +110,7 @@ module Bio
|
|
90
110
|
add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file."
|
91
111
|
add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file."
|
92
112
|
add_option :title, :type => :string, :aliases => "-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
|
113
|
+
add_option :quality_type, :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
|
93
114
|
end #ReadsBoxPlot
|
94
115
|
|
95
116
|
# Solexa-Reads coverage plotter
|
@@ -109,6 +130,7 @@ module Bio
|
|
109
130
|
add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file."
|
110
131
|
add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file."
|
111
132
|
add_option :title, :type => :string, :aliases => "-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
|
133
|
+
add_option :quality_type, :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
|
112
134
|
end #ReadsCoverage
|
113
135
|
|
114
136
|
|
@@ -163,6 +185,7 @@ module Bio
|
|
163
185
|
add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file.", :collapse=>true
|
164
186
|
add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file.", :collapse=>true
|
165
187
|
add_option :new_format, :type => :boolean, :aliases => "-N", :desc => "New output format (with more information per nucleotide/cycle)."
|
188
|
+
add_option :quality_type, :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
|
166
189
|
end #ReadsCoverage
|
167
190
|
|
168
191
|
end #Fastx
|
data/lib/bio/appl/ngs/tophat.rb
CHANGED
@@ -27,7 +27,11 @@
|
|
27
27
|
# -i/--min-intron-length <int> [ default: 50 ]
|
28
28
|
# -I/--max-intron-length <int> [ default: 500000 ]
|
29
29
|
# -g/--max-multihits <int> [ default: 20 ]
|
30
|
-
# -
|
30
|
+
# -x/--transcriptome-max-hits <int> [ default: 60 ]
|
31
|
+
# -n/--transcriptome-mismatches <int> [ default: 1 ]
|
32
|
+
# -M/--prefilter-multihits ( for -G/--GTF option, enable
|
33
|
+
# an initial bowtie search
|
34
|
+
# against the genome )
|
31
35
|
# --max-insertion-length <int> [ default: 3 ]
|
32
36
|
# --max-deletion-length <int> [ default: 3 ]
|
33
37
|
# --solexa-quals
|
@@ -40,7 +44,9 @@
|
|
40
44
|
# --library-type <string> (fr-unstranded, fr-firststrand,
|
41
45
|
# fr-secondstrand)
|
42
46
|
# -p/--num-threads <int> [ default: 1 ]
|
43
|
-
# -G/--GTF <filename>
|
47
|
+
# -G/--GTF <filename> (GTF/GFF with known transcripts)
|
48
|
+
# --transcriptome-index <bwtidx> (transcriptome bowtie index)
|
49
|
+
# -T/--transcriptome-only (map only to the transcriptome)
|
44
50
|
# -j/--raw-juncs <filename>
|
45
51
|
# --insertions <filename>
|
46
52
|
# --deletions <filename>
|
@@ -59,10 +65,11 @@
|
|
59
65
|
# --keep-tmp
|
60
66
|
# --tmp-dir <dirname> [ default: <output_dir>/tmp ]
|
61
67
|
# -z/--zpacker <program> [ default: gzip ]
|
62
|
-
# -X/--unmapped-fifo
|
68
|
+
# -X/--unmapped-fifo ( use mkfifo to compress
|
69
|
+
# more temporary files )
|
63
70
|
#
|
64
71
|
# Advanced Options:
|
65
|
-
#
|
72
|
+
# -N/--initial-read-mismatches <int> [ default: 2 ]
|
66
73
|
# --segment-mismatches <int> [ default: 2 ]
|
67
74
|
# --segment-length <int> [ default: 25 ]
|
68
75
|
# --bowtie-n [ default: bowtie -v ]
|
@@ -73,10 +80,10 @@
|
|
73
80
|
# --max-coverage-intron <int> [ default: 20000 ]
|
74
81
|
# --min-segment-intron <int> [ default: 50 ]
|
75
82
|
# --max-segment-intron <int> [ default: 500000 ]
|
76
|
-
# --no-sort-bam
|
77
|
-
# --no-convert-bam
|
83
|
+
# --no-sort-bam (Output BAM is not coordinate-sorted)
|
84
|
+
# --no-convert-bam (Do not convert to bam format.
|
78
85
|
# Output is <output_dir>accepted_hit.sam.
|
79
|
-
# Implies --no-sort-bam
|
86
|
+
# Implies --no-sort-bam)
|
80
87
|
#
|
81
88
|
# SAM Header Options (for embedding sequencing run metadata in output):
|
82
89
|
# --rg-id <string> (read group ID)
|
@@ -105,7 +112,9 @@ module Bio
|
|
105
112
|
add_option "min-intron-length", :type => :numeric , :aliases => '-i'
|
106
113
|
add_option "max-intron-length", :type => :numeric, :aliases => '-I'
|
107
114
|
add_option "max-multihits", :type => :numeric, :aliases => '-g'
|
108
|
-
add_option "
|
115
|
+
add_option "transcriptome-max-hits", :type => :numeric, :aliases =>'-x'
|
116
|
+
add_option "transcriptome-mismatches", :type => :numeric, :aliases =>'-n'
|
117
|
+
add_option "prefilter-multihits", :type => :boolean, :aliases =>'-M'
|
109
118
|
add_option "max-insertion-length", :type => :numeric
|
110
119
|
add_option "max-deletion-length", :type => :numeric
|
111
120
|
add_option "solexa-quals", :type => :boolean
|
@@ -116,6 +125,8 @@ module Bio
|
|
116
125
|
add_option "library-type", :type => :string
|
117
126
|
add_option "num-threads", :type => :numeric, :aliases => '-p'
|
118
127
|
add_option "GTF", :type => :string, :aliases => '-G'
|
128
|
+
add_option "transcriptome-index", :type => :string
|
129
|
+
add_option "transcriptome-only", :type => :boolean
|
119
130
|
add_option "raw-juncs", :type => :string, :aliases => '-j'
|
120
131
|
add_option :insertions, :type => :string
|
121
132
|
add_option :deletions, :type => :string
|
@@ -135,6 +146,9 @@ module Bio
|
|
135
146
|
add_option "no-butterfly-search", :type => :boolean
|
136
147
|
add_option "keep-tmp", :type => :boolean
|
137
148
|
add_option "tmp-dir", :type => :string
|
149
|
+
add_option "zpacker", :type => :string, :aliases => '-z'
|
150
|
+
add_option "unmapped-fifo", :type => :boolean, :aliases => '-X'
|
151
|
+
add_option "initial-read-mismatches", :type => :int, :aliases => '-N'
|
138
152
|
add_option "segment-mismatches", :type => :numeric
|
139
153
|
add_option "segment-length", :type => :numeric
|
140
154
|
add_option "min-closure-exon", :type => :numeric
|
@@ -144,6 +158,8 @@ module Bio
|
|
144
158
|
add_option "max-coverage-intron", :type => :numeric
|
145
159
|
add_option "min-segment-intron", :type => :numeric
|
146
160
|
add_option "max-segment-intron", :type => :numeric
|
161
|
+
add_option "no-sort-bam", :type => :boolean
|
162
|
+
add_option "no-convert-bam", :type => :boolean
|
147
163
|
add_option "rg-id", :type => :string
|
148
164
|
add_option "rg-sample", :type => :string
|
149
165
|
add_option "rg-library", :type => :string
|
@@ -15,16 +15,16 @@ common:
|
|
15
15
|
type: source
|
16
16
|
linux:
|
17
17
|
cufflinks:
|
18
|
-
version: 1.
|
19
|
-
url: http://cufflinks.cbcb.umd.edu/downloads/cufflinks-1.
|
20
|
-
basename: cufflinks-1.
|
18
|
+
version: 1.3.0
|
19
|
+
url: http://cufflinks.cbcb.umd.edu/downloads/cufflinks-1.3.0.Linux_x86_64.tar.gz
|
20
|
+
basename: cufflinks-1.3.0.Linux_x86_64
|
21
21
|
suffix: tar.gz
|
22
22
|
desc: ""
|
23
23
|
type: binary
|
24
24
|
tophat:
|
25
|
-
version: 1.
|
26
|
-
url: http://tophat.cbcb.umd.edu/downloads/tophat-1.
|
27
|
-
basename: tophat-1.
|
25
|
+
version: 1.4.1
|
26
|
+
url: http://tophat.cbcb.umd.edu/downloads/tophat-1.4.1.Linux_x86_64.tar.gz
|
27
|
+
basename: tophat-1.4.1.Linux_x86_64
|
28
28
|
suffix: tar.gz
|
29
29
|
desc: ""
|
30
30
|
type: binary
|
@@ -44,16 +44,16 @@ linux:
|
|
44
44
|
# type: binary
|
45
45
|
osx:
|
46
46
|
cufflinks:
|
47
|
-
version: 1.
|
48
|
-
url: http://cufflinks.cbcb.umd.edu/downloads/cufflinks-1.
|
49
|
-
basename: cufflinks-1.
|
47
|
+
version: 1.3.0
|
48
|
+
url: http://cufflinks.cbcb.umd.edu/downloads/cufflinks-1.3.0.OSX_x86_64.tar.gz
|
49
|
+
basename: cufflinks-1.3.0.OSX_x86_64
|
50
50
|
suffix: tar.gz
|
51
51
|
desc: ""
|
52
52
|
type: binary
|
53
53
|
tophat:
|
54
|
-
version: 1.
|
55
|
-
url: http://tophat.cbcb.umd.edu/downloads/tophat-1.
|
56
|
-
basename: tophat-1.
|
54
|
+
version: 1.4.1
|
55
|
+
url: http://tophat.cbcb.umd.edu/downloads/tophat-1.4.1.OSX_x86_64.tar.gz
|
56
|
+
basename: tophat-1.4.1.OSX_x86_64
|
57
57
|
suffix: tar.gz
|
58
58
|
desc: ""
|
59
59
|
type: binary
|