bio-ngs 0.4.6.alpha.01 → 0.4.6.alpha.02
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -2
- data/Gemfile.lock +21 -21
- data/README.rdoc +51 -4
- data/VERSION +1 -1
- data/bin/biongs +1 -0
- data/bio-ngs.gemspec +36 -8
- data/features/cufflinks_gtf_parser.feature +22 -0
- data/features/cufflinks_gtf_parser_indexing.feature +20 -0
- data/features/step_definitions/cufflinks_gtf.rb +30 -0
- data/features/step_definitions/cufflinks_gtf_parser_indexing.rb +53 -0
- data/features/support/env.rb +2 -0
- data/lib/bio-ngs.rb +19 -5
- data/lib/bio/appl/ngs/cufflinks.rb +447 -281
- data/lib/bio/appl/ngs/cufflinks/gtf/gtf.rb +23 -0
- data/lib/bio/appl/ngs/cufflinks/gtf/gtf_parser.rb +248 -0
- data/lib/bio/appl/ngs/cufflinks/gtf/transcript.rb +154 -0
- data/lib/bio/ngs/fs.rb +46 -0
- data/lib/bio/ngs/illumina/fastq.rb +176 -0
- data/lib/bio/ngs/illumina/illumina.rb +64 -0
- data/lib/bio/ngs/illumina/project.rb +81 -0
- data/lib/bio/ngs/illumina/sample.rb +85 -0
- data/lib/bio/ngs/task.rb +1 -1
- data/lib/bio/ngs/utils.rb +124 -112
- data/lib/meta.rb +162 -0
- data/lib/tasks/convert.thor +14 -14
- data/lib/tasks/filter.thor +158 -23
- data/lib/tasks/quality.thor +24 -4
- data/lib/tasks/rna.thor +26 -0
- data/lib/wrapper.rb +28 -0
- data/spec/bio/ngs/fs_spec.rb +70 -0
- data/spec/bio/ngs/illumina/fastq_spec.rb +52 -0
- data/spec/bio/ngs/illumina/illumina_spec.rb +21 -0
- data/spec/bio/ngs/illumina/project_spec.rb +0 -0
- data/spec/bio/ngs/illumina/sample_spec.rb +0 -0
- data/spec/bio/ngs/illumina/samples_spec.rb +0 -0
- data/spec/filter_spec.rb +25 -0
- data/spec/fixture/table_filter_list.txt +3 -0
- data/spec/fixture/table_filter_list_first_column.txt +2 -0
- data/spec/fixture/table_filter_source.tsv +44 -0
- data/spec/fixture/test-filtered-reference.fastq.gz +0 -0
- data/spec/fixture/test-merged-reference.fastq.gz +0 -0
- data/spec/fixture/test.fastq.gz +0 -0
- data/spec/meta_spec.rb +117 -0
- data/spec/spec_helper.rb +1 -1
- metadata +97 -69
data/lib/bio-ngs.rb
CHANGED
@@ -24,9 +24,12 @@ require 'bio'
|
|
24
24
|
require 'active_record'
|
25
25
|
require 'sqlite3'
|
26
26
|
|
27
|
-
#
|
27
|
+
#Generic classes
|
28
28
|
require 'enumerable'
|
29
29
|
require 'wrapper'
|
30
|
+
require 'meta'
|
31
|
+
|
32
|
+
# NGS classes
|
30
33
|
require 'bio/ngs/utils'
|
31
34
|
require 'bio/ngs/record'
|
32
35
|
require 'bio/ngs/quality'
|
@@ -45,18 +48,29 @@ require 'bio/appl/ngs/sff_extract'
|
|
45
48
|
require 'bio/appl/ngs/bcl2qseq' #TODO: FIX THIS BUGGY CODE in THOR TASK
|
46
49
|
|
47
50
|
require 'bio/appl/ngs/cufflinks/iterators'
|
51
|
+
require 'bio/appl/ngs/cufflinks/gtf/gtf_parser'
|
52
|
+
require 'bio/appl/ngs/cufflinks/gtf/gtf'
|
53
|
+
require 'bio/appl/ngs/cufflinks/gtf/transcript'
|
48
54
|
require 'bio/appl/ngs/cufflinks'
|
49
55
|
require 'bio/appl/ngs/samtools'
|
50
56
|
require 'bio/appl/ngs/fastx'
|
51
57
|
require 'bio/appl/ngs/blast'
|
52
58
|
require 'bio/appl/ngs/bwa'
|
53
59
|
|
54
|
-
#
|
60
|
+
#Illumina utility for projects
|
61
|
+
require 'bio/ngs/illumina/illumina'
|
62
|
+
require 'bio/ngs/fs'
|
63
|
+
|
64
|
+
# history
|
55
65
|
Bio::Ngs::HISTORY_FILE = Dir.pwd+"/.task-history.yml"
|
56
66
|
Bio::Ngs::Utils.extend_system_path
|
57
67
|
|
68
|
+
|
58
69
|
# loading Tasks
|
59
|
-
|
60
|
-
|
61
|
-
|
70
|
+
# TODO let the user define which tasks must be loaded, maybe a list of names
|
71
|
+
if Bio::Ngs.const_defined?(:LoadBaseTasks) && Bio::Ngs.const_get(:LoadBaseTasks)==true
|
72
|
+
path = File.expand_path(File.dirname(__FILE__))
|
73
|
+
Dir.glob(File.join(path,"tasks","*.thor")) do |thorfile|
|
74
|
+
Thor::Util.load_thorfile(thorfile)
|
75
|
+
end
|
62
76
|
end
|
@@ -7,10 +7,8 @@
|
|
7
7
|
#
|
8
8
|
#
|
9
9
|
|
10
|
-
|
11
|
-
|
12
10
|
module Bio
|
13
|
-
module Ngs
|
11
|
+
module Ngs
|
14
12
|
module Cufflinks
|
15
13
|
VERSION = "1.0.X"
|
16
14
|
class << self
|
@@ -19,229 +17,258 @@ module Bio
|
|
19
17
|
end
|
20
18
|
end
|
21
19
|
|
20
|
+
module MarkCall
|
21
|
+
def mark
|
22
|
+
puts caller.first #elaborate the concept of tracking but not here please.
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
22
26
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
add_option "
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
#deprecated add_option "
|
124
|
-
#deprecated add_option "
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
#
|
141
|
-
|
142
|
-
#
|
143
|
-
#
|
144
|
-
|
145
|
-
#
|
146
|
-
#
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
add_option "
|
218
|
-
add_option "
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
27
|
+
# cufflinks v1.3.0
|
28
|
+
# linked against Boost version 104000
|
29
|
+
# -----------------------------
|
30
|
+
# Usage: cufflinks [options] <hits.sam>
|
31
|
+
# General Options:
|
32
|
+
# -o/--output-dir write all output files to this directory [ default: ./ ]
|
33
|
+
# -p/--num-threads number of threads used during analysis [ default: 1 ]
|
34
|
+
# --seed value of random number generator seed [ default: 0 ]
|
35
|
+
# -G/--GTF quantitate against reference transcript annotations
|
36
|
+
# -g/--GTF-guide use reference transcript annotation to guide assembly
|
37
|
+
# -M/--mask-file ignore all alignment within transcripts in this file
|
38
|
+
# -b/--frag-bias-correct use bias correction - reference fasta required [ default: NULL ]
|
39
|
+
# -u/--multi-read-correct use 'rescue method' for multi-reads (more accurate) [ default: FALSE ]
|
40
|
+
# --library-type library prep used for input reads [ default: below ]
|
41
|
+
#
|
42
|
+
# Advanced Abundance Estimation Options:
|
43
|
+
# -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]
|
44
|
+
# -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]
|
45
|
+
# --upper-quartile-norm use upper-quartile normalization [ default: FALSE ]
|
46
|
+
# --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
|
47
|
+
# --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
|
48
|
+
# --compatible-hits-norm count hits compatible with reference RNAs only [ default: FALSE ]
|
49
|
+
# --total-hits-norm count all hits for normalization [ default: TRUE ]
|
50
|
+
#
|
51
|
+
# Advanced Assembly Options:
|
52
|
+
# -L/--label assembled transcripts have this ID prefix [ default: CUFF ]
|
53
|
+
# -F/--min-isoform-fraction suppress transcripts below this abundance level [ default: 0.10 ]
|
54
|
+
# -j/--pre-mrna-fraction suppress intra-intronic transcripts below this level [ default: 0.15 ]
|
55
|
+
# -I/--max-intron-length ignore alignments with gaps longer than this [ default: 300000 ]
|
56
|
+
# -a/--junc-alpha alpha for junction binomial test filter [ default: 0.001 ]
|
57
|
+
# -A/--small-anchor-fraction percent read overhang taken as 'suspiciously small' [ default: 0.09 ]
|
58
|
+
# --min-frags-per-transfrag minimum number of fragments needed for new transfrags [ default: 10 ]
|
59
|
+
# --overhang-tolerance number of terminal exon bp to tolerate in introns [ default: 8 ]
|
60
|
+
# --max-bundle-length maximum genomic length allowed for a given bundle [ default:3500000 ]
|
61
|
+
# --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]
|
62
|
+
# --min-intron-length minimum intron size allowed in genome [ default: 50 ]
|
63
|
+
# --trim-3-avgcov-thresh minimum avg coverage required to attempt 3' trimming [ default: 10 ]
|
64
|
+
# --trim-3-dropoff-frac fraction of avg coverage below which to trim 3' end [ default: 0.1 ]
|
65
|
+
#
|
66
|
+
# Advanced Reference Annotation Guided Assembly Options:
|
67
|
+
# --no-faux-reads disable tiling by faux reads [ default: FALSE ]
|
68
|
+
# --3-overhang-tolerance overhang allowed on 3' end when merging with reference[ default: 600 ]
|
69
|
+
# --intron-overhang-tolerance overhang allowed inside reference intron when merging [ default: 30 ]
|
70
|
+
#
|
71
|
+
# Advanced Program Behavior Options:
|
72
|
+
# -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
|
73
|
+
# -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
|
74
|
+
# --no-update-check do not contact server to check for update availability[ default: FALSE ]
|
75
|
+
#
|
76
|
+
# Supported library types:
|
77
|
+
# ff-firststrand
|
78
|
+
# ff-secondstrand
|
79
|
+
# ff-unstranded
|
80
|
+
# fr-firststrand
|
81
|
+
# fr-secondstrand
|
82
|
+
# fr-unstranded (default)
|
83
|
+
# transfrags
|
84
|
+
class Quantification
|
85
|
+
include Bio::Command::Wrapper
|
86
|
+
include Bio::Ngs::Cufflinks::Utils
|
87
|
+
|
88
|
+
set_program Bio::Ngs::Utils.binary("cufflinks")
|
89
|
+
|
90
|
+
add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
|
91
|
+
add_option "num-threads", :type => :numeric, :aliases => '-p', :default => 1
|
92
|
+
add_option "seed", :type => :numeric
|
93
|
+
add_option "GTF", :type => :string, :aliases => '-G'
|
94
|
+
add_option "GTF-guide", :type => :string, :aliases => '-g'
|
95
|
+
add_option "mask-file", :type => :string, :aliases => '-M'
|
96
|
+
add_option "frag-bias-correct", :type => :string, :aliases => '-b'
|
97
|
+
add_option "multi-read-correct", :type => :boolean, :aliases => '-u'
|
98
|
+
add_option "library-type", :type => :string
|
99
|
+
add_option "farg-len-mean", :type => :numeric, :aliases => '-m'#, :default => 200
|
100
|
+
add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'#, :default => 80
|
101
|
+
add_option "upper-quartile-norm", :type => :boolean
|
102
|
+
add_option "max-mle-iterations", :type => :numeric#, :default => 5000
|
103
|
+
add_option "num-importance-samples", :type => :numeric#, :default => 1000
|
104
|
+
add_option "compatible-hits-norm", :type => :boolean, :aliases => '-h'
|
105
|
+
add_option "total-hits-norm", :type => :boolean, :aliases => '-t'
|
106
|
+
add_option "label", :type => :string, :aliases => '-L', :default => "CUFF"
|
107
|
+
add_option "min-isoform-fraction", :type => :numeric, :aliases => '-F', :default => 0.15
|
108
|
+
add_option "pre-mrna-fraction", :type => :numeric, :aliases => '-j', :default => 0.15
|
109
|
+
#deprecated add_option "min-intron-fraction", :type => :numeric, :aliases => '-f', :default => 0.05
|
110
|
+
add_option "max-intron-length", :type => :numeric, :aliases => '-I', :default => 300000
|
111
|
+
add_option "junc-alpha", :type => :numeric, :aliases => '-a', :default => 0.01
|
112
|
+
add_option "small-anchor-fraction", :type => :numeric, :aliases => '-A', :default => 0.12
|
113
|
+
add_option "min-frags-per-transfrag", :type => :numeric#, :default => 10
|
114
|
+
add_option "overhang-tolerance", :type => :numeric#, :default => 8
|
115
|
+
add_option "max-bundle-length", :type => :numeric #, :default => 3500000
|
116
|
+
add_option "max-bundle-frags", :type => :numeric #, :default => 500000
|
117
|
+
add_option "min-intron-length", :type => :numeric#, :default => 50
|
118
|
+
add_option "trim-3-avgcov-thresh", :type => :numeric
|
119
|
+
add_option "trim-3-dropoff-frac", :type => :numeric
|
120
|
+
add_option "no-faux-reads", :type => :boolean
|
121
|
+
add_option "3-overhang-tolerance", :type => :numeric
|
122
|
+
add_option "intron-overhang-tolerance", :type => :numeric
|
123
|
+
add_option "verbose", :type => :boolean, :aliases => '-v'
|
124
|
+
add_option "quiet", :type => :boolean, :aliases => '-q'
|
125
|
+
|
126
|
+
#deprecated add_option "min-map-qual", :type => :numeric, :aliases => '-Q', :default => 0
|
127
|
+
#deprecated add_option "reference-seq", :type => :string, :aliases => '-r'
|
128
|
+
#deprecated add_option "quartile-normalization", :type => :boolean, :aliases => '-N'
|
129
|
+
|
130
|
+
#TODO Check why with these defaults is not working properly
|
131
|
+
|
132
|
+
|
133
|
+
|
134
|
+
add_iterator_for :genes
|
135
|
+
add_iterator_for :isoforms
|
136
|
+
|
137
|
+
|
138
|
+
|
139
|
+
|
140
|
+
end #Quantification
|
141
|
+
|
142
|
+
|
143
|
+
class QuantificationDenovo < Quantification
|
144
|
+
#set_program Bio::Ngs::Utils.binary("cufflinks")
|
145
|
+
delete_option "GTF"
|
146
|
+
#add_option "GTF-guide", :type => :string, :aliases => '-g'
|
147
|
+
# add_alias "GTF", "GTF-guide"
|
148
|
+
|
149
|
+
# returns new trascripts from a gff3 file, it creates the file if doesn't exist
|
150
|
+
# gets only the brand new.
|
151
|
+
def get_new_transcripts(file=nil, type="gtf")
|
152
|
+
# TODO implement conversion to gff3
|
153
|
+
file||= "transcripts.#{type}"
|
154
|
+
# if type=="gtf"
|
155
|
+
# unless File.exists?(file)
|
156
|
+
# to_gff3(File.dirname(File.absolute_path(file)))
|
157
|
+
# end
|
158
|
+
File.open()
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# cuffdiff v1.3.0 (3022)
|
163
|
+
# -----------------------------
|
164
|
+
# Usage: cuffdiff [options] <transcripts.gtf> <sample1_hits.sam> <sample2_hits.sam> [... sampleN_hits.sam]
|
165
|
+
# Supply replicate SAMs as comma separated lists for each condition: sample1_rep1.sam,sample1_rep2.sam,...sample1_repM.sam
|
166
|
+
# General Options:
|
167
|
+
# -o/--output-dir write all output files to this directory [ default: ./ ]
|
168
|
+
# --seed value of random number generator seed [ default: 0 ]
|
169
|
+
# -T/--time-series treat samples as a time-series [ default: FALSE ]
|
170
|
+
# -c/--min-alignment-count minimum number of alignments in a locus for testing [ default: 10 ]
|
171
|
+
# --FDR False discovery rate used in testing [ default: 0.05 ]
|
172
|
+
# -M/--mask-file ignore all alignment within transcripts in this file [ default: NULL ]
|
173
|
+
# -b/--frag-bias-correct use bias correction - reference fasta required [ default: NULL ]
|
174
|
+
# -u/--multi-read-correct use 'rescue method' for multi-reads (more accurate) [ default: FALSE ]
|
175
|
+
# -N/--upper-quartile-norm use upper-quartile normalization [ default: FALSE ]
|
176
|
+
# -L/--labels comma-separated list of condition labels
|
177
|
+
# -p/--num-threads number of threads used during quantification [ default: 1 ]
|
178
|
+
#
|
179
|
+
# Advanced Options:
|
180
|
+
# --library-type Library prep used for input reads [ default: below ]
|
181
|
+
# -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]
|
182
|
+
# -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]
|
183
|
+
# --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
|
184
|
+
# --num-bootstrap-samples Number of bootstrap replications [ default: 20 ]
|
185
|
+
# --bootstrap-fraction Fraction of fragments in each bootstrap sample [ default: 1.0 ]
|
186
|
+
# --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
|
187
|
+
# --compatible-hits-norm count hits compatible with reference RNAs only [ default: TRUE ]
|
188
|
+
# --total-hits-norm count all hits for normalization [ default: FALSE ]
|
189
|
+
# --poisson-dispersion Don't fit fragment counts for overdispersion [ default: FALSE ]
|
190
|
+
# -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
|
191
|
+
# -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
|
192
|
+
# --no-update-check do not contact server to check for update availability[ default: FALSE ]
|
193
|
+
# --emit-count-tables print count tables used to fit overdispersion [ default: FALSE ]
|
194
|
+
# --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]
|
195
|
+
#
|
196
|
+
# Debugging use only:
|
197
|
+
# --read-skip-fraction Skip a random subset of reads this size [ default: 0.0 ]
|
198
|
+
# --no-read-pairs Break all read pairs [ default: FALSE ]
|
199
|
+
# --trim-read-length Trim reads to be this long (keep 5' end) [ default: none ]
|
200
|
+
# --cov-delta Maximum gap between bootstrap and IS [ default: 2.0 ]
|
201
|
+
#
|
202
|
+
# Supported library types:
|
203
|
+
# ff-firststrand
|
204
|
+
# ff-secondstrand
|
205
|
+
# ff-unstranded
|
206
|
+
# fr-firststrand
|
207
|
+
# fr-secondstrand
|
208
|
+
# fr-unstranded (default)
|
209
|
+
# transfrags
|
210
|
+
class Diff
|
211
|
+
include Bio::Command::Wrapper
|
212
|
+
include Bio::Ngs::Cufflinks::Utils
|
213
|
+
|
214
|
+
set_program Bio::Ngs::Utils.binary("cuffdiff")
|
215
|
+
|
216
|
+
add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
|
217
|
+
add_option "seed", :type => :numeric
|
218
|
+
add_option "time-series", :type => :boolean, :aliases => '-T'
|
219
|
+
add_option "min-alignment-count", :type => :numeric, :aliases => '-c'
|
220
|
+
add_option "FDR", :type => :numeric, :aliases => '-F'
|
221
|
+
#TODO:FIX add_option "mask-file", :type => :string, :aliases => '-M'
|
222
|
+
#TODO:FIX add_option "frag-bias-correct", :type =>
|
223
|
+
add_option "multi-read-correct", :type => :boolean, :aliases => '-u'
|
224
|
+
add_option "upper-quartile-norm", :type => :boolean, :aliases => 'N'
|
225
|
+
add_option "labels", :type => :array, :aliases => '-L'
|
226
|
+
add_option "num-threads", :type => :numeric, :aliases => '-p'
|
227
|
+
add_option "library-type", :type => :string, :aliases => '-l'
|
228
|
+
add_option "frag-len-mean", :type => :numeric, :aliases => '-m'
|
229
|
+
add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'
|
230
|
+
add_option "num-importance-samples", :type => :numeric, :aliases => '-i'
|
231
|
+
add_option "num-bootstrap-samples", :type => :numeric
|
232
|
+
add_option "bootstrap-fraction", :type => :numeric
|
233
|
+
add_option "max-mle-iterations", :type => :numeric, :aliases => '-e'
|
234
|
+
add_option "compatible-hits-norm", :type => :boolean, :aliases => '-h'
|
235
|
+
add_option "total-hits-norm", :type => :boolean, :aliases => '-t'
|
236
|
+
add_option "poisson-dispersion", :type => :boolean, :aliases => '-d'
|
237
|
+
add_option "verbose", :type => :boolean, :aliases => '-v'
|
238
|
+
add_option "quiet", :type => :boolean, :aliases => '-q'
|
239
|
+
add_option "no-update-check", :type => :boolean, :aliases => '-j'
|
240
|
+
add_option "emit-count-tables", :type => :boolean, :aliases => '-b'
|
241
|
+
add_option "max-bundle-frags", :type => :numeric
|
242
|
+
add_option "read-skip-fraction", :type => :numeric
|
243
|
+
add_option "no-read-pairs", :type => :numeric
|
244
|
+
add_option "trim-read-length", :type => :numeric
|
245
|
+
add_option "cov-delta", :type => :numeric
|
246
|
+
|
247
|
+
#define iterators
|
248
|
+
add_iterator_for :genes
|
249
|
+
add_iterator_for :isoforms
|
250
|
+
add_iterator_for :cds
|
251
|
+
add_iterator_for :tss_groups
|
252
|
+
|
253
|
+
#Examples
|
254
|
+
#Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,3,0.6,false,true)
|
255
|
+
#Bio::Ngs::Cufflinks::Diff.genes("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/gene_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,5,0.5,false,true)
|
256
|
+
|
257
|
+
class << self
|
258
|
+
|
259
|
+
#Return the version of CuffDiff used to produce the output
|
260
|
+
def version(diff)
|
261
|
+
#cufflink_version_offset = Bio::Ngs::Cufflinks.version
|
262
|
+
f=File.open(diff,'r')
|
263
|
+
header=f.readline #skip header
|
264
|
+
f.close
|
265
|
+
cufflink_version_offset = case header.split.size
|
266
|
+
when 12
|
267
|
+
"0.9.X"
|
268
|
+
when 14
|
269
|
+
Bio::Ngs::Cufflinks.version #latest
|
270
|
+
end
|
271
|
+
end#version
|
245
272
|
|
246
273
|
|
247
274
|
def offset_by_version(cufflinks_version)
|
@@ -252,7 +279,7 @@ module Bio
|
|
252
279
|
1
|
253
280
|
end
|
254
281
|
end
|
255
|
-
|
282
|
+
|
256
283
|
#write a file with the information
|
257
284
|
#See process_de for options available
|
258
285
|
# Example: Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DEPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/isoform_exp.diff",
|
@@ -268,16 +295,16 @@ module Bio
|
|
268
295
|
#See process_de for options available
|
269
296
|
# Example: Bio::Ngs::Cufflinks::Diff.genes("/Users/bonnalraoul/Desktop/RRep16giugno/DEPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/gene_exp.diff",
|
270
297
|
# "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_PopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/ComparepPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8.combined.gtf",
|
271
|
-
# fold:0.5,min_samples:5,min_fpkm:0.5,z_scores:true, :regulated=>:up)
|
298
|
+
# fold:0.5,min_samples:5,min_fpkm:0.5,z_scores:true, :regulated=>:up)
|
272
299
|
def genes(diff, gtf, options={})
|
273
300
|
process_de(diff, gtf, options) do |dict_info, diff_reference, gtf_kb, fpkm_values|
|
274
|
-
# puts diff_reference
|
275
|
-
# puts fpkm_values
|
301
|
+
# puts diff_reference
|
302
|
+
# puts fpkm_values
|
276
303
|
# "#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:gene_name]}\t#{fpkm_values.join("\t")}"
|
277
|
-
#do not use th gtf kb
|
304
|
+
#do not use th gtf kb
|
278
305
|
"#{dict_info[:winner].first}\t#{dict_info[:gene_name]}\t#{fpkm_values.join("\t")}"
|
279
306
|
end
|
280
|
-
end #genes
|
307
|
+
end #genes
|
281
308
|
|
282
309
|
private
|
283
310
|
#Options hash
|
@@ -297,7 +324,7 @@ module Bio
|
|
297
324
|
force_not_significative = options[:force_not_significative] || false
|
298
325
|
|
299
326
|
#set up the kb if not available = pass an option with the path of the kb ?
|
300
|
-
gtf_kb = nil###### Bio::Ngs::Cufflinks::Compare.exists_kb?(gtf) ? Bio::Ngs::Cufflinks::Compare.load_compare_kb(gtf) : Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
|
327
|
+
gtf_kb = nil###### Bio::Ngs::Cufflinks::Compare.exists_kb?(gtf) ? Bio::Ngs::Cufflinks::Compare.load_compare_kb(gtf) : Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
|
301
328
|
|
302
329
|
#convert log2 fold value into natural log value (internally computed by cuffdiff)
|
303
330
|
fold_log2 = fold
|
@@ -311,7 +338,7 @@ module Bio
|
|
311
338
|
|
312
339
|
File.open(diff,'r') do |f|
|
313
340
|
header=f.readline #skip header
|
314
|
-
|
341
|
+
|
315
342
|
test_id_idx = 0
|
316
343
|
gene_name_idx = 2
|
317
344
|
q_first_idx = 3 + cufflink_version_offset
|
@@ -320,76 +347,76 @@ module Bio
|
|
320
347
|
fpkm_second_idx = 7 + cufflink_version_offset
|
321
348
|
fold_idx = 8 + cufflink_version_offset
|
322
349
|
significant_idx = 11 + cufflink_version_offset + (cufflink_version_offset==1 ? 1 : 0)
|
323
|
-
|
350
|
+
|
324
351
|
#Commenti:
|
325
352
|
# per ogni riga del diff devo salvare il valore dei espressione di ogni test
|
326
353
|
# quindi fpkm e se è significativo o meno
|
327
354
|
|
328
355
|
f.each_line do |line|
|
329
356
|
data=line.split
|
330
|
-
|
357
|
+
|
331
358
|
#fix comparison t-test, remove negative symbol e invert comparison: if fold change q1 vs q2 <0 abs(foldchange) & swaap q1,q2
|
332
|
-
# puts data[fold_idx].to_f
|
333
|
-
#delete puts "#{data[fold_idx].to_f} #{data[fold_idx].to_f<0}"
|
359
|
+
# puts data[fold_idx].to_f
|
360
|
+
#delete puts "#{data[fold_idx].to_f} #{data[fold_idx].to_f<0}"
|
334
361
|
if data[fold_idx].to_f<0
|
335
362
|
data[fold_idx]=data[fold_idx][1..-1] #.sub(/-/,"") remove the minus symbol from the number, the values q1, q2 and their fpkm will be reorganized into the data structure
|
336
|
-
else
|
337
|
-
# puts "ciao"
|
363
|
+
else
|
364
|
+
# puts "ciao"
|
338
365
|
data[fpkm_first_idx],data[fpkm_second_idx]=data[fpkm_second_idx],data[fpkm_first_idx]
|
339
366
|
data[q_first_idx],data[q_second_idx]=data[q_second_idx],data[q_first_idx]
|
340
|
-
#delete puts "#{q_first_idx},#{q_second_idx}"
|
367
|
+
#delete puts "#{q_first_idx},#{q_second_idx}"
|
341
368
|
end
|
342
|
-
#delete puts "#{q_first_idx},#{q_second_idx}"
|
343
|
-
#delete puts "#{data[q_first_idx].to_sym} #{data[q_second_idx].to_sym}"
|
344
|
-
#delete puts "#{data[fpkm_first_idx].to_sym} #{data[fpkm_second_idx].to_sym}"
|
369
|
+
#delete puts "#{q_first_idx},#{q_second_idx}"
|
370
|
+
#delete puts "#{data[q_first_idx].to_sym} #{data[q_second_idx].to_sym}"
|
371
|
+
#delete puts "#{data[fpkm_first_idx].to_sym} #{data[fpkm_second_idx].to_sym}"
|
345
372
|
|
346
373
|
|
347
374
|
#0 TCONS
|
348
375
|
#4 name sample is the max diff for the item
|
349
376
|
#5 name sample is the less diff for the item
|
350
|
-
#9 is the fold
|
377
|
+
#9 is the fold
|
351
378
|
dict_samples[data[q_first_idx]]
|
352
379
|
dict_samples[data[q_second_idx]]
|
353
380
|
|
354
381
|
#7 is the fpkm value of max pop/sample
|
355
382
|
#8 is the fpkm value of min pop/sample
|
356
383
|
k_reference = data[test_id_idx].to_sym #This can be TCONS if isoforms or XLOC if genes
|
357
|
-
|
384
|
+
|
358
385
|
unless dict[k_reference].key?(:values)
|
359
386
|
dict[k_reference][:values]={}
|
360
387
|
dict[k_reference][:gene_name]=data[gene_name_idx]
|
361
|
-
end
|
388
|
+
end
|
362
389
|
dict[k_reference][:values][data[q_first_idx].to_sym]=data[fpkm_first_idx].to_f unless dict[k_reference][:values].key?(data[q_first_idx].to_sym)
|
363
|
-
dict[k_reference][:values][data[q_second_idx].to_sym]=data[fpkm_second_idx].to_f unless dict[k_reference][:values].key?(data[q_second_idx].to_sym)
|
364
|
-
|
390
|
+
dict[k_reference][:values][data[q_second_idx].to_sym]=data[fpkm_second_idx].to_f unless dict[k_reference][:values].key?(data[q_second_idx].to_sym)
|
391
|
+
|
365
392
|
if ((only_significative==true && data[significant_idx]=="yes") || ((data[significant_idx]=="yes"||force_not_significative) && data[fold_idx].to_f>=fold)) && data[fpkm_first_idx].to_f>=min_fpkm && data[fpkm_second_idx].to_f>=min_fpkm
|
366
|
-
|
367
|
-
|
393
|
+
|
394
|
+
###### puts data.join(" ") if k_reference == :XLOC_017497
|
368
395
|
#TODO refactor: this can be done using lambda
|
369
396
|
k_sample = ""
|
370
397
|
if regulated==:up
|
371
398
|
|
372
399
|
k_sample = data[q_first_idx].to_sym
|
373
|
-
#delete puts "#{k_sample} #{data[q_second_idx].to_sym}"
|
400
|
+
#delete puts "#{k_sample} #{data[q_second_idx].to_sym}"
|
374
401
|
dict[k_reference][k_sample]<<data[q_second_idx].to_sym
|
375
|
-
#delete puts "#{k_reference} #{q_first_idx}, #{q_second_idx}"
|
402
|
+
#delete puts "#{k_reference} #{q_first_idx}, #{q_second_idx}"
|
376
403
|
k_sample
|
377
404
|
elsif regulated==:down
|
378
405
|
k_sample = data[q_second_idx].to_sym
|
379
406
|
dict[k_reference][k_sample]<<data[q_first_idx].to_sym
|
380
|
-
k_sample
|
407
|
+
k_sample
|
381
408
|
end
|
382
|
-
|
383
|
-
|
409
|
+
|
410
|
+
#delete puts dict[k_reference].inspect if k_reference == :XLOC_017497
|
384
411
|
#delete puts dict.inspect
|
385
|
-
#store fpkm values as well for each pop/sample it should be
|
412
|
+
#store fpkm values as well for each pop/sample it should be
|
386
413
|
if dict[k_reference][k_sample].size >= min_samples
|
387
414
|
(dict[k_reference][:winner] << k_sample).uniq!
|
388
415
|
end
|
389
|
-
|
416
|
+
#delete puts dict[k_reference].inspect if k_reference == :XLOC_017497
|
390
417
|
else
|
391
418
|
# k_reference = data[0].to_sym #This can be TCONS if isoforms or XLOC if genes
|
392
|
-
#
|
419
|
+
#
|
393
420
|
# unless dict[k_reference].key?(:values)
|
394
421
|
# dict[k_reference][:values]={}
|
395
422
|
# end
|
@@ -398,19 +425,19 @@ module Bio
|
|
398
425
|
# dict[k_reference][:values][data[q_second_idx].to_sym]=data[fpkm_second_idx].to_f unless dict[k_reference][:values].key?(data[q_second_idx].to_sym)
|
399
426
|
# #dict[k_reference][:values][data[4].to_sym]=data[7].to_f
|
400
427
|
end
|
401
|
-
#delete puts dict[k_reference].inspect
|
402
|
-
|
428
|
+
#delete puts dict[k_reference].inspect
|
429
|
+
|
403
430
|
end #each line
|
404
|
-
#example structure
|
405
|
-
#{:TCONS_00086164=>{:q5=>[:q1, :q2, :q3, :q6]}, :TCONS_00086166=>{:q5=>[:q1, :q2, :q3, :q4, :q6]}
|
431
|
+
#example structure
|
432
|
+
#{:TCONS_00086164=>{:q5=>[:q1, :q2, :q3, :q6]}, :TCONS_00086166=>{:q5=>[:q1, :q2, :q3, :q4, :q6]}
|
406
433
|
end #file.open
|
407
434
|
|
408
435
|
file_lines =[]
|
409
436
|
dict.each do |diff_reference, dict_info|
|
410
|
-
|
437
|
+
|
411
438
|
if dict_info.key?(:winner)
|
412
439
|
#puts dict_info.inspect
|
413
|
-
|
440
|
+
|
414
441
|
#BAD PERFORMANCES use lambda
|
415
442
|
valz = case z_scores
|
416
443
|
when true
|
@@ -419,13 +446,13 @@ module Bio
|
|
419
446
|
stdev = items.standard_deviation
|
420
447
|
items.map do |fpkm|
|
421
448
|
(fpkm-average)/stdev
|
422
|
-
end
|
449
|
+
end
|
423
450
|
when false
|
424
451
|
dict_info[:values].sort.map{|sample| sample[1]}
|
425
452
|
end #case
|
426
453
|
|
427
454
|
#TODO generalize to isoforms and genes now only isoforms
|
428
|
-
|
455
|
+
# puts yield(dict_info, diff_reference, gtf_kb, valz) if diff_reference == :XLOC_017497
|
429
456
|
file_lines<< yield(dict_info, diff_reference, gtf_kb, valz) #fpkm_values
|
430
457
|
#file_lines<<"#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:nearest_ref]}_#{gtf_kb[diff_reference][:gene_name]}\t#{valz.join("\t")}"
|
431
458
|
else
|
@@ -451,42 +478,42 @@ module Bio
|
|
451
478
|
# cuffcompare v1.0.2 (2335)
|
452
479
|
# -----------------------------
|
453
480
|
# Usage:
|
454
|
-
# cuffcompare [-r <reference_mrna.gtf>] [-R] [-T] [-V] [-s <seq_path>]
|
455
|
-
# [-o <outprefix>] [-p <cprefix>]
|
481
|
+
# cuffcompare [-r <reference_mrna.gtf>] [-R] [-T] [-V] [-s <seq_path>]
|
482
|
+
# [-o <outprefix>] [-p <cprefix>]
|
456
483
|
# {-i <input_gtf_list> | <input1.gtf> [<input2.gtf> .. <inputN.gtf>]}
|
457
|
-
#
|
484
|
+
#
|
458
485
|
# Cuffcompare provides classification, reference annotation mapping and various
|
459
486
|
# statistics for Cufflinks transfrags.
|
460
487
|
# Cuffcompare clusters and tracks transfrags across multiple samples, writing
|
461
488
|
# matching transcripts (intron chains) into <outprefix>.tracking, and a GTF
|
462
|
-
# file <outprefix>.combined.gtf containing a nonredundant set of transcripts
|
489
|
+
# file <outprefix>.combined.gtf containing a nonredundant set of transcripts
|
463
490
|
# across all input files (with a single representative transfrag chosen
|
464
491
|
# for each clique of matching transfrags across samples).
|
465
|
-
#
|
492
|
+
#
|
466
493
|
# Options:
|
467
494
|
# -i provide a text file with a list of Cufflinks GTF files to process instead
|
468
495
|
# of expecting them as command line arguments (useful when a large number
|
469
496
|
# of GTF files should be processed)
|
470
|
-
#
|
471
|
-
# -r a set of known mRNAs to use as a reference for assessing
|
497
|
+
#
|
498
|
+
# -r a set of known mRNAs to use as a reference for assessing
|
472
499
|
# the accuracy of mRNAs or gene models given in <input.gtf>
|
473
|
-
#
|
474
|
-
# -R for -r option, reduce the set of reference transcripts to
|
500
|
+
#
|
501
|
+
# -R for -r option, reduce the set of reference transcripts to
|
475
502
|
# only those found to overlap any of the input loci
|
476
503
|
# -M discard (ignore) single-exon transfrags and reference transcripts
|
477
504
|
# -N discard (ignore) single-exon reference transcripts
|
478
|
-
#
|
479
|
-
# -s <seq_path> can be a multi-fasta file with all the genomic sequences or
|
505
|
+
#
|
506
|
+
# -s <seq_path> can be a multi-fasta file with all the genomic sequences or
|
480
507
|
# a directory containing multiple single-fasta files (one file per contig);
|
481
508
|
# lower case bases will be used to classify input transcripts as repeats
|
482
|
-
#
|
509
|
+
#
|
483
510
|
# -d max distance (range) for grouping transcript start sites (100)
|
484
|
-
# -p the name prefix to use for consensus transcripts in the
|
511
|
+
# -p the name prefix to use for consensus transcripts in the
|
485
512
|
# <outprefix>.combined.gtf file (default: 'TCONS')
|
486
513
|
# -C include the "contained" transcripts in the .combined.gtf file
|
487
514
|
# -G generic GFF input file(s) (do not assume Cufflinks GTF)
|
488
515
|
# -T do not generate .tmap and .refmap files for each input file
|
489
|
-
# -V verbose processing mode (showing all GFF parsing warnings)
|
516
|
+
# -V verbose processing mode (showing all GFF parsing warnings)
|
490
517
|
class Compare
|
491
518
|
include Bio::Command::Wrapper
|
492
519
|
|
@@ -579,6 +606,145 @@ module Bio
|
|
579
606
|
end #load_compare_kb
|
580
607
|
end
|
581
608
|
end #Compare
|
609
|
+
|
610
|
+
# cuffmerge takes two or more Cufflinks GTF files and merges them into a
|
611
|
+
# single unified transcript catalog. Optionally, you can provide the script
|
612
|
+
# with a reference GTF, and the script will use it to attach gene names and other
|
613
|
+
# metadata to the merged catalog.
|
614
|
+
|
615
|
+
# Usage:
|
616
|
+
# cuffmerge [Options] <assembly_GTF_list.txt>
|
617
|
+
|
618
|
+
# Options:
|
619
|
+
# -h/--help Prints the help message and exits
|
620
|
+
# -o <output_dir> Directory where merged assembly will be written [ default: ./merged_asm ]
|
621
|
+
# -g/--ref-gtf An optional "reference" annotation GTF.
|
622
|
+
# -s/--ref-sequence <seq_dir>/<seq_fasta> Genomic DNA sequences for the reference.
|
623
|
+
# --min-isoform-fraction <0-1.0> Discard isoforms with abundance below this [ default: 0.05 ]
|
624
|
+
# -p/--num-threads <int> Use this many threads to merge assemblies. [ default: 1 ]
|
625
|
+
# --keep-tmp Keep all intermediate files during merge
|
626
|
+
class Merge
|
627
|
+
include Bio::Command::Wrapper
|
628
|
+
|
629
|
+
set_program Bio::Ngs::Utils.binary("cuffmerge")
|
630
|
+
|
631
|
+
add_option "output-dir", :type => :string, :aliases => '-o', :default => "merged_asm"
|
632
|
+
add_option "ref-gtf", :type => :string, :aliases => '-g'
|
633
|
+
add_option "ref-sequence", :type => :string, :aliases => '-s'
|
634
|
+
add_option "min-isoform-fraction", :type => :numeric, :aliases => '-m'
|
635
|
+
add_option "num-threads", :type => :numeric, :aliases => '-p', :default => 6
|
636
|
+
add_option "keep-tmp", :type => :boolean, :aliases => 't'
|
637
|
+
end #Merge
|
638
|
+
|
639
|
+
# gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>]
|
640
|
+
# [-o <outfile.gff>] [-t <tname>] [-r [[<strand>]<chr>:]<start>..<end> [-R]]
|
641
|
+
# [-CTVNJMKQAFGUBHZWTOLE] [-w <exons.fa>] [-x <cds.fa>] [-y <tr_cds.fa>]
|
642
|
+
# [-i <maxintron>]
|
643
|
+
# Filters and/or converts GFF3/GTF2 records.
|
644
|
+
# <input_gff> is a GFF file, use '-' if the GFF records will be given at stdin
|
645
|
+
|
646
|
+
# Options:
|
647
|
+
# -g full path to a multi-fasta file with the genomic sequences
|
648
|
+
# for all input mappings, OR a directory with single-fasta files
|
649
|
+
# (one per genomic sequence, with file names matching sequence names)
|
650
|
+
# -s <seq_info.fsize> is a tab-delimited file providing this info
|
651
|
+
# for each of the mapped sequences:
|
652
|
+
# <seq-name> <seq-length> <seq-description>
|
653
|
+
# (useful for -A option with mRNA/EST/protein mappings)
|
654
|
+
# -i discard transcripts having an intron larger than <maxintron>
|
655
|
+
# -r only show transcripts overlapping coordinate range <start>..<end>
|
656
|
+
# (on chromosome/contig <chr>, strand <strand> if provided)
|
657
|
+
# -R for -r option, discard all transcripts that are not fully
|
658
|
+
# contained within the given range
|
659
|
+
# -U discard single-exon transcripts
|
660
|
+
# -C coding only: discard mRNAs that have no CDS feature
|
661
|
+
# -F full GFF attribute preservation (all attributes are shown)
|
662
|
+
# -G only parse additional exon attributes from the first exon
|
663
|
+
# and move them to the mRNA level (useful for GTF input)
|
664
|
+
# -A use the description field from <seq_info.fsize> and add it
|
665
|
+
# as the value for a 'descr' attribute to the GFF record
|
666
|
+
|
667
|
+
# -O process also non-transcript GFF records (by default non-transcript
|
668
|
+
# records are ignored)
|
669
|
+
# -V discard any mRNAs with CDS having in-frame stop codons
|
670
|
+
# -H for -V option, check and adjust the starting CDS phase
|
671
|
+
# if the original phase leads to a translation with an
|
672
|
+
# in-frame stop codon
|
673
|
+
# -B for -V option, single-exon transcripts are also checked on the
|
674
|
+
# opposite strand
|
675
|
+
# -N discard multi-exon mRNAs that have any intron with a non-canonical
|
676
|
+
# splice site consensus (i.e. not GT-AG, GC-AG or AT-AC)
|
677
|
+
# -J discard any mRNAs that either lack initial START codon
|
678
|
+
# or the terminal STOP codon, or have an in-frame stop codon
|
679
|
+
# (only print mRNAs with a fulll, valid CDS)
|
680
|
+
|
681
|
+
# -M/--merge : cluster the input transcripts into loci, collapsing matching
|
682
|
+
# transcripts (those with the same exact introns and fully contained)
|
683
|
+
# -d <dupinfo> : for -M option, write collapsing info to file <dupinfo>
|
684
|
+
# --cluster-only: same as --merge but without collapsing matching transcripts
|
685
|
+
# -K for -M option: also collapse shorter, fully contained transcripts
|
686
|
+
# with fewer introns than the container
|
687
|
+
# -Q for -M option, remove the containment restriction:
|
688
|
+
# (multi-exon transcripts will be collapsed if just their introns match,
|
689
|
+
# while single-exon transcripts can partially overlap (80%))
|
690
|
+
|
691
|
+
# -E expose (warn about) duplicate transcript IDs and other potential
|
692
|
+
# problems with the given GFF/GTF records
|
693
|
+
# -Z merge close exons into a single exon (for intron size<4)
|
694
|
+
# -w write a fasta file with spliced exons for each GFF transcript
|
695
|
+
# -x write a fasta file with spliced CDS for each GFF transcript
|
696
|
+
# -W for -w and -x options, also write for each fasta record the exon
|
697
|
+
# coordinates projected onto the spliced sequence
|
698
|
+
# -y write a protein fasta file with the translation of CDS for each record
|
699
|
+
# -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)
|
700
|
+
# -m <chr_replace> is a reference (genomic) sequence replacement table with
|
701
|
+
# this format:
|
702
|
+
# <original_ref_ID> <new_ref_ID>
|
703
|
+
# GFF records on reference sequences that are not found among the
|
704
|
+
# <original_ref_ID> entries in this file will be filtered out
|
705
|
+
# -o the "filtered" GFF records will be written to <outfile.gff>
|
706
|
+
# (use -o- for printing to stdout)
|
707
|
+
# -t use <trackname> in the second column of each GFF output line
|
708
|
+
# -T -o option will output GTF format instead of GFF3
|
709
|
+
class GffRead
|
710
|
+
include Bio::Command::Wrapper
|
711
|
+
|
712
|
+
set_program Bio::Ngs::Utils.binary("gffread")
|
713
|
+
use_aliases
|
714
|
+
|
715
|
+
add_option "genomic-sequence", :type => :string, :aliases => '-g'
|
716
|
+
add_option "seq-info", :type => :string, :aliases => '-s'
|
717
|
+
add_option "discard-transcripts", :type => :numeric, :aliases => '-i'
|
718
|
+
add_option "orverlap-coords", :type => :string, :aliases => '-r'
|
719
|
+
add_option "discard-not-overlap", :type => :string, :aliases => '-R'
|
720
|
+
add_option "discard-single-exon", :type => :boolean, :aliases => '-U'
|
721
|
+
add_option "coding-only", :type => :boolean, :aliases => '-C'
|
722
|
+
add_option "full-attributes", :type => :boolean, :aliases => '-F'
|
723
|
+
add_option "partial-attributes", :type => :boolean, :aliases => '-G'
|
724
|
+
add_option "description-field", :type => :string, :aliases => '-A'
|
725
|
+
add_option "also-non-transcripts", :type => :boolean, :aliases => '-O'
|
726
|
+
add_option "discard-in-frame-stop", :type => :boolean, :aliases => '-V'
|
727
|
+
add_option "adjust-codon-phase", :type => :boolean, :aliases => '-H'
|
728
|
+
add_option "single-exon-check-opposite", :type => :boolean, :aliases => '-B'
|
729
|
+
add_option "discard-multi-exon", :type => :boolean, :aliases => '-N'
|
730
|
+
add_option "discard-wrong-codon", :type => :boolean, :aliases => '-J'
|
731
|
+
add_option "merge", :type => :boolean, :aliases => '-M'
|
732
|
+
add_option "output-collapsing", :type => :string, :aliases => '-d'
|
733
|
+
add_option "cluster-only", :type => :boolean, :aliases => '-c'
|
734
|
+
add_option "collaps-contained", :type => :boolean, :aliases => '-K'
|
735
|
+
add_option "remove-containment-restriction", :type => :boolean, :aliases => '-Q'
|
736
|
+
add_option "warnings", :type => :boolean, :aliases => '-E'
|
737
|
+
add_option "merge-close-exons", :type => :boolean, :aliases => '-Z'
|
738
|
+
add_option "write-exon-fasta", :type => :boolean, :aliases => '-w'
|
739
|
+
add_option "write-cds-fasta", :type => :boolean, :aliases => '-x'
|
740
|
+
add_option "write-coords", :type => :boolean, :aliases => '-W'
|
741
|
+
add_option "write-protein-fasta", :type => :boolean, :aliases => '-y'
|
742
|
+
add_option "ensembl-to-gff3", :type => :boolean, :aliases => '-L'
|
743
|
+
add_option "chr-replace", :type => :string, :aliases => '-m'
|
744
|
+
add_option "output", :type => :string, :aliases => '-o', :default => "outfile.gtf", :collapse=>true
|
745
|
+
add_option "track-name", :type => :string, :aliases => '-t'
|
746
|
+
add_option "output-gtf", :type => :boolean, :aliases => '-T'
|
747
|
+
end # GffRead
|
582
748
|
end #Cufflinks
|
583
749
|
end #Ngs
|
584
|
-
end #Bio
|
750
|
+
end #Bio
|