bio-ngs 0.4.6.alpha.01 → 0.4.6.alpha.02

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. data/Gemfile +4 -2
  2. data/Gemfile.lock +21 -21
  3. data/README.rdoc +51 -4
  4. data/VERSION +1 -1
  5. data/bin/biongs +1 -0
  6. data/bio-ngs.gemspec +36 -8
  7. data/features/cufflinks_gtf_parser.feature +22 -0
  8. data/features/cufflinks_gtf_parser_indexing.feature +20 -0
  9. data/features/step_definitions/cufflinks_gtf.rb +30 -0
  10. data/features/step_definitions/cufflinks_gtf_parser_indexing.rb +53 -0
  11. data/features/support/env.rb +2 -0
  12. data/lib/bio-ngs.rb +19 -5
  13. data/lib/bio/appl/ngs/cufflinks.rb +447 -281
  14. data/lib/bio/appl/ngs/cufflinks/gtf/gtf.rb +23 -0
  15. data/lib/bio/appl/ngs/cufflinks/gtf/gtf_parser.rb +248 -0
  16. data/lib/bio/appl/ngs/cufflinks/gtf/transcript.rb +154 -0
  17. data/lib/bio/ngs/fs.rb +46 -0
  18. data/lib/bio/ngs/illumina/fastq.rb +176 -0
  19. data/lib/bio/ngs/illumina/illumina.rb +64 -0
  20. data/lib/bio/ngs/illumina/project.rb +81 -0
  21. data/lib/bio/ngs/illumina/sample.rb +85 -0
  22. data/lib/bio/ngs/task.rb +1 -1
  23. data/lib/bio/ngs/utils.rb +124 -112
  24. data/lib/meta.rb +162 -0
  25. data/lib/tasks/convert.thor +14 -14
  26. data/lib/tasks/filter.thor +158 -23
  27. data/lib/tasks/quality.thor +24 -4
  28. data/lib/tasks/rna.thor +26 -0
  29. data/lib/wrapper.rb +28 -0
  30. data/spec/bio/ngs/fs_spec.rb +70 -0
  31. data/spec/bio/ngs/illumina/fastq_spec.rb +52 -0
  32. data/spec/bio/ngs/illumina/illumina_spec.rb +21 -0
  33. data/spec/bio/ngs/illumina/project_spec.rb +0 -0
  34. data/spec/bio/ngs/illumina/sample_spec.rb +0 -0
  35. data/spec/bio/ngs/illumina/samples_spec.rb +0 -0
  36. data/spec/filter_spec.rb +25 -0
  37. data/spec/fixture/table_filter_list.txt +3 -0
  38. data/spec/fixture/table_filter_list_first_column.txt +2 -0
  39. data/spec/fixture/table_filter_source.tsv +44 -0
  40. data/spec/fixture/test-filtered-reference.fastq.gz +0 -0
  41. data/spec/fixture/test-merged-reference.fastq.gz +0 -0
  42. data/spec/fixture/test.fastq.gz +0 -0
  43. data/spec/meta_spec.rb +117 -0
  44. data/spec/spec_helper.rb +1 -1
  45. metadata +97 -69
@@ -0,0 +1,2 @@
1
+ $LOAD_PATH << File.expand_path('../../../lib', __FILE__)
2
+ require 'bio-ngs'
@@ -24,9 +24,12 @@ require 'bio'
24
24
  require 'active_record'
25
25
  require 'sqlite3'
26
26
 
27
- # NGS classes
27
+ #Generic classes
28
28
  require 'enumerable'
29
29
  require 'wrapper'
30
+ require 'meta'
31
+
32
+ # NGS classes
30
33
  require 'bio/ngs/utils'
31
34
  require 'bio/ngs/record'
32
35
  require 'bio/ngs/quality'
@@ -45,18 +48,29 @@ require 'bio/appl/ngs/sff_extract'
45
48
  require 'bio/appl/ngs/bcl2qseq' #TODO: FIX THIS BUGGY CODE in THOR TASK
46
49
 
47
50
  require 'bio/appl/ngs/cufflinks/iterators'
51
+ require 'bio/appl/ngs/cufflinks/gtf/gtf_parser'
52
+ require 'bio/appl/ngs/cufflinks/gtf/gtf'
53
+ require 'bio/appl/ngs/cufflinks/gtf/transcript'
48
54
  require 'bio/appl/ngs/cufflinks'
49
55
  require 'bio/appl/ngs/samtools'
50
56
  require 'bio/appl/ngs/fastx'
51
57
  require 'bio/appl/ngs/blast'
52
58
  require 'bio/appl/ngs/bwa'
53
59
 
54
- # history
60
+ #Illumina utility for projects
61
+ require 'bio/ngs/illumina/illumina'
62
+ require 'bio/ngs/fs'
63
+
64
+ # history
55
65
  Bio::Ngs::HISTORY_FILE = Dir.pwd+"/.task-history.yml"
56
66
  Bio::Ngs::Utils.extend_system_path
57
67
 
68
+
58
69
  # loading Tasks
59
- path = File.expand_path(File.dirname(__FILE__))
60
- Dir.glob(File.join(path,"tasks","*.thor")) do |thorfile|
61
- Thor::Util.load_thorfile(thorfile)
70
+ # TODO let the user define which tasks must be loaded, maybe a list of names
71
+ if Bio::Ngs.const_defined?(:LoadBaseTasks) && Bio::Ngs.const_get(:LoadBaseTasks)==true
72
+ path = File.expand_path(File.dirname(__FILE__))
73
+ Dir.glob(File.join(path,"tasks","*.thor")) do |thorfile|
74
+ Thor::Util.load_thorfile(thorfile)
75
+ end
62
76
  end
@@ -7,10 +7,8 @@
7
7
  #
8
8
  #
9
9
 
10
-
11
-
12
10
  module Bio
13
- module Ngs
11
+ module Ngs
14
12
  module Cufflinks
15
13
  VERSION = "1.0.X"
16
14
  class << self
@@ -19,229 +17,258 @@ module Bio
19
17
  end
20
18
  end
21
19
 
20
+ module MarkCall
21
+ def mark
22
+ puts caller.first #elaborate the concept of tracking but not here please.
23
+ end
24
+ end
25
+
22
26
 
23
- # cufflinks v1.3.0
24
- # linked against Boost version 104000
25
- # -----------------------------
26
- # Usage: cufflinks [options] <hits.sam>
27
- # General Options:
28
- # -o/--output-dir write all output files to this directory [ default: ./ ]
29
- # -p/--num-threads number of threads used during analysis [ default: 1 ]
30
- # --seed value of random number generator seed [ default: 0 ]
31
- # -G/--GTF quantitate against reference transcript annotations
32
- # -g/--GTF-guide use reference transcript annotation to guide assembly
33
- # -M/--mask-file ignore all alignment within transcripts in this file
34
- # -b/--frag-bias-correct use bias correction - reference fasta required [ default: NULL ]
35
- # -u/--multi-read-correct use 'rescue method' for multi-reads (more accurate) [ default: FALSE ]
36
- # --library-type library prep used for input reads [ default: below ]
37
- #
38
- # Advanced Abundance Estimation Options:
39
- # -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]
40
- # -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]
41
- # --upper-quartile-norm use upper-quartile normalization [ default: FALSE ]
42
- # --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
43
- # --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
44
- # --compatible-hits-norm count hits compatible with reference RNAs only [ default: FALSE ]
45
- # --total-hits-norm count all hits for normalization [ default: TRUE ]
46
- #
47
- # Advanced Assembly Options:
48
- # -L/--label assembled transcripts have this ID prefix [ default: CUFF ]
49
- # -F/--min-isoform-fraction suppress transcripts below this abundance level [ default: 0.10 ]
50
- # -j/--pre-mrna-fraction suppress intra-intronic transcripts below this level [ default: 0.15 ]
51
- # -I/--max-intron-length ignore alignments with gaps longer than this [ default: 300000 ]
52
- # -a/--junc-alpha alpha for junction binomial test filter [ default: 0.001 ]
53
- # -A/--small-anchor-fraction percent read overhang taken as 'suspiciously small' [ default: 0.09 ]
54
- # --min-frags-per-transfrag minimum number of fragments needed for new transfrags [ default: 10 ]
55
- # --overhang-tolerance number of terminal exon bp to tolerate in introns [ default: 8 ]
56
- # --max-bundle-length maximum genomic length allowed for a given bundle [ default:3500000 ]
57
- # --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]
58
- # --min-intron-length minimum intron size allowed in genome [ default: 50 ]
59
- # --trim-3-avgcov-thresh minimum avg coverage required to attempt 3' trimming [ default: 10 ]
60
- # --trim-3-dropoff-frac fraction of avg coverage below which to trim 3' end [ default: 0.1 ]
61
- #
62
- # Advanced Reference Annotation Guided Assembly Options:
63
- # --no-faux-reads disable tiling by faux reads [ default: FALSE ]
64
- # --3-overhang-tolerance overhang allowed on 3' end when merging with reference[ default: 600 ]
65
- # --intron-overhang-tolerance overhang allowed inside reference intron when merging [ default: 30 ]
66
- #
67
- # Advanced Program Behavior Options:
68
- # -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
69
- # -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
70
- # --no-update-check do not contact server to check for update availability[ default: FALSE ]
71
- #
72
- # Supported library types:
73
- # ff-firststrand
74
- # ff-secondstrand
75
- # ff-unstranded
76
- # fr-firststrand
77
- # fr-secondstrand
78
- # fr-unstranded (default)
79
- # transfrags
80
- class Quantification
81
-
82
- include Bio::Command::Wrapper
83
- include Bio::Ngs::Cufflinks::Utils
84
-
85
- set_program Bio::Ngs::Utils.binary("cufflinks")
86
-
87
- add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
88
- add_option "num-threads", :type => :numeric, :aliases => '-p', :default => 1
89
- add_option "seed", :type => :numeric
90
- add_option "GTF", :type => :string, :aliases => '-G'
91
- add_option "GTF-guide", :type => :boolean, :aliases => '-g'
92
- add_option "mask-file", :type => :string, :aliases => '-M'
93
- add_option "frag-bias-correct", :type => :string, :aliases => '-b'
94
- add_option "multi-read-correct", :type => :boolean, :aliases => '-u'
95
- add_option "library-type", :type => :string
96
- add_option "farg-len-mean", :type => :numeric, :aliases => '-m'#, :default => 200
97
- add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'#, :default => 80
98
- add_option "upper-quartile-norm", :type => :boolean
99
- add_option "max-mle-iterations", :type => :numeric#, :default => 5000
100
- add_option "num-importance-samples", :type => :numeric#, :default => 1000
101
- add_option "compatible-hits-norm", :type => :boolean, :aliases => '-h'
102
- add_option "total-hits-norm", :type => :boolean, :aliases => '-t'
103
- add_option "label", :type => :string, :aliases => '-L', :default => "CUFF"
104
- add_option "min-isoform-fraction", :type => :numeric, :aliases => '-F', :default => 0.15
105
- add_option "pre-mrna-fraction", :type => :numeric, :aliases => '-j', :default => 0.15
106
- #deprecated add_option "min-intron-fraction", :type => :numeric, :aliases => '-f', :default => 0.05
107
- add_option "max-intron-length", :type => :numeric, :aliases => '-I', :default => 300000
108
- add_option "junc-alpha", :type => :numeric, :aliases => '-a', :default => 0.01
109
- add_option "small-anchor-fraction", :type => :numeric, :aliases => '-A', :default => 0.12
110
- add_option "min-frags-per-transfrag", :type => :numeric#, :default => 10
111
- add_option "overhang-tolerance", :type => :numeric#, :default => 8
112
- add_option "max-bundle-length", :type => :numeric #, :default => 3500000
113
- add_option "max-bundle-frags", :type => :numeric #, :default => 500000
114
- add_option "min-intron-length", :type => :numeric#, :default => 50
115
- add_option "trim-3-avgcov-thresh", :type => :numeric
116
- add_option "trim-3-dropoff-frac", :type => :numeric
117
- add_option "no-faux-reads", :type => :boolean
118
- add_option "3-overhang-tolerance", :type => :numeric
119
- add_option "intron-overhang-tolerance", :type => :numeric
120
- add_option "verbose", :type => :boolean, :aliases => '-v'
121
- add_option "quiet", :type => :boolean, :aliases => '-q'
122
-
123
- #deprecated add_option "min-map-qual", :type => :numeric, :aliases => '-Q', :default => 0
124
- #deprecated add_option "reference-seq", :type => :string, :aliases => '-r'
125
- #deprecated add_option "quartile-normalization", :type => :boolean, :aliases => '-N'
126
-
127
- #TODO Check why with these defaults is not working properly
128
-
129
-
130
-
131
- add_iterator_for :genes
132
- add_iterator_for :isoforms
133
- end #Quantification
134
-
135
- # cuffdiff v1.3.0 (3022)
136
- # -----------------------------
137
- # Usage: cuffdiff [options] <transcripts.gtf> <sample1_hits.sam> <sample2_hits.sam> [... sampleN_hits.sam]
138
- # Supply replicate SAMs as comma separated lists for each condition: sample1_rep1.sam,sample1_rep2.sam,...sample1_repM.sam
139
- # General Options:
140
- # -o/--output-dir write all output files to this directory [ default: ./ ]
141
- # --seed value of random number generator seed [ default: 0 ]
142
- # -T/--time-series treat samples as a time-series [ default: FALSE ]
143
- # -c/--min-alignment-count minimum number of alignments in a locus for testing [ default: 10 ]
144
- # --FDR False discovery rate used in testing [ default: 0.05 ]
145
- # -M/--mask-file ignore all alignment within transcripts in this file [ default: NULL ]
146
- # -b/--frag-bias-correct use bias correction - reference fasta required [ default: NULL ]
147
- # -u/--multi-read-correct use 'rescue method' for multi-reads (more accurate) [ default: FALSE ]
148
- # -N/--upper-quartile-norm use upper-quartile normalization [ default: FALSE ]
149
- # -L/--labels comma-separated list of condition labels
150
- # -p/--num-threads number of threads used during quantification [ default: 1 ]
151
- #
152
- # Advanced Options:
153
- # --library-type Library prep used for input reads [ default: below ]
154
- # -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]
155
- # -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]
156
- # --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
157
- # --num-bootstrap-samples Number of bootstrap replications [ default: 20 ]
158
- # --bootstrap-fraction Fraction of fragments in each bootstrap sample [ default: 1.0 ]
159
- # --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
160
- # --compatible-hits-norm count hits compatible with reference RNAs only [ default: TRUE ]
161
- # --total-hits-norm count all hits for normalization [ default: FALSE ]
162
- # --poisson-dispersion Don't fit fragment counts for overdispersion [ default: FALSE ]
163
- # -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
164
- # -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
165
- # --no-update-check do not contact server to check for update availability[ default: FALSE ]
166
- # --emit-count-tables print count tables used to fit overdispersion [ default: FALSE ]
167
- # --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]
168
- #
169
- # Debugging use only:
170
- # --read-skip-fraction Skip a random subset of reads this size [ default: 0.0 ]
171
- # --no-read-pairs Break all read pairs [ default: FALSE ]
172
- # --trim-read-length Trim reads to be this long (keep 5' end) [ default: none ]
173
- # --cov-delta Maximum gap between bootstrap and IS [ default: 2.0 ]
174
- #
175
- # Supported library types:
176
- # ff-firststrand
177
- # ff-secondstrand
178
- # ff-unstranded
179
- # fr-firststrand
180
- # fr-secondstrand
181
- # fr-unstranded (default)
182
- # transfrags
183
- class Diff
184
- include Bio::Command::Wrapper
185
- include Bio::Ngs::Cufflinks::Utils
186
-
187
- set_program Bio::Ngs::Utils.binary("cuffdiff")
188
-
189
- add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
190
- add_option "seed", :type => :numeric
191
- add_option "time-series", :type => :boolean, :aliases => '-T'
192
- add_option "min-alignment-count", :type => :numeric, :aliases => '-c'
193
- add_option "FDR", :type => :numeric, :aliases => '-F'
194
- #TODO:FIX add_option "mask-file", :type => :string, :aliases => '-M'
195
- #TODO:FIX add_option "frag-bias-correct", :type =>
196
- add_option "multi-read-correct", :type => :boolean, :aliases => '-u'
197
- add_option "upper-quartile-norm", :type => :boolean, :aliases => 'N'
198
- add_option "labels", :type => :array, :aliases => '-L'
199
- add_option "num-threads", :type => :numeric, :aliases => '-p'
200
- add_option "library-type", :type => :string, :aliases => '-l'
201
- add_option "frag-len-mean", :type => :numeric, :aliases => '-m'
202
- add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'
203
- add_option "num-importance-samples", :type => :numeric, :aliases => '-i'
204
- add_option "num-bootstrap-samples", :type => :numeric
205
- add_option "bootstrap-fraction", :type => :numeric
206
- add_option "max-mle-iterations", :type => :numeric, :aliases => '-e'
207
- add_option "compatible-hits-norm", :type => :boolean, :aliases => '-h'
208
- add_option "total-hits-norm", :type => :boolean, :aliases => '-t'
209
- add_option "poisson-dispersion", :type => :boolean, :aliases => '-d'
210
- add_option "verbose", :type => :boolean, :aliases => '-v'
211
- add_option "quiet", :type => :boolean, :aliases => '-q'
212
- add_option "no-update-check", :type => :boolean, :aliases => '-j'
213
- add_option "emit-count-tables", :type => :boolean, :aliases => '-b'
214
- add_option "max-bundle-frags", :type => :numeric
215
- add_option "read-skip-fraction", :type => :numeric
216
- add_option "no-read-pairs", :type => :numeric
217
- add_option "trim-read-length", :type => :numeric
218
- add_option "cov-delta", :type => :numeric
219
-
220
- #define iterators
221
- add_iterator_for :genes
222
- add_iterator_for :isoforms
223
- add_iterator_for :cds
224
- add_iterator_for :tss_groups
225
-
226
- #Examples
227
- #Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,3,0.6,false,true)
228
- #Bio::Ngs::Cufflinks::Diff.genes("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/gene_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,5,0.5,false,true)
229
-
230
- class << self
231
-
232
- #Return the version of CuffDiff used to produce the output
233
- def version(diff)
234
- #cufflink_version_offset = Bio::Ngs::Cufflinks.version
235
- f=File.open(diff,'r')
236
- header=f.readline #skip header
237
- f.close
238
- cufflink_version_offset = case header.split.size
239
- when 12
240
- "0.9.X"
241
- when 14
242
- Bio::Ngs::Cufflinks.version #latest
243
- end
244
- end#version
27
+ # cufflinks v1.3.0
28
+ # linked against Boost version 104000
29
+ # -----------------------------
30
+ # Usage: cufflinks [options] <hits.sam>
31
+ # General Options:
32
+ # -o/--output-dir write all output files to this directory [ default: ./ ]
33
+ # -p/--num-threads number of threads used during analysis [ default: 1 ]
34
+ # --seed value of random number generator seed [ default: 0 ]
35
+ # -G/--GTF quantitate against reference transcript annotations
36
+ # -g/--GTF-guide use reference transcript annotation to guide assembly
37
+ # -M/--mask-file ignore all alignment within transcripts in this file
38
+ # -b/--frag-bias-correct use bias correction - reference fasta required [ default: NULL ]
39
+ # -u/--multi-read-correct use 'rescue method' for multi-reads (more accurate) [ default: FALSE ]
40
+ # --library-type library prep used for input reads [ default: below ]
41
+ #
42
+ # Advanced Abundance Estimation Options:
43
+ # -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]
44
+ # -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]
45
+ # --upper-quartile-norm use upper-quartile normalization [ default: FALSE ]
46
+ # --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
47
+ # --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
48
+ # --compatible-hits-norm count hits compatible with reference RNAs only [ default: FALSE ]
49
+ # --total-hits-norm count all hits for normalization [ default: TRUE ]
50
+ #
51
+ # Advanced Assembly Options:
52
+ # -L/--label assembled transcripts have this ID prefix [ default: CUFF ]
53
+ # -F/--min-isoform-fraction suppress transcripts below this abundance level [ default: 0.10 ]
54
+ # -j/--pre-mrna-fraction suppress intra-intronic transcripts below this level [ default: 0.15 ]
55
+ # -I/--max-intron-length ignore alignments with gaps longer than this [ default: 300000 ]
56
+ # -a/--junc-alpha alpha for junction binomial test filter [ default: 0.001 ]
57
+ # -A/--small-anchor-fraction percent read overhang taken as 'suspiciously small' [ default: 0.09 ]
58
+ # --min-frags-per-transfrag minimum number of fragments needed for new transfrags [ default: 10 ]
59
+ # --overhang-tolerance number of terminal exon bp to tolerate in introns [ default: 8 ]
60
+ # --max-bundle-length maximum genomic length allowed for a given bundle [ default:3500000 ]
61
+ # --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]
62
+ # --min-intron-length minimum intron size allowed in genome [ default: 50 ]
63
+ # --trim-3-avgcov-thresh minimum avg coverage required to attempt 3' trimming [ default: 10 ]
64
+ # --trim-3-dropoff-frac fraction of avg coverage below which to trim 3' end [ default: 0.1 ]
65
+ #
66
+ # Advanced Reference Annotation Guided Assembly Options:
67
+ # --no-faux-reads disable tiling by faux reads [ default: FALSE ]
68
+ # --3-overhang-tolerance overhang allowed on 3' end when merging with reference[ default: 600 ]
69
+ # --intron-overhang-tolerance overhang allowed inside reference intron when merging [ default: 30 ]
70
+ #
71
+ # Advanced Program Behavior Options:
72
+ # -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
73
+ # -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
74
+ # --no-update-check do not contact server to check for update availability[ default: FALSE ]
75
+ #
76
+ # Supported library types:
77
+ # ff-firststrand
78
+ # ff-secondstrand
79
+ # ff-unstranded
80
+ # fr-firststrand
81
+ # fr-secondstrand
82
+ # fr-unstranded (default)
83
+ # transfrags
84
+ class Quantification
85
+ include Bio::Command::Wrapper
86
+ include Bio::Ngs::Cufflinks::Utils
87
+
88
+ set_program Bio::Ngs::Utils.binary("cufflinks")
89
+
90
+ add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
91
+ add_option "num-threads", :type => :numeric, :aliases => '-p', :default => 1
92
+ add_option "seed", :type => :numeric
93
+ add_option "GTF", :type => :string, :aliases => '-G'
94
+ add_option "GTF-guide", :type => :string, :aliases => '-g'
95
+ add_option "mask-file", :type => :string, :aliases => '-M'
96
+ add_option "frag-bias-correct", :type => :string, :aliases => '-b'
97
+ add_option "multi-read-correct", :type => :boolean, :aliases => '-u'
98
+ add_option "library-type", :type => :string
99
+ add_option "farg-len-mean", :type => :numeric, :aliases => '-m'#, :default => 200
100
+ add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'#, :default => 80
101
+ add_option "upper-quartile-norm", :type => :boolean
102
+ add_option "max-mle-iterations", :type => :numeric#, :default => 5000
103
+ add_option "num-importance-samples", :type => :numeric#, :default => 1000
104
+ add_option "compatible-hits-norm", :type => :boolean, :aliases => '-h'
105
+ add_option "total-hits-norm", :type => :boolean, :aliases => '-t'
106
+ add_option "label", :type => :string, :aliases => '-L', :default => "CUFF"
107
+ add_option "min-isoform-fraction", :type => :numeric, :aliases => '-F', :default => 0.15
108
+ add_option "pre-mrna-fraction", :type => :numeric, :aliases => '-j', :default => 0.15
109
+ #deprecated add_option "min-intron-fraction", :type => :numeric, :aliases => '-f', :default => 0.05
110
+ add_option "max-intron-length", :type => :numeric, :aliases => '-I', :default => 300000
111
+ add_option "junc-alpha", :type => :numeric, :aliases => '-a', :default => 0.01
112
+ add_option "small-anchor-fraction", :type => :numeric, :aliases => '-A', :default => 0.12
113
+ add_option "min-frags-per-transfrag", :type => :numeric#, :default => 10
114
+ add_option "overhang-tolerance", :type => :numeric#, :default => 8
115
+ add_option "max-bundle-length", :type => :numeric #, :default => 3500000
116
+ add_option "max-bundle-frags", :type => :numeric #, :default => 500000
117
+ add_option "min-intron-length", :type => :numeric#, :default => 50
118
+ add_option "trim-3-avgcov-thresh", :type => :numeric
119
+ add_option "trim-3-dropoff-frac", :type => :numeric
120
+ add_option "no-faux-reads", :type => :boolean
121
+ add_option "3-overhang-tolerance", :type => :numeric
122
+ add_option "intron-overhang-tolerance", :type => :numeric
123
+ add_option "verbose", :type => :boolean, :aliases => '-v'
124
+ add_option "quiet", :type => :boolean, :aliases => '-q'
125
+
126
+ #deprecated add_option "min-map-qual", :type => :numeric, :aliases => '-Q', :default => 0
127
+ #deprecated add_option "reference-seq", :type => :string, :aliases => '-r'
128
+ #deprecated add_option "quartile-normalization", :type => :boolean, :aliases => '-N'
129
+
130
+ #TODO Check why with these defaults is not working properly
131
+
132
+
133
+
134
+ add_iterator_for :genes
135
+ add_iterator_for :isoforms
136
+
137
+
138
+
139
+
140
+ end #Quantification
141
+
142
+
143
+ class QuantificationDenovo < Quantification
144
+ #set_program Bio::Ngs::Utils.binary("cufflinks")
145
+ delete_option "GTF"
146
+ #add_option "GTF-guide", :type => :string, :aliases => '-g'
147
+ # add_alias "GTF", "GTF-guide"
148
+
149
+ # returns new trascripts from a gff3 file, it creates the file if doesn't exist
150
+ # gets only the brand new.
151
+ def get_new_transcripts(file=nil, type="gtf")
152
+ # TODO implement conversion to gff3
153
+ file||= "transcripts.#{type}"
154
+ # if type=="gtf"
155
+ # unless File.exists?(file)
156
+ # to_gff3(File.dirname(File.absolute_path(file)))
157
+ # end
158
+ File.open()
159
+ end
160
+ end
161
+
162
+ # cuffdiff v1.3.0 (3022)
163
+ # -----------------------------
164
+ # Usage: cuffdiff [options] <transcripts.gtf> <sample1_hits.sam> <sample2_hits.sam> [... sampleN_hits.sam]
165
+ # Supply replicate SAMs as comma separated lists for each condition: sample1_rep1.sam,sample1_rep2.sam,...sample1_repM.sam
166
+ # General Options:
167
+ # -o/--output-dir write all output files to this directory [ default: ./ ]
168
+ # --seed value of random number generator seed [ default: 0 ]
169
+ # -T/--time-series treat samples as a time-series [ default: FALSE ]
170
+ # -c/--min-alignment-count minimum number of alignments in a locus for testing [ default: 10 ]
171
+ # --FDR False discovery rate used in testing [ default: 0.05 ]
172
+ # -M/--mask-file ignore all alignment within transcripts in this file [ default: NULL ]
173
+ # -b/--frag-bias-correct use bias correction - reference fasta required [ default: NULL ]
174
+ # -u/--multi-read-correct use 'rescue method' for multi-reads (more accurate) [ default: FALSE ]
175
+ # -N/--upper-quartile-norm use upper-quartile normalization [ default: FALSE ]
176
+ # -L/--labels comma-separated list of condition labels
177
+ # -p/--num-threads number of threads used during quantification [ default: 1 ]
178
+ #
179
+ # Advanced Options:
180
+ # --library-type Library prep used for input reads [ default: below ]
181
+ # -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]
182
+ # -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]
183
+ # --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
184
+ # --num-bootstrap-samples Number of bootstrap replications [ default: 20 ]
185
+ # --bootstrap-fraction Fraction of fragments in each bootstrap sample [ default: 1.0 ]
186
+ # --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
187
+ # --compatible-hits-norm count hits compatible with reference RNAs only [ default: TRUE ]
188
+ # --total-hits-norm count all hits for normalization [ default: FALSE ]
189
+ # --poisson-dispersion Don't fit fragment counts for overdispersion [ default: FALSE ]
190
+ # -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
191
+ # -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
192
+ # --no-update-check do not contact server to check for update availability[ default: FALSE ]
193
+ # --emit-count-tables print count tables used to fit overdispersion [ default: FALSE ]
194
+ # --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]
195
+ #
196
+ # Debugging use only:
197
+ # --read-skip-fraction Skip a random subset of reads this size [ default: 0.0 ]
198
+ # --no-read-pairs Break all read pairs [ default: FALSE ]
199
+ # --trim-read-length Trim reads to be this long (keep 5' end) [ default: none ]
200
+ # --cov-delta Maximum gap between bootstrap and IS [ default: 2.0 ]
201
+ #
202
+ # Supported library types:
203
+ # ff-firststrand
204
+ # ff-secondstrand
205
+ # ff-unstranded
206
+ # fr-firststrand
207
+ # fr-secondstrand
208
+ # fr-unstranded (default)
209
+ # transfrags
210
+ class Diff
211
+ include Bio::Command::Wrapper
212
+ include Bio::Ngs::Cufflinks::Utils
213
+
214
+ set_program Bio::Ngs::Utils.binary("cuffdiff")
215
+
216
+ add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
217
+ add_option "seed", :type => :numeric
218
+ add_option "time-series", :type => :boolean, :aliases => '-T'
219
+ add_option "min-alignment-count", :type => :numeric, :aliases => '-c'
220
+ add_option "FDR", :type => :numeric, :aliases => '-F'
221
+ #TODO:FIX add_option "mask-file", :type => :string, :aliases => '-M'
222
+ #TODO:FIX add_option "frag-bias-correct", :type =>
223
+ add_option "multi-read-correct", :type => :boolean, :aliases => '-u'
224
+ add_option "upper-quartile-norm", :type => :boolean, :aliases => 'N'
225
+ add_option "labels", :type => :array, :aliases => '-L'
226
+ add_option "num-threads", :type => :numeric, :aliases => '-p'
227
+ add_option "library-type", :type => :string, :aliases => '-l'
228
+ add_option "frag-len-mean", :type => :numeric, :aliases => '-m'
229
+ add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'
230
+ add_option "num-importance-samples", :type => :numeric, :aliases => '-i'
231
+ add_option "num-bootstrap-samples", :type => :numeric
232
+ add_option "bootstrap-fraction", :type => :numeric
233
+ add_option "max-mle-iterations", :type => :numeric, :aliases => '-e'
234
+ add_option "compatible-hits-norm", :type => :boolean, :aliases => '-h'
235
+ add_option "total-hits-norm", :type => :boolean, :aliases => '-t'
236
+ add_option "poisson-dispersion", :type => :boolean, :aliases => '-d'
237
+ add_option "verbose", :type => :boolean, :aliases => '-v'
238
+ add_option "quiet", :type => :boolean, :aliases => '-q'
239
+ add_option "no-update-check", :type => :boolean, :aliases => '-j'
240
+ add_option "emit-count-tables", :type => :boolean, :aliases => '-b'
241
+ add_option "max-bundle-frags", :type => :numeric
242
+ add_option "read-skip-fraction", :type => :numeric
243
+ add_option "no-read-pairs", :type => :numeric
244
+ add_option "trim-read-length", :type => :numeric
245
+ add_option "cov-delta", :type => :numeric
246
+
247
+ #define iterators
248
+ add_iterator_for :genes
249
+ add_iterator_for :isoforms
250
+ add_iterator_for :cds
251
+ add_iterator_for :tss_groups
252
+
253
+ #Examples
254
+ #Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,3,0.6,false,true)
255
+ #Bio::Ngs::Cufflinks::Diff.genes("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/gene_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,5,0.5,false,true)
256
+
257
+ class << self
258
+
259
+ #Return the version of CuffDiff used to produce the output
260
+ def version(diff)
261
+ #cufflink_version_offset = Bio::Ngs::Cufflinks.version
262
+ f=File.open(diff,'r')
263
+ header=f.readline #skip header
264
+ f.close
265
+ cufflink_version_offset = case header.split.size
266
+ when 12
267
+ "0.9.X"
268
+ when 14
269
+ Bio::Ngs::Cufflinks.version #latest
270
+ end
271
+ end#version
245
272
 
246
273
 
247
274
  def offset_by_version(cufflinks_version)
@@ -252,7 +279,7 @@ module Bio
252
279
  1
253
280
  end
254
281
  end
255
-
282
+
256
283
  #write a file with the information
257
284
  #See process_de for options available
258
285
  # Example: Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DEPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/isoform_exp.diff",
@@ -268,16 +295,16 @@ module Bio
268
295
  #See process_de for options available
269
296
  # Example: Bio::Ngs::Cufflinks::Diff.genes("/Users/bonnalraoul/Desktop/RRep16giugno/DEPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/gene_exp.diff",
270
297
  # "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_PopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/ComparepPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8.combined.gtf",
271
- # fold:0.5,min_samples:5,min_fpkm:0.5,z_scores:true, :regulated=>:up)
298
+ # fold:0.5,min_samples:5,min_fpkm:0.5,z_scores:true, :regulated=>:up)
272
299
  def genes(diff, gtf, options={})
273
300
  process_de(diff, gtf, options) do |dict_info, diff_reference, gtf_kb, fpkm_values|
274
- # puts diff_reference
275
- # puts fpkm_values
301
+ # puts diff_reference
302
+ # puts fpkm_values
276
303
  # "#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:gene_name]}\t#{fpkm_values.join("\t")}"
277
- #do not use th gtf kb
304
+ #do not use th gtf kb
278
305
  "#{dict_info[:winner].first}\t#{dict_info[:gene_name]}\t#{fpkm_values.join("\t")}"
279
306
  end
280
- end #genes
307
+ end #genes
281
308
 
282
309
  private
283
310
  #Options hash
@@ -297,7 +324,7 @@ module Bio
297
324
  force_not_significative = options[:force_not_significative] || false
298
325
 
299
326
  #set up the kb if not available = pass an option with the path of the kb ?
300
- gtf_kb = nil###### Bio::Ngs::Cufflinks::Compare.exists_kb?(gtf) ? Bio::Ngs::Cufflinks::Compare.load_compare_kb(gtf) : Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
327
+ gtf_kb = nil###### Bio::Ngs::Cufflinks::Compare.exists_kb?(gtf) ? Bio::Ngs::Cufflinks::Compare.load_compare_kb(gtf) : Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
301
328
 
302
329
  #convert log2 fold value into natural log value (internally computed by cuffdiff)
303
330
  fold_log2 = fold
@@ -311,7 +338,7 @@ module Bio
311
338
 
312
339
  File.open(diff,'r') do |f|
313
340
  header=f.readline #skip header
314
-
341
+
315
342
  test_id_idx = 0
316
343
  gene_name_idx = 2
317
344
  q_first_idx = 3 + cufflink_version_offset
@@ -320,76 +347,76 @@ module Bio
320
347
  fpkm_second_idx = 7 + cufflink_version_offset
321
348
  fold_idx = 8 + cufflink_version_offset
322
349
  significant_idx = 11 + cufflink_version_offset + (cufflink_version_offset==1 ? 1 : 0)
323
-
350
+
324
351
  #Commenti:
325
352
  # per ogni riga del diff devo salvare il valore dei espressione di ogni test
326
353
  # quindi fpkm e se è significativo o meno
327
354
 
328
355
  f.each_line do |line|
329
356
  data=line.split
330
-
357
+
331
358
  #fix comparison t-test, remove negative symbol e invert comparison: if fold change q1 vs q2 <0 abs(foldchange) & swaap q1,q2
332
- # puts data[fold_idx].to_f
333
- #delete puts "#{data[fold_idx].to_f} #{data[fold_idx].to_f<0}"
359
+ # puts data[fold_idx].to_f
360
+ #delete puts "#{data[fold_idx].to_f} #{data[fold_idx].to_f<0}"
334
361
  if data[fold_idx].to_f<0
335
362
  data[fold_idx]=data[fold_idx][1..-1] #.sub(/-/,"") remove the minus symbol from the number, the values q1, q2 and their fpkm will be reorganized into the data structure
336
- else
337
- # puts "ciao"
363
+ else
364
+ # puts "ciao"
338
365
  data[fpkm_first_idx],data[fpkm_second_idx]=data[fpkm_second_idx],data[fpkm_first_idx]
339
366
  data[q_first_idx],data[q_second_idx]=data[q_second_idx],data[q_first_idx]
340
- #delete puts "#{q_first_idx},#{q_second_idx}"
367
+ #delete puts "#{q_first_idx},#{q_second_idx}"
341
368
  end
342
- #delete puts "#{q_first_idx},#{q_second_idx}"
343
- #delete puts "#{data[q_first_idx].to_sym} #{data[q_second_idx].to_sym}"
344
- #delete puts "#{data[fpkm_first_idx].to_sym} #{data[fpkm_second_idx].to_sym}"
369
+ #delete puts "#{q_first_idx},#{q_second_idx}"
370
+ #delete puts "#{data[q_first_idx].to_sym} #{data[q_second_idx].to_sym}"
371
+ #delete puts "#{data[fpkm_first_idx].to_sym} #{data[fpkm_second_idx].to_sym}"
345
372
 
346
373
 
347
374
  #0 TCONS
348
375
  #4 name sample is the max diff for the item
349
376
  #5 name sample is the less diff for the item
350
- #9 is the fold
377
+ #9 is the fold
351
378
  dict_samples[data[q_first_idx]]
352
379
  dict_samples[data[q_second_idx]]
353
380
 
354
381
  #7 is the fpkm value of max pop/sample
355
382
  #8 is the fpkm value of min pop/sample
356
383
  k_reference = data[test_id_idx].to_sym #This can be TCONS if isoforms or XLOC if genes
357
-
384
+
358
385
  unless dict[k_reference].key?(:values)
359
386
  dict[k_reference][:values]={}
360
387
  dict[k_reference][:gene_name]=data[gene_name_idx]
361
- end
388
+ end
362
389
  dict[k_reference][:values][data[q_first_idx].to_sym]=data[fpkm_first_idx].to_f unless dict[k_reference][:values].key?(data[q_first_idx].to_sym)
363
- dict[k_reference][:values][data[q_second_idx].to_sym]=data[fpkm_second_idx].to_f unless dict[k_reference][:values].key?(data[q_second_idx].to_sym)
364
-
390
+ dict[k_reference][:values][data[q_second_idx].to_sym]=data[fpkm_second_idx].to_f unless dict[k_reference][:values].key?(data[q_second_idx].to_sym)
391
+
365
392
  if ((only_significative==true && data[significant_idx]=="yes") || ((data[significant_idx]=="yes"||force_not_significative) && data[fold_idx].to_f>=fold)) && data[fpkm_first_idx].to_f>=min_fpkm && data[fpkm_second_idx].to_f>=min_fpkm
366
-
367
- ###### puts data.join(" ") if k_reference == :XLOC_017497
393
+
394
+ ###### puts data.join(" ") if k_reference == :XLOC_017497
368
395
  #TODO refactor: this can be done using lambda
369
396
  k_sample = ""
370
397
  if regulated==:up
371
398
 
372
399
  k_sample = data[q_first_idx].to_sym
373
- #delete puts "#{k_sample} #{data[q_second_idx].to_sym}"
400
+ #delete puts "#{k_sample} #{data[q_second_idx].to_sym}"
374
401
  dict[k_reference][k_sample]<<data[q_second_idx].to_sym
375
- #delete puts "#{k_reference} #{q_first_idx}, #{q_second_idx}"
402
+ #delete puts "#{k_reference} #{q_first_idx}, #{q_second_idx}"
376
403
  k_sample
377
404
  elsif regulated==:down
378
405
  k_sample = data[q_second_idx].to_sym
379
406
  dict[k_reference][k_sample]<<data[q_first_idx].to_sym
380
- k_sample
407
+ k_sample
381
408
  end
382
-
383
- #delete puts dict[k_reference].inspect if k_reference == :XLOC_017497
409
+
410
+ #delete puts dict[k_reference].inspect if k_reference == :XLOC_017497
384
411
  #delete puts dict.inspect
385
- #store fpkm values as well for each pop/sample it should be
412
+ #store fpkm values as well for each pop/sample it should be
386
413
  if dict[k_reference][k_sample].size >= min_samples
387
414
  (dict[k_reference][:winner] << k_sample).uniq!
388
415
  end
389
- #delete puts dict[k_reference].inspect if k_reference == :XLOC_017497
416
+ #delete puts dict[k_reference].inspect if k_reference == :XLOC_017497
390
417
  else
391
418
  # k_reference = data[0].to_sym #This can be TCONS if isoforms or XLOC if genes
392
- #
419
+ #
393
420
  # unless dict[k_reference].key?(:values)
394
421
  # dict[k_reference][:values]={}
395
422
  # end
@@ -398,19 +425,19 @@ module Bio
398
425
  # dict[k_reference][:values][data[q_second_idx].to_sym]=data[fpkm_second_idx].to_f unless dict[k_reference][:values].key?(data[q_second_idx].to_sym)
399
426
  # #dict[k_reference][:values][data[4].to_sym]=data[7].to_f
400
427
  end
401
- #delete puts dict[k_reference].inspect
402
-
428
+ #delete puts dict[k_reference].inspect
429
+
403
430
  end #each line
404
- #example structure
405
- #{:TCONS_00086164=>{:q5=>[:q1, :q2, :q3, :q6]}, :TCONS_00086166=>{:q5=>[:q1, :q2, :q3, :q4, :q6]}
431
+ #example structure
432
+ #{:TCONS_00086164=>{:q5=>[:q1, :q2, :q3, :q6]}, :TCONS_00086166=>{:q5=>[:q1, :q2, :q3, :q4, :q6]}
406
433
  end #file.open
407
434
 
408
435
  file_lines =[]
409
436
  dict.each do |diff_reference, dict_info|
410
-
437
+
411
438
  if dict_info.key?(:winner)
412
439
  #puts dict_info.inspect
413
-
440
+
414
441
  #BAD PERFORMANCES use lambda
415
442
  valz = case z_scores
416
443
  when true
@@ -419,13 +446,13 @@ module Bio
419
446
  stdev = items.standard_deviation
420
447
  items.map do |fpkm|
421
448
  (fpkm-average)/stdev
422
- end
449
+ end
423
450
  when false
424
451
  dict_info[:values].sort.map{|sample| sample[1]}
425
452
  end #case
426
453
 
427
454
  #TODO generalize to isoforms and genes now only isoforms
428
- # puts yield(dict_info, diff_reference, gtf_kb, valz) if diff_reference == :XLOC_017497
455
+ # puts yield(dict_info, diff_reference, gtf_kb, valz) if diff_reference == :XLOC_017497
429
456
  file_lines<< yield(dict_info, diff_reference, gtf_kb, valz) #fpkm_values
430
457
  #file_lines<<"#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:nearest_ref]}_#{gtf_kb[diff_reference][:gene_name]}\t#{valz.join("\t")}"
431
458
  else
@@ -451,42 +478,42 @@ module Bio
451
478
  # cuffcompare v1.0.2 (2335)
452
479
  # -----------------------------
453
480
  # Usage:
454
- # cuffcompare [-r <reference_mrna.gtf>] [-R] [-T] [-V] [-s <seq_path>]
455
- # [-o <outprefix>] [-p <cprefix>]
481
+ # cuffcompare [-r <reference_mrna.gtf>] [-R] [-T] [-V] [-s <seq_path>]
482
+ # [-o <outprefix>] [-p <cprefix>]
456
483
  # {-i <input_gtf_list> | <input1.gtf> [<input2.gtf> .. <inputN.gtf>]}
457
- #
484
+ #
458
485
  # Cuffcompare provides classification, reference annotation mapping and various
459
486
  # statistics for Cufflinks transfrags.
460
487
  # Cuffcompare clusters and tracks transfrags across multiple samples, writing
461
488
  # matching transcripts (intron chains) into <outprefix>.tracking, and a GTF
462
- # file <outprefix>.combined.gtf containing a nonredundant set of transcripts
489
+ # file <outprefix>.combined.gtf containing a nonredundant set of transcripts
463
490
  # across all input files (with a single representative transfrag chosen
464
491
  # for each clique of matching transfrags across samples).
465
- #
492
+ #
466
493
  # Options:
467
494
  # -i provide a text file with a list of Cufflinks GTF files to process instead
468
495
  # of expecting them as command line arguments (useful when a large number
469
496
  # of GTF files should be processed)
470
- #
471
- # -r a set of known mRNAs to use as a reference for assessing
497
+ #
498
+ # -r a set of known mRNAs to use as a reference for assessing
472
499
  # the accuracy of mRNAs or gene models given in <input.gtf>
473
- #
474
- # -R for -r option, reduce the set of reference transcripts to
500
+ #
501
+ # -R for -r option, reduce the set of reference transcripts to
475
502
  # only those found to overlap any of the input loci
476
503
  # -M discard (ignore) single-exon transfrags and reference transcripts
477
504
  # -N discard (ignore) single-exon reference transcripts
478
- #
479
- # -s <seq_path> can be a multi-fasta file with all the genomic sequences or
505
+ #
506
+ # -s <seq_path> can be a multi-fasta file with all the genomic sequences or
480
507
  # a directory containing multiple single-fasta files (one file per contig);
481
508
  # lower case bases will be used to classify input transcripts as repeats
482
- #
509
+ #
483
510
  # -d max distance (range) for grouping transcript start sites (100)
484
- # -p the name prefix to use for consensus transcripts in the
511
+ # -p the name prefix to use for consensus transcripts in the
485
512
  # <outprefix>.combined.gtf file (default: 'TCONS')
486
513
  # -C include the "contained" transcripts in the .combined.gtf file
487
514
  # -G generic GFF input file(s) (do not assume Cufflinks GTF)
488
515
  # -T do not generate .tmap and .refmap files for each input file
489
- # -V verbose processing mode (showing all GFF parsing warnings)
516
+ # -V verbose processing mode (showing all GFF parsing warnings)
490
517
  class Compare
491
518
  include Bio::Command::Wrapper
492
519
 
@@ -579,6 +606,145 @@ module Bio
579
606
  end #load_compare_kb
580
607
  end
581
608
  end #Compare
609
+
610
+ # cuffmerge takes two or more Cufflinks GTF files and merges them into a
611
+ # single unified transcript catalog. Optionally, you can provide the script
612
+ # with a reference GTF, and the script will use it to attach gene names and other
613
+ # metadata to the merged catalog.
614
+
615
+ # Usage:
616
+ # cuffmerge [Options] <assembly_GTF_list.txt>
617
+
618
+ # Options:
619
+ # -h/--help Prints the help message and exits
620
+ # -o <output_dir> Directory where merged assembly will be written [ default: ./merged_asm ]
621
+ # -g/--ref-gtf An optional "reference" annotation GTF.
622
+ # -s/--ref-sequence <seq_dir>/<seq_fasta> Genomic DNA sequences for the reference.
623
+ # --min-isoform-fraction <0-1.0> Discard isoforms with abundance below this [ default: 0.05 ]
624
+ # -p/--num-threads <int> Use this many threads to merge assemblies. [ default: 1 ]
625
+ # --keep-tmp Keep all intermediate files during merge
626
+ class Merge
627
+ include Bio::Command::Wrapper
628
+
629
+ set_program Bio::Ngs::Utils.binary("cuffmerge")
630
+
631
+ add_option "output-dir", :type => :string, :aliases => '-o', :default => "merged_asm"
632
+ add_option "ref-gtf", :type => :string, :aliases => '-g'
633
+ add_option "ref-sequence", :type => :string, :aliases => '-s'
634
+ add_option "min-isoform-fraction", :type => :numeric, :aliases => '-m'
635
+ add_option "num-threads", :type => :numeric, :aliases => '-p', :default => 6
636
+ add_option "keep-tmp", :type => :boolean, :aliases => 't'
637
+ end #Merge
638
+
639
+ # gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>]
640
+ # [-o <outfile.gff>] [-t <tname>] [-r [[<strand>]<chr>:]<start>..<end> [-R]]
641
+ # [-CTVNJMKQAFGUBHZWTOLE] [-w <exons.fa>] [-x <cds.fa>] [-y <tr_cds.fa>]
642
+ # [-i <maxintron>]
643
+ # Filters and/or converts GFF3/GTF2 records.
644
+ # <input_gff> is a GFF file, use '-' if the GFF records will be given at stdin
645
+
646
+ # Options:
647
+ # -g full path to a multi-fasta file with the genomic sequences
648
+ # for all input mappings, OR a directory with single-fasta files
649
+ # (one per genomic sequence, with file names matching sequence names)
650
+ # -s <seq_info.fsize> is a tab-delimited file providing this info
651
+ # for each of the mapped sequences:
652
+ # <seq-name> <seq-length> <seq-description>
653
+ # (useful for -A option with mRNA/EST/protein mappings)
654
+ # -i discard transcripts having an intron larger than <maxintron>
655
+ # -r only show transcripts overlapping coordinate range <start>..<end>
656
+ # (on chromosome/contig <chr>, strand <strand> if provided)
657
+ # -R for -r option, discard all transcripts that are not fully
658
+ # contained within the given range
659
+ # -U discard single-exon transcripts
660
+ # -C coding only: discard mRNAs that have no CDS feature
661
+ # -F full GFF attribute preservation (all attributes are shown)
662
+ # -G only parse additional exon attributes from the first exon
663
+ # and move them to the mRNA level (useful for GTF input)
664
+ # -A use the description field from <seq_info.fsize> and add it
665
+ # as the value for a 'descr' attribute to the GFF record
666
+
667
+ # -O process also non-transcript GFF records (by default non-transcript
668
+ # records are ignored)
669
+ # -V discard any mRNAs with CDS having in-frame stop codons
670
+ # -H for -V option, check and adjust the starting CDS phase
671
+ # if the original phase leads to a translation with an
672
+ # in-frame stop codon
673
+ # -B for -V option, single-exon transcripts are also checked on the
674
+ # opposite strand
675
+ # -N discard multi-exon mRNAs that have any intron with a non-canonical
676
+ # splice site consensus (i.e. not GT-AG, GC-AG or AT-AC)
677
+ # -J discard any mRNAs that either lack initial START codon
678
+ # or the terminal STOP codon, or have an in-frame stop codon
679
+ # (only print mRNAs with a fulll, valid CDS)
680
+
681
+ # -M/--merge : cluster the input transcripts into loci, collapsing matching
682
+ # transcripts (those with the same exact introns and fully contained)
683
+ # -d <dupinfo> : for -M option, write collapsing info to file <dupinfo>
684
+ # --cluster-only: same as --merge but without collapsing matching transcripts
685
+ # -K for -M option: also collapse shorter, fully contained transcripts
686
+ # with fewer introns than the container
687
+ # -Q for -M option, remove the containment restriction:
688
+ # (multi-exon transcripts will be collapsed if just their introns match,
689
+ # while single-exon transcripts can partially overlap (80%))
690
+
691
+ # -E expose (warn about) duplicate transcript IDs and other potential
692
+ # problems with the given GFF/GTF records
693
+ # -Z merge close exons into a single exon (for intron size<4)
694
+ # -w write a fasta file with spliced exons for each GFF transcript
695
+ # -x write a fasta file with spliced CDS for each GFF transcript
696
+ # -W for -w and -x options, also write for each fasta record the exon
697
+ # coordinates projected onto the spliced sequence
698
+ # -y write a protein fasta file with the translation of CDS for each record
699
+ # -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)
700
+ # -m <chr_replace> is a reference (genomic) sequence replacement table with
701
+ # this format:
702
+ # <original_ref_ID> <new_ref_ID>
703
+ # GFF records on reference sequences that are not found among the
704
+ # <original_ref_ID> entries in this file will be filtered out
705
+ # -o the "filtered" GFF records will be written to <outfile.gff>
706
+ # (use -o- for printing to stdout)
707
+ # -t use <trackname> in the second column of each GFF output line
708
+ # -T -o option will output GTF format instead of GFF3
709
+ class GffRead
710
+ include Bio::Command::Wrapper
711
+
712
+ set_program Bio::Ngs::Utils.binary("gffread")
713
+ use_aliases
714
+
715
+ add_option "genomic-sequence", :type => :string, :aliases => '-g'
716
+ add_option "seq-info", :type => :string, :aliases => '-s'
717
+ add_option "discard-transcripts", :type => :numeric, :aliases => '-i'
718
+ add_option "orverlap-coords", :type => :string, :aliases => '-r'
719
+ add_option "discard-not-overlap", :type => :string, :aliases => '-R'
720
+ add_option "discard-single-exon", :type => :boolean, :aliases => '-U'
721
+ add_option "coding-only", :type => :boolean, :aliases => '-C'
722
+ add_option "full-attributes", :type => :boolean, :aliases => '-F'
723
+ add_option "partial-attributes", :type => :boolean, :aliases => '-G'
724
+ add_option "description-field", :type => :string, :aliases => '-A'
725
+ add_option "also-non-transcripts", :type => :boolean, :aliases => '-O'
726
+ add_option "discard-in-frame-stop", :type => :boolean, :aliases => '-V'
727
+ add_option "adjust-codon-phase", :type => :boolean, :aliases => '-H'
728
+ add_option "single-exon-check-opposite", :type => :boolean, :aliases => '-B'
729
+ add_option "discard-multi-exon", :type => :boolean, :aliases => '-N'
730
+ add_option "discard-wrong-codon", :type => :boolean, :aliases => '-J'
731
+ add_option "merge", :type => :boolean, :aliases => '-M'
732
+ add_option "output-collapsing", :type => :string, :aliases => '-d'
733
+ add_option "cluster-only", :type => :boolean, :aliases => '-c'
734
+ add_option "collaps-contained", :type => :boolean, :aliases => '-K'
735
+ add_option "remove-containment-restriction", :type => :boolean, :aliases => '-Q'
736
+ add_option "warnings", :type => :boolean, :aliases => '-E'
737
+ add_option "merge-close-exons", :type => :boolean, :aliases => '-Z'
738
+ add_option "write-exon-fasta", :type => :boolean, :aliases => '-w'
739
+ add_option "write-cds-fasta", :type => :boolean, :aliases => '-x'
740
+ add_option "write-coords", :type => :boolean, :aliases => '-W'
741
+ add_option "write-protein-fasta", :type => :boolean, :aliases => '-y'
742
+ add_option "ensembl-to-gff3", :type => :boolean, :aliases => '-L'
743
+ add_option "chr-replace", :type => :string, :aliases => '-m'
744
+ add_option "output", :type => :string, :aliases => '-o', :default => "outfile.gtf", :collapse=>true
745
+ add_option "track-name", :type => :string, :aliases => '-t'
746
+ add_option "output-gtf", :type => :boolean, :aliases => '-T'
747
+ end # GffRead
582
748
  end #Cufflinks
583
749
  end #Ngs
584
- end #Bio
750
+ end #Bio