bio-ngs 0.3.2.alpha.01
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +39 -0
- data/Gemfile.lock +81 -0
- data/LICENSE.txt +28 -0
- data/README.rdoc +240 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/bin/biongs +35 -0
- data/bio-ngs.gemspec +215 -0
- data/ext/mkrf_conf.rb +87 -0
- data/lib/bio-ngs.rb +54 -0
- data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
- data/lib/bio/appl/ngs/blast.rb +36 -0
- data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
- data/lib/bio/appl/ngs/cufflinks.rb +489 -0
- data/lib/bio/appl/ngs/fastx.rb +170 -0
- data/lib/bio/appl/ngs/samtools.rb +118 -0
- data/lib/bio/appl/ngs/sff_extract.rb +23 -0
- data/lib/bio/appl/ngs/tophat.rb +158 -0
- data/lib/bio/ngs/converter.rb +100 -0
- data/lib/bio/ngs/core_ext.rb +12 -0
- data/lib/bio/ngs/db.rb +66 -0
- data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
- data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
- data/lib/bio/ngs/db/models.rb +1 -0
- data/lib/bio/ngs/db/models/homology.rb +8 -0
- data/lib/bio/ngs/db/models/ontology.rb +16 -0
- data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
- data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
- data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
- data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
- data/lib/bio/ngs/ext/versions.yaml +73 -0
- data/lib/bio/ngs/graphics.rb +189 -0
- data/lib/bio/ngs/homology.rb +102 -0
- data/lib/bio/ngs/ontology.rb +103 -0
- data/lib/bio/ngs/quality.rb +64 -0
- data/lib/bio/ngs/record.rb +50 -0
- data/lib/bio/ngs/task.rb +46 -0
- data/lib/bio/ngs/utils.rb +176 -0
- data/lib/development_tasks.rb +34 -0
- data/lib/enumerable.rb +37 -0
- data/lib/tasks/bwa.thor +126 -0
- data/lib/tasks/convert.thor +454 -0
- data/lib/tasks/history.thor +51 -0
- data/lib/tasks/homology.thor +121 -0
- data/lib/tasks/ontology.thor +93 -0
- data/lib/tasks/project.thor +51 -0
- data/lib/tasks/quality.thor +142 -0
- data/lib/tasks/rna.thor +126 -0
- data/lib/tasks/sff_extract.thor +9 -0
- data/lib/templates/README.tt +43 -0
- data/lib/templates/db.tt +6 -0
- data/lib/wrapper.rb +225 -0
- data/spec/converter_qseq_spec.rb +56 -0
- data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
- data/spec/quality_spec.rb +40 -0
- data/spec/sff_extract_spec.rb +98 -0
- data/spec/spec_helper.rb +55 -0
- data/spec/tophat_spec.rb +99 -0
- data/spec/utils_spec.rb +22 -0
- data/test/conf/test_db.yml +4 -0
- data/test/data/blastoutput.xml +69 -0
- data/test/data/gene-GO.json +1 -0
- data/test/data/goa_uniprot +27 -0
- data/test/data/goslim_goa.obo +1763 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-ngs.rb +17 -0
- data/test/test_db.rb +21 -0
- data/test/test_homology.rb +102 -0
- data/test/test_ngs.rb +21 -0
- data/test/test_ontology.rb +74 -0
- data/test/test_utils.rb +29 -0
- metadata +460 -0
@@ -0,0 +1,36 @@
|
|
1
|
+
module Bio
|
2
|
+
module Ngs
|
3
|
+
class Blast
|
4
|
+
|
5
|
+
include Bio::Command::Wrapper
|
6
|
+
|
7
|
+
class BlastN < Blast
|
8
|
+
set_program Bio::Ngs::Utils.binary("blastn")
|
9
|
+
add_option "evalue", :type => :string, :desc => "E-value cutoff"
|
10
|
+
add_option "query", :type => :string, :desc => "Query sequence"
|
11
|
+
add_option "db", :type => :string, :desc => "Database sequences"
|
12
|
+
add_option "query", :type => :string, :desc => "Query sequence"
|
13
|
+
add_option "word_size", :type => :string, :desc => "Query sequence"
|
14
|
+
add_option "task", :type => :string, :desc => "Task type", :default => "blastn"
|
15
|
+
add_option "out", :type => :string, :desc => "Output file", :default => "blastout.xml"
|
16
|
+
add_option "outfmt", :type => :numeric, :desc => "Output format type", :default => 5
|
17
|
+
add_option "num_descriptions", :type => :numeric, :desc => "Number of HIT descriptions", :default => 1
|
18
|
+
add_option "num_alignments", :type => :numeric, :desc => "Number of HIT alignments", :default => 1
|
19
|
+
add_option "num_threads", :type => :numeric, :desc => "Number of threads", :default => 1
|
20
|
+
end
|
21
|
+
|
22
|
+
class BlastX < Blast
|
23
|
+
set_program Bio::Ngs::Utils.binary("blastx")
|
24
|
+
add_option "evalue", :type => :string, :desc => "E-value cutoff"
|
25
|
+
add_option "query", :type => :string, :desc => "Query sequence"
|
26
|
+
add_option "db", :type => :string, :desc => "Database sequences"
|
27
|
+
add_option "query", :type => :string, :desc => "Query sequence"
|
28
|
+
add_option "out", :type => :string, :desc => "Output file", :default => "blastout.xml"
|
29
|
+
add_option "outfmt", :type => :numeric, :desc => "Output format type", :default => 5
|
30
|
+
add_option "num_descriptions", :type => :numeric, :desc => "Number of HIT descriptions", :default => 1
|
31
|
+
add_option "num_alignments", :type => :numeric, :desc => "Number of HIT alignments", :default => 1
|
32
|
+
add_option "num_threads", :type => :numeric, :desc => "Number of threads", :default => 1
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
#
|
2
|
+
# bowtie-inspect.rb - Wrapper for bowtie-inspect
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2011
|
5
|
+
# Raoul Bonnal <r@bioruby.org>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
#
|
9
|
+
|
10
|
+
|
11
|
+
# Usage: bowtie-inspect [options]* <ebwt_base>
|
12
|
+
# <ebwt_base> ebwt filename minus trailing .1.ebwt/.2.ebwt
|
13
|
+
#
|
14
|
+
# By default, prints FASTA records of the indexed nucleotide sequences to
|
15
|
+
# standard out. With -n, just prints names. With -s, just prints a summary of
|
16
|
+
# the index parameters and sequences. With -e, preserves colors if applicable.
|
17
|
+
#
|
18
|
+
# Options:
|
19
|
+
# -a/--across <int> Number of characters across in FASTA output (default: 60)
|
20
|
+
# -n/--names Print reference sequence names only
|
21
|
+
# -s/--summary Print summary incl. ref names, lengths, index properties
|
22
|
+
# -e/--ebwt-ref Reconstruct reference from ebwt (slow, preserves colors)
|
23
|
+
# -v/--verbose Verbose output (for debugging)
|
24
|
+
# -h/--help print detailed description of tool and its options
|
25
|
+
# --help print this usage message
|
26
|
+
|
27
|
+
|
28
|
+
module Bio
|
29
|
+
module Ngs
|
30
|
+
class BowtieInspect
|
31
|
+
|
32
|
+
include Bio::Command::Wrapper
|
33
|
+
|
34
|
+
set_program Bio::Ngs::Utils.binary("bowtie-inspect")
|
35
|
+
# User should provide a complete path to the tool.
|
36
|
+
# I think it would it better identify the program from just a name
|
37
|
+
# looking int othe ext/ or host system path
|
38
|
+
# Why not grab the file name from the class name if not specified ?
|
39
|
+
|
40
|
+
set_output :stdout
|
41
|
+
|
42
|
+
|
43
|
+
add_option "across",:type => :numeric, :aliases => '-a'
|
44
|
+
add_option "names", :type => :boolean, :aliases => '-n'
|
45
|
+
add_option "summary", :type => :boolean, :aliases => '-s'
|
46
|
+
add_option "ebwt-ref", :type => :boolean, :aliases => '-e'
|
47
|
+
add_option "verbose", :type => :boolean, :aliases => '-v'
|
48
|
+
end #BowtieInspect
|
49
|
+
end#Ngs
|
50
|
+
end#Bio
|
@@ -0,0 +1,489 @@
|
|
1
|
+
#
|
2
|
+
# cufflinks.rb - description
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2011
|
5
|
+
# Raoul Bonnal <r@bioruby.org>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
#
|
9
|
+
|
10
|
+
|
11
|
+
|
12
|
+
module Bio
|
13
|
+
module Ngs
|
14
|
+
module Cufflinks
|
15
|
+
VERSION = "1.0.X"
|
16
|
+
class << self
|
17
|
+
def version
|
18
|
+
VERSION
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
# cufflinks v1.0.2 (2335)
|
24
|
+
# linked against Boost version 104000
|
25
|
+
# -----------------------------
|
26
|
+
# Usage: cufflinks [options] <hits.sam>
|
27
|
+
# Options:
|
28
|
+
#
|
29
|
+
# -p/--num-threads number of threads used during analysis [ default: 1 ]
|
30
|
+
# -L/--label assembled transcripts have this ID prefix [ default: CUFF ]
|
31
|
+
# -G/--GTF quantitate against reference transcript annotations
|
32
|
+
# -F/--min-isoform-fraction suppress transcripts below this abundance level [ default: 0.15 ]
|
33
|
+
# -f/--min-intron-fraction filter spliced alignments below this level [ default: 0.05 ]
|
34
|
+
# -j/--pre-mrna-fraction suppress intra-intronic transcripts below this level [ default: 0.15 ]
|
35
|
+
# -I/--max-intron-length ignore alignments with gaps longer than this [ default: 300000 ]
|
36
|
+
# -Q/--min-map-qual ignore alignments with lower than this mapping qual [ default: 0 ]
|
37
|
+
# -M/--mask-file ignore all alignment within transcripts in this file
|
38
|
+
# -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
|
39
|
+
# -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
|
40
|
+
# -o/--output-dir write all output files to this directory [ default: ./ ]
|
41
|
+
# -r/--reference-seq reference fasta file for sequence bias correction [ default: NULL ]
|
42
|
+
#
|
43
|
+
# Advanced Options:
|
44
|
+
#
|
45
|
+
# -N/--quartile-normalization use quartile normalization instead of total counts [ default: FALSE ]
|
46
|
+
# -a/--junc-alpha alpha for junction binomial test filter [ default: 0.01 ]
|
47
|
+
# -A/--small-anchor-fraction percent read overhang taken as 'suspiciously small' [ default: 0.12 ]
|
48
|
+
# -m/--frag-len-mean the average fragment length [ default: 200 ]
|
49
|
+
# -s/--frag-len-std-dev the fragment length standard deviation [ default: 80 ]
|
50
|
+
# --min-frags-per-transfrag minimum number of fragments needed for new transfrags [ default: 10 ]
|
51
|
+
# --overhang-tolerance number of terminal exon bp to tolerate in introns [ default: 8 ]
|
52
|
+
# --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
|
53
|
+
# --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
|
54
|
+
# --library-type Library prep used for input reads [ default: below ]
|
55
|
+
# --max-bundle-length maximum genomic length allowed for a given bundle [ default:3500000 ]
|
56
|
+
# --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]
|
57
|
+
# --min-intron-length minimum intron size allowed in genome [ default: 50 ]
|
58
|
+
# Supported library types:
|
59
|
+
# ff-firststrand
|
60
|
+
# ff-secondstrand
|
61
|
+
# ff-unstranded
|
62
|
+
# fr-firststrand
|
63
|
+
# fr-secondstrand
|
64
|
+
# fr-unstranded (default)
|
65
|
+
# transfrags
|
66
|
+
class Quantification
|
67
|
+
|
68
|
+
include Bio::Command::Wrapper
|
69
|
+
|
70
|
+
set_program Bio::Ngs::Utils.binary("cufflinks")
|
71
|
+
|
72
|
+
add_option "num-threads", :type => :numeric, :aliases => '-p', :default => 1
|
73
|
+
add_option "label", :type => :string, :aliases => '-L', :default => "CUFF"
|
74
|
+
add_option "GTF", :type => :string, :aliases => '-G'
|
75
|
+
add_option "min-isoform-fraction", :type => :numeric, :aliases => '-F', :default => 0.15
|
76
|
+
add_option "min-intron-fraction", :type => :numeric, :aliases => '-f', :default => 0.05
|
77
|
+
add_option "pre-mrna-fraction", :type => :numeric, :aliases => '-j', :default => 0.15
|
78
|
+
add_option "max-intron-length", :type => :numeric, :aliases => '-I', :default => 300000
|
79
|
+
add_option "min-map-qual", :type => :numeric, :aliases => '-Q', :default => 0
|
80
|
+
add_option "mask-file", :type => :string, :aliases => '-M'
|
81
|
+
add_option "verbose", :type => :boolean, :aliases => '-v'
|
82
|
+
add_option "quiet", :type => :boolean, :aliases => '-q'
|
83
|
+
add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
|
84
|
+
add_option "reference-seq", :type => :string, :aliases => '-r'
|
85
|
+
add_option "quartile-normalization", :type => :boolean, :aliases => '-N'
|
86
|
+
add_option "junc-alpha", :type => :numeric, :aliases => '-a', :default => 0.01
|
87
|
+
add_option "small-anchor-fraction", :type => :numeric, :aliases => '-A', :default => 0.12
|
88
|
+
#TODO Check why with these defaults is not working properly
|
89
|
+
add_option "farg-len-mean", :type => :numeric, :aliases => '-m'#, :default => 200
|
90
|
+
add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'#, :default => 80
|
91
|
+
add_option "min-frags-per-transfrag", :type => :numeric#, :default => 10
|
92
|
+
add_option "overhang-tolerance", :type => :numeric#, :default => 8
|
93
|
+
add_option "num-importance-samples", :type => :numeric#, :default => 1000
|
94
|
+
add_option "max-mle-iterations", :type => :numeric#, :default => 5000
|
95
|
+
add_option "library-type", :type => :string
|
96
|
+
add_option "max-bundle-length", :type => :numeric #, :default => 3500000
|
97
|
+
add_option "max-bundle-frags", :type => :numeric #, :default => 500000
|
98
|
+
add_option "min-intron-length", :type => :numeric#, :default => 50
|
99
|
+
end #Quantification
|
100
|
+
|
101
|
+
# cuffdiff v1.0.2 (2336)
|
102
|
+
# -----------------------------
|
103
|
+
# Usage: cuffdiff [options] <transcripts.gtf> <sample1_hits.sam> <sample2_hits.sam> [... sampleN_hits.sam]
|
104
|
+
# Supply replicate SAMs as comma separated lists for each condition: sample1_rep1.sam,sample1_rep2.sam,...sample1_repM.sam
|
105
|
+
# General Options:
|
106
|
+
# -o/--output-dir write all output files to this directory [ default: ./ ]
|
107
|
+
# -T/--time-series treat samples as a time-series [ default: FALSE ]
|
108
|
+
# -c/--min-alignment-count minimum number of alignments in a locus for testing [ default: 10 ]
|
109
|
+
# --FDR False discovery rate used in testing [ default: 0.05 ]
|
110
|
+
# -M/--mask-file ignore all alignment within transcripts in this file [ default: NULL ]
|
111
|
+
# -b/--frag-bias-correct use bias correction - reference fasta required [ default: NULL ]
|
112
|
+
# -u/--multi-read-correct use 'rescue method' for multi-reads (more accurate) [ default: FALSE ]
|
113
|
+
# -N/--upper-quartile-norm use upper-quartile normalization [ default: FALSE ]
|
114
|
+
# -L/--labels comma-separated list of condition labels
|
115
|
+
# -p/--num-threads number of threads used during quantification [ default: 1 ]
|
116
|
+
#
|
117
|
+
# Advanced Options:
|
118
|
+
# --library-type Library prep used for input reads [ default: below ]
|
119
|
+
# -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]
|
120
|
+
# -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]
|
121
|
+
# --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
|
122
|
+
# --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
|
123
|
+
# --compatible-hits-norm count hits compatible with reference RNAs only [ default: TRUE ]
|
124
|
+
# --total-hits-norm count all hits for normalization [ default: FALSE ]
|
125
|
+
# --poisson-dispersion Don't fit fragment counts for overdispersion [ default: FALSE ]
|
126
|
+
# -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
|
127
|
+
# -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
|
128
|
+
# --no-update-check do not contact server to check for update availability[ default: FALSE ]
|
129
|
+
# --emit-count-tables print count tables used to fit overdispersion [ default: FALSE ]
|
130
|
+
#
|
131
|
+
# Supported library types:
|
132
|
+
# ff-firststrand
|
133
|
+
# ff-secondstrand
|
134
|
+
# ff-unstranded
|
135
|
+
# fr-firststrand
|
136
|
+
# fr-secondstrand
|
137
|
+
# fr-unstranded (default)
|
138
|
+
# transfrags
|
139
|
+
class Diff
|
140
|
+
include Bio::Command::Wrapper
|
141
|
+
|
142
|
+
set_program Bio::Ngs::Utils.binary("cuffdiff")
|
143
|
+
|
144
|
+
add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
|
145
|
+
add_option "time-series", :type => :boolean, :aliases => '-T'
|
146
|
+
add_option "min-alignment-count", :type => :numeric, :aliases => '-c'
|
147
|
+
add_option "FDR", :type => :numeric, :aliases => '-F'
|
148
|
+
#TODO:FIX add_option "mask-file", :type => :string, :aliases => '-M'
|
149
|
+
#TODO:FIX add_option "frag-bias-correct", :type =>
|
150
|
+
add_option "multi-read-correct", :type => :boolean, :aliases => '-u'
|
151
|
+
add_option "upper-quartile-norm", :type => :boolean, :aliases => 'N'
|
152
|
+
add_option "labels", :type => :array, :aliases => '-L'
|
153
|
+
add_option "num-threads", :type => :numeric, :aliases => '-p'
|
154
|
+
add_option "library-type", :type => :string, :aliases => '-l'
|
155
|
+
add_option "frag-len-mean", :type => :numeric, :aliases => '-m'
|
156
|
+
add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'
|
157
|
+
add_option "num-importance-samples", :type => :numeric, :aliases => '-i'
|
158
|
+
add_option "max-mle-iterations", :type => :numeric, :aliases => '-e'
|
159
|
+
add_option "compatible-hits-norm", :type => :boolean, :aliases => '-h'
|
160
|
+
add_option "total-hits-norm", :type => :boolean, :aliases => '-t'
|
161
|
+
add_option "poisson-dispersion", :type => :boolean, :aliases => '-d'
|
162
|
+
add_option "verbose", :type => :boolean, :aliases => '-v'
|
163
|
+
add_option "quiet", :type => :boolean, :aliases => '-q'
|
164
|
+
add_option "no-update-check", :type => :boolean, :aliases => '-j'
|
165
|
+
add_option "emit-count-tables", :type => :boolean, :aliases => '-b'
|
166
|
+
|
167
|
+
#Examples
|
168
|
+
#Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,3,0.6,false,true)
|
169
|
+
#Bio::Ngs::Cufflinks::Diff.genes("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/gene_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,5,0.5,false,true)
|
170
|
+
|
171
|
+
class << self
|
172
|
+
|
173
|
+
#Return the version of CuffDiff used to produce the output
|
174
|
+
def version(diff)
|
175
|
+
#cufflink_version_offset = Bio::Ngs::Cufflinks.version
|
176
|
+
f=File.open(diff,'r')
|
177
|
+
header=f.readline #skip header
|
178
|
+
f.close
|
179
|
+
cufflink_version_offset = case header.split.size
|
180
|
+
when 12
|
181
|
+
"0.9.X"
|
182
|
+
when 14
|
183
|
+
Bio::Ngs::Cufflinks.version #latest
|
184
|
+
end
|
185
|
+
end#version
|
186
|
+
|
187
|
+
|
188
|
+
def offset_by_version(cufflinks_version)
|
189
|
+
case cufflinks_version
|
190
|
+
when "0.9.X"
|
191
|
+
0
|
192
|
+
when "1.0.X"
|
193
|
+
1
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
#write a file with the information
|
198
|
+
#See process_de for options available
|
199
|
+
# Example: Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DEPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/isoform_exp.diff",
|
200
|
+
# "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_PopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/ComparepPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8.combined.gtf",
|
201
|
+
# fold:0.5,min_samples:5,min_fpkm:0.5,z_scores:true, :regulated=>:up)
|
202
|
+
def isoforms(diff, gtf, options={})
|
203
|
+
process_de(diff, gtf, options) do |dict_info, diff_reference, gtf_kb, fpkm_values|
|
204
|
+
"#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:nearest_ref]}_#{gtf_kb[diff_reference][:gene_name]}\t#{fpkm_values.join("\t")}"
|
205
|
+
end
|
206
|
+
end #isoform
|
207
|
+
|
208
|
+
#write a file with the information
|
209
|
+
#See process_de for options available
|
210
|
+
# Example: Bio::Ngs::Cufflinks::Diff.genes("/Users/bonnalraoul/Desktop/RRep16giugno/DEPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/gene_exp.diff",
|
211
|
+
# "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_PopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/ComparepPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8.combined.gtf",
|
212
|
+
# fold:0.5,min_samples:5,min_fpkm:0.5,z_scores:true, :regulated=>:up)
|
213
|
+
def genes(diff, gtf, options={})
|
214
|
+
process_de(diff, gtf, options) do |dict_info, diff_reference, gtf_kb, fpkm_values|
|
215
|
+
"#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:gene_name]}\t#{fpkm_values.join("\t")}"
|
216
|
+
end
|
217
|
+
end #genes
|
218
|
+
|
219
|
+
private
|
220
|
+
#Options hash
|
221
|
+
# :fold(float), :min_samples(integer), :min_fpkm(float), :only_significative(boolean, false) , :z_score(boolean, false)
|
222
|
+
# :regulated(symbol :up or :down default :up)
|
223
|
+
def process_de(diff, gtf, options={})
|
224
|
+
fold = options[:fold] || 0.0
|
225
|
+
min_samples = options[:min_samples] || 0
|
226
|
+
min_fpkm = options[:min_fpkm] || 0.0
|
227
|
+
only_significative = options[:only_significative] || false
|
228
|
+
z_scores = options[:z_scores] || false
|
229
|
+
#TODO improve check on paramters
|
230
|
+
regulated =options[:regulated] || :up
|
231
|
+
|
232
|
+
gtf_kb = Bio::Ngs::Cufflinks::Compare.exists_kb?(gtf) ? Bio::Ngs::Cufflinks::Compare.load_compare_kb(gtf) : Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
|
233
|
+
|
234
|
+
#convert log2 fold value into natural log value (internally computed by cuffdiff)
|
235
|
+
fold_log2 = fold
|
236
|
+
fold = fold==0 ? 0.0 : (fold*Math.log(2))
|
237
|
+
|
238
|
+
dict=Hash.new {|h, k| h[k]=Hash.new{|hh,kk| hh[kk]=[]}; }
|
239
|
+
dict_samples = Hash.new{|h,k| h[k]=""}
|
240
|
+
|
241
|
+
#which offset may I consider to get data from cuffdiff?
|
242
|
+
cufflink_version_offset = offset_by_version(version(diff))
|
243
|
+
|
244
|
+
File.open(diff,'r') do |f|
|
245
|
+
header=f.readline #skip header
|
246
|
+
|
247
|
+
q_first = 3 + cufflink_version_offset
|
248
|
+
q_second = 4 + cufflink_version_offset
|
249
|
+
fpkm_first = 6 + cufflink_version_offset
|
250
|
+
fpkm_second = 7 + cufflink_version_offset
|
251
|
+
fold_position = 8 + cufflink_version_offset
|
252
|
+
significant_position = 11 + cufflink_version_offset + (cufflink_version_offset==1 ? 1 : 0)
|
253
|
+
f.each_line do |line|
|
254
|
+
data=line.split
|
255
|
+
if data[fold_position].to_f<=0
|
256
|
+
data[fold_position]=data[fold_position].sub(/-/,"")
|
257
|
+
else
|
258
|
+
a=data[fpkm_second]
|
259
|
+
data[fpkm_second]=data[fpkm_first]
|
260
|
+
data[fpkm_first]=a
|
261
|
+
a=data[q_second]
|
262
|
+
data[q_second]=data[q_first]
|
263
|
+
data[q_first]=a
|
264
|
+
end
|
265
|
+
#0 TCONS
|
266
|
+
#4 name sample is the max diff for the item
|
267
|
+
#5 name sample is the less diff for the item
|
268
|
+
#9 is the fold
|
269
|
+
dict_samples[data[q_first]]
|
270
|
+
dict_samples[data[q_second]]
|
271
|
+
|
272
|
+
#7 is the fpkm value of max pop/sample
|
273
|
+
#8 is the fpkm value of min pop/sample
|
274
|
+
if ((only_significative==true && data[significant_position]=="yes") || (data[significant_position]=="yes" && data[fold_position].to_f>=fold)) && data[fpkm_first].to_f>=min_fpkm && data[fpkm_second].to_f>=min_fpkm
|
275
|
+
k_reference = data[0].to_sym #This can be TCONS if isoforms or XLOC if genes
|
276
|
+
|
277
|
+
###### puts data.join(" ") if k_reference == :XLOC_017497
|
278
|
+
#TODO refactor: this can be done using lambda
|
279
|
+
k_sample = case regulated
|
280
|
+
when :up
|
281
|
+
k_sample = data[q_first].to_sym
|
282
|
+
dict[k_reference][k_sample]<<data[q_second].to_sym
|
283
|
+
k_sample
|
284
|
+
when :down
|
285
|
+
k_sample = data[q_second].to_sym
|
286
|
+
dict[k_reference][k_sample]<<data[q_first].to_sym
|
287
|
+
k_sample
|
288
|
+
end
|
289
|
+
|
290
|
+
# puts dict[k_reference].inspect if k_reference == :XLOC_017497
|
291
|
+
|
292
|
+
unless dict[k_reference].key?(:values)
|
293
|
+
dict[k_reference][:values]={}
|
294
|
+
end
|
295
|
+
#store fpkm values as well for each pop/sample it should be
|
296
|
+
dict[k_reference][:values][k_sample]=data[fpkm_first].to_f unless dict[k_reference][:values].key?(k_sample)
|
297
|
+
dict[k_reference][:values][data[q_second].to_sym]=data[fpkm_second].to_f unless dict[k_reference][:values].key?(data[q_second].to_sym)
|
298
|
+
if dict[k_reference][k_sample].size >= min_samples
|
299
|
+
dict[k_reference][:winner] << k_sample
|
300
|
+
end
|
301
|
+
# puts dict[k_reference].inspect if k_reference == :XLOC_017497
|
302
|
+
else
|
303
|
+
#TODO add threshold value below min fpkm
|
304
|
+
#dict[k_reference][:values][k_sample]=data[6].to_f
|
305
|
+
#dict[k_reference][:values][data[4].to_sym]=data[7].to_f
|
306
|
+
end
|
307
|
+
end #each line
|
308
|
+
|
309
|
+
#example structure
|
310
|
+
#{:TCONS_00086164=>{:q5=>[:q1, :q2, :q3, :q6]}, :TCONS_00086166=>{:q5=>[:q1, :q2, :q3, :q4, :q6]}
|
311
|
+
end #file.open
|
312
|
+
|
313
|
+
|
314
|
+
file_lines =[]
|
315
|
+
dict.each do |diff_reference, dict_info|
|
316
|
+
|
317
|
+
if dict_info.key?(:winner)
|
318
|
+
|
319
|
+
#BAD PERFORMANCES use lambda
|
320
|
+
valz = case z_scores
|
321
|
+
when true
|
322
|
+
items=dict_info[:values].sort.map{|sample| sample[1]}
|
323
|
+
average = items.average
|
324
|
+
stdev = items.standard_deviation
|
325
|
+
items.map do |fpkm|
|
326
|
+
(fpkm-average)/stdev
|
327
|
+
end
|
328
|
+
when false
|
329
|
+
dict_info[:values].sort.map{|sample| sample[1]}
|
330
|
+
end #case
|
331
|
+
|
332
|
+
#TODO generalize to isoforms and genes now only isoforms
|
333
|
+
# puts yield(dict_info, diff_reference, gtf_kb, valz) if diff_reference == :XLOC_017497
|
334
|
+
file_lines<< yield(dict_info, diff_reference, gtf_kb, valz) #fpkm_values
|
335
|
+
#file_lines<<"#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:nearest_ref]}_#{gtf_kb[diff_reference][:gene_name]}\t#{valz.join("\t")}"
|
336
|
+
else
|
337
|
+
#TODO not winner or number of min samples
|
338
|
+
end#winner
|
339
|
+
end # dict_each
|
340
|
+
file_name_output =File.join(File.dirname(diff),File.basename(diff,".diff")+"-f#{fold_log2}_s#{min_samples}_fpkm#{min_fpkm}")
|
341
|
+
file_name_output += "_z" if z_scores
|
342
|
+
file_name_output += regulated.to_s
|
343
|
+
file_name_output += ".txt"
|
344
|
+
File.open(file_name_output,'w') do |odiff|
|
345
|
+
odiff.puts "sample\thumanized_id\t#{dict_samples.keys.sort.join("\t")}"
|
346
|
+
file_lines.sort.each do |file_line|
|
347
|
+
odiff.puts file_line
|
348
|
+
end#each sorted line
|
349
|
+
end#open
|
350
|
+
end #process_de
|
351
|
+
end
|
352
|
+
|
353
|
+
end #Diff
|
354
|
+
|
355
|
+
|
356
|
+
# cuffcompare v1.0.2 (2335)
|
357
|
+
# -----------------------------
|
358
|
+
# Usage:
|
359
|
+
# cuffcompare [-r <reference_mrna.gtf>] [-R] [-T] [-V] [-s <seq_path>]
|
360
|
+
# [-o <outprefix>] [-p <cprefix>]
|
361
|
+
# {-i <input_gtf_list> | <input1.gtf> [<input2.gtf> .. <inputN.gtf>]}
|
362
|
+
#
|
363
|
+
# Cuffcompare provides classification, reference annotation mapping and various
|
364
|
+
# statistics for Cufflinks transfrags.
|
365
|
+
# Cuffcompare clusters and tracks transfrags across multiple samples, writing
|
366
|
+
# matching transcripts (intron chains) into <outprefix>.tracking, and a GTF
|
367
|
+
# file <outprefix>.combined.gtf containing a nonredundant set of transcripts
|
368
|
+
# across all input files (with a single representative transfrag chosen
|
369
|
+
# for each clique of matching transfrags across samples).
|
370
|
+
#
|
371
|
+
# Options:
|
372
|
+
# -i provide a text file with a list of Cufflinks GTF files to process instead
|
373
|
+
# of expecting them as command line arguments (useful when a large number
|
374
|
+
# of GTF files should be processed)
|
375
|
+
#
|
376
|
+
# -r a set of known mRNAs to use as a reference for assessing
|
377
|
+
# the accuracy of mRNAs or gene models given in <input.gtf>
|
378
|
+
#
|
379
|
+
# -R for -r option, reduce the set of reference transcripts to
|
380
|
+
# only those found to overlap any of the input loci
|
381
|
+
# -M discard (ignore) single-exon transfrags and reference transcripts
|
382
|
+
# -N discard (ignore) single-exon reference transcripts
|
383
|
+
#
|
384
|
+
# -s <seq_path> can be a multi-fasta file with all the genomic sequences or
|
385
|
+
# a directory containing multiple single-fasta files (one file per contig);
|
386
|
+
# lower case bases will be used to classify input transcripts as repeats
|
387
|
+
#
|
388
|
+
# -d max distance (range) for grouping transcript start sites (100)
|
389
|
+
# -p the name prefix to use for consensus transcripts in the
|
390
|
+
# <outprefix>.combined.gtf file (default: 'TCONS')
|
391
|
+
# -C include the "contained" transcripts in the .combined.gtf file
|
392
|
+
# -G generic GFF input file(s) (do not assume Cufflinks GTF)
|
393
|
+
# -T do not generate .tmap and .refmap files for each input file
|
394
|
+
# -V verbose processing mode (showing all GFF parsing warnings)
|
395
|
+
class Compare
|
396
|
+
include Bio::Command::Wrapper
|
397
|
+
|
398
|
+
set_program Bio::Ngs::Utils.binary("cuffcompare")
|
399
|
+
use_aliases
|
400
|
+
#TODO: add descriptions
|
401
|
+
add_option "outprefix", :type => :string, :aliases => '-o', :default => "Comparison"
|
402
|
+
add_option "gtf_combine_file", :type => :string, :aliases => '-i'
|
403
|
+
add_option "gtf_reference", :type => :string, :aliases => '-r'
|
404
|
+
add_option "only_overlap", :type => :boolean, :aliases => '-R'
|
405
|
+
add_option "discard_transfrags", :type => :boolean, :aliases => '-M'
|
406
|
+
add_option "discard_ref_transcripts", :type => :boolean, :aliases => '-N'
|
407
|
+
add_option "multi_fasta", :type => :string, :aliases => '-s'
|
408
|
+
add_option "distance_tss", :type => :numeric, :aliases => '-d'
|
409
|
+
add_option "prefix_transcripts_consensus", :type => :string, :aliases => '-p'
|
410
|
+
add_option "contained", :type=>:boolean, :aliases => '-C'
|
411
|
+
add_option "GFF", :type => :boolean, :aliases =>'-G'
|
412
|
+
add_option "no_map_files", :type => :boolean, :aliases =>'-T'
|
413
|
+
|
414
|
+
class << self
|
415
|
+
|
416
|
+
def kb_name(gtf)
|
417
|
+
gtf.sub(/\.[a-zA-Z0-9]*$/,".kb")
|
418
|
+
end
|
419
|
+
|
420
|
+
def exists_kb?(gtf)
|
421
|
+
File.exists?(kb_name(gtf))
|
422
|
+
end
|
423
|
+
|
424
|
+
# Dump an hash of associations from a GTF file generated from CuffCompare
|
425
|
+
# gene_id: transcript_id, gene_name, oid, nearest_ref
|
426
|
+
# gene_id example: :XLOC_000001=>{:gene_name=>:RP11-304M2.1, :transcripts=>{:TCONS_00000001=>{:oid=>:ENST00000519787, :nearest_ref=>:ENST00000519787}}}
|
427
|
+
# the others are just plain hash
|
428
|
+
# transcript_id: gene_id, gene_name, oid, nearest_ref
|
429
|
+
# gene_name: gene_id, transcript_id, oid, nearest_ref
|
430
|
+
# oid: gene_id, transcript_id, gene_name, nearest_ref
|
431
|
+
# nearest_ref: gene_id, transcript_id, gene_name, oid
|
432
|
+
#Note:exons and coordinates are not saved.
|
433
|
+
def build_compare_kb(gtf)
|
434
|
+
unless File.exists?(gtf)
|
435
|
+
STDERR.puts "File #{gtf} doesn't exist."
|
436
|
+
return nil
|
437
|
+
end
|
438
|
+
|
439
|
+
dict = {} #build an hash with the combinations of data extracted from GTF file, XLOC, TCONS, ENST, SYMBOL
|
440
|
+
File.open(gtf,'r') do |f|
|
441
|
+
f.lines do |line|
|
442
|
+
line=~/gene_id (.*?);/
|
443
|
+
gene_id = $1.gsub(/"/,'').to_sym
|
444
|
+
line=~/transcript_id (.*?);/
|
445
|
+
transcript_id = $1.gsub(/"/,'').to_sym
|
446
|
+
line=~/gene_name (.*?);/
|
447
|
+
gene_name = $1.gsub(/"/,'').to_sym
|
448
|
+
line=~/oId (.*?);/
|
449
|
+
oid=$1.gsub(/"/,'').to_sym
|
450
|
+
line=~/nearest_ref (.*?);/
|
451
|
+
nearest_ref = $1.gsub(/"/,'').to_sym
|
452
|
+
unless dict.key?(gene_id)
|
453
|
+
dict[gene_id]={:gene_name=>gene_name,:transcripts=>{}}
|
454
|
+
end
|
455
|
+
unless dict[gene_id][:transcripts].key?(transcript_id)
|
456
|
+
dict[gene_id][:transcripts][transcript_id]={:odi=>oid, :nearest_ref=>nearest_ref}
|
457
|
+
end
|
458
|
+
dict[transcript_id]={:gene_id=>gene_id, :gene_name=>gene_name, :odi=>oid, :nearest_ref=>nearest_ref}
|
459
|
+
dict[gene_name]={:gene_id=>gene_id, :transcript_id=>transcript_id, :odi=>oid, :nearest_ref=>nearest_ref}
|
460
|
+
dict[oid]={:gene_id=>gene_id, :transcript_id=>transcript_id, :gene_name=>gene_name, :nearest_ref=>nearest_ref}
|
461
|
+
dict[nearest_ref]={:gene_id=>gene_id, :transcript_id=>transcript_id, :odi=>oid, :gene_name=>gene_name}
|
462
|
+
end#lines
|
463
|
+
end#file
|
464
|
+
kb_filename = kb_name(gtf)
|
465
|
+
File.open(kb_filename,'w') do |fkb|
|
466
|
+
#fkb.write(dict.to_json)
|
467
|
+
Marshal.dump(dict,fkb)
|
468
|
+
end #fkb
|
469
|
+
dict
|
470
|
+
end #build_compare_kb
|
471
|
+
|
472
|
+
# Return the hash of associations
|
473
|
+
# gene_id: transcript_id, gene_name, oid, nearest_ref
|
474
|
+
# transcript_id: gene_id, gene_name, oid, nearest_ref
|
475
|
+
# gene_name: gene_id, transcript_id, oid, nearest_ref
|
476
|
+
# oid: gene_id, transcript_id, gene_name, nearest_ref
|
477
|
+
# nearest_ref: gene_id, transcript_id, gene_name, oid
|
478
|
+
def load_compare_kb(gtf)
|
479
|
+
#TODO rescue Exceptions
|
480
|
+
kb_filename = kb_name(gtf)
|
481
|
+
gtf_kb = File.open(kb_filename,'r') do |kb_dump|
|
482
|
+
Marshal.load(kb_dump)
|
483
|
+
end
|
484
|
+
end #load_compare_kb
|
485
|
+
end
|
486
|
+
end #Compare
|
487
|
+
end #Cufflinks
|
488
|
+
end #Ngs
|
489
|
+
end #Bio
|