bio-ngs 0.3.2.alpha.01
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +39 -0
- data/Gemfile.lock +81 -0
- data/LICENSE.txt +28 -0
- data/README.rdoc +240 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/bin/biongs +35 -0
- data/bio-ngs.gemspec +215 -0
- data/ext/mkrf_conf.rb +87 -0
- data/lib/bio-ngs.rb +54 -0
- data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
- data/lib/bio/appl/ngs/blast.rb +36 -0
- data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
- data/lib/bio/appl/ngs/cufflinks.rb +489 -0
- data/lib/bio/appl/ngs/fastx.rb +170 -0
- data/lib/bio/appl/ngs/samtools.rb +118 -0
- data/lib/bio/appl/ngs/sff_extract.rb +23 -0
- data/lib/bio/appl/ngs/tophat.rb +158 -0
- data/lib/bio/ngs/converter.rb +100 -0
- data/lib/bio/ngs/core_ext.rb +12 -0
- data/lib/bio/ngs/db.rb +66 -0
- data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
- data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
- data/lib/bio/ngs/db/models.rb +1 -0
- data/lib/bio/ngs/db/models/homology.rb +8 -0
- data/lib/bio/ngs/db/models/ontology.rb +16 -0
- data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
- data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
- data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
- data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
- data/lib/bio/ngs/ext/versions.yaml +73 -0
- data/lib/bio/ngs/graphics.rb +189 -0
- data/lib/bio/ngs/homology.rb +102 -0
- data/lib/bio/ngs/ontology.rb +103 -0
- data/lib/bio/ngs/quality.rb +64 -0
- data/lib/bio/ngs/record.rb +50 -0
- data/lib/bio/ngs/task.rb +46 -0
- data/lib/bio/ngs/utils.rb +176 -0
- data/lib/development_tasks.rb +34 -0
- data/lib/enumerable.rb +37 -0
- data/lib/tasks/bwa.thor +126 -0
- data/lib/tasks/convert.thor +454 -0
- data/lib/tasks/history.thor +51 -0
- data/lib/tasks/homology.thor +121 -0
- data/lib/tasks/ontology.thor +93 -0
- data/lib/tasks/project.thor +51 -0
- data/lib/tasks/quality.thor +142 -0
- data/lib/tasks/rna.thor +126 -0
- data/lib/tasks/sff_extract.thor +9 -0
- data/lib/templates/README.tt +43 -0
- data/lib/templates/db.tt +6 -0
- data/lib/wrapper.rb +225 -0
- data/spec/converter_qseq_spec.rb +56 -0
- data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
- data/spec/quality_spec.rb +40 -0
- data/spec/sff_extract_spec.rb +98 -0
- data/spec/spec_helper.rb +55 -0
- data/spec/tophat_spec.rb +99 -0
- data/spec/utils_spec.rb +22 -0
- data/test/conf/test_db.yml +4 -0
- data/test/data/blastoutput.xml +69 -0
- data/test/data/gene-GO.json +1 -0
- data/test/data/goa_uniprot +27 -0
- data/test/data/goslim_goa.obo +1763 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-ngs.rb +17 -0
- data/test/test_db.rb +21 -0
- data/test/test_homology.rb +102 -0
- data/test/test_ngs.rb +21 -0
- data/test/test_ontology.rb +74 -0
- data/test/test_utils.rb +29 -0
- metadata +460 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
module Bio
|
|
2
|
+
module Ngs
|
|
3
|
+
class Blast
|
|
4
|
+
|
|
5
|
+
include Bio::Command::Wrapper
|
|
6
|
+
|
|
7
|
+
class BlastN < Blast
|
|
8
|
+
set_program Bio::Ngs::Utils.binary("blastn")
|
|
9
|
+
add_option "evalue", :type => :string, :desc => "E-value cutoff"
|
|
10
|
+
add_option "query", :type => :string, :desc => "Query sequence"
|
|
11
|
+
add_option "db", :type => :string, :desc => "Database sequences"
|
|
12
|
+
add_option "query", :type => :string, :desc => "Query sequence"
|
|
13
|
+
add_option "word_size", :type => :string, :desc => "Query sequence"
|
|
14
|
+
add_option "task", :type => :string, :desc => "Task type", :default => "blastn"
|
|
15
|
+
add_option "out", :type => :string, :desc => "Output file", :default => "blastout.xml"
|
|
16
|
+
add_option "outfmt", :type => :numeric, :desc => "Output format type", :default => 5
|
|
17
|
+
add_option "num_descriptions", :type => :numeric, :desc => "Number of HIT descriptions", :default => 1
|
|
18
|
+
add_option "num_alignments", :type => :numeric, :desc => "Number of HIT alignments", :default => 1
|
|
19
|
+
add_option "num_threads", :type => :numeric, :desc => "Number of threads", :default => 1
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
class BlastX < Blast
|
|
23
|
+
set_program Bio::Ngs::Utils.binary("blastx")
|
|
24
|
+
add_option "evalue", :type => :string, :desc => "E-value cutoff"
|
|
25
|
+
add_option "query", :type => :string, :desc => "Query sequence"
|
|
26
|
+
add_option "db", :type => :string, :desc => "Database sequences"
|
|
27
|
+
add_option "query", :type => :string, :desc => "Query sequence"
|
|
28
|
+
add_option "out", :type => :string, :desc => "Output file", :default => "blastout.xml"
|
|
29
|
+
add_option "outfmt", :type => :numeric, :desc => "Output format type", :default => 5
|
|
30
|
+
add_option "num_descriptions", :type => :numeric, :desc => "Number of HIT descriptions", :default => 1
|
|
31
|
+
add_option "num_alignments", :type => :numeric, :desc => "Number of HIT alignments", :default => 1
|
|
32
|
+
add_option "num_threads", :type => :numeric, :desc => "Number of threads", :default => 1
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
#
|
|
2
|
+
# bowtie-inspect.rb - Wrapper for bowtie-inspect
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2011
|
|
5
|
+
# Raoul Bonnal <r@bioruby.org>
|
|
6
|
+
# License:: The Ruby License
|
|
7
|
+
#
|
|
8
|
+
#
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# Usage: bowtie-inspect [options]* <ebwt_base>
|
|
12
|
+
# <ebwt_base> ebwt filename minus trailing .1.ebwt/.2.ebwt
|
|
13
|
+
#
|
|
14
|
+
# By default, prints FASTA records of the indexed nucleotide sequences to
|
|
15
|
+
# standard out. With -n, just prints names. With -s, just prints a summary of
|
|
16
|
+
# the index parameters and sequences. With -e, preserves colors if applicable.
|
|
17
|
+
#
|
|
18
|
+
# Options:
|
|
19
|
+
# -a/--across <int> Number of characters across in FASTA output (default: 60)
|
|
20
|
+
# -n/--names Print reference sequence names only
|
|
21
|
+
# -s/--summary Print summary incl. ref names, lengths, index properties
|
|
22
|
+
# -e/--ebwt-ref Reconstruct reference from ebwt (slow, preserves colors)
|
|
23
|
+
# -v/--verbose Verbose output (for debugging)
|
|
24
|
+
# -h/--help print detailed description of tool and its options
|
|
25
|
+
# --help print this usage message
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
module Bio
|
|
29
|
+
module Ngs
|
|
30
|
+
class BowtieInspect
|
|
31
|
+
|
|
32
|
+
include Bio::Command::Wrapper
|
|
33
|
+
|
|
34
|
+
set_program Bio::Ngs::Utils.binary("bowtie-inspect")
|
|
35
|
+
# User should provide a complete path to the tool.
|
|
36
|
+
# I think it would it better identify the program from just a name
|
|
37
|
+
# looking int othe ext/ or host system path
|
|
38
|
+
# Why not grab the file name from the class name if not specified ?
|
|
39
|
+
|
|
40
|
+
set_output :stdout
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
add_option "across",:type => :numeric, :aliases => '-a'
|
|
44
|
+
add_option "names", :type => :boolean, :aliases => '-n'
|
|
45
|
+
add_option "summary", :type => :boolean, :aliases => '-s'
|
|
46
|
+
add_option "ebwt-ref", :type => :boolean, :aliases => '-e'
|
|
47
|
+
add_option "verbose", :type => :boolean, :aliases => '-v'
|
|
48
|
+
end #BowtieInspect
|
|
49
|
+
end#Ngs
|
|
50
|
+
end#Bio
|
|
@@ -0,0 +1,489 @@
|
|
|
1
|
+
#
|
|
2
|
+
# cufflinks.rb - description
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2011
|
|
5
|
+
# Raoul Bonnal <r@bioruby.org>
|
|
6
|
+
# License:: The Ruby License
|
|
7
|
+
#
|
|
8
|
+
#
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
module Bio
|
|
13
|
+
module Ngs
|
|
14
|
+
module Cufflinks
|
|
15
|
+
VERSION = "1.0.X"
|
|
16
|
+
class << self
|
|
17
|
+
def version
|
|
18
|
+
VERSION
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# cufflinks v1.0.2 (2335)
|
|
24
|
+
# linked against Boost version 104000
|
|
25
|
+
# -----------------------------
|
|
26
|
+
# Usage: cufflinks [options] <hits.sam>
|
|
27
|
+
# Options:
|
|
28
|
+
#
|
|
29
|
+
# -p/--num-threads number of threads used during analysis [ default: 1 ]
|
|
30
|
+
# -L/--label assembled transcripts have this ID prefix [ default: CUFF ]
|
|
31
|
+
# -G/--GTF quantitate against reference transcript annotations
|
|
32
|
+
# -F/--min-isoform-fraction suppress transcripts below this abundance level [ default: 0.15 ]
|
|
33
|
+
# -f/--min-intron-fraction filter spliced alignments below this level [ default: 0.05 ]
|
|
34
|
+
# -j/--pre-mrna-fraction suppress intra-intronic transcripts below this level [ default: 0.15 ]
|
|
35
|
+
# -I/--max-intron-length ignore alignments with gaps longer than this [ default: 300000 ]
|
|
36
|
+
# -Q/--min-map-qual ignore alignments with lower than this mapping qual [ default: 0 ]
|
|
37
|
+
# -M/--mask-file ignore all alignment within transcripts in this file
|
|
38
|
+
# -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
|
|
39
|
+
# -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
|
|
40
|
+
# -o/--output-dir write all output files to this directory [ default: ./ ]
|
|
41
|
+
# -r/--reference-seq reference fasta file for sequence bias correction [ default: NULL ]
|
|
42
|
+
#
|
|
43
|
+
# Advanced Options:
|
|
44
|
+
#
|
|
45
|
+
# -N/--quartile-normalization use quartile normalization instead of total counts [ default: FALSE ]
|
|
46
|
+
# -a/--junc-alpha alpha for junction binomial test filter [ default: 0.01 ]
|
|
47
|
+
# -A/--small-anchor-fraction percent read overhang taken as 'suspiciously small' [ default: 0.12 ]
|
|
48
|
+
# -m/--frag-len-mean the average fragment length [ default: 200 ]
|
|
49
|
+
# -s/--frag-len-std-dev the fragment length standard deviation [ default: 80 ]
|
|
50
|
+
# --min-frags-per-transfrag minimum number of fragments needed for new transfrags [ default: 10 ]
|
|
51
|
+
# --overhang-tolerance number of terminal exon bp to tolerate in introns [ default: 8 ]
|
|
52
|
+
# --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
|
|
53
|
+
# --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
|
|
54
|
+
# --library-type Library prep used for input reads [ default: below ]
|
|
55
|
+
# --max-bundle-length maximum genomic length allowed for a given bundle [ default:3500000 ]
|
|
56
|
+
# --max-bundle-frags maximum fragments allowed in a bundle before skipping [ default: 500000 ]
|
|
57
|
+
# --min-intron-length minimum intron size allowed in genome [ default: 50 ]
|
|
58
|
+
# Supported library types:
|
|
59
|
+
# ff-firststrand
|
|
60
|
+
# ff-secondstrand
|
|
61
|
+
# ff-unstranded
|
|
62
|
+
# fr-firststrand
|
|
63
|
+
# fr-secondstrand
|
|
64
|
+
# fr-unstranded (default)
|
|
65
|
+
# transfrags
|
|
66
|
+
class Quantification
|
|
67
|
+
|
|
68
|
+
include Bio::Command::Wrapper
|
|
69
|
+
|
|
70
|
+
set_program Bio::Ngs::Utils.binary("cufflinks")
|
|
71
|
+
|
|
72
|
+
add_option "num-threads", :type => :numeric, :aliases => '-p', :default => 1
|
|
73
|
+
add_option "label", :type => :string, :aliases => '-L', :default => "CUFF"
|
|
74
|
+
add_option "GTF", :type => :string, :aliases => '-G'
|
|
75
|
+
add_option "min-isoform-fraction", :type => :numeric, :aliases => '-F', :default => 0.15
|
|
76
|
+
add_option "min-intron-fraction", :type => :numeric, :aliases => '-f', :default => 0.05
|
|
77
|
+
add_option "pre-mrna-fraction", :type => :numeric, :aliases => '-j', :default => 0.15
|
|
78
|
+
add_option "max-intron-length", :type => :numeric, :aliases => '-I', :default => 300000
|
|
79
|
+
add_option "min-map-qual", :type => :numeric, :aliases => '-Q', :default => 0
|
|
80
|
+
add_option "mask-file", :type => :string, :aliases => '-M'
|
|
81
|
+
add_option "verbose", :type => :boolean, :aliases => '-v'
|
|
82
|
+
add_option "quiet", :type => :boolean, :aliases => '-q'
|
|
83
|
+
add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
|
|
84
|
+
add_option "reference-seq", :type => :string, :aliases => '-r'
|
|
85
|
+
add_option "quartile-normalization", :type => :boolean, :aliases => '-N'
|
|
86
|
+
add_option "junc-alpha", :type => :numeric, :aliases => '-a', :default => 0.01
|
|
87
|
+
add_option "small-anchor-fraction", :type => :numeric, :aliases => '-A', :default => 0.12
|
|
88
|
+
#TODO Check why with these defaults is not working properly
|
|
89
|
+
add_option "farg-len-mean", :type => :numeric, :aliases => '-m'#, :default => 200
|
|
90
|
+
add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'#, :default => 80
|
|
91
|
+
add_option "min-frags-per-transfrag", :type => :numeric#, :default => 10
|
|
92
|
+
add_option "overhang-tolerance", :type => :numeric#, :default => 8
|
|
93
|
+
add_option "num-importance-samples", :type => :numeric#, :default => 1000
|
|
94
|
+
add_option "max-mle-iterations", :type => :numeric#, :default => 5000
|
|
95
|
+
add_option "library-type", :type => :string
|
|
96
|
+
add_option "max-bundle-length", :type => :numeric #, :default => 3500000
|
|
97
|
+
add_option "max-bundle-frags", :type => :numeric #, :default => 500000
|
|
98
|
+
add_option "min-intron-length", :type => :numeric#, :default => 50
|
|
99
|
+
end #Quantification
|
|
100
|
+
|
|
101
|
+
# cuffdiff v1.0.2 (2336)
|
|
102
|
+
# -----------------------------
|
|
103
|
+
# Usage: cuffdiff [options] <transcripts.gtf> <sample1_hits.sam> <sample2_hits.sam> [... sampleN_hits.sam]
|
|
104
|
+
# Supply replicate SAMs as comma separated lists for each condition: sample1_rep1.sam,sample1_rep2.sam,...sample1_repM.sam
|
|
105
|
+
# General Options:
|
|
106
|
+
# -o/--output-dir write all output files to this directory [ default: ./ ]
|
|
107
|
+
# -T/--time-series treat samples as a time-series [ default: FALSE ]
|
|
108
|
+
# -c/--min-alignment-count minimum number of alignments in a locus for testing [ default: 10 ]
|
|
109
|
+
# --FDR False discovery rate used in testing [ default: 0.05 ]
|
|
110
|
+
# -M/--mask-file ignore all alignment within transcripts in this file [ default: NULL ]
|
|
111
|
+
# -b/--frag-bias-correct use bias correction - reference fasta required [ default: NULL ]
|
|
112
|
+
# -u/--multi-read-correct use 'rescue method' for multi-reads (more accurate) [ default: FALSE ]
|
|
113
|
+
# -N/--upper-quartile-norm use upper-quartile normalization [ default: FALSE ]
|
|
114
|
+
# -L/--labels comma-separated list of condition labels
|
|
115
|
+
# -p/--num-threads number of threads used during quantification [ default: 1 ]
|
|
116
|
+
#
|
|
117
|
+
# Advanced Options:
|
|
118
|
+
# --library-type Library prep used for input reads [ default: below ]
|
|
119
|
+
# -m/--frag-len-mean average fragment length (unpaired reads only) [ default: 200 ]
|
|
120
|
+
# -s/--frag-len-std-dev fragment length std deviation (unpaired reads only) [ default: 80 ]
|
|
121
|
+
# --num-importance-samples number of importance samples for MAP restimation [ default: 1000 ]
|
|
122
|
+
# --max-mle-iterations maximum iterations allowed for MLE calculation [ default: 5000 ]
|
|
123
|
+
# --compatible-hits-norm count hits compatible with reference RNAs only [ default: TRUE ]
|
|
124
|
+
# --total-hits-norm count all hits for normalization [ default: FALSE ]
|
|
125
|
+
# --poisson-dispersion Don't fit fragment counts for overdispersion [ default: FALSE ]
|
|
126
|
+
# -v/--verbose log-friendly verbose processing (no progress bar) [ default: FALSE ]
|
|
127
|
+
# -q/--quiet log-friendly quiet processing (no progress bar) [ default: FALSE ]
|
|
128
|
+
# --no-update-check do not contact server to check for update availability[ default: FALSE ]
|
|
129
|
+
# --emit-count-tables print count tables used to fit overdispersion [ default: FALSE ]
|
|
130
|
+
#
|
|
131
|
+
# Supported library types:
|
|
132
|
+
# ff-firststrand
|
|
133
|
+
# ff-secondstrand
|
|
134
|
+
# ff-unstranded
|
|
135
|
+
# fr-firststrand
|
|
136
|
+
# fr-secondstrand
|
|
137
|
+
# fr-unstranded (default)
|
|
138
|
+
# transfrags
|
|
139
|
+
class Diff
|
|
140
|
+
include Bio::Command::Wrapper
|
|
141
|
+
|
|
142
|
+
set_program Bio::Ngs::Utils.binary("cuffdiff")
|
|
143
|
+
|
|
144
|
+
add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
|
|
145
|
+
add_option "time-series", :type => :boolean, :aliases => '-T'
|
|
146
|
+
add_option "min-alignment-count", :type => :numeric, :aliases => '-c'
|
|
147
|
+
add_option "FDR", :type => :numeric, :aliases => '-F'
|
|
148
|
+
#TODO:FIX add_option "mask-file", :type => :string, :aliases => '-M'
|
|
149
|
+
#TODO:FIX add_option "frag-bias-correct", :type =>
|
|
150
|
+
add_option "multi-read-correct", :type => :boolean, :aliases => '-u'
|
|
151
|
+
add_option "upper-quartile-norm", :type => :boolean, :aliases => 'N'
|
|
152
|
+
add_option "labels", :type => :array, :aliases => '-L'
|
|
153
|
+
add_option "num-threads", :type => :numeric, :aliases => '-p'
|
|
154
|
+
add_option "library-type", :type => :string, :aliases => '-l'
|
|
155
|
+
add_option "frag-len-mean", :type => :numeric, :aliases => '-m'
|
|
156
|
+
add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'
|
|
157
|
+
add_option "num-importance-samples", :type => :numeric, :aliases => '-i'
|
|
158
|
+
add_option "max-mle-iterations", :type => :numeric, :aliases => '-e'
|
|
159
|
+
add_option "compatible-hits-norm", :type => :boolean, :aliases => '-h'
|
|
160
|
+
add_option "total-hits-norm", :type => :boolean, :aliases => '-t'
|
|
161
|
+
add_option "poisson-dispersion", :type => :boolean, :aliases => '-d'
|
|
162
|
+
add_option "verbose", :type => :boolean, :aliases => '-v'
|
|
163
|
+
add_option "quiet", :type => :boolean, :aliases => '-q'
|
|
164
|
+
add_option "no-update-check", :type => :boolean, :aliases => '-j'
|
|
165
|
+
add_option "emit-count-tables", :type => :boolean, :aliases => '-b'
|
|
166
|
+
|
|
167
|
+
#Examples
|
|
168
|
+
#Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,3,0.6,false,true)
|
|
169
|
+
#Bio::Ngs::Cufflinks::Diff.genes("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/gene_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,5,0.5,false,true)
|
|
170
|
+
|
|
171
|
+
class << self
|
|
172
|
+
|
|
173
|
+
#Return the version of CuffDiff used to produce the output
|
|
174
|
+
def version(diff)
|
|
175
|
+
#cufflink_version_offset = Bio::Ngs::Cufflinks.version
|
|
176
|
+
f=File.open(diff,'r')
|
|
177
|
+
header=f.readline #skip header
|
|
178
|
+
f.close
|
|
179
|
+
cufflink_version_offset = case header.split.size
|
|
180
|
+
when 12
|
|
181
|
+
"0.9.X"
|
|
182
|
+
when 14
|
|
183
|
+
Bio::Ngs::Cufflinks.version #latest
|
|
184
|
+
end
|
|
185
|
+
end#version
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def offset_by_version(cufflinks_version)
|
|
189
|
+
case cufflinks_version
|
|
190
|
+
when "0.9.X"
|
|
191
|
+
0
|
|
192
|
+
when "1.0.X"
|
|
193
|
+
1
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
#write a file with the information
|
|
198
|
+
#See process_de for options available
|
|
199
|
+
# Example: Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DEPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/isoform_exp.diff",
|
|
200
|
+
# "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_PopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/ComparepPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8.combined.gtf",
|
|
201
|
+
# fold:0.5,min_samples:5,min_fpkm:0.5,z_scores:true, :regulated=>:up)
|
|
202
|
+
def isoforms(diff, gtf, options={})
|
|
203
|
+
process_de(diff, gtf, options) do |dict_info, diff_reference, gtf_kb, fpkm_values|
|
|
204
|
+
"#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:nearest_ref]}_#{gtf_kb[diff_reference][:gene_name]}\t#{fpkm_values.join("\t")}"
|
|
205
|
+
end
|
|
206
|
+
end #isoform
|
|
207
|
+
|
|
208
|
+
#write a file with the information
|
|
209
|
+
#See process_de for options available
|
|
210
|
+
# Example: Bio::Ngs::Cufflinks::Diff.genes("/Users/bonnalraoul/Desktop/RRep16giugno/DEPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/gene_exp.diff",
|
|
211
|
+
# "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_PopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/ComparepPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8.combined.gtf",
|
|
212
|
+
# fold:0.5,min_samples:5,min_fpkm:0.5,z_scores:true, :regulated=>:up)
|
|
213
|
+
def genes(diff, gtf, options={})
|
|
214
|
+
process_de(diff, gtf, options) do |dict_info, diff_reference, gtf_kb, fpkm_values|
|
|
215
|
+
"#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:gene_name]}\t#{fpkm_values.join("\t")}"
|
|
216
|
+
end
|
|
217
|
+
end #genes
|
|
218
|
+
|
|
219
|
+
private
|
|
220
|
+
#Options hash
|
|
221
|
+
# :fold(float), :min_samples(integer), :min_fpkm(float), :only_significative(boolean, false) , :z_score(boolean, false)
|
|
222
|
+
# :regulated(symbol :up or :down default :up)
|
|
223
|
+
def process_de(diff, gtf, options={})
|
|
224
|
+
fold = options[:fold] || 0.0
|
|
225
|
+
min_samples = options[:min_samples] || 0
|
|
226
|
+
min_fpkm = options[:min_fpkm] || 0.0
|
|
227
|
+
only_significative = options[:only_significative] || false
|
|
228
|
+
z_scores = options[:z_scores] || false
|
|
229
|
+
#TODO improve check on paramters
|
|
230
|
+
regulated =options[:regulated] || :up
|
|
231
|
+
|
|
232
|
+
gtf_kb = Bio::Ngs::Cufflinks::Compare.exists_kb?(gtf) ? Bio::Ngs::Cufflinks::Compare.load_compare_kb(gtf) : Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
|
|
233
|
+
|
|
234
|
+
#convert log2 fold value into natural log value (internally computed by cuffdiff)
|
|
235
|
+
fold_log2 = fold
|
|
236
|
+
fold = fold==0 ? 0.0 : (fold*Math.log(2))
|
|
237
|
+
|
|
238
|
+
dict=Hash.new {|h, k| h[k]=Hash.new{|hh,kk| hh[kk]=[]}; }
|
|
239
|
+
dict_samples = Hash.new{|h,k| h[k]=""}
|
|
240
|
+
|
|
241
|
+
#which offset may I consider to get data from cuffdiff?
|
|
242
|
+
cufflink_version_offset = offset_by_version(version(diff))
|
|
243
|
+
|
|
244
|
+
File.open(diff,'r') do |f|
|
|
245
|
+
header=f.readline #skip header
|
|
246
|
+
|
|
247
|
+
q_first = 3 + cufflink_version_offset
|
|
248
|
+
q_second = 4 + cufflink_version_offset
|
|
249
|
+
fpkm_first = 6 + cufflink_version_offset
|
|
250
|
+
fpkm_second = 7 + cufflink_version_offset
|
|
251
|
+
fold_position = 8 + cufflink_version_offset
|
|
252
|
+
significant_position = 11 + cufflink_version_offset + (cufflink_version_offset==1 ? 1 : 0)
|
|
253
|
+
f.each_line do |line|
|
|
254
|
+
data=line.split
|
|
255
|
+
if data[fold_position].to_f<=0
|
|
256
|
+
data[fold_position]=data[fold_position].sub(/-/,"")
|
|
257
|
+
else
|
|
258
|
+
a=data[fpkm_second]
|
|
259
|
+
data[fpkm_second]=data[fpkm_first]
|
|
260
|
+
data[fpkm_first]=a
|
|
261
|
+
a=data[q_second]
|
|
262
|
+
data[q_second]=data[q_first]
|
|
263
|
+
data[q_first]=a
|
|
264
|
+
end
|
|
265
|
+
#0 TCONS
|
|
266
|
+
#4 name sample is the max diff for the item
|
|
267
|
+
#5 name sample is the less diff for the item
|
|
268
|
+
#9 is the fold
|
|
269
|
+
dict_samples[data[q_first]]
|
|
270
|
+
dict_samples[data[q_second]]
|
|
271
|
+
|
|
272
|
+
#7 is the fpkm value of max pop/sample
|
|
273
|
+
#8 is the fpkm value of min pop/sample
|
|
274
|
+
if ((only_significative==true && data[significant_position]=="yes") || (data[significant_position]=="yes" && data[fold_position].to_f>=fold)) && data[fpkm_first].to_f>=min_fpkm && data[fpkm_second].to_f>=min_fpkm
|
|
275
|
+
k_reference = data[0].to_sym #This can be TCONS if isoforms or XLOC if genes
|
|
276
|
+
|
|
277
|
+
###### puts data.join(" ") if k_reference == :XLOC_017497
|
|
278
|
+
#TODO refactor: this can be done using lambda
|
|
279
|
+
k_sample = case regulated
|
|
280
|
+
when :up
|
|
281
|
+
k_sample = data[q_first].to_sym
|
|
282
|
+
dict[k_reference][k_sample]<<data[q_second].to_sym
|
|
283
|
+
k_sample
|
|
284
|
+
when :down
|
|
285
|
+
k_sample = data[q_second].to_sym
|
|
286
|
+
dict[k_reference][k_sample]<<data[q_first].to_sym
|
|
287
|
+
k_sample
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
# puts dict[k_reference].inspect if k_reference == :XLOC_017497
|
|
291
|
+
|
|
292
|
+
unless dict[k_reference].key?(:values)
|
|
293
|
+
dict[k_reference][:values]={}
|
|
294
|
+
end
|
|
295
|
+
#store fpkm values as well for each pop/sample it should be
|
|
296
|
+
dict[k_reference][:values][k_sample]=data[fpkm_first].to_f unless dict[k_reference][:values].key?(k_sample)
|
|
297
|
+
dict[k_reference][:values][data[q_second].to_sym]=data[fpkm_second].to_f unless dict[k_reference][:values].key?(data[q_second].to_sym)
|
|
298
|
+
if dict[k_reference][k_sample].size >= min_samples
|
|
299
|
+
dict[k_reference][:winner] << k_sample
|
|
300
|
+
end
|
|
301
|
+
# puts dict[k_reference].inspect if k_reference == :XLOC_017497
|
|
302
|
+
else
|
|
303
|
+
#TODO add threshold value below min fpkm
|
|
304
|
+
#dict[k_reference][:values][k_sample]=data[6].to_f
|
|
305
|
+
#dict[k_reference][:values][data[4].to_sym]=data[7].to_f
|
|
306
|
+
end
|
|
307
|
+
end #each line
|
|
308
|
+
|
|
309
|
+
#example structure
|
|
310
|
+
#{:TCONS_00086164=>{:q5=>[:q1, :q2, :q3, :q6]}, :TCONS_00086166=>{:q5=>[:q1, :q2, :q3, :q4, :q6]}
|
|
311
|
+
end #file.open
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
file_lines =[]
|
|
315
|
+
dict.each do |diff_reference, dict_info|
|
|
316
|
+
|
|
317
|
+
if dict_info.key?(:winner)
|
|
318
|
+
|
|
319
|
+
#BAD PERFORMANCES use lambda
|
|
320
|
+
valz = case z_scores
|
|
321
|
+
when true
|
|
322
|
+
items=dict_info[:values].sort.map{|sample| sample[1]}
|
|
323
|
+
average = items.average
|
|
324
|
+
stdev = items.standard_deviation
|
|
325
|
+
items.map do |fpkm|
|
|
326
|
+
(fpkm-average)/stdev
|
|
327
|
+
end
|
|
328
|
+
when false
|
|
329
|
+
dict_info[:values].sort.map{|sample| sample[1]}
|
|
330
|
+
end #case
|
|
331
|
+
|
|
332
|
+
#TODO generalize to isoforms and genes now only isoforms
|
|
333
|
+
# puts yield(dict_info, diff_reference, gtf_kb, valz) if diff_reference == :XLOC_017497
|
|
334
|
+
file_lines<< yield(dict_info, diff_reference, gtf_kb, valz) #fpkm_values
|
|
335
|
+
#file_lines<<"#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:nearest_ref]}_#{gtf_kb[diff_reference][:gene_name]}\t#{valz.join("\t")}"
|
|
336
|
+
else
|
|
337
|
+
#TODO not winner or number of min samples
|
|
338
|
+
end#winner
|
|
339
|
+
end # dict_each
|
|
340
|
+
file_name_output =File.join(File.dirname(diff),File.basename(diff,".diff")+"-f#{fold_log2}_s#{min_samples}_fpkm#{min_fpkm}")
|
|
341
|
+
file_name_output += "_z" if z_scores
|
|
342
|
+
file_name_output += regulated.to_s
|
|
343
|
+
file_name_output += ".txt"
|
|
344
|
+
File.open(file_name_output,'w') do |odiff|
|
|
345
|
+
odiff.puts "sample\thumanized_id\t#{dict_samples.keys.sort.join("\t")}"
|
|
346
|
+
file_lines.sort.each do |file_line|
|
|
347
|
+
odiff.puts file_line
|
|
348
|
+
end#each sorted line
|
|
349
|
+
end#open
|
|
350
|
+
end #process_de
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
end #Diff
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
# cuffcompare v1.0.2 (2335)
|
|
357
|
+
# -----------------------------
|
|
358
|
+
# Usage:
|
|
359
|
+
# cuffcompare [-r <reference_mrna.gtf>] [-R] [-T] [-V] [-s <seq_path>]
|
|
360
|
+
# [-o <outprefix>] [-p <cprefix>]
|
|
361
|
+
# {-i <input_gtf_list> | <input1.gtf> [<input2.gtf> .. <inputN.gtf>]}
|
|
362
|
+
#
|
|
363
|
+
# Cuffcompare provides classification, reference annotation mapping and various
|
|
364
|
+
# statistics for Cufflinks transfrags.
|
|
365
|
+
# Cuffcompare clusters and tracks transfrags across multiple samples, writing
|
|
366
|
+
# matching transcripts (intron chains) into <outprefix>.tracking, and a GTF
|
|
367
|
+
# file <outprefix>.combined.gtf containing a nonredundant set of transcripts
|
|
368
|
+
# across all input files (with a single representative transfrag chosen
|
|
369
|
+
# for each clique of matching transfrags across samples).
|
|
370
|
+
#
|
|
371
|
+
# Options:
|
|
372
|
+
# -i provide a text file with a list of Cufflinks GTF files to process instead
|
|
373
|
+
# of expecting them as command line arguments (useful when a large number
|
|
374
|
+
# of GTF files should be processed)
|
|
375
|
+
#
|
|
376
|
+
# -r a set of known mRNAs to use as a reference for assessing
|
|
377
|
+
# the accuracy of mRNAs or gene models given in <input.gtf>
|
|
378
|
+
#
|
|
379
|
+
# -R for -r option, reduce the set of reference transcripts to
|
|
380
|
+
# only those found to overlap any of the input loci
|
|
381
|
+
# -M discard (ignore) single-exon transfrags and reference transcripts
|
|
382
|
+
# -N discard (ignore) single-exon reference transcripts
|
|
383
|
+
#
|
|
384
|
+
# -s <seq_path> can be a multi-fasta file with all the genomic sequences or
|
|
385
|
+
# a directory containing multiple single-fasta files (one file per contig);
|
|
386
|
+
# lower case bases will be used to classify input transcripts as repeats
|
|
387
|
+
#
|
|
388
|
+
# -d max distance (range) for grouping transcript start sites (100)
|
|
389
|
+
# -p the name prefix to use for consensus transcripts in the
|
|
390
|
+
# <outprefix>.combined.gtf file (default: 'TCONS')
|
|
391
|
+
# -C include the "contained" transcripts in the .combined.gtf file
|
|
392
|
+
# -G generic GFF input file(s) (do not assume Cufflinks GTF)
|
|
393
|
+
# -T do not generate .tmap and .refmap files for each input file
|
|
394
|
+
# -V verbose processing mode (showing all GFF parsing warnings)
|
|
395
|
+
class Compare
|
|
396
|
+
include Bio::Command::Wrapper
|
|
397
|
+
|
|
398
|
+
set_program Bio::Ngs::Utils.binary("cuffcompare")
|
|
399
|
+
use_aliases
|
|
400
|
+
#TODO: add descriptions
|
|
401
|
+
add_option "outprefix", :type => :string, :aliases => '-o', :default => "Comparison"
|
|
402
|
+
add_option "gtf_combine_file", :type => :string, :aliases => '-i'
|
|
403
|
+
add_option "gtf_reference", :type => :string, :aliases => '-r'
|
|
404
|
+
add_option "only_overlap", :type => :boolean, :aliases => '-R'
|
|
405
|
+
add_option "discard_transfrags", :type => :boolean, :aliases => '-M'
|
|
406
|
+
add_option "discard_ref_transcripts", :type => :boolean, :aliases => '-N'
|
|
407
|
+
add_option "multi_fasta", :type => :string, :aliases => '-s'
|
|
408
|
+
add_option "distance_tss", :type => :numeric, :aliases => '-d'
|
|
409
|
+
add_option "prefix_transcripts_consensus", :type => :string, :aliases => '-p'
|
|
410
|
+
add_option "contained", :type=>:boolean, :aliases => '-C'
|
|
411
|
+
add_option "GFF", :type => :boolean, :aliases =>'-G'
|
|
412
|
+
add_option "no_map_files", :type => :boolean, :aliases =>'-T'
|
|
413
|
+
|
|
414
|
+
class << self
|
|
415
|
+
|
|
416
|
+
def kb_name(gtf)
|
|
417
|
+
gtf.sub(/\.[a-zA-Z0-9]*$/,".kb")
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
def exists_kb?(gtf)
|
|
421
|
+
File.exists?(kb_name(gtf))
|
|
422
|
+
end
|
|
423
|
+
|
|
424
|
+
# Dump an hash of associations from a GTF file generated from CuffCompare
|
|
425
|
+
# gene_id: transcript_id, gene_name, oid, nearest_ref
|
|
426
|
+
# gene_id example: :XLOC_000001=>{:gene_name=>:RP11-304M2.1, :transcripts=>{:TCONS_00000001=>{:oid=>:ENST00000519787, :nearest_ref=>:ENST00000519787}}}
|
|
427
|
+
# the others are just plain hash
|
|
428
|
+
# transcript_id: gene_id, gene_name, oid, nearest_ref
|
|
429
|
+
# gene_name: gene_id, transcript_id, oid, nearest_ref
|
|
430
|
+
# oid: gene_id, transcript_id, gene_name, nearest_ref
|
|
431
|
+
# nearest_ref: gene_id, transcript_id, gene_name, oid
|
|
432
|
+
#Note:exons and coordinates are not saved.
|
|
433
|
+
def build_compare_kb(gtf)
|
|
434
|
+
unless File.exists?(gtf)
|
|
435
|
+
STDERR.puts "File #{gtf} doesn't exist."
|
|
436
|
+
return nil
|
|
437
|
+
end
|
|
438
|
+
|
|
439
|
+
dict = {} #build an hash with the combinations of data extracted from GTF file, XLOC, TCONS, ENST, SYMBOL
|
|
440
|
+
File.open(gtf,'r') do |f|
|
|
441
|
+
f.lines do |line|
|
|
442
|
+
line=~/gene_id (.*?);/
|
|
443
|
+
gene_id = $1.gsub(/"/,'').to_sym
|
|
444
|
+
line=~/transcript_id (.*?);/
|
|
445
|
+
transcript_id = $1.gsub(/"/,'').to_sym
|
|
446
|
+
line=~/gene_name (.*?);/
|
|
447
|
+
gene_name = $1.gsub(/"/,'').to_sym
|
|
448
|
+
line=~/oId (.*?);/
|
|
449
|
+
oid=$1.gsub(/"/,'').to_sym
|
|
450
|
+
line=~/nearest_ref (.*?);/
|
|
451
|
+
nearest_ref = $1.gsub(/"/,'').to_sym
|
|
452
|
+
unless dict.key?(gene_id)
|
|
453
|
+
dict[gene_id]={:gene_name=>gene_name,:transcripts=>{}}
|
|
454
|
+
end
|
|
455
|
+
unless dict[gene_id][:transcripts].key?(transcript_id)
|
|
456
|
+
dict[gene_id][:transcripts][transcript_id]={:odi=>oid, :nearest_ref=>nearest_ref}
|
|
457
|
+
end
|
|
458
|
+
dict[transcript_id]={:gene_id=>gene_id, :gene_name=>gene_name, :odi=>oid, :nearest_ref=>nearest_ref}
|
|
459
|
+
dict[gene_name]={:gene_id=>gene_id, :transcript_id=>transcript_id, :odi=>oid, :nearest_ref=>nearest_ref}
|
|
460
|
+
dict[oid]={:gene_id=>gene_id, :transcript_id=>transcript_id, :gene_name=>gene_name, :nearest_ref=>nearest_ref}
|
|
461
|
+
dict[nearest_ref]={:gene_id=>gene_id, :transcript_id=>transcript_id, :odi=>oid, :gene_name=>gene_name}
|
|
462
|
+
end#lines
|
|
463
|
+
end#file
|
|
464
|
+
kb_filename = kb_name(gtf)
|
|
465
|
+
File.open(kb_filename,'w') do |fkb|
|
|
466
|
+
#fkb.write(dict.to_json)
|
|
467
|
+
Marshal.dump(dict,fkb)
|
|
468
|
+
end #fkb
|
|
469
|
+
dict
|
|
470
|
+
end #build_compare_kb
|
|
471
|
+
|
|
472
|
+
# Return the hash of associations
|
|
473
|
+
# gene_id: transcript_id, gene_name, oid, nearest_ref
|
|
474
|
+
# transcript_id: gene_id, gene_name, oid, nearest_ref
|
|
475
|
+
# gene_name: gene_id, transcript_id, oid, nearest_ref
|
|
476
|
+
# oid: gene_id, transcript_id, gene_name, nearest_ref
|
|
477
|
+
# nearest_ref: gene_id, transcript_id, gene_name, oid
|
|
478
|
+
def load_compare_kb(gtf)
|
|
479
|
+
#TODO rescue Exceptions
|
|
480
|
+
kb_filename = kb_name(gtf)
|
|
481
|
+
gtf_kb = File.open(kb_filename,'r') do |kb_dump|
|
|
482
|
+
Marshal.load(kb_dump)
|
|
483
|
+
end
|
|
484
|
+
end #load_compare_kb
|
|
485
|
+
end
|
|
486
|
+
end #Compare
|
|
487
|
+
end #Cufflinks
|
|
488
|
+
end #Ngs
|
|
489
|
+
end #Bio
|