bio-ngs 0.3.2.alpha.01
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +39 -0
- data/Gemfile.lock +81 -0
- data/LICENSE.txt +28 -0
- data/README.rdoc +240 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/bin/biongs +35 -0
- data/bio-ngs.gemspec +215 -0
- data/ext/mkrf_conf.rb +87 -0
- data/lib/bio-ngs.rb +54 -0
- data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
- data/lib/bio/appl/ngs/blast.rb +36 -0
- data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
- data/lib/bio/appl/ngs/cufflinks.rb +489 -0
- data/lib/bio/appl/ngs/fastx.rb +170 -0
- data/lib/bio/appl/ngs/samtools.rb +118 -0
- data/lib/bio/appl/ngs/sff_extract.rb +23 -0
- data/lib/bio/appl/ngs/tophat.rb +158 -0
- data/lib/bio/ngs/converter.rb +100 -0
- data/lib/bio/ngs/core_ext.rb +12 -0
- data/lib/bio/ngs/db.rb +66 -0
- data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
- data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
- data/lib/bio/ngs/db/models.rb +1 -0
- data/lib/bio/ngs/db/models/homology.rb +8 -0
- data/lib/bio/ngs/db/models/ontology.rb +16 -0
- data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
- data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
- data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
- data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
- data/lib/bio/ngs/ext/versions.yaml +73 -0
- data/lib/bio/ngs/graphics.rb +189 -0
- data/lib/bio/ngs/homology.rb +102 -0
- data/lib/bio/ngs/ontology.rb +103 -0
- data/lib/bio/ngs/quality.rb +64 -0
- data/lib/bio/ngs/record.rb +50 -0
- data/lib/bio/ngs/task.rb +46 -0
- data/lib/bio/ngs/utils.rb +176 -0
- data/lib/development_tasks.rb +34 -0
- data/lib/enumerable.rb +37 -0
- data/lib/tasks/bwa.thor +126 -0
- data/lib/tasks/convert.thor +454 -0
- data/lib/tasks/history.thor +51 -0
- data/lib/tasks/homology.thor +121 -0
- data/lib/tasks/ontology.thor +93 -0
- data/lib/tasks/project.thor +51 -0
- data/lib/tasks/quality.thor +142 -0
- data/lib/tasks/rna.thor +126 -0
- data/lib/tasks/sff_extract.thor +9 -0
- data/lib/templates/README.tt +43 -0
- data/lib/templates/db.tt +6 -0
- data/lib/wrapper.rb +225 -0
- data/spec/converter_qseq_spec.rb +56 -0
- data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
- data/spec/quality_spec.rb +40 -0
- data/spec/sff_extract_spec.rb +98 -0
- data/spec/spec_helper.rb +55 -0
- data/spec/tophat_spec.rb +99 -0
- data/spec/utils_spec.rb +22 -0
- data/test/conf/test_db.yml +4 -0
- data/test/data/blastoutput.xml +69 -0
- data/test/data/gene-GO.json +1 -0
- data/test/data/goa_uniprot +27 -0
- data/test/data/goslim_goa.obo +1763 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-ngs.rb +17 -0
- data/test/test_db.rb +21 -0
- data/test/test_homology.rb +102 -0
- data/test/test_ngs.rb +21 -0
- data/test/test_ontology.rb +74 -0
- data/test/test_utils.rb +29 -0
- metadata +460 -0
@@ -0,0 +1,454 @@
|
|
1
|
+
#
|
2
|
+
# convert.thor - Main task for converting data between NGS formats
|
3
|
+
#
|
4
|
+
# Copyright:: Copyright (C) 2011
|
5
|
+
# Raoul J.P. Bonnal <r@bioruby.org>
|
6
|
+
# License:: The Ruby License
|
7
|
+
#
|
8
|
+
#
|
9
|
+
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
module Convert
|
14
|
+
|
15
|
+
class Bam < Thor
|
16
|
+
# Sort and index the input bam filename
|
17
|
+
# the sorted/indexed output is created in the same directory of the input file
|
18
|
+
desc "sort BAM [PREFIX]", "Sort and create and index for the BAM file name"
|
19
|
+
def sort(bam_fn, prefix=nil)
|
20
|
+
if File.exists?(bam_fn)
|
21
|
+
dirname = File.dirname(bam_fn)
|
22
|
+
prefix = File.basename(bam_fn).gsub(/\.bam/,'_sort') if prefix.nil?
|
23
|
+
bam_sort_fn = File.join(dirname, prefix)
|
24
|
+
#bam sort
|
25
|
+
Bio::DB::SAM::Tools.bam_sort(bam_fn, bam_sort_fn)
|
26
|
+
bam_sort_fn += ".bam"
|
27
|
+
#bam index sorted file
|
28
|
+
Bio::DB::SAM::Tools.bam_index_build(bam_sort_fn)
|
29
|
+
else
|
30
|
+
warn "[#{Time.now}] There was an error, tophat did not create any accepted_hit file "
|
31
|
+
end
|
32
|
+
#you tasks here
|
33
|
+
end #sort
|
34
|
+
|
35
|
+
desc "merge" ,"Merge multiple bams in a single one, BAMS separated by commmas"
|
36
|
+
method_option :input_bams, :type => :array, :required => true, :aliases => '-i'
|
37
|
+
method_option :output, :type => :string, :require => true, :aliases => '-o'
|
38
|
+
Bio::Ngs::Samtools::Merge.new.thor_task(self, :merge) do |wrapper, task|
|
39
|
+
wrapper.params = task.options
|
40
|
+
wrapper.run :arguments => [task.options.output, task.options.input_bams].flatten
|
41
|
+
end
|
42
|
+
|
43
|
+
desc "extract_genes BAM GENES", "Extract GENES from bam. It connects to Ensembl Humnan, release 61 and download the coordinates for the inserted genes"
|
44
|
+
method_option :output, :type => :string, :desc => "output file name"
|
45
|
+
method_option :ensembl_specie, :type => :string, :desc => "default homo_sapiens", :default => 'homo_sapiens'
|
46
|
+
method_option :ensembl_release, :type => :numeric, :desc => "ensembl release", :required => true
|
47
|
+
Bio::Ngs::Samtools::View.new.thor_task(self, :extract_genes) do |wrapper, task, bam_fn, gene_names|
|
48
|
+
require 'ensembl'
|
49
|
+
# begin
|
50
|
+
::Ensembl::Core::DBConnection.connect(task.options.ensembl_specie, task.options.ensembl_release)
|
51
|
+
genes_str=gene_names.split(',').map do |gene|
|
52
|
+
g = ::Ensembl::Core::Gene.find_by_name(gene)
|
53
|
+
if g
|
54
|
+
coords = "#{g.seq_region.name}:#{g.seq_region_start}-#{g.seq_region_end}"
|
55
|
+
else
|
56
|
+
warn "Can't find gene #{gene} in Ensembl #{task.options.ensembl_specie}, release #{task.options.ensembl_release} "
|
57
|
+
end
|
58
|
+
end.compact
|
59
|
+
if File.exists?(bam_fn) && !genes_str.empty?
|
60
|
+
output_name = task.options.output || bam_fn.gsub(/\.bam/, "_subset.bam")
|
61
|
+
wrapper.run :arguments => [output_name, bam_fn, genes_str]
|
62
|
+
task.invoke :sort, [output_name]
|
63
|
+
puts "Find your data in #{output_name} and #{output_name.gsub(/\.bam/,"_sort.bam")}"
|
64
|
+
end
|
65
|
+
# rescue Exception => e
|
66
|
+
# warn "Bam file #{bam_fn} does not exsist or you don't have the rights to open it.#{e}"
|
67
|
+
# end
|
68
|
+
end
|
69
|
+
end # Bam
|
70
|
+
|
71
|
+
module Qseq
|
72
|
+
class Fastq < Thor
|
73
|
+
desc "by_file FIRST OUTPUT", "Convert a qseq file into fastq"
|
74
|
+
method_option :paired, :type => :boolean, :default => false, :desc => 'Convert the reads in the paired format'
|
75
|
+
method_option :append, :type => :boolean, :default => false, :desc => 'Append this convertion to the output file if exists'
|
76
|
+
method_option :dir, :type => :string, :default=>".", :desc => 'Path to the working directory (data)'
|
77
|
+
# output is just a string I'll attach the fastq extension
|
78
|
+
def by_file(first, output)
|
79
|
+
qseq = Bio::Ngs::Converter::Qseq.new(options.paired ? :pe : :se)
|
80
|
+
buffers = [first] if first.kind_of? String
|
81
|
+
buffers = first if first.kind_of? Array
|
82
|
+
buffers.each do |file_name|
|
83
|
+
qseq.buffer = File.open(file_name,'r') #todo: dir is not used here it could be a bug
|
84
|
+
fastq_file = File.open(File.join(options.dir,"#{output}.fastq"), (options.append ? 'a' : 'w'))
|
85
|
+
qseq.to_fastq do |fastq|
|
86
|
+
fastq_file.puts fastq if fastq
|
87
|
+
end
|
88
|
+
qseq.buffer.close
|
89
|
+
fastq_file.close
|
90
|
+
#Write the report
|
91
|
+
File.open(File.join(options.dir,"#{output}.stats"), (options.append ? 'a' : 'w')) do |file|
|
92
|
+
file.puts ({:file_name=>file_name, :stats=>qseq.stats}.to_yaml)
|
93
|
+
end
|
94
|
+
end #buffers
|
95
|
+
# puts "Done #{file_name}"
|
96
|
+
end #by_file
|
97
|
+
|
98
|
+
# This tasks is used to aggregate the data demultiplexed from Illumina OLB 1.9 and CASAVA 1.7.
|
99
|
+
# Demultiplexing software splits the reads in different subdirectories based on the tag index of the reads,
|
100
|
+
# usually the wet-lab puts a population in a single lane an tags it with different indexes. The demultiplexer
|
101
|
+
# behaviour is not so clear, so this task takes care of simplify the aggregation for the final dataset.
|
102
|
+
# Output: 2 files
|
103
|
+
# 1) Forward fastq
|
104
|
+
# 2) Reverse fastq
|
105
|
+
desc "by_lane LANE OUTPUT", "Convert all the file in the current and descendant directories belonging to the specified lane in fastq. This command is specific for Illumina qseqs file s_#LANE_#STRAND_#TILE. Note UNKOWN directory is excluded by default."
|
106
|
+
method_option :paired, :type => :boolean, :default => false, :desc => 'Convert the reads in the paired format searching in the directories.'
|
107
|
+
method_option :append, :type => :boolean, :default => false, :desc => 'Append this convertion to the output file if exists'
|
108
|
+
method_option :dir, :type => :string, :desc => 'Path to the working directory (data)'
|
109
|
+
# output is just a string I'll attach the fastq extension
|
110
|
+
def by_lane(lane, output)
|
111
|
+
dir = options.dir || Dir.pwd
|
112
|
+
|
113
|
+
paired = options.paired
|
114
|
+
append = options.append
|
115
|
+
strand_lambda = lambda do |dir, strand| #Forward
|
116
|
+
strand_number = case strand
|
117
|
+
when :forward then 1
|
118
|
+
when :reverse then 2
|
119
|
+
end
|
120
|
+
invoke :by_file, [Dir[File.join(dir,"00?/s_#{lane}_#{strand_number}_*_qseq.txt")], "#{output}_#{strand}"], :paired => paired, :append => append, :dir => dir
|
121
|
+
end
|
122
|
+
|
123
|
+
forward_daemon_options = {
|
124
|
+
:app_name => "forward_#{lane}",
|
125
|
+
:ARGV => ['start'],
|
126
|
+
:log_output => true}
|
127
|
+
forward_task = ::Daemons.run_proc("forward_#{lane}",forward_daemon_options ) do
|
128
|
+
strand_lambda.call(dir,:forward)
|
129
|
+
end #daemon1
|
130
|
+
|
131
|
+
#Reverse
|
132
|
+
if options.paired
|
133
|
+
reverse_daemon_options = {
|
134
|
+
:app_name => "reverse_#{lane}",
|
135
|
+
:ARGV => ['start'],
|
136
|
+
:log_output => true}
|
137
|
+
reverse_task = ::Daemons.run_proc("reverse_#{lane}",reverse_daemon_options) do
|
138
|
+
strand_lambda.call(dir, :reverse)
|
139
|
+
end #daemon2
|
140
|
+
end #ifpaired
|
141
|
+
end #by_lane
|
142
|
+
|
143
|
+
desc "by_lane_index LANE INDEX OUTPUT", "Convert the qseq from a line and index in a fastq file"
|
144
|
+
method_option :paired, :type => :boolean, :default => false, :desc => 'Convert the reads in the paired format searching in the directories.'
|
145
|
+
method_option :append, :type => :boolean, :default => false, :desc => 'Append this convertion to the output file if exists'
|
146
|
+
method_option :dir, :type => :string, :desc => 'Path to the working directory (data)'
|
147
|
+
# output is just a string I'll attach the fastq extension
|
148
|
+
def by_lane_index(lane, index, output)
|
149
|
+
dir = options.dir || Dir.pwd
|
150
|
+
paired = options.paired
|
151
|
+
append = options.append
|
152
|
+
index_str = "%03d" % index
|
153
|
+
strand_lambda = lambda do |dir, strand| #Forward
|
154
|
+
strand_number = case strand
|
155
|
+
when :forward then 1
|
156
|
+
when :reverse then 2
|
157
|
+
end
|
158
|
+
invoke :by_file, [Dir[File.join(dir,"#{index_str}/s_#{lane}_#{strand_number}_*_qseq.txt")], "#{output}_#{strand}"], :paired => paired, :append => append, :dir => dir
|
159
|
+
end
|
160
|
+
|
161
|
+
forward_daemon_options = {
|
162
|
+
:app_name => "forward_#{lane}_#{index_str}",
|
163
|
+
:ARGV => ['start'],
|
164
|
+
:log_output => true,
|
165
|
+
:dir_mode => :normal,
|
166
|
+
:dir => dir}
|
167
|
+
forward_task = ::Daemons.run_proc("forward_#{lane}_#{index_str}",forward_daemon_options ) do
|
168
|
+
strand_lambda.call(dir,:forward)
|
169
|
+
end #daemon1
|
170
|
+
|
171
|
+
#Reverse
|
172
|
+
if options.paired
|
173
|
+
reverse_daemon_options = {
|
174
|
+
:app_name => "reverse_#{lane}_#{index_str}",
|
175
|
+
:ARGV => ['start'],
|
176
|
+
:log_output => true,
|
177
|
+
:dir_mode => :normal,
|
178
|
+
:dir => dir}
|
179
|
+
reverse_task = ::Daemons.run_proc("reverse_#{lane}_#{index_str}",reverse_daemon_options) do
|
180
|
+
strand_lambda.call(dir, :reverse)
|
181
|
+
end #daemon2
|
182
|
+
end #ifpaired
|
183
|
+
end #by_lane_index
|
184
|
+
|
185
|
+
# SAMPLES = 1,2,3,4
|
186
|
+
# LANE = 1
|
187
|
+
#OUTOUP = File name prefix, output file name will be OOUTPUT-Sample_N....
|
188
|
+
desc "samples_by_lane SAMPLES LANE OUTPUT", "Convert the qseqs for each sample in a specific lane. SAMPLES is an array of index codes separated by commas lane is an integer"
|
189
|
+
method_option :paired, :type => :boolean, :default => false, :desc => 'Convert the reads in the paired format searching in the directories.'
|
190
|
+
method_option :append, :type => :boolean, :default => false, :desc => 'Append this convertion to the output file if exists'
|
191
|
+
def samples_by_lane(samples, lane, output)
|
192
|
+
dir = Dir.pwd
|
193
|
+
samples.split(",").each do |sample|
|
194
|
+
sample_idx = sample.to_i
|
195
|
+
::Daemons.run_proc("sample#{sample}_by_lane-#{lane}", {:app_name => "sample#{sample}_by_lane-#{lane}",
|
196
|
+
:ARGV => ['start'],
|
197
|
+
:log_output => true}) do
|
198
|
+
invoke :by_lane_index, [lane, sample_idx, "#{output}-Sample_#{sample_idx}"], :paired => options.paired, :append =>options.append, :dir => dir
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end #samples_by_lane
|
202
|
+
|
203
|
+
end #Fastq
|
204
|
+
end #Qseq
|
205
|
+
|
206
|
+
module Bcl
|
207
|
+
class Qseq < Thor
|
208
|
+
desc "convert RUN OUTPUT [JOBS]", "Convert a bcl dataset in qseq"
|
209
|
+
def converts (run_basecalls_root, output, jobs=1)
|
210
|
+
invoke :configure_conversion, [run_basecalls_root, output]
|
211
|
+
invoke :run_bcl_to_qseq, [run_basecalls_root, jobs]
|
212
|
+
end #bcl_to_qseq
|
213
|
+
|
214
|
+
desc "configure_conversion RUN_DIR OUTPUT ", "Configure the specific Run to be converted", :hide => true
|
215
|
+
Bio::Ngs::Bclqseq.new.thor_task(self, :configure_conversion) do |wrapper, task, run_basecalls_root, output|
|
216
|
+
#wrapper.params={"base-calls-directory" => "#{run_basecalls_root}/Data/Intensities/BaseCalls", "output-directory" => output}
|
217
|
+
task.options.base_calls_directory=run_basecalls_root
|
218
|
+
#puts "Test parametri #{task.inspect}"
|
219
|
+
wrapper.run
|
220
|
+
end #setup_bcl_conversion
|
221
|
+
|
222
|
+
desc "start_conversion RUN_DIR [JOBS] ", "Start the conversion", :hide => true
|
223
|
+
method_option :prova, :type => :string
|
224
|
+
def start_conversion(run_basecalls_root, jobs=1)
|
225
|
+
# puts jobs
|
226
|
+
# puts basecalls
|
227
|
+
puts "make recursive -j #{jobs} -f #{run_basecalls_root}/Data/Intensities/BaseCalls/Makefile -C #{run_basecalls_root}/Data/Intensities/BaseCalls"
|
228
|
+
end #run_bcl_to_qseq
|
229
|
+
end #Qseq
|
230
|
+
end #Bcl
|
231
|
+
|
232
|
+
|
233
|
+
|
234
|
+
module Illumina
|
235
|
+
class Fastq < Thor
|
236
|
+
|
237
|
+
# Trim fastq sequences (Illumina format 1.5+):
|
238
|
+
# ------------------BBBBBBBBBBBBBBBBB
|
239
|
+
# ------------------
|
240
|
+
# First step trailing Bs are removed and if the remaining sequence is length enough
|
241
|
+
# The user can specify the minimum length of the sequnce and the number of Bs to search in the middle.
|
242
|
+
# If user passes an output file name that witll be used as suffix for the other output files.
|
243
|
+
# If no file name is passed the input file name will be used as suffix.
|
244
|
+
# Output: 4 files
|
245
|
+
# 1) xxx_trim.fastq the trimmed sequences in fastq format
|
246
|
+
# 2) xxx_rejected.fastq
|
247
|
+
# 3) xxx_profile.csv the length distribution of the trimmed sequnces
|
248
|
+
# 4) xxx_report.csv statistics on processed reads as total number of reads in input,
|
249
|
+
# trimmed, removed, untouched ( not trimmed)
|
250
|
+
# Note: removed reads are the ones which start with a B
|
251
|
+
# IMPORTANT: Data in FastQ formant MUST NOT BE WRAPPED sequence and quality MUST BE ON 1 LINE EACH
|
252
|
+
desc "trim_b FASTQ", "perform a trim on all the sequences on B qualities with Illumina's criteria. Ref to CASAVA manual."
|
253
|
+
#TODO, report the legth/profile of all the sequences.
|
254
|
+
#TODO: implement different strategies for trimming, N consecutive Bs ?
|
255
|
+
#TODO: implement min length for a trimmed sequnce to be reported as valid.
|
256
|
+
method_option :fileout, :type => :string
|
257
|
+
method_option :min_size, :type =>:numeric, :default => 20, :aliases => '-s', :desc => 'minimum length to consider a trimmed sequence as valid, otherwise it will be discarded'
|
258
|
+
def trim_b(fastq)
|
259
|
+
reads = File.open(fastq,'r')
|
260
|
+
output_filename_base = options[:fileout].nil? ? fastq : options.fileout
|
261
|
+
count_total = 0
|
262
|
+
count_trimmed = 0
|
263
|
+
count_removed = 0
|
264
|
+
sequences_profile=Hash.new(0)
|
265
|
+
fastq=0
|
266
|
+
head =""
|
267
|
+
seq=""
|
268
|
+
qual=""
|
269
|
+
min_size = (options[:min_size] > 1) ? (options[:min_size]-1) : 0
|
270
|
+
|
271
|
+
trimming_tail_patter = /B*$/
|
272
|
+
|
273
|
+
r_rejected = File.open(Bio::Ngs::Utils.tag_filename(output_filename_base, "trim_rejected","fastq"), 'w')
|
274
|
+
|
275
|
+
File.open(Bio::Ngs::Utils.tag_filename(output_filename_base, "trim", "fastq"), 'w') do |f|
|
276
|
+
reads.lines do |line|
|
277
|
+
case (fastq % 4 )
|
278
|
+
when 0 then
|
279
|
+
head = line
|
280
|
+
count_total+=1
|
281
|
+
when 1 then seq=line
|
282
|
+
#2 is the plus sign
|
283
|
+
when 3 then
|
284
|
+
b_tail_idx=(line=~trimming_tail_patter)
|
285
|
+
if (b_tail_idx > min_size )
|
286
|
+
count_trimmed+=1
|
287
|
+
f.puts "#{head}#{seq[0..b_tail_idx-1]}\n+\n#{$`}" #remaining_line}"#line[0..b_tail_idx]
|
288
|
+
else
|
289
|
+
count_removed+=1
|
290
|
+
r_rejected.puts "#{head}#{seq}+\n#{line}"
|
291
|
+
end
|
292
|
+
end #case
|
293
|
+
fastq+=1
|
294
|
+
end#read
|
295
|
+
end #Write fastq
|
296
|
+
r_rejected.close
|
297
|
+
#Profile
|
298
|
+
File.open(Bio::Ngs::Utils.tag_filename(output_filename_base, "trim_profile", "csv"), 'w') do |f_profile|
|
299
|
+
f_profile.puts "Sequnce length,count"
|
300
|
+
sequences_profile.sort.each do |profile|
|
301
|
+
read_size = profile[0]
|
302
|
+
read_number = profile[1]
|
303
|
+
f_profile.puts "#{read_size},#{read_number}"
|
304
|
+
end
|
305
|
+
end #Write profile
|
306
|
+
#Report
|
307
|
+
File.open(Bio::Ngs::Utils.tag_filename(output_filename_base, "trim_report", "csv"), 'w') do |report|
|
308
|
+
report.puts "Reads processed,Reads trimmed,Reads removed,Reads untouched"
|
309
|
+
report.puts "#{count_total},#{count_trimmed},#{count_removed},#{count_total-count_trimmed-count_removed}"
|
310
|
+
end #Write report
|
311
|
+
end #trim_b
|
312
|
+
end #Fastq
|
313
|
+
|
314
|
+
class Humanize < Thor
|
315
|
+
require 'json'
|
316
|
+
|
317
|
+
desc "build_compare_kb GTF", "Build the JSON file with the annoation from the GTF file used to humanize the results"
|
318
|
+
#TODO: create a zip file to optimize the space.
|
319
|
+
def build_compare_kb(gtf)
|
320
|
+
Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
|
321
|
+
# unless File.exists?(gtf)
|
322
|
+
# STDERR.puts "File #{gtf} doesn't exist."
|
323
|
+
# return nil
|
324
|
+
# end
|
325
|
+
# dict = {} #build an hash with the combinations of data extracted from GTF file, XLOC, TCONS, ENST, SYMBOL
|
326
|
+
# File.open(gtf,'r') do |f|
|
327
|
+
# f.lines do |line|
|
328
|
+
# line=~/gene_id (.*?);/
|
329
|
+
# gene_id = $1.gsub(/"/,'').to_sym
|
330
|
+
# line=~/transcript_id (.*?);/
|
331
|
+
# transcript_id = $1.gsub(/"/,'').to_sym
|
332
|
+
# line=~/gene_name (.*?);/
|
333
|
+
# gene_name = $1.gsub(/"/,'').to_sym
|
334
|
+
# line=~/oId (.*?);/
|
335
|
+
# oid=$1.gsub(/"/,'').to_sym
|
336
|
+
# line=~/nearest_ref (.*?);/
|
337
|
+
# nearest_ref = $1.gsub(/"/,'').to_sym
|
338
|
+
# dict[gene_id]={:transcript_id=>transcript_id, :gene_name=>gene_name, :odi=>oid, :nearest_ref=>nearest_ref}
|
339
|
+
# dict[transcript_id]={:gene_id=>gene_id, :gene_name=>gene_name, :odi=>oid, :nearest_ref=>nearest_ref}
|
340
|
+
# dict[gene_name]={:gene_id=>gene_id, :transcript_id=>transcript_id, :odi=>oid, :nearest_ref=>nearest_ref}
|
341
|
+
# dict[oid]={:gene_id=>gene_id, :transcript_id=>transcript_id, :gene_name=>gene_name, :nearest_ref=>nearest_ref}
|
342
|
+
# dict[nearest_ref]={:gene_id=>gene_id, :transcript_id=>transcript_id, :odi=>oid, :gene_name=>gene_name}
|
343
|
+
# end#lines
|
344
|
+
# end#file
|
345
|
+
# kb_filename = gtf.sub(/\.[a-zA-Z0-9]*$/,".kb")
|
346
|
+
# File.open(kb_filename,'w') do |fkb|
|
347
|
+
# #fkb.write(dict.to_json)
|
348
|
+
# Marshal.dump(dict,fkb)
|
349
|
+
# end #fkb
|
350
|
+
end
|
351
|
+
|
352
|
+
desc "isoform_exp GTF ISOFORM", "tag the XLOC gathering information from GTF (ensembl)"
|
353
|
+
#TODO: open a zip file,KB to optimez performances
|
354
|
+
def isoform_exp(gtf, isoform)
|
355
|
+
unless File.exists?(gtf)
|
356
|
+
STDERR.puts "File #{gtf} doesn't exist."
|
357
|
+
return nil
|
358
|
+
end
|
359
|
+
|
360
|
+
unless File.exists?(isoform)
|
361
|
+
STDERR.puts "File #{isoform} doesn't exist."
|
362
|
+
return nil
|
363
|
+
end
|
364
|
+
|
365
|
+
unless File.exists?(kb_filename = gtf.sub(/\.[a-zA-Z0-9]*$/,".kb"))
|
366
|
+
#build the kb
|
367
|
+
invoke :build_compare_kb, [gtf]
|
368
|
+
end
|
369
|
+
|
370
|
+
gtf_gkb = Bio::Ngs::Cufflinks::Compare.load_compare_kb(kb_filename)
|
371
|
+
# gtf_kb = File.open(kb_filename,'r') do |kb_dump|
|
372
|
+
# Marshal.load(kb_dump)
|
373
|
+
# end
|
374
|
+
|
375
|
+
File.open("#{isoform}_rich", 'w') do |w|
|
376
|
+
File.open(isoform,'r') do |f|
|
377
|
+
w.write("ensembl_transcript_id\t#{f.readline}") #skip header and write to output files
|
378
|
+
f.each_line do |line|
|
379
|
+
data = line.split
|
380
|
+
w.write("#{gtf_kb[data[0].to_sym][:nearest_ref]}\t#{line}")
|
381
|
+
end #line
|
382
|
+
end #file read
|
383
|
+
end #file write
|
384
|
+
end#isoform_exp
|
385
|
+
|
386
|
+
end #Humanize
|
387
|
+
|
388
|
+
class De < Thor
|
389
|
+
|
390
|
+
#./bin/biongs convert:illumina:de:isoform /Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp.diff /Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf --min_samples=5 --fold=2 --min_fpkm=0.5 --z_score | sort > /Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp_diff.txt
|
391
|
+
|
392
|
+
|
393
|
+
#Extract data from differential expression made by Cuffdiff.
|
394
|
+
#The user can request to export the data in a tabular format with data in fpkm or z-score (computed by row)
|
395
|
+
#Is possible to filter the results in different manners:
|
396
|
+
#by fold change: log2 (internally Cuffdiff compute a fold change with natural logarithm, this task made an internal conversion)
|
397
|
+
#by number of elmentes for with the fold change is verified among the remaining populations/samples
|
398
|
+
#by fpkm a poulation/samples is take into account by further selection steps if it's fpkm value is greater_equal to...
|
399
|
+
#the output is writted to a tab delimited table, sorted by the first column:sample-discriminator.
|
400
|
+
#Output file name isoform_exp-f1_s5_fpkm0.5_z.txt, the parameters are written in the file name, so is possible to keep track of them
|
401
|
+
desc "isoform DIFF GTF", "extract the transcripts"
|
402
|
+
method_option :fold, :type => :numeric, :desc => "DE fold change log2", :default=>0.0
|
403
|
+
method_option :only_significative, :type => :boolean, :aliases=>'-s', :default=>false
|
404
|
+
method_option :min_samples, :type=>:numeric, :aliases=>"-m", :desc=>"Niminim number of item for the the fold must be verified or significative"
|
405
|
+
method_option :min_fpkm, :type => :numeric, :aliases => "-f", :default=> 0.0, :desc => "Store a value if its fpkm is at least"
|
406
|
+
method_option :z_scores, :type => :boolean, :aliases => "-z", :default=> false, :desc=> "Return a matrix of Z-scores other than fpkm"
|
407
|
+
method_option :up, :type => :boolean, :aliases => '-u', :default => true, :desc => "Up regulated (true), down regulated (false)"
|
408
|
+
def isoform(diff_file, gtf)
|
409
|
+
how_regulated = options.up ? :up : :down
|
410
|
+
Bio::Ngs::Cufflinks::Diff.isoforms(diff_file,
|
411
|
+
gtf,
|
412
|
+
fold:options.fold,min_samples:options.min_samples,min_fpkm:options.min_fpkm,z_scores:options.z_scores, regulated:how_regulated)
|
413
|
+
end #de_isoform
|
414
|
+
|
415
|
+
desc "gene DIFF GTF", "extract the transcripts"
|
416
|
+
method_option :fold, :type => :numeric, :desc => "DE fold change log2", :default=>0.0
|
417
|
+
method_option :only_significative, :type => :boolean, :aliases=>'-s', :default=>false
|
418
|
+
method_option :min_samples, :type=>:numeric, :aliases=>"-m", :desc=>"Niminim number of item for the the fold must be verified or significative"
|
419
|
+
method_option :min_fpkm, :type => :numeric, :aliases => "-f", :default=> 0.0, :desc => "Store a value if its fpkm is at least"
|
420
|
+
method_option :z_scores, :type => :boolean, :aliases => "-z", :default=> false, :desc=> "Return a matrix of Z-scores other than fpkm"
|
421
|
+
method_option :up, :type => :boolean, :aliases => '-u', :default => true, :desc => "Up regulated (true), down regulated (false)"
|
422
|
+
def gene(diff_file, gtf)
|
423
|
+
how_regulated = options.up ? :up : :down
|
424
|
+
Bio::Ngs::Cufflinks::Diff.genes(diff_file,
|
425
|
+
gtf,
|
426
|
+
fold:options.fold,min_samples:options.min_samples,min_fpkm:options.min_fpkm,z_scores:options.z_scores, regulated:how_regulated)
|
427
|
+
end #de_isoform
|
428
|
+
|
429
|
+
#convert:illumina:de:rename_qs /Users/bonnalraoul/Desktop/RRep16giugno/DEpopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/gene_exp-f0.5_s5_fpkm0.5_zup.txt\
|
430
|
+
# Naive,Th1,Th17,Th2,Treg,Tfh
|
431
|
+
desc "rename_qs DIFF_FILE NAMES", 'rename q1,...,qn with names provided by the user(comma separated)'
|
432
|
+
def rename_qs(diff_file, names)
|
433
|
+
names_list = names.split(',')
|
434
|
+
File.open(diff_file+"_renamed",'w') do |w|
|
435
|
+
File.open(diff_file, 'r') do |f|
|
436
|
+
header = f.readline
|
437
|
+
names_list.each_with_index{|name,idx| header.gsub!(/q#{idx+1}/,name)}
|
438
|
+
w.puts header
|
439
|
+
f.each_line do |line|
|
440
|
+
line.scan(/q\d+/).each do |q|
|
441
|
+
line.gsub!(/#{q}/,names_list[q.tr('q','').to_i-1])
|
442
|
+
end #scan
|
443
|
+
w.puts line
|
444
|
+
end #each_line
|
445
|
+
end# open-read
|
446
|
+
end #open-write
|
447
|
+
end
|
448
|
+
end #De
|
449
|
+
|
450
|
+
end #Illumina
|
451
|
+
|
452
|
+
|
453
|
+
end #Convert
|
454
|
+
# Add methods to Enumerable, which makes them available to Array
|