bio-ngs 0.3.2.alpha.01
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +39 -0
- data/Gemfile.lock +81 -0
- data/LICENSE.txt +28 -0
- data/README.rdoc +240 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/bin/biongs +35 -0
- data/bio-ngs.gemspec +215 -0
- data/ext/mkrf_conf.rb +87 -0
- data/lib/bio-ngs.rb +54 -0
- data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
- data/lib/bio/appl/ngs/blast.rb +36 -0
- data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
- data/lib/bio/appl/ngs/cufflinks.rb +489 -0
- data/lib/bio/appl/ngs/fastx.rb +170 -0
- data/lib/bio/appl/ngs/samtools.rb +118 -0
- data/lib/bio/appl/ngs/sff_extract.rb +23 -0
- data/lib/bio/appl/ngs/tophat.rb +158 -0
- data/lib/bio/ngs/converter.rb +100 -0
- data/lib/bio/ngs/core_ext.rb +12 -0
- data/lib/bio/ngs/db.rb +66 -0
- data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
- data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
- data/lib/bio/ngs/db/models.rb +1 -0
- data/lib/bio/ngs/db/models/homology.rb +8 -0
- data/lib/bio/ngs/db/models/ontology.rb +16 -0
- data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
- data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
- data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
- data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
- data/lib/bio/ngs/ext/versions.yaml +73 -0
- data/lib/bio/ngs/graphics.rb +189 -0
- data/lib/bio/ngs/homology.rb +102 -0
- data/lib/bio/ngs/ontology.rb +103 -0
- data/lib/bio/ngs/quality.rb +64 -0
- data/lib/bio/ngs/record.rb +50 -0
- data/lib/bio/ngs/task.rb +46 -0
- data/lib/bio/ngs/utils.rb +176 -0
- data/lib/development_tasks.rb +34 -0
- data/lib/enumerable.rb +37 -0
- data/lib/tasks/bwa.thor +126 -0
- data/lib/tasks/convert.thor +454 -0
- data/lib/tasks/history.thor +51 -0
- data/lib/tasks/homology.thor +121 -0
- data/lib/tasks/ontology.thor +93 -0
- data/lib/tasks/project.thor +51 -0
- data/lib/tasks/quality.thor +142 -0
- data/lib/tasks/rna.thor +126 -0
- data/lib/tasks/sff_extract.thor +9 -0
- data/lib/templates/README.tt +43 -0
- data/lib/templates/db.tt +6 -0
- data/lib/wrapper.rb +225 -0
- data/spec/converter_qseq_spec.rb +56 -0
- data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
- data/spec/quality_spec.rb +40 -0
- data/spec/sff_extract_spec.rb +98 -0
- data/spec/spec_helper.rb +55 -0
- data/spec/tophat_spec.rb +99 -0
- data/spec/utils_spec.rb +22 -0
- data/test/conf/test_db.yml +4 -0
- data/test/data/blastoutput.xml +69 -0
- data/test/data/gene-GO.json +1 -0
- data/test/data/goa_uniprot +27 -0
- data/test/data/goslim_goa.obo +1763 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-ngs.rb +17 -0
- data/test/test_db.rb +21 -0
- data/test/test_homology.rb +102 -0
- data/test/test_ngs.rb +21 -0
- data/test/test_ontology.rb +74 -0
- data/test/test_utils.rb +29 -0
- metadata +460 -0
|
@@ -0,0 +1,454 @@
|
|
|
1
|
+
#
|
|
2
|
+
# convert.thor - Main task for converting data between NGS formats
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2011
|
|
5
|
+
# Raoul J.P. Bonnal <r@bioruby.org>
|
|
6
|
+
# License:: The Ruby License
|
|
7
|
+
#
|
|
8
|
+
#
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
module Convert
|
|
14
|
+
|
|
15
|
+
class Bam < Thor
|
|
16
|
+
# Sort and index the input bam filename
|
|
17
|
+
# the sorted/indexed output is created in the same directory of the input file
|
|
18
|
+
desc "sort BAM [PREFIX]", "Sort and create and index for the BAM file name"
|
|
19
|
+
def sort(bam_fn, prefix=nil)
|
|
20
|
+
if File.exists?(bam_fn)
|
|
21
|
+
dirname = File.dirname(bam_fn)
|
|
22
|
+
prefix = File.basename(bam_fn).gsub(/\.bam/,'_sort') if prefix.nil?
|
|
23
|
+
bam_sort_fn = File.join(dirname, prefix)
|
|
24
|
+
#bam sort
|
|
25
|
+
Bio::DB::SAM::Tools.bam_sort(bam_fn, bam_sort_fn)
|
|
26
|
+
bam_sort_fn += ".bam"
|
|
27
|
+
#bam index sorted file
|
|
28
|
+
Bio::DB::SAM::Tools.bam_index_build(bam_sort_fn)
|
|
29
|
+
else
|
|
30
|
+
warn "[#{Time.now}] There was an error, tophat did not create any accepted_hit file "
|
|
31
|
+
end
|
|
32
|
+
#you tasks here
|
|
33
|
+
end #sort
|
|
34
|
+
|
|
35
|
+
desc "merge" ,"Merge multiple bams in a single one, BAMS separated by commmas"
|
|
36
|
+
method_option :input_bams, :type => :array, :required => true, :aliases => '-i'
|
|
37
|
+
method_option :output, :type => :string, :require => true, :aliases => '-o'
|
|
38
|
+
Bio::Ngs::Samtools::Merge.new.thor_task(self, :merge) do |wrapper, task|
|
|
39
|
+
wrapper.params = task.options
|
|
40
|
+
wrapper.run :arguments => [task.options.output, task.options.input_bams].flatten
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
desc "extract_genes BAM GENES", "Extract GENES from bam. It connects to Ensembl Humnan, release 61 and download the coordinates for the inserted genes"
|
|
44
|
+
method_option :output, :type => :string, :desc => "output file name"
|
|
45
|
+
method_option :ensembl_specie, :type => :string, :desc => "default homo_sapiens", :default => 'homo_sapiens'
|
|
46
|
+
method_option :ensembl_release, :type => :numeric, :desc => "ensembl release", :required => true
|
|
47
|
+
Bio::Ngs::Samtools::View.new.thor_task(self, :extract_genes) do |wrapper, task, bam_fn, gene_names|
|
|
48
|
+
require 'ensembl'
|
|
49
|
+
# begin
|
|
50
|
+
::Ensembl::Core::DBConnection.connect(task.options.ensembl_specie, task.options.ensembl_release)
|
|
51
|
+
genes_str=gene_names.split(',').map do |gene|
|
|
52
|
+
g = ::Ensembl::Core::Gene.find_by_name(gene)
|
|
53
|
+
if g
|
|
54
|
+
coords = "#{g.seq_region.name}:#{g.seq_region_start}-#{g.seq_region_end}"
|
|
55
|
+
else
|
|
56
|
+
warn "Can't find gene #{gene} in Ensembl #{task.options.ensembl_specie}, release #{task.options.ensembl_release} "
|
|
57
|
+
end
|
|
58
|
+
end.compact
|
|
59
|
+
if File.exists?(bam_fn) && !genes_str.empty?
|
|
60
|
+
output_name = task.options.output || bam_fn.gsub(/\.bam/, "_subset.bam")
|
|
61
|
+
wrapper.run :arguments => [output_name, bam_fn, genes_str]
|
|
62
|
+
task.invoke :sort, [output_name]
|
|
63
|
+
puts "Find your data in #{output_name} and #{output_name.gsub(/\.bam/,"_sort.bam")}"
|
|
64
|
+
end
|
|
65
|
+
# rescue Exception => e
|
|
66
|
+
# warn "Bam file #{bam_fn} does not exsist or you don't have the rights to open it.#{e}"
|
|
67
|
+
# end
|
|
68
|
+
end
|
|
69
|
+
end # Bam
|
|
70
|
+
|
|
71
|
+
module Qseq
|
|
72
|
+
class Fastq < Thor
|
|
73
|
+
desc "by_file FIRST OUTPUT", "Convert a qseq file into fastq"
|
|
74
|
+
method_option :paired, :type => :boolean, :default => false, :desc => 'Convert the reads in the paired format'
|
|
75
|
+
method_option :append, :type => :boolean, :default => false, :desc => 'Append this convertion to the output file if exists'
|
|
76
|
+
method_option :dir, :type => :string, :default=>".", :desc => 'Path to the working directory (data)'
|
|
77
|
+
# output is just a string I'll attach the fastq extension
|
|
78
|
+
def by_file(first, output)
|
|
79
|
+
qseq = Bio::Ngs::Converter::Qseq.new(options.paired ? :pe : :se)
|
|
80
|
+
buffers = [first] if first.kind_of? String
|
|
81
|
+
buffers = first if first.kind_of? Array
|
|
82
|
+
buffers.each do |file_name|
|
|
83
|
+
qseq.buffer = File.open(file_name,'r') #todo: dir is not used here it could be a bug
|
|
84
|
+
fastq_file = File.open(File.join(options.dir,"#{output}.fastq"), (options.append ? 'a' : 'w'))
|
|
85
|
+
qseq.to_fastq do |fastq|
|
|
86
|
+
fastq_file.puts fastq if fastq
|
|
87
|
+
end
|
|
88
|
+
qseq.buffer.close
|
|
89
|
+
fastq_file.close
|
|
90
|
+
#Write the report
|
|
91
|
+
File.open(File.join(options.dir,"#{output}.stats"), (options.append ? 'a' : 'w')) do |file|
|
|
92
|
+
file.puts ({:file_name=>file_name, :stats=>qseq.stats}.to_yaml)
|
|
93
|
+
end
|
|
94
|
+
end #buffers
|
|
95
|
+
# puts "Done #{file_name}"
|
|
96
|
+
end #by_file
|
|
97
|
+
|
|
98
|
+
# This tasks is used to aggregate the data demultiplexed from Illumina OLB 1.9 and CASAVA 1.7.
|
|
99
|
+
# Demultiplexing software splits the reads in different subdirectories based on the tag index of the reads,
|
|
100
|
+
# usually the wet-lab puts a population in a single lane an tags it with different indexes. The demultiplexer
|
|
101
|
+
# behaviour is not so clear, so this task takes care of simplify the aggregation for the final dataset.
|
|
102
|
+
# Output: 2 files
|
|
103
|
+
# 1) Forward fastq
|
|
104
|
+
# 2) Reverse fastq
|
|
105
|
+
desc "by_lane LANE OUTPUT", "Convert all the file in the current and descendant directories belonging to the specified lane in fastq. This command is specific for Illumina qseqs file s_#LANE_#STRAND_#TILE. Note UNKOWN directory is excluded by default."
|
|
106
|
+
method_option :paired, :type => :boolean, :default => false, :desc => 'Convert the reads in the paired format searching in the directories.'
|
|
107
|
+
method_option :append, :type => :boolean, :default => false, :desc => 'Append this convertion to the output file if exists'
|
|
108
|
+
method_option :dir, :type => :string, :desc => 'Path to the working directory (data)'
|
|
109
|
+
# output is just a string I'll attach the fastq extension
|
|
110
|
+
def by_lane(lane, output)
|
|
111
|
+
dir = options.dir || Dir.pwd
|
|
112
|
+
|
|
113
|
+
paired = options.paired
|
|
114
|
+
append = options.append
|
|
115
|
+
strand_lambda = lambda do |dir, strand| #Forward
|
|
116
|
+
strand_number = case strand
|
|
117
|
+
when :forward then 1
|
|
118
|
+
when :reverse then 2
|
|
119
|
+
end
|
|
120
|
+
invoke :by_file, [Dir[File.join(dir,"00?/s_#{lane}_#{strand_number}_*_qseq.txt")], "#{output}_#{strand}"], :paired => paired, :append => append, :dir => dir
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
forward_daemon_options = {
|
|
124
|
+
:app_name => "forward_#{lane}",
|
|
125
|
+
:ARGV => ['start'],
|
|
126
|
+
:log_output => true}
|
|
127
|
+
forward_task = ::Daemons.run_proc("forward_#{lane}",forward_daemon_options ) do
|
|
128
|
+
strand_lambda.call(dir,:forward)
|
|
129
|
+
end #daemon1
|
|
130
|
+
|
|
131
|
+
#Reverse
|
|
132
|
+
if options.paired
|
|
133
|
+
reverse_daemon_options = {
|
|
134
|
+
:app_name => "reverse_#{lane}",
|
|
135
|
+
:ARGV => ['start'],
|
|
136
|
+
:log_output => true}
|
|
137
|
+
reverse_task = ::Daemons.run_proc("reverse_#{lane}",reverse_daemon_options) do
|
|
138
|
+
strand_lambda.call(dir, :reverse)
|
|
139
|
+
end #daemon2
|
|
140
|
+
end #ifpaired
|
|
141
|
+
end #by_lane
|
|
142
|
+
|
|
143
|
+
desc "by_lane_index LANE INDEX OUTPUT", "Convert the qseq from a line and index in a fastq file"
|
|
144
|
+
method_option :paired, :type => :boolean, :default => false, :desc => 'Convert the reads in the paired format searching in the directories.'
|
|
145
|
+
method_option :append, :type => :boolean, :default => false, :desc => 'Append this convertion to the output file if exists'
|
|
146
|
+
method_option :dir, :type => :string, :desc => 'Path to the working directory (data)'
|
|
147
|
+
# output is just a string I'll attach the fastq extension
|
|
148
|
+
def by_lane_index(lane, index, output)
|
|
149
|
+
dir = options.dir || Dir.pwd
|
|
150
|
+
paired = options.paired
|
|
151
|
+
append = options.append
|
|
152
|
+
index_str = "%03d" % index
|
|
153
|
+
strand_lambda = lambda do |dir, strand| #Forward
|
|
154
|
+
strand_number = case strand
|
|
155
|
+
when :forward then 1
|
|
156
|
+
when :reverse then 2
|
|
157
|
+
end
|
|
158
|
+
invoke :by_file, [Dir[File.join(dir,"#{index_str}/s_#{lane}_#{strand_number}_*_qseq.txt")], "#{output}_#{strand}"], :paired => paired, :append => append, :dir => dir
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
forward_daemon_options = {
|
|
162
|
+
:app_name => "forward_#{lane}_#{index_str}",
|
|
163
|
+
:ARGV => ['start'],
|
|
164
|
+
:log_output => true,
|
|
165
|
+
:dir_mode => :normal,
|
|
166
|
+
:dir => dir}
|
|
167
|
+
forward_task = ::Daemons.run_proc("forward_#{lane}_#{index_str}",forward_daemon_options ) do
|
|
168
|
+
strand_lambda.call(dir,:forward)
|
|
169
|
+
end #daemon1
|
|
170
|
+
|
|
171
|
+
#Reverse
|
|
172
|
+
if options.paired
|
|
173
|
+
reverse_daemon_options = {
|
|
174
|
+
:app_name => "reverse_#{lane}_#{index_str}",
|
|
175
|
+
:ARGV => ['start'],
|
|
176
|
+
:log_output => true,
|
|
177
|
+
:dir_mode => :normal,
|
|
178
|
+
:dir => dir}
|
|
179
|
+
reverse_task = ::Daemons.run_proc("reverse_#{lane}_#{index_str}",reverse_daemon_options) do
|
|
180
|
+
strand_lambda.call(dir, :reverse)
|
|
181
|
+
end #daemon2
|
|
182
|
+
end #ifpaired
|
|
183
|
+
end #by_lane_index
|
|
184
|
+
|
|
185
|
+
# SAMPLES = 1,2,3,4
|
|
186
|
+
# LANE = 1
|
|
187
|
+
#OUTOUP = File name prefix, output file name will be OOUTPUT-Sample_N....
|
|
188
|
+
desc "samples_by_lane SAMPLES LANE OUTPUT", "Convert the qseqs for each sample in a specific lane. SAMPLES is an array of index codes separated by commas lane is an integer"
|
|
189
|
+
method_option :paired, :type => :boolean, :default => false, :desc => 'Convert the reads in the paired format searching in the directories.'
|
|
190
|
+
method_option :append, :type => :boolean, :default => false, :desc => 'Append this convertion to the output file if exists'
|
|
191
|
+
def samples_by_lane(samples, lane, output)
|
|
192
|
+
dir = Dir.pwd
|
|
193
|
+
samples.split(",").each do |sample|
|
|
194
|
+
sample_idx = sample.to_i
|
|
195
|
+
::Daemons.run_proc("sample#{sample}_by_lane-#{lane}", {:app_name => "sample#{sample}_by_lane-#{lane}",
|
|
196
|
+
:ARGV => ['start'],
|
|
197
|
+
:log_output => true}) do
|
|
198
|
+
invoke :by_lane_index, [lane, sample_idx, "#{output}-Sample_#{sample_idx}"], :paired => options.paired, :append =>options.append, :dir => dir
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
end #samples_by_lane
|
|
202
|
+
|
|
203
|
+
end #Fastq
|
|
204
|
+
end #Qseq
|
|
205
|
+
|
|
206
|
+
module Bcl
|
|
207
|
+
class Qseq < Thor
|
|
208
|
+
desc "convert RUN OUTPUT [JOBS]", "Convert a bcl dataset in qseq"
|
|
209
|
+
def converts (run_basecalls_root, output, jobs=1)
|
|
210
|
+
invoke :configure_conversion, [run_basecalls_root, output]
|
|
211
|
+
invoke :run_bcl_to_qseq, [run_basecalls_root, jobs]
|
|
212
|
+
end #bcl_to_qseq
|
|
213
|
+
|
|
214
|
+
desc "configure_conversion RUN_DIR OUTPUT ", "Configure the specific Run to be converted", :hide => true
|
|
215
|
+
Bio::Ngs::Bclqseq.new.thor_task(self, :configure_conversion) do |wrapper, task, run_basecalls_root, output|
|
|
216
|
+
#wrapper.params={"base-calls-directory" => "#{run_basecalls_root}/Data/Intensities/BaseCalls", "output-directory" => output}
|
|
217
|
+
task.options.base_calls_directory=run_basecalls_root
|
|
218
|
+
#puts "Test parametri #{task.inspect}"
|
|
219
|
+
wrapper.run
|
|
220
|
+
end #setup_bcl_conversion
|
|
221
|
+
|
|
222
|
+
desc "start_conversion RUN_DIR [JOBS] ", "Start the conversion", :hide => true
|
|
223
|
+
method_option :prova, :type => :string
|
|
224
|
+
def start_conversion(run_basecalls_root, jobs=1)
|
|
225
|
+
# puts jobs
|
|
226
|
+
# puts basecalls
|
|
227
|
+
puts "make recursive -j #{jobs} -f #{run_basecalls_root}/Data/Intensities/BaseCalls/Makefile -C #{run_basecalls_root}/Data/Intensities/BaseCalls"
|
|
228
|
+
end #run_bcl_to_qseq
|
|
229
|
+
end #Qseq
|
|
230
|
+
end #Bcl
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
module Illumina
|
|
235
|
+
class Fastq < Thor
|
|
236
|
+
|
|
237
|
+
# Trim fastq sequences (Illumina format 1.5+):
|
|
238
|
+
# ------------------BBBBBBBBBBBBBBBBB
|
|
239
|
+
# ------------------
|
|
240
|
+
# First step trailing Bs are removed and if the remaining sequence is length enough
|
|
241
|
+
# The user can specify the minimum length of the sequnce and the number of Bs to search in the middle.
|
|
242
|
+
# If user passes an output file name that witll be used as suffix for the other output files.
|
|
243
|
+
# If no file name is passed the input file name will be used as suffix.
|
|
244
|
+
# Output: 4 files
|
|
245
|
+
# 1) xxx_trim.fastq the trimmed sequences in fastq format
|
|
246
|
+
# 2) xxx_rejected.fastq
|
|
247
|
+
# 3) xxx_profile.csv the length distribution of the trimmed sequnces
|
|
248
|
+
# 4) xxx_report.csv statistics on processed reads as total number of reads in input,
|
|
249
|
+
# trimmed, removed, untouched ( not trimmed)
|
|
250
|
+
# Note: removed reads are the ones which start with a B
|
|
251
|
+
# IMPORTANT: Data in FastQ formant MUST NOT BE WRAPPED sequence and quality MUST BE ON 1 LINE EACH
|
|
252
|
+
desc "trim_b FASTQ", "perform a trim on all the sequences on B qualities with Illumina's criteria. Ref to CASAVA manual."
|
|
253
|
+
#TODO, report the legth/profile of all the sequences.
|
|
254
|
+
#TODO: implement different strategies for trimming, N consecutive Bs ?
|
|
255
|
+
#TODO: implement min length for a trimmed sequnce to be reported as valid.
|
|
256
|
+
method_option :fileout, :type => :string
|
|
257
|
+
method_option :min_size, :type =>:numeric, :default => 20, :aliases => '-s', :desc => 'minimum length to consider a trimmed sequence as valid, otherwise it will be discarded'
|
|
258
|
+
def trim_b(fastq)
|
|
259
|
+
reads = File.open(fastq,'r')
|
|
260
|
+
output_filename_base = options[:fileout].nil? ? fastq : options.fileout
|
|
261
|
+
count_total = 0
|
|
262
|
+
count_trimmed = 0
|
|
263
|
+
count_removed = 0
|
|
264
|
+
sequences_profile=Hash.new(0)
|
|
265
|
+
fastq=0
|
|
266
|
+
head =""
|
|
267
|
+
seq=""
|
|
268
|
+
qual=""
|
|
269
|
+
min_size = (options[:min_size] > 1) ? (options[:min_size]-1) : 0
|
|
270
|
+
|
|
271
|
+
trimming_tail_patter = /B*$/
|
|
272
|
+
|
|
273
|
+
r_rejected = File.open(Bio::Ngs::Utils.tag_filename(output_filename_base, "trim_rejected","fastq"), 'w')
|
|
274
|
+
|
|
275
|
+
File.open(Bio::Ngs::Utils.tag_filename(output_filename_base, "trim", "fastq"), 'w') do |f|
|
|
276
|
+
reads.lines do |line|
|
|
277
|
+
case (fastq % 4 )
|
|
278
|
+
when 0 then
|
|
279
|
+
head = line
|
|
280
|
+
count_total+=1
|
|
281
|
+
when 1 then seq=line
|
|
282
|
+
#2 is the plus sign
|
|
283
|
+
when 3 then
|
|
284
|
+
b_tail_idx=(line=~trimming_tail_patter)
|
|
285
|
+
if (b_tail_idx > min_size )
|
|
286
|
+
count_trimmed+=1
|
|
287
|
+
f.puts "#{head}#{seq[0..b_tail_idx-1]}\n+\n#{$`}" #remaining_line}"#line[0..b_tail_idx]
|
|
288
|
+
else
|
|
289
|
+
count_removed+=1
|
|
290
|
+
r_rejected.puts "#{head}#{seq}+\n#{line}"
|
|
291
|
+
end
|
|
292
|
+
end #case
|
|
293
|
+
fastq+=1
|
|
294
|
+
end#read
|
|
295
|
+
end #Write fastq
|
|
296
|
+
r_rejected.close
|
|
297
|
+
#Profile
|
|
298
|
+
File.open(Bio::Ngs::Utils.tag_filename(output_filename_base, "trim_profile", "csv"), 'w') do |f_profile|
|
|
299
|
+
f_profile.puts "Sequnce length,count"
|
|
300
|
+
sequences_profile.sort.each do |profile|
|
|
301
|
+
read_size = profile[0]
|
|
302
|
+
read_number = profile[1]
|
|
303
|
+
f_profile.puts "#{read_size},#{read_number}"
|
|
304
|
+
end
|
|
305
|
+
end #Write profile
|
|
306
|
+
#Report
|
|
307
|
+
File.open(Bio::Ngs::Utils.tag_filename(output_filename_base, "trim_report", "csv"), 'w') do |report|
|
|
308
|
+
report.puts "Reads processed,Reads trimmed,Reads removed,Reads untouched"
|
|
309
|
+
report.puts "#{count_total},#{count_trimmed},#{count_removed},#{count_total-count_trimmed-count_removed}"
|
|
310
|
+
end #Write report
|
|
311
|
+
end #trim_b
|
|
312
|
+
end #Fastq
|
|
313
|
+
|
|
314
|
+
class Humanize < Thor
|
|
315
|
+
require 'json'
|
|
316
|
+
|
|
317
|
+
desc "build_compare_kb GTF", "Build the JSON file with the annoation from the GTF file used to humanize the results"
|
|
318
|
+
#TODO: create a zip file to optimize the space.
|
|
319
|
+
def build_compare_kb(gtf)
|
|
320
|
+
Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
|
|
321
|
+
# unless File.exists?(gtf)
|
|
322
|
+
# STDERR.puts "File #{gtf} doesn't exist."
|
|
323
|
+
# return nil
|
|
324
|
+
# end
|
|
325
|
+
# dict = {} #build an hash with the combinations of data extracted from GTF file, XLOC, TCONS, ENST, SYMBOL
|
|
326
|
+
# File.open(gtf,'r') do |f|
|
|
327
|
+
# f.lines do |line|
|
|
328
|
+
# line=~/gene_id (.*?);/
|
|
329
|
+
# gene_id = $1.gsub(/"/,'').to_sym
|
|
330
|
+
# line=~/transcript_id (.*?);/
|
|
331
|
+
# transcript_id = $1.gsub(/"/,'').to_sym
|
|
332
|
+
# line=~/gene_name (.*?);/
|
|
333
|
+
# gene_name = $1.gsub(/"/,'').to_sym
|
|
334
|
+
# line=~/oId (.*?);/
|
|
335
|
+
# oid=$1.gsub(/"/,'').to_sym
|
|
336
|
+
# line=~/nearest_ref (.*?);/
|
|
337
|
+
# nearest_ref = $1.gsub(/"/,'').to_sym
|
|
338
|
+
# dict[gene_id]={:transcript_id=>transcript_id, :gene_name=>gene_name, :odi=>oid, :nearest_ref=>nearest_ref}
|
|
339
|
+
# dict[transcript_id]={:gene_id=>gene_id, :gene_name=>gene_name, :odi=>oid, :nearest_ref=>nearest_ref}
|
|
340
|
+
# dict[gene_name]={:gene_id=>gene_id, :transcript_id=>transcript_id, :odi=>oid, :nearest_ref=>nearest_ref}
|
|
341
|
+
# dict[oid]={:gene_id=>gene_id, :transcript_id=>transcript_id, :gene_name=>gene_name, :nearest_ref=>nearest_ref}
|
|
342
|
+
# dict[nearest_ref]={:gene_id=>gene_id, :transcript_id=>transcript_id, :odi=>oid, :gene_name=>gene_name}
|
|
343
|
+
# end#lines
|
|
344
|
+
# end#file
|
|
345
|
+
# kb_filename = gtf.sub(/\.[a-zA-Z0-9]*$/,".kb")
|
|
346
|
+
# File.open(kb_filename,'w') do |fkb|
|
|
347
|
+
# #fkb.write(dict.to_json)
|
|
348
|
+
# Marshal.dump(dict,fkb)
|
|
349
|
+
# end #fkb
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
desc "isoform_exp GTF ISOFORM", "tag the XLOC gathering information from GTF (ensembl)"
|
|
353
|
+
#TODO: open a zip file,KB to optimez performances
|
|
354
|
+
def isoform_exp(gtf, isoform)
|
|
355
|
+
unless File.exists?(gtf)
|
|
356
|
+
STDERR.puts "File #{gtf} doesn't exist."
|
|
357
|
+
return nil
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
unless File.exists?(isoform)
|
|
361
|
+
STDERR.puts "File #{isoform} doesn't exist."
|
|
362
|
+
return nil
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
unless File.exists?(kb_filename = gtf.sub(/\.[a-zA-Z0-9]*$/,".kb"))
|
|
366
|
+
#build the kb
|
|
367
|
+
invoke :build_compare_kb, [gtf]
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
gtf_gkb = Bio::Ngs::Cufflinks::Compare.load_compare_kb(kb_filename)
|
|
371
|
+
# gtf_kb = File.open(kb_filename,'r') do |kb_dump|
|
|
372
|
+
# Marshal.load(kb_dump)
|
|
373
|
+
# end
|
|
374
|
+
|
|
375
|
+
File.open("#{isoform}_rich", 'w') do |w|
|
|
376
|
+
File.open(isoform,'r') do |f|
|
|
377
|
+
w.write("ensembl_transcript_id\t#{f.readline}") #skip header and write to output files
|
|
378
|
+
f.each_line do |line|
|
|
379
|
+
data = line.split
|
|
380
|
+
w.write("#{gtf_kb[data[0].to_sym][:nearest_ref]}\t#{line}")
|
|
381
|
+
end #line
|
|
382
|
+
end #file read
|
|
383
|
+
end #file write
|
|
384
|
+
end#isoform_exp
|
|
385
|
+
|
|
386
|
+
end #Humanize
|
|
387
|
+
|
|
388
|
+
class De < Thor
|
|
389
|
+
|
|
390
|
+
#./bin/biongs convert:illumina:de:isoform /Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp.diff /Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf --min_samples=5 --fold=2 --min_fpkm=0.5 --z_score | sort > /Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp_diff.txt
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
#Extract data from differential expression made by Cuffdiff.
|
|
394
|
+
#The user can request to export the data in a tabular format with data in fpkm or z-score (computed by row)
|
|
395
|
+
#Is possible to filter the results in different manners:
|
|
396
|
+
#by fold change: log2 (internally Cuffdiff compute a fold change with natural logarithm, this task made an internal conversion)
|
|
397
|
+
#by number of elmentes for with the fold change is verified among the remaining populations/samples
|
|
398
|
+
#by fpkm a poulation/samples is take into account by further selection steps if it's fpkm value is greater_equal to...
|
|
399
|
+
#the output is writted to a tab delimited table, sorted by the first column:sample-discriminator.
|
|
400
|
+
#Output file name isoform_exp-f1_s5_fpkm0.5_z.txt, the parameters are written in the file name, so is possible to keep track of them
|
|
401
|
+
desc "isoform DIFF GTF", "extract the transcripts"
|
|
402
|
+
method_option :fold, :type => :numeric, :desc => "DE fold change log2", :default=>0.0
|
|
403
|
+
method_option :only_significative, :type => :boolean, :aliases=>'-s', :default=>false
|
|
404
|
+
method_option :min_samples, :type=>:numeric, :aliases=>"-m", :desc=>"Niminim number of item for the the fold must be verified or significative"
|
|
405
|
+
method_option :min_fpkm, :type => :numeric, :aliases => "-f", :default=> 0.0, :desc => "Store a value if its fpkm is at least"
|
|
406
|
+
method_option :z_scores, :type => :boolean, :aliases => "-z", :default=> false, :desc=> "Return a matrix of Z-scores other than fpkm"
|
|
407
|
+
method_option :up, :type => :boolean, :aliases => '-u', :default => true, :desc => "Up regulated (true), down regulated (false)"
|
|
408
|
+
def isoform(diff_file, gtf)
|
|
409
|
+
how_regulated = options.up ? :up : :down
|
|
410
|
+
Bio::Ngs::Cufflinks::Diff.isoforms(diff_file,
|
|
411
|
+
gtf,
|
|
412
|
+
fold:options.fold,min_samples:options.min_samples,min_fpkm:options.min_fpkm,z_scores:options.z_scores, regulated:how_regulated)
|
|
413
|
+
end #de_isoform
|
|
414
|
+
|
|
415
|
+
desc "gene DIFF GTF", "extract the transcripts"
|
|
416
|
+
method_option :fold, :type => :numeric, :desc => "DE fold change log2", :default=>0.0
|
|
417
|
+
method_option :only_significative, :type => :boolean, :aliases=>'-s', :default=>false
|
|
418
|
+
method_option :min_samples, :type=>:numeric, :aliases=>"-m", :desc=>"Niminim number of item for the the fold must be verified or significative"
|
|
419
|
+
method_option :min_fpkm, :type => :numeric, :aliases => "-f", :default=> 0.0, :desc => "Store a value if its fpkm is at least"
|
|
420
|
+
method_option :z_scores, :type => :boolean, :aliases => "-z", :default=> false, :desc=> "Return a matrix of Z-scores other than fpkm"
|
|
421
|
+
method_option :up, :type => :boolean, :aliases => '-u', :default => true, :desc => "Up regulated (true), down regulated (false)"
|
|
422
|
+
def gene(diff_file, gtf)
|
|
423
|
+
how_regulated = options.up ? :up : :down
|
|
424
|
+
Bio::Ngs::Cufflinks::Diff.genes(diff_file,
|
|
425
|
+
gtf,
|
|
426
|
+
fold:options.fold,min_samples:options.min_samples,min_fpkm:options.min_fpkm,z_scores:options.z_scores, regulated:how_regulated)
|
|
427
|
+
end #de_isoform
|
|
428
|
+
|
|
429
|
+
#convert:illumina:de:rename_qs /Users/bonnalraoul/Desktop/RRep16giugno/DEpopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/gene_exp-f0.5_s5_fpkm0.5_zup.txt\
|
|
430
|
+
# Naive,Th1,Th17,Th2,Treg,Tfh
|
|
431
|
+
desc "rename_qs DIFF_FILE NAMES", 'rename q1,...,qn with names provided by the user(comma separated)'
|
|
432
|
+
def rename_qs(diff_file, names)
|
|
433
|
+
names_list = names.split(',')
|
|
434
|
+
File.open(diff_file+"_renamed",'w') do |w|
|
|
435
|
+
File.open(diff_file, 'r') do |f|
|
|
436
|
+
header = f.readline
|
|
437
|
+
names_list.each_with_index{|name,idx| header.gsub!(/q#{idx+1}/,name)}
|
|
438
|
+
w.puts header
|
|
439
|
+
f.each_line do |line|
|
|
440
|
+
line.scan(/q\d+/).each do |q|
|
|
441
|
+
line.gsub!(/#{q}/,names_list[q.tr('q','').to_i-1])
|
|
442
|
+
end #scan
|
|
443
|
+
w.puts line
|
|
444
|
+
end #each_line
|
|
445
|
+
end# open-read
|
|
446
|
+
end #open-write
|
|
447
|
+
end
|
|
448
|
+
end #De
|
|
449
|
+
|
|
450
|
+
end #Illumina
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
end #Convert
|
|
454
|
+
# Add methods to Enumerable, which makes them available to Array
|