bio-ngs 0.3.2.alpha.01

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/Gemfile +39 -0
  3. data/Gemfile.lock +81 -0
  4. data/LICENSE.txt +28 -0
  5. data/README.rdoc +240 -0
  6. data/Rakefile +60 -0
  7. data/VERSION +1 -0
  8. data/bin/biongs +35 -0
  9. data/bio-ngs.gemspec +215 -0
  10. data/ext/mkrf_conf.rb +87 -0
  11. data/lib/bio-ngs.rb +54 -0
  12. data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
  13. data/lib/bio/appl/ngs/blast.rb +36 -0
  14. data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
  15. data/lib/bio/appl/ngs/cufflinks.rb +489 -0
  16. data/lib/bio/appl/ngs/fastx.rb +170 -0
  17. data/lib/bio/appl/ngs/samtools.rb +118 -0
  18. data/lib/bio/appl/ngs/sff_extract.rb +23 -0
  19. data/lib/bio/appl/ngs/tophat.rb +158 -0
  20. data/lib/bio/ngs/converter.rb +100 -0
  21. data/lib/bio/ngs/core_ext.rb +12 -0
  22. data/lib/bio/ngs/db.rb +66 -0
  23. data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
  24. data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
  25. data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
  26. data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
  27. data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
  28. data/lib/bio/ngs/db/models.rb +1 -0
  29. data/lib/bio/ngs/db/models/homology.rb +8 -0
  30. data/lib/bio/ngs/db/models/ontology.rb +16 -0
  31. data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
  32. data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
  33. data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
  34. data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
  35. data/lib/bio/ngs/ext/versions.yaml +73 -0
  36. data/lib/bio/ngs/graphics.rb +189 -0
  37. data/lib/bio/ngs/homology.rb +102 -0
  38. data/lib/bio/ngs/ontology.rb +103 -0
  39. data/lib/bio/ngs/quality.rb +64 -0
  40. data/lib/bio/ngs/record.rb +50 -0
  41. data/lib/bio/ngs/task.rb +46 -0
  42. data/lib/bio/ngs/utils.rb +176 -0
  43. data/lib/development_tasks.rb +34 -0
  44. data/lib/enumerable.rb +37 -0
  45. data/lib/tasks/bwa.thor +126 -0
  46. data/lib/tasks/convert.thor +454 -0
  47. data/lib/tasks/history.thor +51 -0
  48. data/lib/tasks/homology.thor +121 -0
  49. data/lib/tasks/ontology.thor +93 -0
  50. data/lib/tasks/project.thor +51 -0
  51. data/lib/tasks/quality.thor +142 -0
  52. data/lib/tasks/rna.thor +126 -0
  53. data/lib/tasks/sff_extract.thor +9 -0
  54. data/lib/templates/README.tt +43 -0
  55. data/lib/templates/db.tt +6 -0
  56. data/lib/wrapper.rb +225 -0
  57. data/spec/converter_qseq_spec.rb +56 -0
  58. data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
  59. data/spec/quality_spec.rb +40 -0
  60. data/spec/sff_extract_spec.rb +98 -0
  61. data/spec/spec_helper.rb +55 -0
  62. data/spec/tophat_spec.rb +99 -0
  63. data/spec/utils_spec.rb +22 -0
  64. data/test/conf/test_db.yml +4 -0
  65. data/test/data/blastoutput.xml +69 -0
  66. data/test/data/gene-GO.json +1 -0
  67. data/test/data/goa_uniprot +27 -0
  68. data/test/data/goslim_goa.obo +1763 -0
  69. data/test/helper.rb +18 -0
  70. data/test/test_bio-ngs.rb +17 -0
  71. data/test/test_db.rb +21 -0
  72. data/test/test_homology.rb +102 -0
  73. data/test/test_ngs.rb +21 -0
  74. data/test/test_ontology.rb +74 -0
  75. data/test/test_utils.rb +29 -0
  76. metadata +460 -0
@@ -0,0 +1,454 @@
1
+ #
2
+ # convert.thor - Main task for converting data between NGS formats
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Raoul J.P. Bonnal <r@bioruby.org>
6
+ # License:: The Ruby License
7
+ #
8
+ #
9
+
10
+
11
+
12
+
13
+ module Convert
14
+
15
+ class Bam < Thor
16
+ # Sort and index the input bam filename
17
+ # the sorted/indexed output is created in the same directory of the input file
18
+ desc "sort BAM [PREFIX]", "Sort and create and index for the BAM file name"
19
+ def sort(bam_fn, prefix=nil)
20
+ if File.exists?(bam_fn)
21
+ dirname = File.dirname(bam_fn)
22
+ prefix = File.basename(bam_fn).gsub(/\.bam/,'_sort') if prefix.nil?
23
+ bam_sort_fn = File.join(dirname, prefix)
24
+ #bam sort
25
+ Bio::DB::SAM::Tools.bam_sort(bam_fn, bam_sort_fn)
26
+ bam_sort_fn += ".bam"
27
+ #bam index sorted file
28
+ Bio::DB::SAM::Tools.bam_index_build(bam_sort_fn)
29
+ else
30
+ warn "[#{Time.now}] There was an error, tophat did not create any accepted_hit file "
31
+ end
32
+ #you tasks here
33
+ end #sort
34
+
35
+ desc "merge" ,"Merge multiple bams in a single one, BAMS separated by commmas"
36
+ method_option :input_bams, :type => :array, :required => true, :aliases => '-i'
37
+ method_option :output, :type => :string, :require => true, :aliases => '-o'
38
+ Bio::Ngs::Samtools::Merge.new.thor_task(self, :merge) do |wrapper, task|
39
+ wrapper.params = task.options
40
+ wrapper.run :arguments => [task.options.output, task.options.input_bams].flatten
41
+ end
42
+
43
+ desc "extract_genes BAM GENES", "Extract GENES from bam. It connects to Ensembl Humnan, release 61 and download the coordinates for the inserted genes"
44
+ method_option :output, :type => :string, :desc => "output file name"
45
+ method_option :ensembl_specie, :type => :string, :desc => "default homo_sapiens", :default => 'homo_sapiens'
46
+ method_option :ensembl_release, :type => :numeric, :desc => "ensembl release", :required => true
47
+ Bio::Ngs::Samtools::View.new.thor_task(self, :extract_genes) do |wrapper, task, bam_fn, gene_names|
48
+ require 'ensembl'
49
+ # begin
50
+ ::Ensembl::Core::DBConnection.connect(task.options.ensembl_specie, task.options.ensembl_release)
51
+ genes_str=gene_names.split(',').map do |gene|
52
+ g = ::Ensembl::Core::Gene.find_by_name(gene)
53
+ if g
54
+ coords = "#{g.seq_region.name}:#{g.seq_region_start}-#{g.seq_region_end}"
55
+ else
56
+ warn "Can't find gene #{gene} in Ensembl #{task.options.ensembl_specie}, release #{task.options.ensembl_release} "
57
+ end
58
+ end.compact
59
+ if File.exists?(bam_fn) && !genes_str.empty?
60
+ output_name = task.options.output || bam_fn.gsub(/\.bam/, "_subset.bam")
61
+ wrapper.run :arguments => [output_name, bam_fn, genes_str]
62
+ task.invoke :sort, [output_name]
63
+ puts "Find your data in #{output_name} and #{output_name.gsub(/\.bam/,"_sort.bam")}"
64
+ end
65
+ # rescue Exception => e
66
+ # warn "Bam file #{bam_fn} does not exsist or you don't have the rights to open it.#{e}"
67
+ # end
68
+ end
69
+ end # Bam
70
+
71
+ module Qseq
72
+ class Fastq < Thor
73
+ desc "by_file FIRST OUTPUT", "Convert a qseq file into fastq"
74
+ method_option :paired, :type => :boolean, :default => false, :desc => 'Convert the reads in the paired format'
75
+ method_option :append, :type => :boolean, :default => false, :desc => 'Append this convertion to the output file if exists'
76
+ method_option :dir, :type => :string, :default=>".", :desc => 'Path to the working directory (data)'
77
+ # output is just a string I'll attach the fastq extension
78
+ def by_file(first, output)
79
+ qseq = Bio::Ngs::Converter::Qseq.new(options.paired ? :pe : :se)
80
+ buffers = [first] if first.kind_of? String
81
+ buffers = first if first.kind_of? Array
82
+ buffers.each do |file_name|
83
+ qseq.buffer = File.open(file_name,'r') #todo: dir is not used here it could be a bug
84
+ fastq_file = File.open(File.join(options.dir,"#{output}.fastq"), (options.append ? 'a' : 'w'))
85
+ qseq.to_fastq do |fastq|
86
+ fastq_file.puts fastq if fastq
87
+ end
88
+ qseq.buffer.close
89
+ fastq_file.close
90
+ #Write the report
91
+ File.open(File.join(options.dir,"#{output}.stats"), (options.append ? 'a' : 'w')) do |file|
92
+ file.puts ({:file_name=>file_name, :stats=>qseq.stats}.to_yaml)
93
+ end
94
+ end #buffers
95
+ # puts "Done #{file_name}"
96
+ end #by_file
97
+
98
+ # This tasks is used to aggregate the data demultiplexed from Illumina OLB 1.9 and CASAVA 1.7.
99
+ # Demultiplexing software splits the reads in different subdirectories based on the tag index of the reads,
100
+ # usually the wet-lab puts a population in a single lane an tags it with different indexes. The demultiplexer
101
+ # behaviour is not so clear, so this task takes care of simplify the aggregation for the final dataset.
102
+ # Output: 2 files
103
+ # 1) Forward fastq
104
+ # 2) Reverse fastq
105
+ desc "by_lane LANE OUTPUT", "Convert all the file in the current and descendant directories belonging to the specified lane in fastq. This command is specific for Illumina qseqs file s_#LANE_#STRAND_#TILE. Note UNKOWN directory is excluded by default."
106
+ method_option :paired, :type => :boolean, :default => false, :desc => 'Convert the reads in the paired format searching in the directories.'
107
+ method_option :append, :type => :boolean, :default => false, :desc => 'Append this convertion to the output file if exists'
108
+ method_option :dir, :type => :string, :desc => 'Path to the working directory (data)'
109
+ # output is just a string I'll attach the fastq extension
110
+ def by_lane(lane, output)
111
+ dir = options.dir || Dir.pwd
112
+
113
+ paired = options.paired
114
+ append = options.append
115
+ strand_lambda = lambda do |dir, strand| #Forward
116
+ strand_number = case strand
117
+ when :forward then 1
118
+ when :reverse then 2
119
+ end
120
+ invoke :by_file, [Dir[File.join(dir,"00?/s_#{lane}_#{strand_number}_*_qseq.txt")], "#{output}_#{strand}"], :paired => paired, :append => append, :dir => dir
121
+ end
122
+
123
+ forward_daemon_options = {
124
+ :app_name => "forward_#{lane}",
125
+ :ARGV => ['start'],
126
+ :log_output => true}
127
+ forward_task = ::Daemons.run_proc("forward_#{lane}",forward_daemon_options ) do
128
+ strand_lambda.call(dir,:forward)
129
+ end #daemon1
130
+
131
+ #Reverse
132
+ if options.paired
133
+ reverse_daemon_options = {
134
+ :app_name => "reverse_#{lane}",
135
+ :ARGV => ['start'],
136
+ :log_output => true}
137
+ reverse_task = ::Daemons.run_proc("reverse_#{lane}",reverse_daemon_options) do
138
+ strand_lambda.call(dir, :reverse)
139
+ end #daemon2
140
+ end #ifpaired
141
+ end #by_lane
142
+
143
+ desc "by_lane_index LANE INDEX OUTPUT", "Convert the qseq from a line and index in a fastq file"
144
+ method_option :paired, :type => :boolean, :default => false, :desc => 'Convert the reads in the paired format searching in the directories.'
145
+ method_option :append, :type => :boolean, :default => false, :desc => 'Append this convertion to the output file if exists'
146
+ method_option :dir, :type => :string, :desc => 'Path to the working directory (data)'
147
+ # output is just a string I'll attach the fastq extension
148
+ def by_lane_index(lane, index, output)
149
+ dir = options.dir || Dir.pwd
150
+ paired = options.paired
151
+ append = options.append
152
+ index_str = "%03d" % index
153
+ strand_lambda = lambda do |dir, strand| #Forward
154
+ strand_number = case strand
155
+ when :forward then 1
156
+ when :reverse then 2
157
+ end
158
+ invoke :by_file, [Dir[File.join(dir,"#{index_str}/s_#{lane}_#{strand_number}_*_qseq.txt")], "#{output}_#{strand}"], :paired => paired, :append => append, :dir => dir
159
+ end
160
+
161
+ forward_daemon_options = {
162
+ :app_name => "forward_#{lane}_#{index_str}",
163
+ :ARGV => ['start'],
164
+ :log_output => true,
165
+ :dir_mode => :normal,
166
+ :dir => dir}
167
+ forward_task = ::Daemons.run_proc("forward_#{lane}_#{index_str}",forward_daemon_options ) do
168
+ strand_lambda.call(dir,:forward)
169
+ end #daemon1
170
+
171
+ #Reverse
172
+ if options.paired
173
+ reverse_daemon_options = {
174
+ :app_name => "reverse_#{lane}_#{index_str}",
175
+ :ARGV => ['start'],
176
+ :log_output => true,
177
+ :dir_mode => :normal,
178
+ :dir => dir}
179
+ reverse_task = ::Daemons.run_proc("reverse_#{lane}_#{index_str}",reverse_daemon_options) do
180
+ strand_lambda.call(dir, :reverse)
181
+ end #daemon2
182
+ end #ifpaired
183
+ end #by_lane_index
184
+
185
+ # SAMPLES = 1,2,3,4
186
+ # LANE = 1
187
+ #OUTOUP = File name prefix, output file name will be OOUTPUT-Sample_N....
188
+ desc "samples_by_lane SAMPLES LANE OUTPUT", "Convert the qseqs for each sample in a specific lane. SAMPLES is an array of index codes separated by commas lane is an integer"
189
+ method_option :paired, :type => :boolean, :default => false, :desc => 'Convert the reads in the paired format searching in the directories.'
190
+ method_option :append, :type => :boolean, :default => false, :desc => 'Append this convertion to the output file if exists'
191
+ def samples_by_lane(samples, lane, output)
192
+ dir = Dir.pwd
193
+ samples.split(",").each do |sample|
194
+ sample_idx = sample.to_i
195
+ ::Daemons.run_proc("sample#{sample}_by_lane-#{lane}", {:app_name => "sample#{sample}_by_lane-#{lane}",
196
+ :ARGV => ['start'],
197
+ :log_output => true}) do
198
+ invoke :by_lane_index, [lane, sample_idx, "#{output}-Sample_#{sample_idx}"], :paired => options.paired, :append =>options.append, :dir => dir
199
+ end
200
+ end
201
+ end #samples_by_lane
202
+
203
+ end #Fastq
204
+ end #Qseq
205
+
206
+ module Bcl
207
+ class Qseq < Thor
208
+ desc "convert RUN OUTPUT [JOBS]", "Convert a bcl dataset in qseq"
209
+ def converts (run_basecalls_root, output, jobs=1)
210
+ invoke :configure_conversion, [run_basecalls_root, output]
211
+ invoke :run_bcl_to_qseq, [run_basecalls_root, jobs]
212
+ end #bcl_to_qseq
213
+
214
+ desc "configure_conversion RUN_DIR OUTPUT ", "Configure the specific Run to be converted", :hide => true
215
+ Bio::Ngs::Bclqseq.new.thor_task(self, :configure_conversion) do |wrapper, task, run_basecalls_root, output|
216
+ #wrapper.params={"base-calls-directory" => "#{run_basecalls_root}/Data/Intensities/BaseCalls", "output-directory" => output}
217
+ task.options.base_calls_directory=run_basecalls_root
218
+ #puts "Test parametri #{task.inspect}"
219
+ wrapper.run
220
+ end #setup_bcl_conversion
221
+
222
+ desc "start_conversion RUN_DIR [JOBS] ", "Start the conversion", :hide => true
223
+ method_option :prova, :type => :string
224
+ def start_conversion(run_basecalls_root, jobs=1)
225
+ # puts jobs
226
+ # puts basecalls
227
+ puts "make recursive -j #{jobs} -f #{run_basecalls_root}/Data/Intensities/BaseCalls/Makefile -C #{run_basecalls_root}/Data/Intensities/BaseCalls"
228
+ end #run_bcl_to_qseq
229
+ end #Qseq
230
+ end #Bcl
231
+
232
+
233
+
234
+ module Illumina
235
+ class Fastq < Thor
236
+
237
+ # Trim fastq sequences (Illumina format 1.5+):
238
+ # ------------------BBBBBBBBBBBBBBBBB
239
+ # ------------------
240
+ # First step trailing Bs are removed and if the remaining sequence is length enough
241
+ # The user can specify the minimum length of the sequnce and the number of Bs to search in the middle.
242
+ # If user passes an output file name that witll be used as suffix for the other output files.
243
+ # If no file name is passed the input file name will be used as suffix.
244
+ # Output: 4 files
245
+ # 1) xxx_trim.fastq the trimmed sequences in fastq format
246
+ # 2) xxx_rejected.fastq
247
+ # 3) xxx_profile.csv the length distribution of the trimmed sequnces
248
+ # 4) xxx_report.csv statistics on processed reads as total number of reads in input,
249
+ # trimmed, removed, untouched ( not trimmed)
250
+ # Note: removed reads are the ones which start with a B
251
+ # IMPORTANT: Data in FastQ formant MUST NOT BE WRAPPED sequence and quality MUST BE ON 1 LINE EACH
252
+ desc "trim_b FASTQ", "perform a trim on all the sequences on B qualities with Illumina's criteria. Ref to CASAVA manual."
253
+ #TODO, report the legth/profile of all the sequences.
254
+ #TODO: implement different strategies for trimming, N consecutive Bs ?
255
+ #TODO: implement min length for a trimmed sequnce to be reported as valid.
256
+ method_option :fileout, :type => :string
257
+ method_option :min_size, :type =>:numeric, :default => 20, :aliases => '-s', :desc => 'minimum length to consider a trimmed sequence as valid, otherwise it will be discarded'
258
+ def trim_b(fastq)
259
+ reads = File.open(fastq,'r')
260
+ output_filename_base = options[:fileout].nil? ? fastq : options.fileout
261
+ count_total = 0
262
+ count_trimmed = 0
263
+ count_removed = 0
264
+ sequences_profile=Hash.new(0)
265
+ fastq=0
266
+ head =""
267
+ seq=""
268
+ qual=""
269
+ min_size = (options[:min_size] > 1) ? (options[:min_size]-1) : 0
270
+
271
+ trimming_tail_patter = /B*$/
272
+
273
+ r_rejected = File.open(Bio::Ngs::Utils.tag_filename(output_filename_base, "trim_rejected","fastq"), 'w')
274
+
275
+ File.open(Bio::Ngs::Utils.tag_filename(output_filename_base, "trim", "fastq"), 'w') do |f|
276
+ reads.lines do |line|
277
+ case (fastq % 4 )
278
+ when 0 then
279
+ head = line
280
+ count_total+=1
281
+ when 1 then seq=line
282
+ #2 is the plus sign
283
+ when 3 then
284
+ b_tail_idx=(line=~trimming_tail_patter)
285
+ if (b_tail_idx > min_size )
286
+ count_trimmed+=1
287
+ f.puts "#{head}#{seq[0..b_tail_idx-1]}\n+\n#{$`}" #remaining_line}"#line[0..b_tail_idx]
288
+ else
289
+ count_removed+=1
290
+ r_rejected.puts "#{head}#{seq}+\n#{line}"
291
+ end
292
+ end #case
293
+ fastq+=1
294
+ end#read
295
+ end #Write fastq
296
+ r_rejected.close
297
+ #Profile
298
+ File.open(Bio::Ngs::Utils.tag_filename(output_filename_base, "trim_profile", "csv"), 'w') do |f_profile|
299
+ f_profile.puts "Sequnce length,count"
300
+ sequences_profile.sort.each do |profile|
301
+ read_size = profile[0]
302
+ read_number = profile[1]
303
+ f_profile.puts "#{read_size},#{read_number}"
304
+ end
305
+ end #Write profile
306
+ #Report
307
+ File.open(Bio::Ngs::Utils.tag_filename(output_filename_base, "trim_report", "csv"), 'w') do |report|
308
+ report.puts "Reads processed,Reads trimmed,Reads removed,Reads untouched"
309
+ report.puts "#{count_total},#{count_trimmed},#{count_removed},#{count_total-count_trimmed-count_removed}"
310
+ end #Write report
311
+ end #trim_b
312
+ end #Fastq
313
+
314
+ class Humanize < Thor
315
+ require 'json'
316
+
317
+ desc "build_compare_kb GTF", "Build the JSON file with the annoation from the GTF file used to humanize the results"
318
+ #TODO: create a zip file to optimize the space.
319
+ def build_compare_kb(gtf)
320
+ Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
321
+ # unless File.exists?(gtf)
322
+ # STDERR.puts "File #{gtf} doesn't exist."
323
+ # return nil
324
+ # end
325
+ # dict = {} #build an hash with the combinations of data extracted from GTF file, XLOC, TCONS, ENST, SYMBOL
326
+ # File.open(gtf,'r') do |f|
327
+ # f.lines do |line|
328
+ # line=~/gene_id (.*?);/
329
+ # gene_id = $1.gsub(/"/,'').to_sym
330
+ # line=~/transcript_id (.*?);/
331
+ # transcript_id = $1.gsub(/"/,'').to_sym
332
+ # line=~/gene_name (.*?);/
333
+ # gene_name = $1.gsub(/"/,'').to_sym
334
+ # line=~/oId (.*?);/
335
+ # oid=$1.gsub(/"/,'').to_sym
336
+ # line=~/nearest_ref (.*?);/
337
+ # nearest_ref = $1.gsub(/"/,'').to_sym
338
+ # dict[gene_id]={:transcript_id=>transcript_id, :gene_name=>gene_name, :odi=>oid, :nearest_ref=>nearest_ref}
339
+ # dict[transcript_id]={:gene_id=>gene_id, :gene_name=>gene_name, :odi=>oid, :nearest_ref=>nearest_ref}
340
+ # dict[gene_name]={:gene_id=>gene_id, :transcript_id=>transcript_id, :odi=>oid, :nearest_ref=>nearest_ref}
341
+ # dict[oid]={:gene_id=>gene_id, :transcript_id=>transcript_id, :gene_name=>gene_name, :nearest_ref=>nearest_ref}
342
+ # dict[nearest_ref]={:gene_id=>gene_id, :transcript_id=>transcript_id, :odi=>oid, :gene_name=>gene_name}
343
+ # end#lines
344
+ # end#file
345
+ # kb_filename = gtf.sub(/\.[a-zA-Z0-9]*$/,".kb")
346
+ # File.open(kb_filename,'w') do |fkb|
347
+ # #fkb.write(dict.to_json)
348
+ # Marshal.dump(dict,fkb)
349
+ # end #fkb
350
+ end
351
+
352
+ desc "isoform_exp GTF ISOFORM", "tag the XLOC gathering information from GTF (ensembl)"
353
+ #TODO: open a zip file,KB to optimez performances
354
+ def isoform_exp(gtf, isoform)
355
+ unless File.exists?(gtf)
356
+ STDERR.puts "File #{gtf} doesn't exist."
357
+ return nil
358
+ end
359
+
360
+ unless File.exists?(isoform)
361
+ STDERR.puts "File #{isoform} doesn't exist."
362
+ return nil
363
+ end
364
+
365
+ unless File.exists?(kb_filename = gtf.sub(/\.[a-zA-Z0-9]*$/,".kb"))
366
+ #build the kb
367
+ invoke :build_compare_kb, [gtf]
368
+ end
369
+
370
+ gtf_gkb = Bio::Ngs::Cufflinks::Compare.load_compare_kb(kb_filename)
371
+ # gtf_kb = File.open(kb_filename,'r') do |kb_dump|
372
+ # Marshal.load(kb_dump)
373
+ # end
374
+
375
+ File.open("#{isoform}_rich", 'w') do |w|
376
+ File.open(isoform,'r') do |f|
377
+ w.write("ensembl_transcript_id\t#{f.readline}") #skip header and write to output files
378
+ f.each_line do |line|
379
+ data = line.split
380
+ w.write("#{gtf_kb[data[0].to_sym][:nearest_ref]}\t#{line}")
381
+ end #line
382
+ end #file read
383
+ end #file write
384
+ end#isoform_exp
385
+
386
+ end #Humanize
387
+
388
+ class De < Thor
389
+
390
+ #./bin/biongs convert:illumina:de:isoform /Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp.diff /Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf --min_samples=5 --fold=2 --min_fpkm=0.5 --z_score | sort > /Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp_diff.txt
391
+
392
+
393
+ #Extract data from differential expression made by Cuffdiff.
394
+ #The user can request to export the data in a tabular format with data in fpkm or z-score (computed by row)
395
+ #Is possible to filter the results in different manners:
396
+ #by fold change: log2 (internally Cuffdiff compute a fold change with natural logarithm, this task made an internal conversion)
397
+ #by number of elmentes for with the fold change is verified among the remaining populations/samples
398
+ #by fpkm a poulation/samples is take into account by further selection steps if it's fpkm value is greater_equal to...
399
+ #the output is writted to a tab delimited table, sorted by the first column:sample-discriminator.
400
+ #Output file name isoform_exp-f1_s5_fpkm0.5_z.txt, the parameters are written in the file name, so is possible to keep track of them
401
+ desc "isoform DIFF GTF", "extract the transcripts"
402
+ method_option :fold, :type => :numeric, :desc => "DE fold change log2", :default=>0.0
403
+ method_option :only_significative, :type => :boolean, :aliases=>'-s', :default=>false
404
+ method_option :min_samples, :type=>:numeric, :aliases=>"-m", :desc=>"Niminim number of item for the the fold must be verified or significative"
405
+ method_option :min_fpkm, :type => :numeric, :aliases => "-f", :default=> 0.0, :desc => "Store a value if its fpkm is at least"
406
+ method_option :z_scores, :type => :boolean, :aliases => "-z", :default=> false, :desc=> "Return a matrix of Z-scores other than fpkm"
407
+ method_option :up, :type => :boolean, :aliases => '-u', :default => true, :desc => "Up regulated (true), down regulated (false)"
408
+ def isoform(diff_file, gtf)
409
+ how_regulated = options.up ? :up : :down
410
+ Bio::Ngs::Cufflinks::Diff.isoforms(diff_file,
411
+ gtf,
412
+ fold:options.fold,min_samples:options.min_samples,min_fpkm:options.min_fpkm,z_scores:options.z_scores, regulated:how_regulated)
413
+ end #de_isoform
414
+
415
+ desc "gene DIFF GTF", "extract the transcripts"
416
+ method_option :fold, :type => :numeric, :desc => "DE fold change log2", :default=>0.0
417
+ method_option :only_significative, :type => :boolean, :aliases=>'-s', :default=>false
418
+ method_option :min_samples, :type=>:numeric, :aliases=>"-m", :desc=>"Niminim number of item for the the fold must be verified or significative"
419
+ method_option :min_fpkm, :type => :numeric, :aliases => "-f", :default=> 0.0, :desc => "Store a value if its fpkm is at least"
420
+ method_option :z_scores, :type => :boolean, :aliases => "-z", :default=> false, :desc=> "Return a matrix of Z-scores other than fpkm"
421
+ method_option :up, :type => :boolean, :aliases => '-u', :default => true, :desc => "Up regulated (true), down regulated (false)"
422
+ def gene(diff_file, gtf)
423
+ how_regulated = options.up ? :up : :down
424
+ Bio::Ngs::Cufflinks::Diff.genes(diff_file,
425
+ gtf,
426
+ fold:options.fold,min_samples:options.min_samples,min_fpkm:options.min_fpkm,z_scores:options.z_scores, regulated:how_regulated)
427
+ end #de_isoform
428
+
429
+ #convert:illumina:de:rename_qs /Users/bonnalraoul/Desktop/RRep16giugno/DEpopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/gene_exp-f0.5_s5_fpkm0.5_zup.txt\
430
+ # Naive,Th1,Th17,Th2,Treg,Tfh
431
+ desc "rename_qs DIFF_FILE NAMES", 'rename q1,...,qn with names provided by the user(comma separated)'
432
+ def rename_qs(diff_file, names)
433
+ names_list = names.split(',')
434
+ File.open(diff_file+"_renamed",'w') do |w|
435
+ File.open(diff_file, 'r') do |f|
436
+ header = f.readline
437
+ names_list.each_with_index{|name,idx| header.gsub!(/q#{idx+1}/,name)}
438
+ w.puts header
439
+ f.each_line do |line|
440
+ line.scan(/q\d+/).each do |q|
441
+ line.gsub!(/#{q}/,names_list[q.tr('q','').to_i-1])
442
+ end #scan
443
+ w.puts line
444
+ end #each_line
445
+ end# open-read
446
+ end #open-write
447
+ end
448
+ end #De
449
+
450
+ end #Illumina
451
+
452
+
453
+ end #Convert
454
+ # Add methods to Enumerable, which makes them available to Array