bio-ngs 0.3.2.alpha.01

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/Gemfile +39 -0
  3. data/Gemfile.lock +81 -0
  4. data/LICENSE.txt +28 -0
  5. data/README.rdoc +240 -0
  6. data/Rakefile +60 -0
  7. data/VERSION +1 -0
  8. data/bin/biongs +35 -0
  9. data/bio-ngs.gemspec +215 -0
  10. data/ext/mkrf_conf.rb +87 -0
  11. data/lib/bio-ngs.rb +54 -0
  12. data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
  13. data/lib/bio/appl/ngs/blast.rb +36 -0
  14. data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
  15. data/lib/bio/appl/ngs/cufflinks.rb +489 -0
  16. data/lib/bio/appl/ngs/fastx.rb +170 -0
  17. data/lib/bio/appl/ngs/samtools.rb +118 -0
  18. data/lib/bio/appl/ngs/sff_extract.rb +23 -0
  19. data/lib/bio/appl/ngs/tophat.rb +158 -0
  20. data/lib/bio/ngs/converter.rb +100 -0
  21. data/lib/bio/ngs/core_ext.rb +12 -0
  22. data/lib/bio/ngs/db.rb +66 -0
  23. data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
  24. data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
  25. data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
  26. data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
  27. data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
  28. data/lib/bio/ngs/db/models.rb +1 -0
  29. data/lib/bio/ngs/db/models/homology.rb +8 -0
  30. data/lib/bio/ngs/db/models/ontology.rb +16 -0
  31. data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
  32. data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
  33. data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
  34. data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
  35. data/lib/bio/ngs/ext/versions.yaml +73 -0
  36. data/lib/bio/ngs/graphics.rb +189 -0
  37. data/lib/bio/ngs/homology.rb +102 -0
  38. data/lib/bio/ngs/ontology.rb +103 -0
  39. data/lib/bio/ngs/quality.rb +64 -0
  40. data/lib/bio/ngs/record.rb +50 -0
  41. data/lib/bio/ngs/task.rb +46 -0
  42. data/lib/bio/ngs/utils.rb +176 -0
  43. data/lib/development_tasks.rb +34 -0
  44. data/lib/enumerable.rb +37 -0
  45. data/lib/tasks/bwa.thor +126 -0
  46. data/lib/tasks/convert.thor +454 -0
  47. data/lib/tasks/history.thor +51 -0
  48. data/lib/tasks/homology.thor +121 -0
  49. data/lib/tasks/ontology.thor +93 -0
  50. data/lib/tasks/project.thor +51 -0
  51. data/lib/tasks/quality.thor +142 -0
  52. data/lib/tasks/rna.thor +126 -0
  53. data/lib/tasks/sff_extract.thor +9 -0
  54. data/lib/templates/README.tt +43 -0
  55. data/lib/templates/db.tt +6 -0
  56. data/lib/wrapper.rb +225 -0
  57. data/spec/converter_qseq_spec.rb +56 -0
  58. data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
  59. data/spec/quality_spec.rb +40 -0
  60. data/spec/sff_extract_spec.rb +98 -0
  61. data/spec/spec_helper.rb +55 -0
  62. data/spec/tophat_spec.rb +99 -0
  63. data/spec/utils_spec.rb +22 -0
  64. data/test/conf/test_db.yml +4 -0
  65. data/test/data/blastoutput.xml +69 -0
  66. data/test/data/gene-GO.json +1 -0
  67. data/test/data/goa_uniprot +27 -0
  68. data/test/data/goslim_goa.obo +1763 -0
  69. data/test/helper.rb +18 -0
  70. data/test/test_bio-ngs.rb +17 -0
  71. data/test/test_db.rb +21 -0
  72. data/test/test_homology.rb +102 -0
  73. data/test/test_ngs.rb +21 -0
  74. data/test/test_ontology.rb +74 -0
  75. data/test/test_utils.rb +29 -0
  76. metadata +460 -0
@@ -0,0 +1,454 @@
1
+ #
2
+ # convert.thor - Main task for converting data between NGS formats
3
+ #
4
+ # Copyright:: Copyright (C) 2011
5
+ # Raoul J.P. Bonnal <r@bioruby.org>
6
+ # License:: The Ruby License
7
+ #
8
+ #
9
+
10
+
11
+
12
+
13
+ module Convert
14
+
15
+ class Bam < Thor
16
+ # Sort and index the input bam filename
17
+ # the sorted/indexed output is created in the same directory of the input file
18
+ desc "sort BAM [PREFIX]", "Sort and create and index for the BAM file name"
19
+ def sort(bam_fn, prefix=nil)
20
+ if File.exists?(bam_fn)
21
+ dirname = File.dirname(bam_fn)
22
+ prefix = File.basename(bam_fn).gsub(/\.bam/,'_sort') if prefix.nil?
23
+ bam_sort_fn = File.join(dirname, prefix)
24
+ #bam sort
25
+ Bio::DB::SAM::Tools.bam_sort(bam_fn, bam_sort_fn)
26
+ bam_sort_fn += ".bam"
27
+ #bam index sorted file
28
+ Bio::DB::SAM::Tools.bam_index_build(bam_sort_fn)
29
+ else
30
+ warn "[#{Time.now}] There was an error, tophat did not create any accepted_hit file "
31
+ end
32
+ #you tasks here
33
+ end #sort
34
+
35
+ desc "merge" ,"Merge multiple bams in a single one, BAMS separated by commmas"
36
+ method_option :input_bams, :type => :array, :required => true, :aliases => '-i'
37
+ method_option :output, :type => :string, :require => true, :aliases => '-o'
38
+ Bio::Ngs::Samtools::Merge.new.thor_task(self, :merge) do |wrapper, task|
39
+ wrapper.params = task.options
40
+ wrapper.run :arguments => [task.options.output, task.options.input_bams].flatten
41
+ end
42
+
43
+ desc "extract_genes BAM GENES", "Extract GENES from bam. It connects to Ensembl Humnan, release 61 and download the coordinates for the inserted genes"
44
+ method_option :output, :type => :string, :desc => "output file name"
45
+ method_option :ensembl_specie, :type => :string, :desc => "default homo_sapiens", :default => 'homo_sapiens'
46
+ method_option :ensembl_release, :type => :numeric, :desc => "ensembl release", :required => true
47
+ Bio::Ngs::Samtools::View.new.thor_task(self, :extract_genes) do |wrapper, task, bam_fn, gene_names|
48
+ require 'ensembl'
49
+ # begin
50
+ ::Ensembl::Core::DBConnection.connect(task.options.ensembl_specie, task.options.ensembl_release)
51
+ genes_str=gene_names.split(',').map do |gene|
52
+ g = ::Ensembl::Core::Gene.find_by_name(gene)
53
+ if g
54
+ coords = "#{g.seq_region.name}:#{g.seq_region_start}-#{g.seq_region_end}"
55
+ else
56
+ warn "Can't find gene #{gene} in Ensembl #{task.options.ensembl_specie}, release #{task.options.ensembl_release} "
57
+ end
58
+ end.compact
59
+ if File.exists?(bam_fn) && !genes_str.empty?
60
+ output_name = task.options.output || bam_fn.gsub(/\.bam/, "_subset.bam")
61
+ wrapper.run :arguments => [output_name, bam_fn, genes_str]
62
+ task.invoke :sort, [output_name]
63
+ puts "Find your data in #{output_name} and #{output_name.gsub(/\.bam/,"_sort.bam")}"
64
+ end
65
+ # rescue Exception => e
66
+ # warn "Bam file #{bam_fn} does not exsist or you don't have the rights to open it.#{e}"
67
+ # end
68
+ end
69
+ end # Bam
70
+
71
+ module Qseq
72
+ class Fastq < Thor
73
+ desc "by_file FIRST OUTPUT", "Convert a qseq file into fastq"
74
+ method_option :paired, :type => :boolean, :default => false, :desc => 'Convert the reads in the paired format'
75
+ method_option :append, :type => :boolean, :default => false, :desc => 'Append this convertion to the output file if exists'
76
+ method_option :dir, :type => :string, :default=>".", :desc => 'Path to the working directory (data)'
77
+ # output is just a string I'll attach the fastq extension
78
+ def by_file(first, output)
79
+ qseq = Bio::Ngs::Converter::Qseq.new(options.paired ? :pe : :se)
80
+ buffers = [first] if first.kind_of? String
81
+ buffers = first if first.kind_of? Array
82
+ buffers.each do |file_name|
83
+ qseq.buffer = File.open(file_name,'r') #todo: dir is not used here it could be a bug
84
+ fastq_file = File.open(File.join(options.dir,"#{output}.fastq"), (options.append ? 'a' : 'w'))
85
+ qseq.to_fastq do |fastq|
86
+ fastq_file.puts fastq if fastq
87
+ end
88
+ qseq.buffer.close
89
+ fastq_file.close
90
+ #Write the report
91
+ File.open(File.join(options.dir,"#{output}.stats"), (options.append ? 'a' : 'w')) do |file|
92
+ file.puts ({:file_name=>file_name, :stats=>qseq.stats}.to_yaml)
93
+ end
94
+ end #buffers
95
+ # puts "Done #{file_name}"
96
+ end #by_file
97
+
98
+ # This tasks is used to aggregate the data demultiplexed from Illumina OLB 1.9 and CASAVA 1.7.
99
+ # Demultiplexing software splits the reads in different subdirectories based on the tag index of the reads,
100
+ # usually the wet-lab puts a population in a single lane an tags it with different indexes. The demultiplexer
101
+ # behaviour is not so clear, so this task takes care of simplify the aggregation for the final dataset.
102
+ # Output: 2 files
103
+ # 1) Forward fastq
104
+ # 2) Reverse fastq
105
+ desc "by_lane LANE OUTPUT", "Convert all the file in the current and descendant directories belonging to the specified lane in fastq. This command is specific for Illumina qseqs file s_#LANE_#STRAND_#TILE. Note UNKOWN directory is excluded by default."
106
+ method_option :paired, :type => :boolean, :default => false, :desc => 'Convert the reads in the paired format searching in the directories.'
107
+ method_option :append, :type => :boolean, :default => false, :desc => 'Append this convertion to the output file if exists'
108
+ method_option :dir, :type => :string, :desc => 'Path to the working directory (data)'
109
+ # output is just a string I'll attach the fastq extension
110
+ def by_lane(lane, output)
111
+ dir = options.dir || Dir.pwd
112
+
113
+ paired = options.paired
114
+ append = options.append
115
+ strand_lambda = lambda do |dir, strand| #Forward
116
+ strand_number = case strand
117
+ when :forward then 1
118
+ when :reverse then 2
119
+ end
120
+ invoke :by_file, [Dir[File.join(dir,"00?/s_#{lane}_#{strand_number}_*_qseq.txt")], "#{output}_#{strand}"], :paired => paired, :append => append, :dir => dir
121
+ end
122
+
123
+ forward_daemon_options = {
124
+ :app_name => "forward_#{lane}",
125
+ :ARGV => ['start'],
126
+ :log_output => true}
127
+ forward_task = ::Daemons.run_proc("forward_#{lane}",forward_daemon_options ) do
128
+ strand_lambda.call(dir,:forward)
129
+ end #daemon1
130
+
131
+ #Reverse
132
+ if options.paired
133
+ reverse_daemon_options = {
134
+ :app_name => "reverse_#{lane}",
135
+ :ARGV => ['start'],
136
+ :log_output => true}
137
+ reverse_task = ::Daemons.run_proc("reverse_#{lane}",reverse_daemon_options) do
138
+ strand_lambda.call(dir, :reverse)
139
+ end #daemon2
140
+ end #ifpaired
141
+ end #by_lane
142
+
143
+ desc "by_lane_index LANE INDEX OUTPUT", "Convert the qseq from a line and index in a fastq file"
144
+ method_option :paired, :type => :boolean, :default => false, :desc => 'Convert the reads in the paired format searching in the directories.'
145
+ method_option :append, :type => :boolean, :default => false, :desc => 'Append this convertion to the output file if exists'
146
+ method_option :dir, :type => :string, :desc => 'Path to the working directory (data)'
147
+ # output is just a string I'll attach the fastq extension
148
+ def by_lane_index(lane, index, output)
149
+ dir = options.dir || Dir.pwd
150
+ paired = options.paired
151
+ append = options.append
152
+ index_str = "%03d" % index
153
+ strand_lambda = lambda do |dir, strand| #Forward
154
+ strand_number = case strand
155
+ when :forward then 1
156
+ when :reverse then 2
157
+ end
158
+ invoke :by_file, [Dir[File.join(dir,"#{index_str}/s_#{lane}_#{strand_number}_*_qseq.txt")], "#{output}_#{strand}"], :paired => paired, :append => append, :dir => dir
159
+ end
160
+
161
+ forward_daemon_options = {
162
+ :app_name => "forward_#{lane}_#{index_str}",
163
+ :ARGV => ['start'],
164
+ :log_output => true,
165
+ :dir_mode => :normal,
166
+ :dir => dir}
167
+ forward_task = ::Daemons.run_proc("forward_#{lane}_#{index_str}",forward_daemon_options ) do
168
+ strand_lambda.call(dir,:forward)
169
+ end #daemon1
170
+
171
+ #Reverse
172
+ if options.paired
173
+ reverse_daemon_options = {
174
+ :app_name => "reverse_#{lane}_#{index_str}",
175
+ :ARGV => ['start'],
176
+ :log_output => true,
177
+ :dir_mode => :normal,
178
+ :dir => dir}
179
+ reverse_task = ::Daemons.run_proc("reverse_#{lane}_#{index_str}",reverse_daemon_options) do
180
+ strand_lambda.call(dir, :reverse)
181
+ end #daemon2
182
+ end #ifpaired
183
+ end #by_lane_index
184
+
185
+ # SAMPLES = 1,2,3,4
186
+ # LANE = 1
187
+ #OUTOUP = File name prefix, output file name will be OOUTPUT-Sample_N....
188
+ desc "samples_by_lane SAMPLES LANE OUTPUT", "Convert the qseqs for each sample in a specific lane. SAMPLES is an array of index codes separated by commas lane is an integer"
189
+ method_option :paired, :type => :boolean, :default => false, :desc => 'Convert the reads in the paired format searching in the directories.'
190
+ method_option :append, :type => :boolean, :default => false, :desc => 'Append this convertion to the output file if exists'
191
+ def samples_by_lane(samples, lane, output)
192
+ dir = Dir.pwd
193
+ samples.split(",").each do |sample|
194
+ sample_idx = sample.to_i
195
+ ::Daemons.run_proc("sample#{sample}_by_lane-#{lane}", {:app_name => "sample#{sample}_by_lane-#{lane}",
196
+ :ARGV => ['start'],
197
+ :log_output => true}) do
198
+ invoke :by_lane_index, [lane, sample_idx, "#{output}-Sample_#{sample_idx}"], :paired => options.paired, :append =>options.append, :dir => dir
199
+ end
200
+ end
201
+ end #samples_by_lane
202
+
203
+ end #Fastq
204
+ end #Qseq
205
+
206
+ module Bcl
207
+ class Qseq < Thor
208
+ desc "convert RUN OUTPUT [JOBS]", "Convert a bcl dataset in qseq"
209
+ def converts (run_basecalls_root, output, jobs=1)
210
+ invoke :configure_conversion, [run_basecalls_root, output]
211
+ invoke :run_bcl_to_qseq, [run_basecalls_root, jobs]
212
+ end #bcl_to_qseq
213
+
214
+ desc "configure_conversion RUN_DIR OUTPUT ", "Configure the specific Run to be converted", :hide => true
215
+ Bio::Ngs::Bclqseq.new.thor_task(self, :configure_conversion) do |wrapper, task, run_basecalls_root, output|
216
+ #wrapper.params={"base-calls-directory" => "#{run_basecalls_root}/Data/Intensities/BaseCalls", "output-directory" => output}
217
+ task.options.base_calls_directory=run_basecalls_root
218
+ #puts "Test parametri #{task.inspect}"
219
+ wrapper.run
220
+ end #setup_bcl_conversion
221
+
222
+ desc "start_conversion RUN_DIR [JOBS] ", "Start the conversion", :hide => true
223
+ method_option :prova, :type => :string
224
+ def start_conversion(run_basecalls_root, jobs=1)
225
+ # puts jobs
226
+ # puts basecalls
227
+ puts "make recursive -j #{jobs} -f #{run_basecalls_root}/Data/Intensities/BaseCalls/Makefile -C #{run_basecalls_root}/Data/Intensities/BaseCalls"
228
+ end #run_bcl_to_qseq
229
+ end #Qseq
230
+ end #Bcl
231
+
232
+
233
+
234
+ module Illumina
235
+ class Fastq < Thor
236
+
237
+ # Trim fastq sequences (Illumina format 1.5+):
238
+ # ------------------BBBBBBBBBBBBBBBBB
239
+ # ------------------
240
+ # First step trailing Bs are removed and if the remaining sequence is length enough
241
+ # The user can specify the minimum length of the sequnce and the number of Bs to search in the middle.
242
+ # If user passes an output file name that witll be used as suffix for the other output files.
243
+ # If no file name is passed the input file name will be used as suffix.
244
+ # Output: 4 files
245
+ # 1) xxx_trim.fastq the trimmed sequences in fastq format
246
+ # 2) xxx_rejected.fastq
247
+ # 3) xxx_profile.csv the length distribution of the trimmed sequnces
248
+ # 4) xxx_report.csv statistics on processed reads as total number of reads in input,
249
+ # trimmed, removed, untouched ( not trimmed)
250
+ # Note: removed reads are the ones which start with a B
251
+ # IMPORTANT: Data in FastQ formant MUST NOT BE WRAPPED sequence and quality MUST BE ON 1 LINE EACH
252
+ desc "trim_b FASTQ", "perform a trim on all the sequences on B qualities with Illumina's criteria. Ref to CASAVA manual."
253
+ #TODO, report the legth/profile of all the sequences.
254
+ #TODO: implement different strategies for trimming, N consecutive Bs ?
255
+ #TODO: implement min length for a trimmed sequnce to be reported as valid.
256
+ method_option :fileout, :type => :string
257
+ method_option :min_size, :type =>:numeric, :default => 20, :aliases => '-s', :desc => 'minimum length to consider a trimmed sequence as valid, otherwise it will be discarded'
258
+ def trim_b(fastq)
259
+ reads = File.open(fastq,'r')
260
+ output_filename_base = options[:fileout].nil? ? fastq : options.fileout
261
+ count_total = 0
262
+ count_trimmed = 0
263
+ count_removed = 0
264
+ sequences_profile=Hash.new(0)
265
+ fastq=0
266
+ head =""
267
+ seq=""
268
+ qual=""
269
+ min_size = (options[:min_size] > 1) ? (options[:min_size]-1) : 0
270
+
271
+ trimming_tail_patter = /B*$/
272
+
273
+ r_rejected = File.open(Bio::Ngs::Utils.tag_filename(output_filename_base, "trim_rejected","fastq"), 'w')
274
+
275
+ File.open(Bio::Ngs::Utils.tag_filename(output_filename_base, "trim", "fastq"), 'w') do |f|
276
+ reads.lines do |line|
277
+ case (fastq % 4 )
278
+ when 0 then
279
+ head = line
280
+ count_total+=1
281
+ when 1 then seq=line
282
+ #2 is the plus sign
283
+ when 3 then
284
+ b_tail_idx=(line=~trimming_tail_patter)
285
+ if (b_tail_idx > min_size )
286
+ count_trimmed+=1
287
+ f.puts "#{head}#{seq[0..b_tail_idx-1]}\n+\n#{$`}" #remaining_line}"#line[0..b_tail_idx]
288
+ else
289
+ count_removed+=1
290
+ r_rejected.puts "#{head}#{seq}+\n#{line}"
291
+ end
292
+ end #case
293
+ fastq+=1
294
+ end#read
295
+ end #Write fastq
296
+ r_rejected.close
297
+ #Profile
298
+ File.open(Bio::Ngs::Utils.tag_filename(output_filename_base, "trim_profile", "csv"), 'w') do |f_profile|
299
+ f_profile.puts "Sequnce length,count"
300
+ sequences_profile.sort.each do |profile|
301
+ read_size = profile[0]
302
+ read_number = profile[1]
303
+ f_profile.puts "#{read_size},#{read_number}"
304
+ end
305
+ end #Write profile
306
+ #Report
307
+ File.open(Bio::Ngs::Utils.tag_filename(output_filename_base, "trim_report", "csv"), 'w') do |report|
308
+ report.puts "Reads processed,Reads trimmed,Reads removed,Reads untouched"
309
+ report.puts "#{count_total},#{count_trimmed},#{count_removed},#{count_total-count_trimmed-count_removed}"
310
+ end #Write report
311
+ end #trim_b
312
+ end #Fastq
313
+
314
+ class Humanize < Thor
315
+ require 'json'
316
+
317
+ desc "build_compare_kb GTF", "Build the JSON file with the annoation from the GTF file used to humanize the results"
318
+ #TODO: create a zip file to optimize the space.
319
+ def build_compare_kb(gtf)
320
+ Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
321
+ # unless File.exists?(gtf)
322
+ # STDERR.puts "File #{gtf} doesn't exist."
323
+ # return nil
324
+ # end
325
+ # dict = {} #build an hash with the combinations of data extracted from GTF file, XLOC, TCONS, ENST, SYMBOL
326
+ # File.open(gtf,'r') do |f|
327
+ # f.lines do |line|
328
+ # line=~/gene_id (.*?);/
329
+ # gene_id = $1.gsub(/"/,'').to_sym
330
+ # line=~/transcript_id (.*?);/
331
+ # transcript_id = $1.gsub(/"/,'').to_sym
332
+ # line=~/gene_name (.*?);/
333
+ # gene_name = $1.gsub(/"/,'').to_sym
334
+ # line=~/oId (.*?);/
335
+ # oid=$1.gsub(/"/,'').to_sym
336
+ # line=~/nearest_ref (.*?);/
337
+ # nearest_ref = $1.gsub(/"/,'').to_sym
338
+ # dict[gene_id]={:transcript_id=>transcript_id, :gene_name=>gene_name, :odi=>oid, :nearest_ref=>nearest_ref}
339
+ # dict[transcript_id]={:gene_id=>gene_id, :gene_name=>gene_name, :odi=>oid, :nearest_ref=>nearest_ref}
340
+ # dict[gene_name]={:gene_id=>gene_id, :transcript_id=>transcript_id, :odi=>oid, :nearest_ref=>nearest_ref}
341
+ # dict[oid]={:gene_id=>gene_id, :transcript_id=>transcript_id, :gene_name=>gene_name, :nearest_ref=>nearest_ref}
342
+ # dict[nearest_ref]={:gene_id=>gene_id, :transcript_id=>transcript_id, :odi=>oid, :gene_name=>gene_name}
343
+ # end#lines
344
+ # end#file
345
+ # kb_filename = gtf.sub(/\.[a-zA-Z0-9]*$/,".kb")
346
+ # File.open(kb_filename,'w') do |fkb|
347
+ # #fkb.write(dict.to_json)
348
+ # Marshal.dump(dict,fkb)
349
+ # end #fkb
350
+ end
351
+
352
+ desc "isoform_exp GTF ISOFORM", "tag the XLOC gathering information from GTF (ensembl)"
353
+ #TODO: open a zip file,KB to optimez performances
354
+ def isoform_exp(gtf, isoform)
355
+ unless File.exists?(gtf)
356
+ STDERR.puts "File #{gtf} doesn't exist."
357
+ return nil
358
+ end
359
+
360
+ unless File.exists?(isoform)
361
+ STDERR.puts "File #{isoform} doesn't exist."
362
+ return nil
363
+ end
364
+
365
+ unless File.exists?(kb_filename = gtf.sub(/\.[a-zA-Z0-9]*$/,".kb"))
366
+ #build the kb
367
+ invoke :build_compare_kb, [gtf]
368
+ end
369
+
370
+ gtf_gkb = Bio::Ngs::Cufflinks::Compare.load_compare_kb(kb_filename)
371
+ # gtf_kb = File.open(kb_filename,'r') do |kb_dump|
372
+ # Marshal.load(kb_dump)
373
+ # end
374
+
375
+ File.open("#{isoform}_rich", 'w') do |w|
376
+ File.open(isoform,'r') do |f|
377
+ w.write("ensembl_transcript_id\t#{f.readline}") #skip header and write to output files
378
+ f.each_line do |line|
379
+ data = line.split
380
+ w.write("#{gtf_kb[data[0].to_sym][:nearest_ref]}\t#{line}")
381
+ end #line
382
+ end #file read
383
+ end #file write
384
+ end#isoform_exp
385
+
386
+ end #Humanize
387
+
388
+ class De < Thor
389
+
390
+ #./bin/biongs convert:illumina:de:isoform /Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp.diff /Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf --min_samples=5 --fold=2 --min_fpkm=0.5 --z_score | sort > /Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp_diff.txt
391
+
392
+
393
+ #Extract data from differential expression made by Cuffdiff.
394
+ #The user can request to export the data in a tabular format with data in fpkm or z-score (computed by row)
395
+ #Is possible to filter the results in different manners:
396
+ #by fold change: log2 (internally Cuffdiff compute a fold change with natural logarithm, this task made an internal conversion)
397
+ #by number of elmentes for with the fold change is verified among the remaining populations/samples
398
+ #by fpkm a poulation/samples is take into account by further selection steps if it's fpkm value is greater_equal to...
399
+ #the output is writted to a tab delimited table, sorted by the first column:sample-discriminator.
400
+ #Output file name isoform_exp-f1_s5_fpkm0.5_z.txt, the parameters are written in the file name, so is possible to keep track of them
401
+ desc "isoform DIFF GTF", "extract the transcripts"
402
+ method_option :fold, :type => :numeric, :desc => "DE fold change log2", :default=>0.0
403
+ method_option :only_significative, :type => :boolean, :aliases=>'-s', :default=>false
404
+ method_option :min_samples, :type=>:numeric, :aliases=>"-m", :desc=>"Niminim number of item for the the fold must be verified or significative"
405
+ method_option :min_fpkm, :type => :numeric, :aliases => "-f", :default=> 0.0, :desc => "Store a value if its fpkm is at least"
406
+ method_option :z_scores, :type => :boolean, :aliases => "-z", :default=> false, :desc=> "Return a matrix of Z-scores other than fpkm"
407
+ method_option :up, :type => :boolean, :aliases => '-u', :default => true, :desc => "Up regulated (true), down regulated (false)"
408
+ def isoform(diff_file, gtf)
409
+ how_regulated = options.up ? :up : :down
410
+ Bio::Ngs::Cufflinks::Diff.isoforms(diff_file,
411
+ gtf,
412
+ fold:options.fold,min_samples:options.min_samples,min_fpkm:options.min_fpkm,z_scores:options.z_scores, regulated:how_regulated)
413
+ end #de_isoform
414
+
415
+ desc "gene DIFF GTF", "extract the transcripts"
416
+ method_option :fold, :type => :numeric, :desc => "DE fold change log2", :default=>0.0
417
+ method_option :only_significative, :type => :boolean, :aliases=>'-s', :default=>false
418
+ method_option :min_samples, :type=>:numeric, :aliases=>"-m", :desc=>"Niminim number of item for the the fold must be verified or significative"
419
+ method_option :min_fpkm, :type => :numeric, :aliases => "-f", :default=> 0.0, :desc => "Store a value if its fpkm is at least"
420
+ method_option :z_scores, :type => :boolean, :aliases => "-z", :default=> false, :desc=> "Return a matrix of Z-scores other than fpkm"
421
+ method_option :up, :type => :boolean, :aliases => '-u', :default => true, :desc => "Up regulated (true), down regulated (false)"
422
+ def gene(diff_file, gtf)
423
+ how_regulated = options.up ? :up : :down
424
+ Bio::Ngs::Cufflinks::Diff.genes(diff_file,
425
+ gtf,
426
+ fold:options.fold,min_samples:options.min_samples,min_fpkm:options.min_fpkm,z_scores:options.z_scores, regulated:how_regulated)
427
+ end #de_isoform
428
+
429
+ #convert:illumina:de:rename_qs /Users/bonnalraoul/Desktop/RRep16giugno/DEpopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/gene_exp-f0.5_s5_fpkm0.5_zup.txt\
430
+ # Naive,Th1,Th17,Th2,Treg,Tfh
431
+ desc "rename_qs DIFF_FILE NAMES", 'rename q1,...,qn with names provided by the user(comma separated)'
432
+ def rename_qs(diff_file, names)
433
+ names_list = names.split(',')
434
+ File.open(diff_file+"_renamed",'w') do |w|
435
+ File.open(diff_file, 'r') do |f|
436
+ header = f.readline
437
+ names_list.each_with_index{|name,idx| header.gsub!(/q#{idx+1}/,name)}
438
+ w.puts header
439
+ f.each_line do |line|
440
+ line.scan(/q\d+/).each do |q|
441
+ line.gsub!(/#{q}/,names_list[q.tr('q','').to_i-1])
442
+ end #scan
443
+ w.puts line
444
+ end #each_line
445
+ end# open-read
446
+ end #open-write
447
+ end
448
+ end #De
449
+
450
+ end #Illumina
451
+
452
+
453
+ end #Convert
454
+ # Add methods to Enumerable, which makes them available to Array