bio-ngs 0.4.6.alpha.01 → 0.4.6.alpha.02

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. data/Gemfile +4 -2
  2. data/Gemfile.lock +21 -21
  3. data/README.rdoc +51 -4
  4. data/VERSION +1 -1
  5. data/bin/biongs +1 -0
  6. data/bio-ngs.gemspec +36 -8
  7. data/features/cufflinks_gtf_parser.feature +22 -0
  8. data/features/cufflinks_gtf_parser_indexing.feature +20 -0
  9. data/features/step_definitions/cufflinks_gtf.rb +30 -0
  10. data/features/step_definitions/cufflinks_gtf_parser_indexing.rb +53 -0
  11. data/features/support/env.rb +2 -0
  12. data/lib/bio-ngs.rb +19 -5
  13. data/lib/bio/appl/ngs/cufflinks.rb +447 -281
  14. data/lib/bio/appl/ngs/cufflinks/gtf/gtf.rb +23 -0
  15. data/lib/bio/appl/ngs/cufflinks/gtf/gtf_parser.rb +248 -0
  16. data/lib/bio/appl/ngs/cufflinks/gtf/transcript.rb +154 -0
  17. data/lib/bio/ngs/fs.rb +46 -0
  18. data/lib/bio/ngs/illumina/fastq.rb +176 -0
  19. data/lib/bio/ngs/illumina/illumina.rb +64 -0
  20. data/lib/bio/ngs/illumina/project.rb +81 -0
  21. data/lib/bio/ngs/illumina/sample.rb +85 -0
  22. data/lib/bio/ngs/task.rb +1 -1
  23. data/lib/bio/ngs/utils.rb +124 -112
  24. data/lib/meta.rb +162 -0
  25. data/lib/tasks/convert.thor +14 -14
  26. data/lib/tasks/filter.thor +158 -23
  27. data/lib/tasks/quality.thor +24 -4
  28. data/lib/tasks/rna.thor +26 -0
  29. data/lib/wrapper.rb +28 -0
  30. data/spec/bio/ngs/fs_spec.rb +70 -0
  31. data/spec/bio/ngs/illumina/fastq_spec.rb +52 -0
  32. data/spec/bio/ngs/illumina/illumina_spec.rb +21 -0
  33. data/spec/bio/ngs/illumina/project_spec.rb +0 -0
  34. data/spec/bio/ngs/illumina/sample_spec.rb +0 -0
  35. data/spec/bio/ngs/illumina/samples_spec.rb +0 -0
  36. data/spec/filter_spec.rb +25 -0
  37. data/spec/fixture/table_filter_list.txt +3 -0
  38. data/spec/fixture/table_filter_list_first_column.txt +2 -0
  39. data/spec/fixture/table_filter_source.tsv +44 -0
  40. data/spec/fixture/test-filtered-reference.fastq.gz +0 -0
  41. data/spec/fixture/test-merged-reference.fastq.gz +0 -0
  42. data/spec/fixture/test.fastq.gz +0 -0
  43. data/spec/meta_spec.rb +117 -0
  44. data/spec/spec_helper.rb +1 -1
  45. metadata +97 -69
@@ -0,0 +1,162 @@
1
+ module Meta
2
+
3
+ class Data
4
+ attr_accessor :metadata
5
+
6
+ def initialize(name, metadata={})
7
+ @metadata={}
8
+ @metadata[:name]=name
9
+ @metadata.merge! metadata
10
+ end
11
+
12
+ def name
13
+ @metadata[:name]
14
+ end
15
+
16
+ def name=(val)
17
+ @metadata[:name]=val
18
+ end
19
+
20
+ def ==(other)
21
+ if self.name==other.name && self.metadata==other.metadata
22
+ true
23
+ else
24
+ false
25
+ end
26
+ end
27
+
28
+ def has_tag?(tag)
29
+ metadata.key? tag
30
+ end
31
+
32
+ def has_value?(val)
33
+ metadata.each_pair do |tag, value|
34
+ return true if value == val
35
+ end
36
+ return false
37
+ end
38
+
39
+ def [](tag)
40
+ metadata[tag]
41
+ end
42
+
43
+
44
+ def to_json(*a)
45
+ {
46
+ "json_class" => self.class.name,
47
+ "name" => name,
48
+ "metadata" => metadata
49
+ }.to_json(*a)
50
+ end
51
+ # end #Data
52
+
53
+ # class File
54
+ # include Data
55
+
56
+
57
+ #TODO: make this class generic and available to other classes
58
+ #TODO: include or subclass original class File, I need to borrow most of its methods. File.exists? File.open File.read
59
+
60
+ #TODO: configure a generic classifier to add any kind of tag passing a block do/yield?
61
+ end #File
62
+
63
+ #TODO: this class could be generalized
64
+ class Pool < Data
65
+ include Enumerable
66
+ # include Data
67
+ attr_accessor :pool
68
+ def initialize(name=SecureRandom.uuid)
69
+ super(name)
70
+ @pool = {}
71
+ end
72
+
73
+
74
+ def each &block
75
+ @pool.each_pair{|name, member| block.call(member)}
76
+ end
77
+
78
+ # TODO implement <=>
79
+
80
+
81
+ def add(element)
82
+ unless element.nil?
83
+ if @pool.key? element.name #TODO I don't know if this is correct.
84
+ @pool[element.name].metadata.merge! element.metadata
85
+ else
86
+ @pool[element.name]=element
87
+ end
88
+ end
89
+ end
90
+ alias :<< :add
91
+
92
+ def empty?
93
+ @pool.empty?
94
+ end
95
+
96
+ def names
97
+ @pool.keys
98
+ end
99
+
100
+ def get(name_or_tag_or_value=nil)
101
+ # TODO implement recursive query or passing multiple values as hash, insercet or etc.....
102
+ # if name_or_tag_or_value.is_a? Hash
103
+ # name_or_tag_or_value.each_pair do |tag, value|
104
+ #
105
+ # end
106
+ # else
107
+ get_by_name(name_or_tag_or_value) || get_by_tag(name_or_tag_or_value) || get_by_value(name_or_tag_or_value) || get_down_to_childer(name_or_tag_or_value)
108
+ # end
109
+ end #get
110
+
111
+ def get_by_name(name)
112
+ @pool[name]
113
+ end #get_by_name
114
+
115
+ def get_by_tag(tag)
116
+ get_generic :tag, tag
117
+ end #get_by_tag
118
+
119
+ def get_by_value(val)
120
+ get_generic :value, val
121
+ end #get_by_value
122
+
123
+ def get_by_tag_and_value(tag, val)
124
+ ret_pool = Pool.new
125
+ @pool.each_pair do |name, meta|
126
+ if meta.has_tag?(tag) && meta[tag]==val
127
+ ret_pool.add meta
128
+ else
129
+ @pool.each_pair do |name, element|
130
+ ret_pool.add element.get_by_tag_and_value(tag, val) if element.respond_to?(:get_by_tag_and_value) && element.respond_to?(:pool)
131
+ end
132
+ end
133
+ end
134
+ ret_pool unless ret_pool.empty?
135
+ end #get_by_tag_and_value
136
+
137
+ def get_down_to_childer(x)
138
+ ret_pool = Pool.new
139
+ @pool.each_pair do |name, element|
140
+ ret_pool.add element.get(x) if element.respond_to?(:get) && element.respond_to?(:pool)
141
+ end
142
+ ret_pool unless ret_pool.empty?
143
+ end
144
+
145
+ private
146
+ def get_generic(type, data)
147
+ ret_pool = Pool.new
148
+ type = type.to_sym
149
+ if [:tag,:value].include? type
150
+ @pool.each_pair do |name, meta|
151
+ if meta.send("has_#{type}?", data)
152
+ ret_pool.add(meta)
153
+ end
154
+ end
155
+ ret_pool unless ret_pool.empty?
156
+ else
157
+ raise ArgumentError, "#{type} is not a valid parameter, use only tag or value"
158
+ end # valid parameters
159
+ end #get_generic
160
+
161
+ end #Pool
162
+ end #Meta
@@ -451,20 +451,20 @@ module Convert
451
451
  end #Illumina
452
452
 
453
453
 
454
- desc "list2table list", "reorganize a list of pairs key value in a table of key values. Tabular is the default separator"
455
- def list2table(list)
456
- dict = Hash.new{|h,k| h[k]=[]}
457
- File.open(ARGV[0],'r') do |f|
458
- f.each_line do |l|
459
- key, value = l.split
460
- dict[key]<<value
461
- end
462
- end
463
-
464
- dict.each_pair do |key, values|
465
- puts "#{key} #{values.join(' ')}"
466
- end
467
- end
454
+ # desc "list2table LIST", "reorganize a list of pairs key value in a table of key values. Tabular is the default separator"
455
+ # def list2table(list)
456
+ # dict = Hash.new{|h,k| h[k]=[]}
457
+ # File.open(ARGV[0],'r') do |f|
458
+ # f.each_line do |l|
459
+ # key, value = l.split
460
+ # dict[key]<<value
461
+ # end
462
+ # end
463
+
464
+ # dict.each_pair do |key, values|
465
+ # puts "#{key} #{values.join(' ')}"
466
+ # end
467
+ # end
468
468
 
469
469
  end #Convert
470
470
  # Add methods to Enumerable, which makes them available to Array
@@ -1,19 +1,106 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../bio/ngs/utils')
2
+ require File.expand_path(File.dirname(__FILE__) + '/../wrapper')
3
+ require File.expand_path(File.dirname(__FILE__) + '/../bio/appl/ngs/cufflinks')
1
4
  class Filter < Thor
2
5
 
6
+ class Cufflinks < Thor
7
+ #TODO method_option :ucsc, :type => :boolean, :aliases => '-u', :desc => "use chr as UCSC a prefix for chromosomes, otherwise uses ENSEMBL notation without chr"
8
+
9
+ desc "transcripts [GTF]", "Extract transcripts from Cufflinks' GTF"
10
+ method_option :brand_new, :type => :boolean, :aliases => '-b', :desc => "get only brand new transcripts, no overlap with any annotation feature"
11
+ method_option :new, :type => :boolean, :aliases => '-n', :desc => "get only new transcripts, overlapping annotations are accepted"
12
+ method_option :annotated, :type => :boolean, :aliases => '-a', :desc => "get only annotated transcripts"
13
+ method_option :mono_exon, :type => :boolean, :aliases => '-s', :desc => "get mono exon transcripts"
14
+ method_option :multi_exons, :type => :boolean, :aliases => '-m', :desc => "get multi exons transcripts"
15
+ method_option :length, :type => :numeric, :aliases => '-l', :desc => "transcripts with a length gt"
16
+ method_option :coverage, :type => :numeric, :aliases => '-c', :desc => "transcripts with a coverage gt"
17
+ method_option :bed, :type => :boolean, :aliases => '-t', :desc => "output data in bed format"
18
+ method_option :count, :type => :boolean, :aliases => '-x', :desc => "counts the selected transcripts"
19
+ method_option :discover, :type => :boolean, :aliases => '-d', :desc => "discovers transcripts.gtf files from within the current directory"
20
+ method_option :split, :type => :boolean, :aliases => '-j', :desc => "split each transcript in a file"
21
+ method_option :output, :type => :string, :aliases => '-o', :desc => "save the results in the output file"
22
+ def transcripts(gtf=nil)
23
+ if gtf.nil? && options[:discover]
24
+ options.remove(:discover)
25
+ Dir.glob("**/transcripts.gtf").each do |gtf_file|
26
+ transcripts(gtf_file)
27
+ end
28
+ elsif !gtf.nil? && File.exists?(gtf)
29
+ data = Bio::Ngs::Cufflinks::Gtf.new gtf
30
+ data.set_lazy
31
+ data.brand_new_isoforms if options[:brand_new]
32
+ data.new_isoforms if options[:new]
33
+ data.annotated_isoforms if options[:annotated]
34
+ data.mono_exons if options[:mono_exons]
35
+ data.multi_exons if options[:multi_exons]
36
+ data.length_gt(options[:length]) if options[:length]
37
+ data.coverage_gt(options[:coverage]) if options[:coverage]
38
+
39
+ default_stdout = (options[:output] && File.open(options[:output], 'w')) || $stdout
40
+
41
+ if options[:bed] && options[:split]
42
+ data.to_bed do |t, bed_exons|
43
+ File.open(t.attributes[:transcript_id], 'w') do |w|
44
+ w.puts bed_exons
45
+ end
46
+ end
47
+ elsif options[:bed]
48
+ data.to_bed do |t, bed_exons|
49
+ default_stdout.puts bed_exons
50
+ end
51
+ elsif options[:count]
52
+ default_stdout.puts "#{gtf}:\t#{data.count}"
53
+ else
54
+ if options[:output]
55
+ data.save(options[:output])
56
+ else
57
+ data.each_transcript do |t|
58
+ default_stdout.puts t
59
+ end
60
+ end
61
+ end
62
+ else
63
+ raise ArgumentError, "file #{gtf} doesn't exist"
64
+ end
65
+ end
66
+
67
+ desc "tra_at_idx GTF IDX", "Extract transcripts from Cufflinks' GTF at specific location, print filename in output"
68
+ method_option :split, :type => :boolean, :aliases => '-j', :desc => "split each transcript in a file"
69
+ method_option :extract, :type => :numeric, :aliases => '-e', :desc => "extract the n-th transcript"
70
+ method_option :ucsc, :type => :boolean, :aliases => '-u', :desc => "use chr as UCSC a prefix for chromosomes, otherwise uses ENSEMBL notation without chr"
71
+ method_option :exons, :type => :boolean, :aliases => '-x', :desc => "proved in output only exons without transcripts", :default => true
72
+ def tra_at_idx(gtf, idx)
73
+ data = Bio::Ngs::Cufflinks::Gtf.new gtf
74
+ t=data[idx.to_i]
75
+ if options[:ucsc]
76
+ t.set_ucsc_notation
77
+ end
78
+ fn = "#{t.attributes[:gene_id]}-#{t.attributes[:transcript_id]}.bed"
79
+ File.open(fn, 'w') do |f|
80
+ f.puts t.to_bed(options[:exons]) #by default only the exons
81
+ end
82
+ puts fn
83
+ end
84
+
85
+ end #Cufflinks
86
+
87
+
3
88
  # Assume that this is a plain list of elements, with just one column. In the future it could be
4
89
  # a table as well.
5
90
  desc "by_list TABLE LIST", "Extract from TABLE the row with a key in LIST"
6
- method_option :exclude, :type => :boolean, :aliases => '-e', :desc => "return the elements in TABLE which are not listed in LIST"
7
- method_option :tablekey, :type => :numeric, :aliases => '-k', :desc =>"which field is the key to consider, start from 0"
8
- method_option :listkey, :type => :numeric, :aliases => '-l', :desc =>"which field is the key to consider, start from 0"
9
- method_option :delimiter, :type => :string, :default => " ", :aliases => '-d'
10
- method_option :skip_table_header, :type => :boolean, :default => true, :aliases => '-h', :desc => 'Skip first line, usually the header'
11
- method_option :skip_list_header, :type => :boolean, :default => true, :aliases => '-j', :desc => 'Skip first line, usually the header'
12
- method_option :skip_table_lines, :type => :numeric, :aliases => '-n', :desc => 'Skip Ns line before start'
13
- method_option :skip_list_lines, :type => :numeric, :aliases => '-m', :desc => 'Skip Ns line before start'
14
- method_option :output, :type => :string, :aliases => '-o', :desc => 'Output results to file'
91
+ method_option :exclude, :type => :boolean, :aliases => '-e', :desc => "return the elements in TABLE which are not listed in LIST"
92
+ method_option :tablekey, :type => :numeric, :aliases => '-k', :desc =>"which field is the key to consider, start from 0"
93
+ method_option :listkey, :type => :numeric, :aliases => '-l', :desc =>"which field is the key to consider, start from 0"
94
+ method_option :delimiter, :type => :string, :default => " ", :aliases => '-d'
95
+ method_option :skip_table_header, :type => :boolean, :default => true, :aliases => '-h', :desc => 'Skip first line, usually the header'
96
+ method_option :skip_list_header, :type => :boolean, :default => true, :aliases => '-j', :desc => 'Skip first line, usually the header'
97
+ method_option :skip_table_lines, :type => :numeric, :aliases => '-n', :desc => 'Skip Ns line before start'
98
+ method_option :skip_list_lines, :type => :numeric, :aliases => '-m', :desc => 'Skip Ns line before start'
99
+ method_option :output, :type => :string, :aliases => '-o', :desc => 'Output results to file'
15
100
  method_option :keep_skipped_lines, :type => :boolean, :default => false, :aliases => '-g', :desc => 'Write on output skipped lines from the TABLE file, header and number of lines skipped using option skip_table_line'
16
- method_option :zero_index_system, :type => :boolean, :default => true, :aliases => '-s', :desc => 'Starts Index from ZERO ? Otherwise starts from ONE'
101
+ method_option :zero_index_system, :type => :boolean, :default => true, :aliases => '-s', :desc => 'Starts Index from ZERO ? Otherwise starts from ONE'
102
+ method_option :fuse, :type => :boolean, :default => false, :aliases => '-f', :desc => 'JOIN two input file using a specific key'
103
+ method_option :in_column_delimiter, :type => :string, :aliases => '-i', :desc => 'Define a delimiter for table key, if setted we assume to split the key columns by this separator'
17
104
  def by_list(table, list)
18
105
  unless File.exists?(table)
19
106
  STDERR.puts "by_list: #{table} does not exist."
@@ -25,7 +112,9 @@ class Filter < Thor
25
112
  end
26
113
  table_key_idx = options[:tablekey] || 0 # by default the first element of the table.
27
114
  list_key_idx = options[:listkey] || 0
28
- #increment indexes in case the use wants to start from 1 and not from 0
115
+ fuse = options[:fuse] || false
116
+ #increment indexes in case user wants to start from 1 and not from 0
117
+ #TODO: fix not increment but decrement, user will pass a +1 value
29
118
  unless options[:zero_index_system]
30
119
  table_key_idx+=1
31
120
  list_key_idx+=1
@@ -38,17 +127,31 @@ class Filter < Thor
38
127
  if (nlines = options[:skip_list_lines])
39
128
  nlines.times.each{|i| flist.readline}
40
129
  end
41
- flist.readline unless options[:skip_list_header]
130
+ flist.readline if options[:skip_list_header]
42
131
  list_dictionary = Hash.new {|hash,key| hash[key] = :fool}
43
132
 
44
- flist.each_line do |line|
45
- #split row
46
- #store the list key
47
- #populate an hash wich keys
48
- list_dictionary[line.split(delimiter)[list_key_idx]]
133
+ #TODO: refactor, find a smarter way to distinguish between fuse or not
134
+ if fuse
135
+ flist.each_line do |line|
136
+ #split row
137
+ #store the list key
138
+ #populate an hash wich keys
139
+ list_line = line.split(delimiter)
140
+ #save the line but remove the key
141
+ list_key = list_line[list_key_idx]
142
+ list_line.delete_at(list_key_idx)
143
+ list_dictionary[list_key]=list_line
144
+ end
145
+ else
146
+ flist.each_line do |line|
147
+ #split row
148
+ #store the list key
149
+ #populate an hash wich keys
150
+ list_dictionary[line.split(delimiter)[list_key_idx]]=:fool
151
+ end
49
152
  end
50
153
  flist.close
51
-
154
+
52
155
  ftable = File.open(table, 'r')
53
156
  #skip header/lines if required
54
157
  #keep skipped line in case it's a proprietary format
@@ -56,20 +159,52 @@ class Filter < Thor
56
159
  if (nlines = options[:skip_table_lines])
57
160
  nlines.times.each{|i| skipped_lines << ftable.readline}
58
161
  end
59
- skipped_lines << ftable.readline unless options[:skip_table_header]
162
+
163
+ skipped_lines << ftable.readline if options[:skip_table_header]
164
+
60
165
  #list_dictionary = Hash.new {|hash,key| hash[key] = :fool}
61
166
 
62
- fout = (output_name=options[:output]).nil? ? STDOUT : File.open(output_name,'w')
167
+ fout = (output_name=options[:output]).nil? ? $stdout : File.open(output_name,'w')
63
168
  fout.puts skipped_lines if keep_skipped_lines
169
+
170
+ fuse_lambda = if fuse
171
+ lambda {|table_line, list_dict, key| "#{table_line.chomp}#{delimiter}#{list_dict[key].join(delimiter)}" }
172
+ #don't know if need to chomp
173
+ else
174
+ lambda {|table_line, list_dict, key| table_line}
175
+ end
64
176
  ftable.each_line do |line|
65
177
  #search for a key in the dictionary/list
66
- if list_dictionary.key?(line.split(delimiter)[table_key_idx]) || options[:exclude]
178
+ #if list_dictionary.key?(line.split(delimiter)[table_key_idx]) || options[:exclude]
179
+ if find_key_in_dictionary(line.split(delimiter)[table_key_idx], list_dictionary, options[:in_column_delimiter]) || options[:exclude]
67
180
  fout.puts line
68
181
  end
69
182
  end
70
183
  ftable.close
71
- fout.close
184
+ fout.close unless options[:output].nil?
185
+ end
186
+
187
+
188
+
189
+ private
190
+
191
+ def find_key_in_dictionary(key, dict, split_key=nil)
192
+ #puts dict
193
+ if split_key.nil?
194
+ if dict.key?(key)
195
+ return true
196
+ end
197
+ else
198
+ key.split(split_key).each do |ikey|
199
+ if dict.key?(ikey)
200
+ return true
201
+ end
202
+ end
203
+ end
204
+ return false
72
205
  end
73
206
 
74
207
 
75
- end
208
+
209
+
210
+ end
@@ -45,7 +45,6 @@ class Quality < Thor
45
45
  desc "fastq_stats FASTQ", "Reports quality of FASTQ file"
46
46
  method_option :output, :type=>:string, :aliases =>"-o", :desc => "Output file name. default is input file_name with .txt."
47
47
  def fastq_stats(fastq)
48
-
49
48
  output_file = options.output || "#{fastq.gsub(/\.fastq\.gz/,'')}_stats.txt"
50
49
  stats = Bio::Ngs::Fastx::FastqStats.new
51
50
  if fastq=~/\.gz/
@@ -60,12 +59,33 @@ class Quality < Thor
60
59
  [:reads_coverage,[output_file]],
61
60
  [:nucleotide_distribution,[output_file]]]
62
61
  Parallel.map(go_in_parallel, in_processes:go_in_parallel.size) do |graph|
63
- invoke graph.first, graph.last
62
+ send graph.first, graph.last
64
63
  end
65
- #invoke :boxplot, [output_file]
66
- #invoke :reads_coverage, [output_file]
67
64
  end
68
65
 
66
+ desc "illumina_projects_stats", "Reports quality of FASTQ files in an Illumina project directory"
67
+ method_option :cpus, :type=>:numeric, :default=>4, :aliases=>'-c', :desc=>'Number of processes to use.'
68
+ def illumina_projects_stats(directory=".")
69
+ if File.directory?(directory) && Bio::Ngs::Illumina.project_directory?(directory)
70
+ projects = Bio::Ngs::Illumina.build(directory)
71
+ files = []
72
+ projects.each do |project_name, project|
73
+ project.samples_path.each do |reads_file|
74
+ #reads_file is an hash with right or left, maybe single also but I didn't code anything for it yet.
75
+ #TODO: refactor these calls
76
+
77
+ files<<File.join(directory, reads_file[:left]) if reads_file.key?(:left)
78
+ files<<File.join(directory, reads_file[:right]) if reads_file.key?(:right)
79
+ end
80
+ end
81
+ Parallel.map(files, in_processes:options[:cpus]) do |file|
82
+ fastq_stats file
83
+ end
84
+ else
85
+ STDERR.puts "illumina_projects_stats: Not an Illumina directory"
86
+ end
87
+ end
88
+
69
89
  desc "boxplot FASTQ_QUALITY_STATS", "plot reads quality as boxplot"
70
90
  method_option :title, :type=>:string, :aliases =>"-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
71
91
  method_option :output, :type=>:string, :aliases =>"-o", :desc => "Output file name. default is input file_name with .txt."