bio-ngs 0.4.6.alpha.01 → 0.4.6.alpha.02

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. data/Gemfile +4 -2
  2. data/Gemfile.lock +21 -21
  3. data/README.rdoc +51 -4
  4. data/VERSION +1 -1
  5. data/bin/biongs +1 -0
  6. data/bio-ngs.gemspec +36 -8
  7. data/features/cufflinks_gtf_parser.feature +22 -0
  8. data/features/cufflinks_gtf_parser_indexing.feature +20 -0
  9. data/features/step_definitions/cufflinks_gtf.rb +30 -0
  10. data/features/step_definitions/cufflinks_gtf_parser_indexing.rb +53 -0
  11. data/features/support/env.rb +2 -0
  12. data/lib/bio-ngs.rb +19 -5
  13. data/lib/bio/appl/ngs/cufflinks.rb +447 -281
  14. data/lib/bio/appl/ngs/cufflinks/gtf/gtf.rb +23 -0
  15. data/lib/bio/appl/ngs/cufflinks/gtf/gtf_parser.rb +248 -0
  16. data/lib/bio/appl/ngs/cufflinks/gtf/transcript.rb +154 -0
  17. data/lib/bio/ngs/fs.rb +46 -0
  18. data/lib/bio/ngs/illumina/fastq.rb +176 -0
  19. data/lib/bio/ngs/illumina/illumina.rb +64 -0
  20. data/lib/bio/ngs/illumina/project.rb +81 -0
  21. data/lib/bio/ngs/illumina/sample.rb +85 -0
  22. data/lib/bio/ngs/task.rb +1 -1
  23. data/lib/bio/ngs/utils.rb +124 -112
  24. data/lib/meta.rb +162 -0
  25. data/lib/tasks/convert.thor +14 -14
  26. data/lib/tasks/filter.thor +158 -23
  27. data/lib/tasks/quality.thor +24 -4
  28. data/lib/tasks/rna.thor +26 -0
  29. data/lib/wrapper.rb +28 -0
  30. data/spec/bio/ngs/fs_spec.rb +70 -0
  31. data/spec/bio/ngs/illumina/fastq_spec.rb +52 -0
  32. data/spec/bio/ngs/illumina/illumina_spec.rb +21 -0
  33. data/spec/bio/ngs/illumina/project_spec.rb +0 -0
  34. data/spec/bio/ngs/illumina/sample_spec.rb +0 -0
  35. data/spec/bio/ngs/illumina/samples_spec.rb +0 -0
  36. data/spec/filter_spec.rb +25 -0
  37. data/spec/fixture/table_filter_list.txt +3 -0
  38. data/spec/fixture/table_filter_list_first_column.txt +2 -0
  39. data/spec/fixture/table_filter_source.tsv +44 -0
  40. data/spec/fixture/test-filtered-reference.fastq.gz +0 -0
  41. data/spec/fixture/test-merged-reference.fastq.gz +0 -0
  42. data/spec/fixture/test.fastq.gz +0 -0
  43. data/spec/meta_spec.rb +117 -0
  44. data/spec/spec_helper.rb +1 -1
  45. metadata +97 -69
@@ -0,0 +1,23 @@
1
+ module Bio
2
+ module Ngs
3
+ module Cufflinks
4
+ class Gtf
5
+ #include MarkCall
6
+ include GtfParser
7
+ def initialize(file)
8
+ @fh=File.open(File.absolute_path(file))
9
+ end
10
+
11
+ def source
12
+ @fh
13
+ end
14
+
15
+ def source=(src)
16
+ @fh=src
17
+ end
18
+
19
+ end #Gtf
20
+
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,248 @@
1
+ # TODO:
2
+ # * when select or first each trasncript create and index. Be aware to return/crete the right index for the requested filtering.
3
+ # issue: filtering is applied but the index is created and saved for the original source file.
4
+
5
+
6
+ module Bio
7
+ module Ngs
8
+ module Cufflinks
9
+ # TODO use a specific class for each block (transcript)
10
+ module GtfParser
11
+ attr_accessor :lazy
12
+ require 'tempfile'
13
+ def each_transcript(&block)
14
+ if @blocks.nil? || @blocks.empty?
15
+ transcript = Transcript.new
16
+ @fh.rewind
17
+ transcript.tra = @fh.readline
18
+ @fh.each_line do |line|
19
+ if line =~ /\ttranscript\t/
20
+ block.call(transcript, @fh.lineno)
21
+ transcript.clear
22
+ transcript.tra = line
23
+ else line =~ /\texon\t/
24
+ transcript.exons << line
25
+ end
26
+ end
27
+ else #lazy
28
+ not_lazy
29
+ blocks_to_run = @blocks
30
+ @blocks=[]
31
+ result=select do |transcript|
32
+ bool_blocks = blocks_to_run.map do |b|
33
+ b.call(transcript)
34
+ end
35
+ !(bool_blocks.include?(nil) || bool_blocks.include?(false))
36
+ end
37
+ set_lazy
38
+ result.send(:each_transcript, &block)
39
+ end #lazy or not?
40
+ end
41
+
42
+ def select(&block)
43
+ if is_lazy?
44
+ @blocks||=[]
45
+ @blocks << block
46
+ self
47
+ else
48
+ # Find out how to concatenate multiple selections
49
+ file = Tempfile.new("transcripts")
50
+ each_transcript do |transcript|
51
+ if block.call(transcript)
52
+ file.write transcript.to_s
53
+ end
54
+ end
55
+ gtf=Gtf.new(file.path) unless file.size == 0
56
+ end
57
+ end #select
58
+
59
+ def multi_exon_with_lengh_and_coverage(length, coverage)
60
+ select do |transcript|
61
+ transcript.multi_exons? && (transcript.size > length) && (transcript.attributes[:cov] > coverage)
62
+ end
63
+ end
64
+
65
+ def multi_exons
66
+ # mark
67
+ select do |transcript|
68
+ transcript.multi_exons? #transcript line and exon line
69
+ end
70
+ end
71
+
72
+ def mono_exon
73
+ # mark
74
+ select do |transcript|
75
+ transcript.mono_exon? #transcript line and exon line
76
+ end
77
+ end
78
+
79
+ def length_gt(length)
80
+ select do |transcript|
81
+ transcript.size > length
82
+ end
83
+ end
84
+
85
+
86
+ def brand_new_isoforms
87
+ select do |transcript|
88
+ transcript.brand_new_isoform?
89
+ end
90
+ end
91
+
92
+ def new_isoforms
93
+ select do |transcript|
94
+ transcript.new_isoform?
95
+ end
96
+ end
97
+
98
+ def annotated_isoforms
99
+ select do |transcript|
100
+ transcript.annotated_isoform?
101
+ end
102
+ end
103
+
104
+ def coverage_gt(size)
105
+ select do |transcript|
106
+ transcript.attributes[:cov] > size
107
+ end
108
+ end
109
+
110
+ def to_gff3(path=".")
111
+ if File.exists?(File.join(path,"transcripts.gtf"))
112
+ gffread = GffRead.new
113
+ gffread.params = {output:"transcripts.gff3"}
114
+ gffread.run :arguments=>["transcripts.gtf"], :separator=>''
115
+ else
116
+ raise ArgumentError, "transcripts.gtf doesn't exists in #{path}"
117
+ end
118
+ end #to_gff3
119
+
120
+ def to_bed(only_exons=true, &block)
121
+ each_transcript do |t|
122
+ block.call(t, t.to_bed(only_exons))
123
+ end
124
+ end #to_bed
125
+
126
+ def set_lazy
127
+ @lazy=true
128
+ end
129
+
130
+ def is_lazy?
131
+ @lazy
132
+ end
133
+
134
+ def not_lazy
135
+ @lazy = false
136
+ end
137
+
138
+ def save(filename=nil)
139
+ fn = filename || "#{@fh.path}.gtf"
140
+ File.open(fn, 'w') do |f|
141
+ each_transcript do |transcript|
142
+ f.write transcript
143
+ end
144
+ end
145
+ # dump_idx("#{fn}.idx") #BUGGY this saves the old index in case the user called a select
146
+ end #save
147
+
148
+ def count
149
+ size = 0
150
+ each_transcript do
151
+ size+=1
152
+ end
153
+ size
154
+ end #count
155
+
156
+ def build_idx
157
+ idx = Hash.new {|h,k| h[k]=[]}
158
+ idx[:transcripts]
159
+ idx[:exons]
160
+ each_transcript do |t, f_lno|
161
+ # t_idx=(f_lno-t.exons.size-2)
162
+ idx[:transcripts] << t.byte_length
163
+ # eidx_b = t_idx +1
164
+ # t.exons.each_index do |ei|
165
+ # idx[t_idx] << eidx_b + ei
166
+ # idx[:exons] << eidx_b + ei
167
+ # end
168
+ end
169
+ @idx = idx
170
+ end #build_idx
171
+
172
+ def dump_idx(fn=nil)
173
+ fn||="#{source.path}.idx"
174
+
175
+ build_idx unless defined?(@idx)
176
+ @idx[:default_hash] = @idx.default
177
+ @idx.default = nil
178
+ File.open(fn, "w+") do |f|
179
+ Marshal.dump(@idx, f)
180
+ end
181
+ @idx.default = @idx[:default_hash]
182
+ fn
183
+ end #dump_idx
184
+
185
+ def load_idx
186
+ if File.exists?("#{source.path}.idx")
187
+ @idx = Marshal.load(File.open("#{source.path}.idx"))
188
+ @idx.default = @idx[:default_hash]
189
+ else
190
+ build_idx
191
+ dump_idx
192
+ end
193
+ end # load_idx
194
+
195
+ def index
196
+ @idx
197
+ end
198
+
199
+ # start from 1
200
+ def read_transcript(n=1)
201
+ load_idx unless defined?(@idx)
202
+ if n==1
203
+ source.seek(0)
204
+ source.read(@idx[:transcripts][0])
205
+ elsif n==2
206
+ source.seek(@idx[:transcripts][0])
207
+ source.read(@idx[:transcripts][n-1])
208
+ else
209
+ source.seek(@idx[:transcripts][0..n-2].sum)
210
+ source.read(@idx[:transcripts][n-1])
211
+ end
212
+ end
213
+
214
+ def get_transcript(n=1)
215
+ r=read_transcript(n)
216
+ s=r.split("\n").first
217
+ e=r.split("\n")[1..-1]
218
+ x=Bio::Ngs::Cufflinks::Transcript.new
219
+ x.tra= s+"\n"
220
+ x.exons=e.map{|ei| ei+"\n"}
221
+ x
222
+ end
223
+
224
+ alias :[] :get_transcript
225
+
226
+ end #GtfParser
227
+
228
+ end #Cufflinks
229
+ end #Ngs
230
+ end #Bio
231
+
232
+ # class Array
233
+ # def to_ranges
234
+ # sorted=self.sort
235
+ # left = sorted.first
236
+ # ranges = sorted.compact.uniq.sort.map do |e|
237
+ # if sorted[sorted.index(e) +1] == e.succ
238
+ # right = e.succ
239
+ # nil # set the elements between the ranges to nil
240
+ # else
241
+ # range_left = left
242
+ # left=sorted[sorted.index(e) +1]
243
+ # range_left == e ? e : Range.new(range_left, e)
244
+ # end
245
+ # end
246
+ # ranges.compact
247
+ # end
248
+ # end
@@ -0,0 +1,154 @@
1
+
2
+ module Bio
3
+ module Ngs
4
+ module Cufflinks
5
+ class Transcript
6
+ Fields = %w(seqname source feature start stop score strand frame)
7
+ Attr_to_Float = %w(FPKM frac conf_lo conf_hi cov)
8
+ Attr_to_Integer = %w(exon_number)
9
+ ChrNotation = {ensembl:"", ucsc:"chr"}
10
+
11
+ attr_accessor :seqname, :source, :feature, :start, :stop, :score, :strand, :frame, :chr_notation
12
+
13
+ def initialize()
14
+ @tra = nil
15
+ @exons = []
16
+ @attributes = {}
17
+ @chr_notation = :ensembl #ensembl/ucsc
18
+ end
19
+
20
+ def tra
21
+ if @chr_notation == :ensembl && @tra=~/^chr(.*?)\s/
22
+ "#{ChrNotation[:ensembl]}#{$1}"
23
+ elsif @chr_notation == :ucsc && @tra=~/^(.*?)\s/
24
+ "#{ChrNotation[:ucsc]}#{$1}"
25
+ else
26
+ @tra
27
+ end
28
+ end
29
+
30
+ def exons
31
+ @exons
32
+ end
33
+
34
+ def exons=(ary)
35
+ @exons=ary
36
+ end
37
+
38
+ def attributes
39
+ @attributes
40
+ end
41
+ def tra=(line)
42
+ @tra = line
43
+ data=line.split
44
+ @seqname = data[0]
45
+ @source = data[1]
46
+ @feature = data[2]
47
+ @start = data[3].to_i
48
+ @stop = data[4].to_i
49
+ @score = data[5]
50
+ @strand = data[6]
51
+ @frame = data[7]
52
+ data[8..-1].join(" ").split(';').each do |attribute|
53
+ data_attr=attribute.tr('"','').split
54
+ @attributes[data_attr[0].to_sym]= if Attr_to_Float.include? data_attr[0]
55
+ data_attr[1].to_f
56
+ elsif Attr_to_Integer.include? data_attr[0]
57
+ data_attr[1].to_i
58
+ else
59
+ data_attr[1]
60
+ end
61
+ end
62
+ end
63
+
64
+ def multi_exons?
65
+ exons.size > 1
66
+ end
67
+
68
+ def mono_exon?
69
+ exons.size == 1
70
+ end
71
+
72
+ def size
73
+ @stop-@start+1
74
+ end
75
+
76
+
77
+ # def method_missing(meth, *args, &block)
78
+ # meth_name = meth.to_s.tr("=")
79
+ # if Fields.include? meth_name
80
+ # method_define meth_name do |args|
81
+
82
+ # end
83
+ # else
84
+ # super # You *must* call super if you don't handle the
85
+ # # method, otherwise you'll mess up Ruby's method
86
+ # # lookup.
87
+ # puts "There's no method called #{m} here -- please try again."
88
+ # end
89
+
90
+ # add last "\n" to last row.
91
+ def to_s
92
+ s=tra #+"\n"
93
+ s << exons.join #("\n") #<< "\n"
94
+ end
95
+
96
+ def to_bed(only_exons=true)
97
+ bed_str=""
98
+ unless only_exons
99
+ # puts seqname
100
+ #TODO fix seqname does not print the right transcript line
101
+ if @chr_notation == :ensembl && seqname=~/^chr(.*)/
102
+ seqname="#{ChrNotation[:ensembl]}#{$1}"
103
+ elsif @chr_notation == :ucsc && seqname=~/^(.*)/
104
+ seqname="#{ChrNotation[:ucsc]}#{$1}"
105
+ end
106
+
107
+ bed_str<<"#{seqname}\t#{start}\t#{stop}\t#{attributes[:gene_id]}_#{attributes[:transcript_id]}\n"
108
+ end
109
+ exons.each do |e|
110
+ data = e.tr('";','').split
111
+ if @chr_notation == :ensembl && data[0]=~/^chr(.*)/
112
+ data[0]="#{ChrNotation[:ensembl]}#{$1}"
113
+ elsif @chr_notation == :ucsc && data[0]=~/^(.*)/
114
+ data[0]="#{ChrNotation[:ucsc]}#{$1}"
115
+ end
116
+
117
+ bed_str<<"#{data[0]}\t#{data[3]}\t#{data[4]}\t#{data[9]}_#{data[11]}\n"
118
+ end
119
+ bed_str
120
+ end
121
+
122
+ def clear
123
+ @tra=""
124
+ @exons.clear
125
+ end
126
+
127
+ def brand_new_isoform?
128
+ attributes[:gene_id]=~/CUFF\.\d+/ && attributes[:transcript_id]=~/CUFF\.\d+\.\d+/
129
+ end
130
+
131
+ def new_isoform?
132
+ attributes[:gene_id]=~/CUFF\.\d+/ && attributes[:transcript_id]!~/CUFF\.\d+\.\d+/
133
+ end
134
+
135
+ def annotated_isoform?
136
+ attributes[:gene_id]!~/CUFF\.\d+/ && attributes[:transcript_id]!~/CUFF\.\d+\.\d+/
137
+ end
138
+
139
+ def byte_length
140
+ exons.map{|e| e.length}.sum + tra.length
141
+ end
142
+
143
+ def set_ucsc_notation
144
+ @chr_notation = :ucsc
145
+ end
146
+
147
+ def set_ensembl_notation
148
+ @chr_notation = :ensembl
149
+ end
150
+
151
+ end #Transcript
152
+ end #Cufflinks
153
+ end #Ngs
154
+ end #Bio
@@ -0,0 +1,46 @@
1
+ module Bio
2
+ module Ngs
3
+ module FS
4
+ # def self.included(base)
5
+ # base.extend(ClassMethods)
6
+ # end
7
+ class << self
8
+ # Write a file 'merged' which is the concatenation of multipes fastq.gz 'files'
9
+ # files is an array of filenames
10
+ def cat(files, merged)
11
+ if files.is_a? Array
12
+ File.open(merged,'wb') do |fmerge|
13
+ files.each do |fname|
14
+ File.open(fname,'rb:binary') do |file|
15
+ while line = file.gets
16
+ fmerge << line
17
+ end
18
+ end #read
19
+ end #each
20
+ end #write
21
+ end #if
22
+ end #cat
23
+ alias :merge :cat
24
+
25
+
26
+ def files(everything, suffix=nil)
27
+ if everything.is_a? String
28
+ if File.file? everything
29
+ [File.expand_path(everything)]
30
+ elsif File.directory? everything
31
+ files(Dir.glob(File.join(everything, suffix.nil? ? "*" : "*"+suffix)).select{|item| File.file? item}).flatten
32
+ elsif everything=~/\*/
33
+ files(Dir.glob(everything)).flatten
34
+ elsif everything=~/[ ,:;]/
35
+ files(everything.split(/[ ,:;]/))
36
+ end
37
+ elsif everything.is_a? Array
38
+ everything.map do |item|
39
+ files(item)
40
+ end.flatten
41
+ end
42
+ end
43
+ end #self
44
+ end #FS
45
+ end #Ngs
46
+ end #Bio