bio-ngs 0.4.6.alpha.01 → 0.4.6.alpha.02
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -2
- data/Gemfile.lock +21 -21
- data/README.rdoc +51 -4
- data/VERSION +1 -1
- data/bin/biongs +1 -0
- data/bio-ngs.gemspec +36 -8
- data/features/cufflinks_gtf_parser.feature +22 -0
- data/features/cufflinks_gtf_parser_indexing.feature +20 -0
- data/features/step_definitions/cufflinks_gtf.rb +30 -0
- data/features/step_definitions/cufflinks_gtf_parser_indexing.rb +53 -0
- data/features/support/env.rb +2 -0
- data/lib/bio-ngs.rb +19 -5
- data/lib/bio/appl/ngs/cufflinks.rb +447 -281
- data/lib/bio/appl/ngs/cufflinks/gtf/gtf.rb +23 -0
- data/lib/bio/appl/ngs/cufflinks/gtf/gtf_parser.rb +248 -0
- data/lib/bio/appl/ngs/cufflinks/gtf/transcript.rb +154 -0
- data/lib/bio/ngs/fs.rb +46 -0
- data/lib/bio/ngs/illumina/fastq.rb +176 -0
- data/lib/bio/ngs/illumina/illumina.rb +64 -0
- data/lib/bio/ngs/illumina/project.rb +81 -0
- data/lib/bio/ngs/illumina/sample.rb +85 -0
- data/lib/bio/ngs/task.rb +1 -1
- data/lib/bio/ngs/utils.rb +124 -112
- data/lib/meta.rb +162 -0
- data/lib/tasks/convert.thor +14 -14
- data/lib/tasks/filter.thor +158 -23
- data/lib/tasks/quality.thor +24 -4
- data/lib/tasks/rna.thor +26 -0
- data/lib/wrapper.rb +28 -0
- data/spec/bio/ngs/fs_spec.rb +70 -0
- data/spec/bio/ngs/illumina/fastq_spec.rb +52 -0
- data/spec/bio/ngs/illumina/illumina_spec.rb +21 -0
- data/spec/bio/ngs/illumina/project_spec.rb +0 -0
- data/spec/bio/ngs/illumina/sample_spec.rb +0 -0
- data/spec/bio/ngs/illumina/samples_spec.rb +0 -0
- data/spec/filter_spec.rb +25 -0
- data/spec/fixture/table_filter_list.txt +3 -0
- data/spec/fixture/table_filter_list_first_column.txt +2 -0
- data/spec/fixture/table_filter_source.tsv +44 -0
- data/spec/fixture/test-filtered-reference.fastq.gz +0 -0
- data/spec/fixture/test-merged-reference.fastq.gz +0 -0
- data/spec/fixture/test.fastq.gz +0 -0
- data/spec/meta_spec.rb +117 -0
- data/spec/spec_helper.rb +1 -1
- metadata +97 -69
@@ -0,0 +1,23 @@
|
|
1
|
+
module Bio
|
2
|
+
module Ngs
|
3
|
+
module Cufflinks
|
4
|
+
class Gtf
|
5
|
+
#include MarkCall
|
6
|
+
include GtfParser
|
7
|
+
def initialize(file)
|
8
|
+
@fh=File.open(File.absolute_path(file))
|
9
|
+
end
|
10
|
+
|
11
|
+
def source
|
12
|
+
@fh
|
13
|
+
end
|
14
|
+
|
15
|
+
def source=(src)
|
16
|
+
@fh=src
|
17
|
+
end
|
18
|
+
|
19
|
+
end #Gtf
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,248 @@
|
|
1
|
+
# TODO:
|
2
|
+
# * when select or first each trasncript create and index. Be aware to return/crete the right index for the requested filtering.
|
3
|
+
# issue: filtering is applied but the index is created and saved for the original source file.
|
4
|
+
|
5
|
+
|
6
|
+
module Bio
|
7
|
+
module Ngs
|
8
|
+
module Cufflinks
|
9
|
+
# TODO use a specific class for each block (transcript)
|
10
|
+
module GtfParser
|
11
|
+
attr_accessor :lazy
|
12
|
+
require 'tempfile'
|
13
|
+
def each_transcript(&block)
|
14
|
+
if @blocks.nil? || @blocks.empty?
|
15
|
+
transcript = Transcript.new
|
16
|
+
@fh.rewind
|
17
|
+
transcript.tra = @fh.readline
|
18
|
+
@fh.each_line do |line|
|
19
|
+
if line =~ /\ttranscript\t/
|
20
|
+
block.call(transcript, @fh.lineno)
|
21
|
+
transcript.clear
|
22
|
+
transcript.tra = line
|
23
|
+
else line =~ /\texon\t/
|
24
|
+
transcript.exons << line
|
25
|
+
end
|
26
|
+
end
|
27
|
+
else #lazy
|
28
|
+
not_lazy
|
29
|
+
blocks_to_run = @blocks
|
30
|
+
@blocks=[]
|
31
|
+
result=select do |transcript|
|
32
|
+
bool_blocks = blocks_to_run.map do |b|
|
33
|
+
b.call(transcript)
|
34
|
+
end
|
35
|
+
!(bool_blocks.include?(nil) || bool_blocks.include?(false))
|
36
|
+
end
|
37
|
+
set_lazy
|
38
|
+
result.send(:each_transcript, &block)
|
39
|
+
end #lazy or not?
|
40
|
+
end
|
41
|
+
|
42
|
+
def select(&block)
|
43
|
+
if is_lazy?
|
44
|
+
@blocks||=[]
|
45
|
+
@blocks << block
|
46
|
+
self
|
47
|
+
else
|
48
|
+
# Find out how to concatenate multiple selections
|
49
|
+
file = Tempfile.new("transcripts")
|
50
|
+
each_transcript do |transcript|
|
51
|
+
if block.call(transcript)
|
52
|
+
file.write transcript.to_s
|
53
|
+
end
|
54
|
+
end
|
55
|
+
gtf=Gtf.new(file.path) unless file.size == 0
|
56
|
+
end
|
57
|
+
end #select
|
58
|
+
|
59
|
+
def multi_exon_with_lengh_and_coverage(length, coverage)
|
60
|
+
select do |transcript|
|
61
|
+
transcript.multi_exons? && (transcript.size > length) && (transcript.attributes[:cov] > coverage)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def multi_exons
|
66
|
+
# mark
|
67
|
+
select do |transcript|
|
68
|
+
transcript.multi_exons? #transcript line and exon line
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def mono_exon
|
73
|
+
# mark
|
74
|
+
select do |transcript|
|
75
|
+
transcript.mono_exon? #transcript line and exon line
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def length_gt(length)
|
80
|
+
select do |transcript|
|
81
|
+
transcript.size > length
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
def brand_new_isoforms
|
87
|
+
select do |transcript|
|
88
|
+
transcript.brand_new_isoform?
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def new_isoforms
|
93
|
+
select do |transcript|
|
94
|
+
transcript.new_isoform?
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def annotated_isoforms
|
99
|
+
select do |transcript|
|
100
|
+
transcript.annotated_isoform?
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def coverage_gt(size)
|
105
|
+
select do |transcript|
|
106
|
+
transcript.attributes[:cov] > size
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def to_gff3(path=".")
|
111
|
+
if File.exists?(File.join(path,"transcripts.gtf"))
|
112
|
+
gffread = GffRead.new
|
113
|
+
gffread.params = {output:"transcripts.gff3"}
|
114
|
+
gffread.run :arguments=>["transcripts.gtf"], :separator=>''
|
115
|
+
else
|
116
|
+
raise ArgumentError, "transcripts.gtf doesn't exists in #{path}"
|
117
|
+
end
|
118
|
+
end #to_gff3
|
119
|
+
|
120
|
+
def to_bed(only_exons=true, &block)
|
121
|
+
each_transcript do |t|
|
122
|
+
block.call(t, t.to_bed(only_exons))
|
123
|
+
end
|
124
|
+
end #to_bed
|
125
|
+
|
126
|
+
def set_lazy
|
127
|
+
@lazy=true
|
128
|
+
end
|
129
|
+
|
130
|
+
def is_lazy?
|
131
|
+
@lazy
|
132
|
+
end
|
133
|
+
|
134
|
+
def not_lazy
|
135
|
+
@lazy = false
|
136
|
+
end
|
137
|
+
|
138
|
+
def save(filename=nil)
|
139
|
+
fn = filename || "#{@fh.path}.gtf"
|
140
|
+
File.open(fn, 'w') do |f|
|
141
|
+
each_transcript do |transcript|
|
142
|
+
f.write transcript
|
143
|
+
end
|
144
|
+
end
|
145
|
+
# dump_idx("#{fn}.idx") #BUGGY this saves the old index in case the user called a select
|
146
|
+
end #save
|
147
|
+
|
148
|
+
def count
|
149
|
+
size = 0
|
150
|
+
each_transcript do
|
151
|
+
size+=1
|
152
|
+
end
|
153
|
+
size
|
154
|
+
end #count
|
155
|
+
|
156
|
+
def build_idx
|
157
|
+
idx = Hash.new {|h,k| h[k]=[]}
|
158
|
+
idx[:transcripts]
|
159
|
+
idx[:exons]
|
160
|
+
each_transcript do |t, f_lno|
|
161
|
+
# t_idx=(f_lno-t.exons.size-2)
|
162
|
+
idx[:transcripts] << t.byte_length
|
163
|
+
# eidx_b = t_idx +1
|
164
|
+
# t.exons.each_index do |ei|
|
165
|
+
# idx[t_idx] << eidx_b + ei
|
166
|
+
# idx[:exons] << eidx_b + ei
|
167
|
+
# end
|
168
|
+
end
|
169
|
+
@idx = idx
|
170
|
+
end #build_idx
|
171
|
+
|
172
|
+
def dump_idx(fn=nil)
|
173
|
+
fn||="#{source.path}.idx"
|
174
|
+
|
175
|
+
build_idx unless defined?(@idx)
|
176
|
+
@idx[:default_hash] = @idx.default
|
177
|
+
@idx.default = nil
|
178
|
+
File.open(fn, "w+") do |f|
|
179
|
+
Marshal.dump(@idx, f)
|
180
|
+
end
|
181
|
+
@idx.default = @idx[:default_hash]
|
182
|
+
fn
|
183
|
+
end #dump_idx
|
184
|
+
|
185
|
+
def load_idx
|
186
|
+
if File.exists?("#{source.path}.idx")
|
187
|
+
@idx = Marshal.load(File.open("#{source.path}.idx"))
|
188
|
+
@idx.default = @idx[:default_hash]
|
189
|
+
else
|
190
|
+
build_idx
|
191
|
+
dump_idx
|
192
|
+
end
|
193
|
+
end # load_idx
|
194
|
+
|
195
|
+
def index
|
196
|
+
@idx
|
197
|
+
end
|
198
|
+
|
199
|
+
# start from 1
|
200
|
+
def read_transcript(n=1)
|
201
|
+
load_idx unless defined?(@idx)
|
202
|
+
if n==1
|
203
|
+
source.seek(0)
|
204
|
+
source.read(@idx[:transcripts][0])
|
205
|
+
elsif n==2
|
206
|
+
source.seek(@idx[:transcripts][0])
|
207
|
+
source.read(@idx[:transcripts][n-1])
|
208
|
+
else
|
209
|
+
source.seek(@idx[:transcripts][0..n-2].sum)
|
210
|
+
source.read(@idx[:transcripts][n-1])
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
def get_transcript(n=1)
|
215
|
+
r=read_transcript(n)
|
216
|
+
s=r.split("\n").first
|
217
|
+
e=r.split("\n")[1..-1]
|
218
|
+
x=Bio::Ngs::Cufflinks::Transcript.new
|
219
|
+
x.tra= s+"\n"
|
220
|
+
x.exons=e.map{|ei| ei+"\n"}
|
221
|
+
x
|
222
|
+
end
|
223
|
+
|
224
|
+
alias :[] :get_transcript
|
225
|
+
|
226
|
+
end #GtfParser
|
227
|
+
|
228
|
+
end #Cufflinks
|
229
|
+
end #Ngs
|
230
|
+
end #Bio
|
231
|
+
|
232
|
+
# class Array
|
233
|
+
# def to_ranges
|
234
|
+
# sorted=self.sort
|
235
|
+
# left = sorted.first
|
236
|
+
# ranges = sorted.compact.uniq.sort.map do |e|
|
237
|
+
# if sorted[sorted.index(e) +1] == e.succ
|
238
|
+
# right = e.succ
|
239
|
+
# nil # set the elements between the ranges to nil
|
240
|
+
# else
|
241
|
+
# range_left = left
|
242
|
+
# left=sorted[sorted.index(e) +1]
|
243
|
+
# range_left == e ? e : Range.new(range_left, e)
|
244
|
+
# end
|
245
|
+
# end
|
246
|
+
# ranges.compact
|
247
|
+
# end
|
248
|
+
# end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
|
2
|
+
module Bio
|
3
|
+
module Ngs
|
4
|
+
module Cufflinks
|
5
|
+
class Transcript
|
6
|
+
Fields = %w(seqname source feature start stop score strand frame)
|
7
|
+
Attr_to_Float = %w(FPKM frac conf_lo conf_hi cov)
|
8
|
+
Attr_to_Integer = %w(exon_number)
|
9
|
+
ChrNotation = {ensembl:"", ucsc:"chr"}
|
10
|
+
|
11
|
+
attr_accessor :seqname, :source, :feature, :start, :stop, :score, :strand, :frame, :chr_notation
|
12
|
+
|
13
|
+
def initialize()
|
14
|
+
@tra = nil
|
15
|
+
@exons = []
|
16
|
+
@attributes = {}
|
17
|
+
@chr_notation = :ensembl #ensembl/ucsc
|
18
|
+
end
|
19
|
+
|
20
|
+
def tra
|
21
|
+
if @chr_notation == :ensembl && @tra=~/^chr(.*?)\s/
|
22
|
+
"#{ChrNotation[:ensembl]}#{$1}"
|
23
|
+
elsif @chr_notation == :ucsc && @tra=~/^(.*?)\s/
|
24
|
+
"#{ChrNotation[:ucsc]}#{$1}"
|
25
|
+
else
|
26
|
+
@tra
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def exons
|
31
|
+
@exons
|
32
|
+
end
|
33
|
+
|
34
|
+
def exons=(ary)
|
35
|
+
@exons=ary
|
36
|
+
end
|
37
|
+
|
38
|
+
def attributes
|
39
|
+
@attributes
|
40
|
+
end
|
41
|
+
def tra=(line)
|
42
|
+
@tra = line
|
43
|
+
data=line.split
|
44
|
+
@seqname = data[0]
|
45
|
+
@source = data[1]
|
46
|
+
@feature = data[2]
|
47
|
+
@start = data[3].to_i
|
48
|
+
@stop = data[4].to_i
|
49
|
+
@score = data[5]
|
50
|
+
@strand = data[6]
|
51
|
+
@frame = data[7]
|
52
|
+
data[8..-1].join(" ").split(';').each do |attribute|
|
53
|
+
data_attr=attribute.tr('"','').split
|
54
|
+
@attributes[data_attr[0].to_sym]= if Attr_to_Float.include? data_attr[0]
|
55
|
+
data_attr[1].to_f
|
56
|
+
elsif Attr_to_Integer.include? data_attr[0]
|
57
|
+
data_attr[1].to_i
|
58
|
+
else
|
59
|
+
data_attr[1]
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def multi_exons?
|
65
|
+
exons.size > 1
|
66
|
+
end
|
67
|
+
|
68
|
+
def mono_exon?
|
69
|
+
exons.size == 1
|
70
|
+
end
|
71
|
+
|
72
|
+
def size
|
73
|
+
@stop-@start+1
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
# def method_missing(meth, *args, &block)
|
78
|
+
# meth_name = meth.to_s.tr("=")
|
79
|
+
# if Fields.include? meth_name
|
80
|
+
# method_define meth_name do |args|
|
81
|
+
|
82
|
+
# end
|
83
|
+
# else
|
84
|
+
# super # You *must* call super if you don't handle the
|
85
|
+
# # method, otherwise you'll mess up Ruby's method
|
86
|
+
# # lookup.
|
87
|
+
# puts "There's no method called #{m} here -- please try again."
|
88
|
+
# end
|
89
|
+
|
90
|
+
# add last "\n" to last row.
|
91
|
+
def to_s
|
92
|
+
s=tra #+"\n"
|
93
|
+
s << exons.join #("\n") #<< "\n"
|
94
|
+
end
|
95
|
+
|
96
|
+
def to_bed(only_exons=true)
|
97
|
+
bed_str=""
|
98
|
+
unless only_exons
|
99
|
+
# puts seqname
|
100
|
+
#TODO fix seqname does not print the right transcript line
|
101
|
+
if @chr_notation == :ensembl && seqname=~/^chr(.*)/
|
102
|
+
seqname="#{ChrNotation[:ensembl]}#{$1}"
|
103
|
+
elsif @chr_notation == :ucsc && seqname=~/^(.*)/
|
104
|
+
seqname="#{ChrNotation[:ucsc]}#{$1}"
|
105
|
+
end
|
106
|
+
|
107
|
+
bed_str<<"#{seqname}\t#{start}\t#{stop}\t#{attributes[:gene_id]}_#{attributes[:transcript_id]}\n"
|
108
|
+
end
|
109
|
+
exons.each do |e|
|
110
|
+
data = e.tr('";','').split
|
111
|
+
if @chr_notation == :ensembl && data[0]=~/^chr(.*)/
|
112
|
+
data[0]="#{ChrNotation[:ensembl]}#{$1}"
|
113
|
+
elsif @chr_notation == :ucsc && data[0]=~/^(.*)/
|
114
|
+
data[0]="#{ChrNotation[:ucsc]}#{$1}"
|
115
|
+
end
|
116
|
+
|
117
|
+
bed_str<<"#{data[0]}\t#{data[3]}\t#{data[4]}\t#{data[9]}_#{data[11]}\n"
|
118
|
+
end
|
119
|
+
bed_str
|
120
|
+
end
|
121
|
+
|
122
|
+
def clear
|
123
|
+
@tra=""
|
124
|
+
@exons.clear
|
125
|
+
end
|
126
|
+
|
127
|
+
def brand_new_isoform?
|
128
|
+
attributes[:gene_id]=~/CUFF\.\d+/ && attributes[:transcript_id]=~/CUFF\.\d+\.\d+/
|
129
|
+
end
|
130
|
+
|
131
|
+
def new_isoform?
|
132
|
+
attributes[:gene_id]=~/CUFF\.\d+/ && attributes[:transcript_id]!~/CUFF\.\d+\.\d+/
|
133
|
+
end
|
134
|
+
|
135
|
+
def annotated_isoform?
|
136
|
+
attributes[:gene_id]!~/CUFF\.\d+/ && attributes[:transcript_id]!~/CUFF\.\d+\.\d+/
|
137
|
+
end
|
138
|
+
|
139
|
+
def byte_length
|
140
|
+
exons.map{|e| e.length}.sum + tra.length
|
141
|
+
end
|
142
|
+
|
143
|
+
def set_ucsc_notation
|
144
|
+
@chr_notation = :ucsc
|
145
|
+
end
|
146
|
+
|
147
|
+
def set_ensembl_notation
|
148
|
+
@chr_notation = :ensembl
|
149
|
+
end
|
150
|
+
|
151
|
+
end #Transcript
|
152
|
+
end #Cufflinks
|
153
|
+
end #Ngs
|
154
|
+
end #Bio
|
data/lib/bio/ngs/fs.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
module Bio
|
2
|
+
module Ngs
|
3
|
+
module FS
|
4
|
+
# def self.included(base)
|
5
|
+
# base.extend(ClassMethods)
|
6
|
+
# end
|
7
|
+
class << self
|
8
|
+
# Write a file 'merged' which is the concatenation of multipes fastq.gz 'files'
|
9
|
+
# files is an array of filenames
|
10
|
+
def cat(files, merged)
|
11
|
+
if files.is_a? Array
|
12
|
+
File.open(merged,'wb') do |fmerge|
|
13
|
+
files.each do |fname|
|
14
|
+
File.open(fname,'rb:binary') do |file|
|
15
|
+
while line = file.gets
|
16
|
+
fmerge << line
|
17
|
+
end
|
18
|
+
end #read
|
19
|
+
end #each
|
20
|
+
end #write
|
21
|
+
end #if
|
22
|
+
end #cat
|
23
|
+
alias :merge :cat
|
24
|
+
|
25
|
+
|
26
|
+
def files(everything, suffix=nil)
|
27
|
+
if everything.is_a? String
|
28
|
+
if File.file? everything
|
29
|
+
[File.expand_path(everything)]
|
30
|
+
elsif File.directory? everything
|
31
|
+
files(Dir.glob(File.join(everything, suffix.nil? ? "*" : "*"+suffix)).select{|item| File.file? item}).flatten
|
32
|
+
elsif everything=~/\*/
|
33
|
+
files(Dir.glob(everything)).flatten
|
34
|
+
elsif everything=~/[ ,:;]/
|
35
|
+
files(everything.split(/[ ,:;]/))
|
36
|
+
end
|
37
|
+
elsif everything.is_a? Array
|
38
|
+
everything.map do |item|
|
39
|
+
files(item)
|
40
|
+
end.flatten
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end #self
|
44
|
+
end #FS
|
45
|
+
end #Ngs
|
46
|
+
end #Bio
|