bio-ngs 0.3.2.alpha.01
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +39 -0
- data/Gemfile.lock +81 -0
- data/LICENSE.txt +28 -0
- data/README.rdoc +240 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/bin/biongs +35 -0
- data/bio-ngs.gemspec +215 -0
- data/ext/mkrf_conf.rb +87 -0
- data/lib/bio-ngs.rb +54 -0
- data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
- data/lib/bio/appl/ngs/blast.rb +36 -0
- data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
- data/lib/bio/appl/ngs/cufflinks.rb +489 -0
- data/lib/bio/appl/ngs/fastx.rb +170 -0
- data/lib/bio/appl/ngs/samtools.rb +118 -0
- data/lib/bio/appl/ngs/sff_extract.rb +23 -0
- data/lib/bio/appl/ngs/tophat.rb +158 -0
- data/lib/bio/ngs/converter.rb +100 -0
- data/lib/bio/ngs/core_ext.rb +12 -0
- data/lib/bio/ngs/db.rb +66 -0
- data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
- data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
- data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
- data/lib/bio/ngs/db/models.rb +1 -0
- data/lib/bio/ngs/db/models/homology.rb +8 -0
- data/lib/bio/ngs/db/models/ontology.rb +16 -0
- data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
- data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
- data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
- data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
- data/lib/bio/ngs/ext/versions.yaml +73 -0
- data/lib/bio/ngs/graphics.rb +189 -0
- data/lib/bio/ngs/homology.rb +102 -0
- data/lib/bio/ngs/ontology.rb +103 -0
- data/lib/bio/ngs/quality.rb +64 -0
- data/lib/bio/ngs/record.rb +50 -0
- data/lib/bio/ngs/task.rb +46 -0
- data/lib/bio/ngs/utils.rb +176 -0
- data/lib/development_tasks.rb +34 -0
- data/lib/enumerable.rb +37 -0
- data/lib/tasks/bwa.thor +126 -0
- data/lib/tasks/convert.thor +454 -0
- data/lib/tasks/history.thor +51 -0
- data/lib/tasks/homology.thor +121 -0
- data/lib/tasks/ontology.thor +93 -0
- data/lib/tasks/project.thor +51 -0
- data/lib/tasks/quality.thor +142 -0
- data/lib/tasks/rna.thor +126 -0
- data/lib/tasks/sff_extract.thor +9 -0
- data/lib/templates/README.tt +43 -0
- data/lib/templates/db.tt +6 -0
- data/lib/wrapper.rb +225 -0
- data/spec/converter_qseq_spec.rb +56 -0
- data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
- data/spec/quality_spec.rb +40 -0
- data/spec/sff_extract_spec.rb +98 -0
- data/spec/spec_helper.rb +55 -0
- data/spec/tophat_spec.rb +99 -0
- data/spec/utils_spec.rb +22 -0
- data/test/conf/test_db.yml +4 -0
- data/test/data/blastoutput.xml +69 -0
- data/test/data/gene-GO.json +1 -0
- data/test/data/goa_uniprot +27 -0
- data/test/data/goslim_goa.obo +1763 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-ngs.rb +17 -0
- data/test/test_db.rb +21 -0
- data/test/test_homology.rb +102 -0
- data/test/test_ngs.rb +21 -0
- data/test/test_ontology.rb +74 -0
- data/test/test_utils.rb +29 -0
- metadata +460 -0
@@ -0,0 +1,1505 @@
|
|
1
|
+
#!/usr/bin/python
|
2
|
+
'''This software extracts the seq, qual and ancillary information from an sff
|
3
|
+
file, like the ones used by the 454 sequencer.
|
4
|
+
|
5
|
+
Optionally, it can also split paired-end reads if given the linker sequence.
|
6
|
+
The splitting is done with maximum match, i.e., every occurence of the linker
|
7
|
+
sequence will be removed, even if occuring multiple times.'''
|
8
|
+
|
9
|
+
#copyright Jose Blanca and Bastien Chevreux
|
10
|
+
#COMAV institute, Universidad Politecnica de Valencia (UPV)
|
11
|
+
#Valencia, Spain
|
12
|
+
|
13
|
+
# additions to handle paired end reads by Bastien Chevreux
|
14
|
+
# bugfixes for linker specific lengths: Lionel Guy
|
15
|
+
|
16
|
+
#This program is free software: you can redistribute it and/or modify
|
17
|
+
#it under the terms of the GNU General Public License as published by
|
18
|
+
#the Free Software Foundation, either version 3 of the License, or
|
19
|
+
#(at your option) any later version.
|
20
|
+
#This program is distributed in the hope that it will be useful,
|
21
|
+
#but WITHOUT ANY WARRANTY; without even the implied warranty of
|
22
|
+
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
23
|
+
#GNU General Public License for more details.
|
24
|
+
#You should have received a copy of the GNU General Public License
|
25
|
+
#along with this program. If not, see <http://www.gnu.org/licenses/>.
|
26
|
+
|
27
|
+
__author__ = 'Jose Blanca and Bastien Chevreux'
|
28
|
+
__copyright__ = 'Copyright 2008, Jose Blanca, COMAV, and Bastien Chevreux'
|
29
|
+
__license__ = 'GPLv3 or later'
|
30
|
+
__version__ = '0.2.8'
|
31
|
+
__email__ = 'jblanca@btc.upv.es'
|
32
|
+
__status__ = 'beta'
|
33
|
+
|
34
|
+
import struct
|
35
|
+
import sys
|
36
|
+
import os
|
37
|
+
import subprocess
|
38
|
+
import tempfile
|
39
|
+
|
40
|
+
|
41
|
+
fake_sff_name = 'fake_sff_name'
|
42
|
+
|
43
|
+
|
44
|
+
# readname as key: lines with matches from SSAHA, one best match
|
45
|
+
ssahapematches = {}
|
46
|
+
# linker readname as key: length of linker sequence
|
47
|
+
linkerlengths = {}
|
48
|
+
|
49
|
+
# set to true if something really fishy is going on with the sequences
|
50
|
+
stern_warning = True
|
51
|
+
|
52
|
+
def read_bin_fragment(struct_def, fileh, offset=0, data=None,
|
53
|
+
byte_padding=None):
|
54
|
+
'''It reads a chunk of a binary file.
|
55
|
+
|
56
|
+
You have to provide the struct, a file object, the offset (where to start
|
57
|
+
reading).
|
58
|
+
Also you can provide an optional dict that will be populated with the
|
59
|
+
extracted data.
|
60
|
+
If a byte_padding is given the number of bytes read will be a multiple of
|
61
|
+
that number, adding the required pad at the end.
|
62
|
+
It returns the number of bytes reads and the data dict.
|
63
|
+
'''
|
64
|
+
if data is None:
|
65
|
+
data = {}
|
66
|
+
|
67
|
+
#we read each item
|
68
|
+
bytes_read = 0
|
69
|
+
for item in struct_def:
|
70
|
+
#we go to the place and read
|
71
|
+
fileh.seek(offset + bytes_read)
|
72
|
+
n_bytes = struct.calcsize(item[1])
|
73
|
+
buffer = fileh.read(n_bytes)
|
74
|
+
read = struct.unpack('>' + item[1], buffer)
|
75
|
+
if len(read) == 1:
|
76
|
+
read = read[0]
|
77
|
+
data[item[0]] = read
|
78
|
+
bytes_read += n_bytes
|
79
|
+
|
80
|
+
#if there is byte_padding the bytes_to_read should be a multiple of the
|
81
|
+
#byte_padding
|
82
|
+
if byte_padding is not None:
|
83
|
+
pad = byte_padding
|
84
|
+
bytes_read = ((bytes_read + pad - 1) // pad) * pad
|
85
|
+
|
86
|
+
return (bytes_read, data)
|
87
|
+
|
88
|
+
|
89
|
+
def check_magic(magic):
|
90
|
+
'''It checks that the magic number of the file matches the sff magic.'''
|
91
|
+
if magic != 779314790:
|
92
|
+
raise RuntimeError('This file does not seems to be an sff file.')
|
93
|
+
|
94
|
+
def check_version(version):
|
95
|
+
'''It checks that the version is supported, otherwise it raises an error.'''
|
96
|
+
supported = ('\x00', '\x00', '\x00', '\x01')
|
97
|
+
i = 0
|
98
|
+
for item in version:
|
99
|
+
if version[i] != supported[i]:
|
100
|
+
raise RuntimeError('SFF version not supported. Please contact the author of the software.')
|
101
|
+
i += 1
|
102
|
+
|
103
|
+
def read_header(fileh):
|
104
|
+
'''It reads the header from the sff file and returns a dict with the
|
105
|
+
information'''
|
106
|
+
#first we read the first part of the header
|
107
|
+
head_struct = [
|
108
|
+
('magic_number', 'I'),
|
109
|
+
('version', 'cccc'),
|
110
|
+
('index_offset', 'Q'),
|
111
|
+
('index_length', 'I'),
|
112
|
+
('number_of_reads', 'I'),
|
113
|
+
('header_length', 'H'),
|
114
|
+
('key_length', 'H'),
|
115
|
+
('number_of_flows_per_read', 'H'),
|
116
|
+
('flowgram_format_code', 'B'),
|
117
|
+
]
|
118
|
+
data = {}
|
119
|
+
first_bytes, data = read_bin_fragment(struct_def=head_struct, fileh=fileh,
|
120
|
+
offset=0, data=data)
|
121
|
+
check_magic(data['magic_number'])
|
122
|
+
check_version(data['version'])
|
123
|
+
#now that we know the number_of_flows_per_read and the key_length
|
124
|
+
#we can read the second part of the header
|
125
|
+
struct2 = [
|
126
|
+
('flow_chars', str(data['number_of_flows_per_read']) + 'c'),
|
127
|
+
('key_sequence', str(data['key_length']) + 'c')
|
128
|
+
]
|
129
|
+
read_bin_fragment(struct_def=struct2, fileh=fileh, offset=first_bytes,
|
130
|
+
data=data)
|
131
|
+
return data
|
132
|
+
|
133
|
+
|
134
|
+
def read_sequence(header, fileh, fposition):
|
135
|
+
'''It reads one read from the sff file located at the fposition and
|
136
|
+
returns a dict with the information.'''
|
137
|
+
header_length = header['header_length']
|
138
|
+
index_offset = header['index_offset']
|
139
|
+
index_length = header['index_length']
|
140
|
+
|
141
|
+
#the sequence struct
|
142
|
+
read_header_1 = [
|
143
|
+
('read_header_length', 'H'),
|
144
|
+
('name_length', 'H'),
|
145
|
+
('number_of_bases', 'I'),
|
146
|
+
('clip_qual_left', 'H'),
|
147
|
+
('clip_qual_right', 'H'),
|
148
|
+
('clip_adapter_left', 'H'),
|
149
|
+
('clip_adapter_right', 'H'),
|
150
|
+
]
|
151
|
+
def read_header_2(name_length):
|
152
|
+
'''It returns the struct definition for the second part of the header'''
|
153
|
+
return [('name', str(name_length) +'c')]
|
154
|
+
def read_data(number_of_bases):
|
155
|
+
'''It returns the struct definition for the read data section.'''
|
156
|
+
#size = {'c': 1, 'B':1, 'H':2, 'I':4, 'Q':8}
|
157
|
+
if header['flowgram_format_code'] == 1:
|
158
|
+
flow_type = 'H'
|
159
|
+
else:
|
160
|
+
raise Error('file version not supported')
|
161
|
+
number_of_bases = str(number_of_bases)
|
162
|
+
return [
|
163
|
+
('flowgram_values', str(header['number_of_flows_per_read']) +
|
164
|
+
flow_type),
|
165
|
+
('flow_index_per_base', number_of_bases + 'B'),
|
166
|
+
('bases', number_of_bases + 'c'),
|
167
|
+
('quality_scores', number_of_bases + 'B'),
|
168
|
+
]
|
169
|
+
|
170
|
+
data = {}
|
171
|
+
#we read the first part of the header
|
172
|
+
bytes_read, data = read_bin_fragment(struct_def=read_header_1,
|
173
|
+
fileh=fileh, offset=fposition, data=data)
|
174
|
+
|
175
|
+
read_bin_fragment(struct_def=read_header_2(data['name_length']),
|
176
|
+
fileh=fileh, offset=fposition + bytes_read, data=data)
|
177
|
+
#we join the letters of the name
|
178
|
+
data['name'] = ''.join(data['name'])
|
179
|
+
offset = data['read_header_length']
|
180
|
+
#we read the sequence and the quality
|
181
|
+
read_data_st = read_data(data['number_of_bases'])
|
182
|
+
bytes_read, data = read_bin_fragment(struct_def=read_data_st,
|
183
|
+
fileh=fileh, offset=fposition + offset,
|
184
|
+
data=data, byte_padding=8)
|
185
|
+
#we join the bases
|
186
|
+
data['bases'] = ''.join(data['bases'])
|
187
|
+
|
188
|
+
#print data
|
189
|
+
#print "pre cqr: ", data['clip_qual_right']
|
190
|
+
#print "pre car: ", data['clip_adapter_right']
|
191
|
+
#print "pre cql: ", data['clip_qual_left']
|
192
|
+
#print "pre cal: ", data['clip_adapter_left']
|
193
|
+
|
194
|
+
# correct for the case the right clip is <= than the left clip
|
195
|
+
# in this case, left clip is 0 are set to 0 (right clip == 0 means
|
196
|
+
# "whole sequence")
|
197
|
+
if data['clip_qual_right'] <= data['clip_qual_left'] :
|
198
|
+
data['clip_qual_right'] = 0
|
199
|
+
data['clip_qual_left'] = 0
|
200
|
+
if data['clip_adapter_right'] <= data['clip_adapter_left'] :
|
201
|
+
data['clip_adapter_right'] = 0
|
202
|
+
data['clip_adapter_left'] = 0
|
203
|
+
|
204
|
+
#the clipping section follows the NCBI's guidelines Trace Archive RFC
|
205
|
+
#http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=rfc&m=doc&s=rfc
|
206
|
+
#if there's no adapter clip: qual -> vector
|
207
|
+
#else: qual-> qual
|
208
|
+
# adapter -> vector
|
209
|
+
|
210
|
+
if not data['clip_adapter_left']:
|
211
|
+
data['clip_adapter_left'], data['clip_qual_left'] = data['clip_qual_left'], data['clip_adapter_left']
|
212
|
+
if not data['clip_adapter_right']:
|
213
|
+
data['clip_adapter_right'], data['clip_qual_right'] = data['clip_qual_right'], data['clip_adapter_right']
|
214
|
+
|
215
|
+
# see whether we have to override the minimum left clips
|
216
|
+
if config['min_leftclip'] > 0:
|
217
|
+
if data['clip_adapter_left'] >0 and data['clip_adapter_left'] < config['min_leftclip']:
|
218
|
+
data['clip_adapter_left'] = config['min_leftclip']
|
219
|
+
if data['clip_qual_left'] >0 and data['clip_qual_left'] < config['min_leftclip']:
|
220
|
+
data['clip_qual_left'] = config['min_leftclip']
|
221
|
+
|
222
|
+
|
223
|
+
#print "post cqr: ", data['clip_qual_right']
|
224
|
+
#print "post car: ", data['clip_adapter_right']
|
225
|
+
#print "post cql: ", data['clip_qual_left']
|
226
|
+
#print "post cal: ", data['clip_adapter_left']
|
227
|
+
|
228
|
+
|
229
|
+
# for handling the -c (clip) option gently, we already clip here
|
230
|
+
# and set all clip points to the sequence end points
|
231
|
+
if config['clip']:
|
232
|
+
data['bases'], data['quality_scores'] = clip_read(data)
|
233
|
+
|
234
|
+
data['number_of_bases']=len(data['bases'])
|
235
|
+
data['clip_qual_right'] = data['number_of_bases']
|
236
|
+
data['clip_adapter_right'] = data['number_of_bases']
|
237
|
+
data['clip_qual_left'] = 0
|
238
|
+
data['clip_adapter_left'] = 0
|
239
|
+
|
240
|
+
return data['read_header_length'] + bytes_read, data
|
241
|
+
|
242
|
+
|
243
|
+
def sequences(fileh, header):
|
244
|
+
'''It returns a generator with the data for each read.'''
|
245
|
+
#now we can read all the sequences
|
246
|
+
fposition = header['header_length'] #position in the file
|
247
|
+
reads_read = 0
|
248
|
+
while True:
|
249
|
+
if fposition == header['index_offset']:
|
250
|
+
#we have to skip the index section
|
251
|
+
fposition += index_length
|
252
|
+
continue
|
253
|
+
else:
|
254
|
+
bytes_read, seq_data = read_sequence(header=header, fileh=fileh,
|
255
|
+
fposition=fposition)
|
256
|
+
yield seq_data
|
257
|
+
fposition += bytes_read
|
258
|
+
reads_read += 1
|
259
|
+
if reads_read >= header['number_of_reads']:
|
260
|
+
break
|
261
|
+
|
262
|
+
|
263
|
+
def remove_last_xmltag_in_file(fname, tag=None):
|
264
|
+
'''Given an xml file name and a tag, it removes the last tag of the
|
265
|
+
file if it matches the given tag. Tag removal is performed via file
|
266
|
+
truncation.
|
267
|
+
|
268
|
+
It the given tag is not the last in the file, a RunTimeError will be
|
269
|
+
raised.
|
270
|
+
|
271
|
+
The resulting xml file will be not xml valid. This function is a hack
|
272
|
+
that allows to append records to xml files in a quick and dirty way.
|
273
|
+
'''
|
274
|
+
|
275
|
+
fh = open(fname, 'r+')
|
276
|
+
#we have to read from the end to the start of the file and keep the
|
277
|
+
#string enclosed by </ >
|
278
|
+
i = -1
|
279
|
+
last_tag = [] #the chars that form the last tag
|
280
|
+
start_offset = None #in which byte does the last tag starts?
|
281
|
+
end_offset = None #in which byte does the last tag ends?
|
282
|
+
while True:
|
283
|
+
fh.seek(i, 2)
|
284
|
+
char = fh.read(1)
|
285
|
+
if not char.isspace():
|
286
|
+
last_tag.append(char)
|
287
|
+
if char == '>':
|
288
|
+
end_offset = i
|
289
|
+
if char == '<':
|
290
|
+
start_offset = i
|
291
|
+
break
|
292
|
+
i -= 1
|
293
|
+
|
294
|
+
#we have read the last tag backwards
|
295
|
+
last_tag = ''.join(last_tag[::-1])
|
296
|
+
#we remove the </ and >
|
297
|
+
last_tag = last_tag.rstrip('>').lstrip('</')
|
298
|
+
|
299
|
+
#we check that we're removing the asked tag
|
300
|
+
if tag is not None and tag != last_tag:
|
301
|
+
raise RuntimeError("The given xml tag wasn't the last one in the file")
|
302
|
+
|
303
|
+
# while we are at it: also remove all white spaces in that line :-)
|
304
|
+
i -= 1
|
305
|
+
while True:
|
306
|
+
fh.seek(i, 2)
|
307
|
+
char = fh.read(1)
|
308
|
+
if not char == ' ' and not char == '\t':
|
309
|
+
break;
|
310
|
+
if fh.tell() == 1:
|
311
|
+
break;
|
312
|
+
i -= 1
|
313
|
+
|
314
|
+
fh.truncate();
|
315
|
+
|
316
|
+
fh.close()
|
317
|
+
return last_tag
|
318
|
+
|
319
|
+
|
320
|
+
def create_basic_xml_info(readname, fname):
|
321
|
+
'''Formats a number of read specific infos into XML format.
|
322
|
+
Currently formated: name and the tags set from command line
|
323
|
+
'''
|
324
|
+
to_print = [' <trace>\n']
|
325
|
+
to_print.append(' <trace_name>')
|
326
|
+
to_print.append(readname)
|
327
|
+
to_print.append('</trace_name>\n')
|
328
|
+
|
329
|
+
#extra information
|
330
|
+
#do we have extra info for this file?
|
331
|
+
info = None
|
332
|
+
if config['xml_info']:
|
333
|
+
#with this name?
|
334
|
+
if fname in config['xml_info']:
|
335
|
+
info = config['xml_info'][fname]
|
336
|
+
else:
|
337
|
+
#with no name?
|
338
|
+
try:
|
339
|
+
info = config['xml_info'][fake_sff_name]
|
340
|
+
except KeyError:
|
341
|
+
pass
|
342
|
+
#we print the info that we have
|
343
|
+
if info:
|
344
|
+
for key in info:
|
345
|
+
to_print.append(' <' + key + '>' + info[key] + \
|
346
|
+
'</' + key +'>\n')
|
347
|
+
|
348
|
+
return ''.join(to_print)
|
349
|
+
|
350
|
+
|
351
|
+
def create_clip_xml_info(readlen, adapl, adapr, quall, qualr):
|
352
|
+
'''Takes the clip values of the read and formats them into XML
|
353
|
+
Corrects "wrong" values that might have resulted through
|
354
|
+
simplified calculations earlier in the process of conversion
|
355
|
+
(especially during splitting of paired-end reads)
|
356
|
+
'''
|
357
|
+
|
358
|
+
to_print = [""]
|
359
|
+
|
360
|
+
# if right borders are >= to read length, they don't need
|
361
|
+
# to be printed
|
362
|
+
if adapr >= readlen:
|
363
|
+
adapr = 0
|
364
|
+
if qualr >= readlen:
|
365
|
+
qualr = 0
|
366
|
+
|
367
|
+
# BaCh
|
368
|
+
# when called via split_paired_end(), some values may be < 0
|
369
|
+
# (when clip values were 0 previously)
|
370
|
+
# instead of putting tons of if clauses for different calculations there,
|
371
|
+
# I centralise corrective measure here
|
372
|
+
# set all values <0 to 0
|
373
|
+
|
374
|
+
if adapr < 0:
|
375
|
+
adapr = 0
|
376
|
+
if qualr <0:
|
377
|
+
qualr = 0
|
378
|
+
if adapl < 0:
|
379
|
+
adapl = 0
|
380
|
+
if quall <0:
|
381
|
+
quall = 0
|
382
|
+
|
383
|
+
if quall:
|
384
|
+
to_print.append(' <clip_quality_left>')
|
385
|
+
to_print.append(str(quall))
|
386
|
+
to_print.append('</clip_quality_left>\n')
|
387
|
+
if qualr:
|
388
|
+
to_print.append(' <clip_quality_right>')
|
389
|
+
to_print.append(str(qualr))
|
390
|
+
to_print.append('</clip_quality_right>\n')
|
391
|
+
if adapl:
|
392
|
+
to_print.append(' <clip_vector_left>')
|
393
|
+
to_print.append(str(adapl))
|
394
|
+
to_print.append('</clip_vector_left>\n')
|
395
|
+
if adapr:
|
396
|
+
to_print.append(' <clip_vector_right>')
|
397
|
+
to_print.append(str(adapr))
|
398
|
+
to_print.append('</clip_vector_right>\n')
|
399
|
+
return ''.join(to_print)
|
400
|
+
|
401
|
+
|
402
|
+
def create_xml_for_unpaired_read(data, fname):
|
403
|
+
'''Given the data for one read it returns an str with the xml ancillary
|
404
|
+
data.'''
|
405
|
+
to_print = [create_basic_xml_info(data['name'],fname)]
|
406
|
+
#clippings in the XML only if we do not hard clip
|
407
|
+
if not config['clip']:
|
408
|
+
to_print.append(create_clip_xml_info(data['number_of_bases'],data['clip_adapter_left'], data['clip_adapter_right'], data['clip_qual_left'], data['clip_qual_right']));
|
409
|
+
to_print.append(' </trace>\n')
|
410
|
+
return ''.join(to_print)
|
411
|
+
|
412
|
+
|
413
|
+
def format_as_fasta(name,seq,qual):
|
414
|
+
name_line = ''.join(('>', name,'\n'))
|
415
|
+
seqstring = ''.join((name_line, seq, '\n'))
|
416
|
+
qual_line = ' '.join([str(q) for q in qual])
|
417
|
+
qualstring = ''.join((name_line, qual_line, '\n'))
|
418
|
+
return seqstring, qualstring
|
419
|
+
|
420
|
+
def format_as_fastq(name,seq,qual):
|
421
|
+
qual_line = ''.join([chr(q+33) for q in qual])
|
422
|
+
#seqstring = ''.join(('@', name,'\n', seq, '\n+', name,'\n', qual_line, '\n'))
|
423
|
+
seqstring = ''.join(('@', name,'\n', seq, '\n+\n', qual_line, '\n'))
|
424
|
+
return seqstring
|
425
|
+
|
426
|
+
|
427
|
+
def get_read_data(data):
|
428
|
+
'''Given the data for one read it returns 2 strs with the fasta seq
|
429
|
+
and fasta qual.'''
|
430
|
+
#seq and qual
|
431
|
+
if config['mix_case']:
|
432
|
+
seq = sequence_case(data)
|
433
|
+
qual = data['quality_scores']
|
434
|
+
else :
|
435
|
+
seq = data['bases']
|
436
|
+
qual = data['quality_scores']
|
437
|
+
|
438
|
+
return seq, qual
|
439
|
+
|
440
|
+
def extract_read_info(data, fname):
|
441
|
+
'''Given the data for one read it returns 3 strs with the fasta seq, fasta
|
442
|
+
qual and xml ancillary data.'''
|
443
|
+
|
444
|
+
seq,qual = get_read_data(data)
|
445
|
+
seqstring, qualstring = format_as_fasta(data['name'],seq,qual)
|
446
|
+
|
447
|
+
#name_line = ''.join(('>', data['name'],'\n'))
|
448
|
+
#seq = ''.join((name_line, seq, '\n'))
|
449
|
+
#qual_line = ' '.join([str(q) for q in qual])
|
450
|
+
#qual = ''.join((name_line, qual_line, '\n'))
|
451
|
+
|
452
|
+
xmlstring = create_xml_for_unpaired_read(data, fname)
|
453
|
+
|
454
|
+
return seqstring, qualstring, xmlstring
|
455
|
+
|
456
|
+
def write_sequence(name,seq,qual,seq_fh,qual_fh):
|
457
|
+
'''Write sequence and quality FASTA and FASTA qual filehandles
|
458
|
+
(or into FASTQ and XML)
|
459
|
+
if sequence length is 0, don't write'''
|
460
|
+
|
461
|
+
if len(seq) == 0 : return
|
462
|
+
|
463
|
+
if qual_fh is None:
|
464
|
+
seq_fh.write(format_as_fastq(name,seq,qual))
|
465
|
+
else:
|
466
|
+
seqstring, qualstring = format_as_fasta(name,seq,qual)
|
467
|
+
seq_fh.write(seqstring)
|
468
|
+
qual_fh.write(qualstring)
|
469
|
+
return
|
470
|
+
|
471
|
+
def write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh):
|
472
|
+
'''Writes an unpaired read into FASTA, FASTA qual and XML filehandles
|
473
|
+
(or into FASTQ and XML)
|
474
|
+
if sequence length is 0, don't write'''
|
475
|
+
|
476
|
+
seq,qual = get_read_data(data)
|
477
|
+
if len(seq) == 0 : return
|
478
|
+
|
479
|
+
write_sequence(data['name'],seq,qual,seq_fh,qual_fh)
|
480
|
+
|
481
|
+
anci = create_xml_for_unpaired_read(data, sff_fh.name)
|
482
|
+
if anci is not None:
|
483
|
+
xml_fh.write(anci)
|
484
|
+
return
|
485
|
+
|
486
|
+
|
487
|
+
def reverse_complement(seq):
|
488
|
+
'''Returns the reverse complement of a DNA sequence as string'''
|
489
|
+
|
490
|
+
compdict = {
|
491
|
+
'a': 't',
|
492
|
+
'c': 'g',
|
493
|
+
'g': 'c',
|
494
|
+
't': 'a',
|
495
|
+
'u': 't',
|
496
|
+
'm': 'k',
|
497
|
+
'r': 'y',
|
498
|
+
'w': 'w',
|
499
|
+
's': 's',
|
500
|
+
'y': 'r',
|
501
|
+
'k': 'm',
|
502
|
+
'v': 'b',
|
503
|
+
'h': 'd',
|
504
|
+
'd': 'h',
|
505
|
+
'b': 'v',
|
506
|
+
'x': 'x',
|
507
|
+
'n': 'n',
|
508
|
+
'A': 'T',
|
509
|
+
'C': 'G',
|
510
|
+
'G': 'C',
|
511
|
+
'T': 'A',
|
512
|
+
'U': 'T',
|
513
|
+
'M': 'K',
|
514
|
+
'R': 'Y',
|
515
|
+
'W': 'W',
|
516
|
+
'S': 'S',
|
517
|
+
'Y': 'R',
|
518
|
+
'K': 'M',
|
519
|
+
'V': 'B',
|
520
|
+
'H': 'D',
|
521
|
+
'D': 'H',
|
522
|
+
'B': 'V',
|
523
|
+
'X': 'X',
|
524
|
+
'N': 'N',
|
525
|
+
'*': '*'
|
526
|
+
}
|
527
|
+
|
528
|
+
complseq = ''.join([compdict[base] for base in seq])
|
529
|
+
# python hack to reverse a list/string/etc
|
530
|
+
complseq = complseq[::-1]
|
531
|
+
return complseq
|
532
|
+
|
533
|
+
|
534
|
+
def mask_sequence(seq, maskchar, fpos, tpos):
|
535
|
+
'''Given a sequence, mask it with maskchar starting at fpos (including) and
|
536
|
+
ending at tpos (excluding)
|
537
|
+
'''
|
538
|
+
|
539
|
+
if len(maskchar) > 1:
|
540
|
+
raise RuntimeError("Internal error: more than one character given to mask_sequence")
|
541
|
+
if fpos<0:
|
542
|
+
fpos = 0
|
543
|
+
if tpos > len(seq):
|
544
|
+
tpos = len(seq)
|
545
|
+
|
546
|
+
newseq = ''.join((seq[:fpos],maskchar*(tpos-fpos), seq[tpos:]))
|
547
|
+
|
548
|
+
return newseq
|
549
|
+
|
550
|
+
|
551
|
+
def fragment_sequences(sequence, qualities, splitchar):
|
552
|
+
'''Works like split() on strings, except it does this on a sequence
|
553
|
+
and the corresponding list with quality values.
|
554
|
+
Returns a tuple for each fragment, each sublist has the fragment
|
555
|
+
sequence as first and the fragment qualities as second elemnt'''
|
556
|
+
|
557
|
+
# this is slow (due to zip and list appends... use an iterator over
|
558
|
+
# the sequence find find variations and splices on seq and qual
|
559
|
+
|
560
|
+
if len(sequence) != len(qualities):
|
561
|
+
print sequence, qualities
|
562
|
+
raise RuntimeError("Internal error: length of sequence and qualities don't match???")
|
563
|
+
|
564
|
+
retlist = ([])
|
565
|
+
if len(sequence) == 0:
|
566
|
+
return retlist
|
567
|
+
|
568
|
+
actseq = ([])
|
569
|
+
actqual = ([])
|
570
|
+
if sequence[0] != splitchar:
|
571
|
+
inseq = True
|
572
|
+
else:
|
573
|
+
inseq = False
|
574
|
+
for char,qual in zip(sequence,qualities):
|
575
|
+
if inseq:
|
576
|
+
if char != splitchar:
|
577
|
+
actseq.append(char)
|
578
|
+
actqual.append(qual)
|
579
|
+
else:
|
580
|
+
retlist.append((''.join(actseq), actqual))
|
581
|
+
actseq = ([])
|
582
|
+
actqual = ([])
|
583
|
+
inseq = False
|
584
|
+
else:
|
585
|
+
if char != splitchar:
|
586
|
+
inseq = True
|
587
|
+
actseq.append(char)
|
588
|
+
actqual.append(qual)
|
589
|
+
|
590
|
+
if inseq and len(actseq):
|
591
|
+
retlist.append((''.join(actseq), actqual))
|
592
|
+
|
593
|
+
return retlist
|
594
|
+
|
595
|
+
|
596
|
+
def calc_subseq_boundaries(maskedseq, maskchar):
|
597
|
+
'''E.g.:
|
598
|
+
........xxxxxxxx..........xxxxxxxxxxxxxxxxxxxxx.........
|
599
|
+
to
|
600
|
+
(0,8),(8,16),(16,26),(26,47),(47,56)
|
601
|
+
'''
|
602
|
+
|
603
|
+
blist = ([])
|
604
|
+
if len(maskedseq) == 0:
|
605
|
+
return blist
|
606
|
+
|
607
|
+
inmask = True
|
608
|
+
if maskedseq[0] != maskchar:
|
609
|
+
inmask = False
|
610
|
+
|
611
|
+
start = 0
|
612
|
+
for spos in range(len(maskedseq)):
|
613
|
+
if inmask and maskedseq[spos] != maskchar:
|
614
|
+
blist.append(([start,spos]))
|
615
|
+
start = spos
|
616
|
+
inmask = False
|
617
|
+
elif not inmask and maskedseq[spos] == maskchar:
|
618
|
+
blist.append(([start,spos]))
|
619
|
+
start = spos
|
620
|
+
inmask = True
|
621
|
+
|
622
|
+
blist.append(([start,spos+1]))
|
623
|
+
|
624
|
+
return blist
|
625
|
+
|
626
|
+
|
627
|
+
def correct_for_smallhits(maskedseq, maskchar, linkername):
|
628
|
+
'''If partial hits were found, take preventive measure: grow
|
629
|
+
the masked areas by 20 bases in each direction
|
630
|
+
Returns either unchanged "maskedseq" or a new sequence
|
631
|
+
with some more characters masked.
|
632
|
+
'''
|
633
|
+
global linkerlengths
|
634
|
+
|
635
|
+
CEBUG = 0
|
636
|
+
|
637
|
+
if CEBUG : print "correct_for_smallhits"
|
638
|
+
if CEBUG : print "Masked seq\n", maskedseq
|
639
|
+
if CEBUG : print "Linkername: ", linkername
|
640
|
+
|
641
|
+
if len(maskedseq) == 0:
|
642
|
+
return maskedseq
|
643
|
+
|
644
|
+
growl=40
|
645
|
+
growl2=growl/2
|
646
|
+
|
647
|
+
boundaries = calc_subseq_boundaries(maskedseq,maskchar)
|
648
|
+
if CEBUG : print "Boundaries: ", boundaries
|
649
|
+
|
650
|
+
foundpartial = False
|
651
|
+
for bounds in boundaries:
|
652
|
+
if CEBUG : print "\tbounds: ", bounds
|
653
|
+
left, right = bounds
|
654
|
+
if left != 0 and right != len(maskedseq):
|
655
|
+
if maskedseq[left] == maskchar:
|
656
|
+
# allow 10% discrepancy
|
657
|
+
# -linkerlengths[linkername]/10
|
658
|
+
# that's a kind of safety net if there are slight sequencing
|
659
|
+
# errors in the linker itself
|
660
|
+
if right-left < linkerlengths[linkername]-linkerlengths[linkername]/10:
|
661
|
+
if CEBUG : print "\t\tPartial: found " + str(right-left) + " gaps, " + linkername + " is " + str(linkerlengths[linkername]) + " nt long."
|
662
|
+
foundpartial = True
|
663
|
+
|
664
|
+
if not foundpartial:
|
665
|
+
return maskedseq
|
666
|
+
|
667
|
+
# grow
|
668
|
+
newseq = ""
|
669
|
+
for bounds in boundaries:
|
670
|
+
if CEBUG : print "Bounds: ", bounds
|
671
|
+
left, right = bounds
|
672
|
+
if maskedseq[left] == maskchar:
|
673
|
+
newseq += maskedseq[left:right]
|
674
|
+
else:
|
675
|
+
clearstart = 0
|
676
|
+
if left > 0 :
|
677
|
+
clearstart = left+growl2
|
678
|
+
clearstop = len(maskedseq)
|
679
|
+
if right < len(maskedseq):
|
680
|
+
clearstop = right-growl2
|
681
|
+
|
682
|
+
if CEBUG : print "clearstart, clearstop: ",clearstart, clearstop
|
683
|
+
|
684
|
+
if clearstop <= clearstart:
|
685
|
+
newseq += maskchar * (right-left)
|
686
|
+
else:
|
687
|
+
if clearstart != left:
|
688
|
+
newseq += maskchar * growl2
|
689
|
+
newseq += maskedseq[clearstart:clearstop]
|
690
|
+
if clearstop != right:
|
691
|
+
newseq += maskchar * growl2
|
692
|
+
|
693
|
+
#print "newseq\n",newseq
|
694
|
+
|
695
|
+
return newseq
|
696
|
+
|
697
|
+
|
698
|
+
def split_paired_end(data, sff_fh, seq_fh, qual_fh, xml_fh):
|
699
|
+
'''Splits a paired end read and writes sequences into FASTA, FASTA qual
|
700
|
+
and XML traceinfo file. Returns the number of sequences created.
|
701
|
+
|
702
|
+
As the linker sequence may be anywhere in the read, including the ends
|
703
|
+
and overlapping with bad quality sequence, we need to perform some
|
704
|
+
computing and eventually set new clip points.
|
705
|
+
|
706
|
+
If the resulting split yields only one sequence (because linker
|
707
|
+
was not present or overlapping with left or right clip), only one
|
708
|
+
sequence will be written with ".fn" appended to the name.
|
709
|
+
|
710
|
+
If the read can be split, two reads will be written. The side left of
|
711
|
+
the linker will be named ".r" and will be written in reverse complement
|
712
|
+
into the file to conform with what approximately all assemblers expect
|
713
|
+
when reading paired-end data: reads in forward direction in file. The side
|
714
|
+
right of the linker will be named ".f"
|
715
|
+
|
716
|
+
If SSAHA found partial linker (linker sequences < length of linker),
|
717
|
+
the sequences will get a "_pl" furthermore be cut back thoroughly.
|
718
|
+
|
719
|
+
If SSAHA found multiple occurences of the linker, the names will get an
|
720
|
+
additional "_mlc" within the name to show that there was "multiple
|
721
|
+
linker contamination".
|
722
|
+
|
723
|
+
For multiple or partial linker, the "good" parts of the reads are
|
724
|
+
stored with a ".part<number>" name, additionally they will not get
|
725
|
+
template information in the XML
|
726
|
+
'''
|
727
|
+
|
728
|
+
global ssahapematches
|
729
|
+
|
730
|
+
CEBUG = 0
|
731
|
+
|
732
|
+
maskchar = "#"
|
733
|
+
|
734
|
+
if CEBUG : print "Need to split: " + data['name']
|
735
|
+
|
736
|
+
numseqs = 0;
|
737
|
+
readname = data['name']
|
738
|
+
readlen = data['number_of_bases']
|
739
|
+
|
740
|
+
leftclip, rightclip = return_merged_clips(data)
|
741
|
+
seq, qual = get_read_data(data)
|
742
|
+
|
743
|
+
if CEBUG : print "Original read:\n",seq
|
744
|
+
|
745
|
+
maskedseq = seq
|
746
|
+
if leftclip > 0:
|
747
|
+
maskedseq = mask_sequence(maskedseq, maskchar, 0, leftclip-1)
|
748
|
+
if rightclip < len(maskedseq):
|
749
|
+
maskedseq = mask_sequence(maskedseq, maskchar, rightclip, len(maskedseq))
|
750
|
+
|
751
|
+
leftclip, rightclip = return_merged_clips(data)
|
752
|
+
readlen = data['number_of_bases']
|
753
|
+
|
754
|
+
if CEBUG : print "Readname:", readname
|
755
|
+
if CEBUG : print "Readlen:", readlen
|
756
|
+
if CEBUG : print "Num matches:", str(len(ssahapematches[data['name']]))
|
757
|
+
if CEBUG : print "matches:", ssahapematches[data['name']]
|
758
|
+
|
759
|
+
for match in ssahapematches[data['name']]:
|
760
|
+
score = int(match[0])
|
761
|
+
linkername = match[2]
|
762
|
+
leftreadhit = int(match[3])
|
763
|
+
rightreadhit = int(match[4])
|
764
|
+
#leftlinkerhit = int(match[5])
|
765
|
+
#rightlinkerhit = int(match[6])
|
766
|
+
#direction = match[7]
|
767
|
+
#hitlen = int(match[8])
|
768
|
+
#hitidentity = float(match[9])
|
769
|
+
|
770
|
+
if CEBUG : print match
|
771
|
+
if CEBUG : print "Match with score:", score
|
772
|
+
if CEBUG : print "Read before:\n", maskedseq
|
773
|
+
maskedseq = mask_sequence(maskedseq, maskchar, leftreadhit-1, rightreadhit)
|
774
|
+
if CEBUG : print "Masked seq:\n", maskedseq
|
775
|
+
|
776
|
+
correctedseq = correct_for_smallhits(maskedseq, maskchar, linkername)
|
777
|
+
|
778
|
+
if len(maskedseq) != len(correctedseq):
|
779
|
+
raise RuntimeError("Internal error: maskedseq != correctedseq")
|
780
|
+
|
781
|
+
partialhits = False
|
782
|
+
if correctedseq != maskedseq:
|
783
|
+
if CEBUG : print "Partial hits in", readname
|
784
|
+
if CEBUG : print "Original seq:\n", seq
|
785
|
+
if CEBUG : print "Masked seq:\n", maskedseq
|
786
|
+
if CEBUG : print "Corrected seq\n", correctedseq
|
787
|
+
partialhits = True
|
788
|
+
readname += "_pl"
|
789
|
+
maskedseq = correctedseq
|
790
|
+
|
791
|
+
fragments = fragment_sequences(maskedseq, qual, maskchar)
|
792
|
+
|
793
|
+
if CEBUG : print "Fragments (", len(fragments), "): ", fragments
|
794
|
+
|
795
|
+
mlcflag = False
|
796
|
+
#if len(ssahapematches[data['name']]) > 1:
|
797
|
+
# #print "Multi linker contamination"
|
798
|
+
# mlcflag = True
|
799
|
+
# readname += "_mlc"
|
800
|
+
|
801
|
+
if len(fragments) > 2:
|
802
|
+
if CEBUG : print "Multi linker contamination"
|
803
|
+
mlcflag = True
|
804
|
+
readname += "_mlc"
|
805
|
+
|
806
|
+
|
807
|
+
#print fragments
|
808
|
+
if mlcflag or partialhits:
|
809
|
+
fragcounter = 1
|
810
|
+
readname += ".part"
|
811
|
+
for frag in fragments:
|
812
|
+
actseq = frag[0]
|
813
|
+
if len(actseq) >= 20:
|
814
|
+
actqual = frag[1]
|
815
|
+
oname = readname + str(fragcounter)
|
816
|
+
#seq_fh.write(">"+oname+"\n")
|
817
|
+
#seq_fh.write(actseq+"\n")
|
818
|
+
#qual_fh.write(">"+oname+"\n")
|
819
|
+
#qual_fh.write(' '.join((str(q) for q in actqual)))
|
820
|
+
#qual_fh.write("\n")
|
821
|
+
write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
|
822
|
+
to_print = [create_basic_xml_info(oname,sff_fh.name)]
|
823
|
+
# No clipping in XML ... the multiple and partial fragments
|
824
|
+
# are clipped "hard"
|
825
|
+
# No template ID and trace_end: we don't know the
|
826
|
+
# orientation of the frahments. Even if it were
|
827
|
+
# only two, the fact we had multiple linkers
|
828
|
+
# says something went wrong, so simply do not
|
829
|
+
# write any paired-end information for all these fragments
|
830
|
+
to_print.append(' </trace>\n')
|
831
|
+
xml_fh.write(''.join(to_print))
|
832
|
+
numseqs += 1
|
833
|
+
fragcounter += 1
|
834
|
+
else:
|
835
|
+
if len(fragments) >2:
|
836
|
+
raise RuntimeError("Unexpected: more than two fragments detected in " + readname + ". please contact the authors.")
|
837
|
+
# nothing will happen for 0 fragments
|
838
|
+
if len(fragments) == 1:
|
839
|
+
#print "Tada1"
|
840
|
+
boundaries = calc_subseq_boundaries(maskedseq,maskchar)
|
841
|
+
if len(boundaries) < 1 or len(boundaries) >3:
|
842
|
+
raise RuntimeError("Unexpected case: ", str(len(boundaries)), "boundaries for 1 fragment of ", readname)
|
843
|
+
if len(boundaries) == 3:
|
844
|
+
# case: mask char on both sides of sequence
|
845
|
+
#print "bounds3"
|
846
|
+
data['clip_adapter_left']=1+boundaries[0][1]
|
847
|
+
data['clip_adapter_right']=boundaries[2][0]
|
848
|
+
elif len(boundaries) == 2:
|
849
|
+
# case: mask char left or right of sequence
|
850
|
+
#print "bounds2",
|
851
|
+
if maskedseq[0] == maskchar :
|
852
|
+
# case: mask char left
|
853
|
+
#print "left"
|
854
|
+
data['clip_adapter_left']=1+boundaries[0][1]
|
855
|
+
else:
|
856
|
+
# case: mask char right
|
857
|
+
#print "right"
|
858
|
+
data['clip_adapter_right']=boundaries[1][0]
|
859
|
+
data['name'] = data['name'] + ".fn"
|
860
|
+
write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh)
|
861
|
+
numseqs = 1
|
862
|
+
elif len(fragments) == 2:
|
863
|
+
#print "Tada2"
|
864
|
+
oname = readname + ".r"
|
865
|
+
seq, qual = get_read_data(data)
|
866
|
+
|
867
|
+
startsearch = False
|
868
|
+
for spos in range(len(maskedseq)):
|
869
|
+
if maskedseq[spos] != maskchar:
|
870
|
+
startsearch = True;
|
871
|
+
else:
|
872
|
+
if startsearch:
|
873
|
+
break
|
874
|
+
|
875
|
+
#print "\nspos: ", spos
|
876
|
+
lseq=seq[:spos]
|
877
|
+
#print "lseq:", lseq
|
878
|
+
actseq = reverse_complement(lseq)
|
879
|
+
lreadlen = len(actseq)
|
880
|
+
lqual = qual[:spos];
|
881
|
+
# python hack to reverse a list/string/etc
|
882
|
+
lqual = lqual[::-1];
|
883
|
+
|
884
|
+
#seq_fh.write(">"+oname+"\n")
|
885
|
+
#seq_fh.write(actseq+"\n")
|
886
|
+
#qual_fh.write(">"+oname+"\n")
|
887
|
+
#qual_fh.write(' '.join((str(q) for q in lqual)))
|
888
|
+
#qual_fh.write("\n")
|
889
|
+
|
890
|
+
write_sequence(oname,actseq,lqual,seq_fh,qual_fh)
|
891
|
+
|
892
|
+
to_print = [create_basic_xml_info(oname,sff_fh.name)]
|
893
|
+
to_print.append(create_clip_xml_info(lreadlen, 0, lreadlen+1-data['clip_adapter_left'], 0, lreadlen+1-data['clip_qual_left']));
|
894
|
+
to_print.append(' <template_id>')
|
895
|
+
to_print.append(readname)
|
896
|
+
to_print.append('</template_id>\n')
|
897
|
+
to_print.append(' <trace_end>r</trace_end>\n')
|
898
|
+
to_print.append(' </trace>\n')
|
899
|
+
xml_fh.write(''.join(to_print))
|
900
|
+
|
901
|
+
oname = readname + ".f"
|
902
|
+
startsearch = False
|
903
|
+
for spos in range(len(maskedseq)-1,-1,-1):
|
904
|
+
if maskedseq[spos] != maskchar:
|
905
|
+
startsearch = True;
|
906
|
+
else:
|
907
|
+
if startsearch:
|
908
|
+
break
|
909
|
+
|
910
|
+
actseq = seq[spos+1:]
|
911
|
+
actqual = qual[spos+1:];
|
912
|
+
|
913
|
+
#print "\nspos: ", spos
|
914
|
+
#print "rseq:", actseq
|
915
|
+
|
916
|
+
#seq_fh.write(">"+oname+"\n")
|
917
|
+
#seq_fh.write(actseq+"\n")
|
918
|
+
#qual_fh.write(">"+oname+"\n")
|
919
|
+
#qual_fh.write(' '.join((str(q) for q in actqual)))
|
920
|
+
#qual_fh.write("\n")
|
921
|
+
write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
|
922
|
+
|
923
|
+
rreadlen = len(actseq)
|
924
|
+
to_print = [create_basic_xml_info(oname,sff_fh.name)]
|
925
|
+
to_print.append(create_clip_xml_info(rreadlen, 0, rreadlen-(readlen-data['clip_adapter_right']), 0, rreadlen-(readlen-data['clip_qual_right'])));
|
926
|
+
to_print.append(' <template_id>')
|
927
|
+
to_print.append(readname)
|
928
|
+
to_print.append('</template_id>\n')
|
929
|
+
to_print.append(' <trace_end>f</trace_end>\n')
|
930
|
+
to_print.append(' </trace>\n')
|
931
|
+
xml_fh.write(''.join(to_print))
|
932
|
+
numseqs = 2
|
933
|
+
|
934
|
+
return numseqs
|
935
|
+
|
936
|
+
|
937
|
+
|
938
|
+
def extract_reads_from_sff(config, sff_files):
|
939
|
+
'''Given the configuration and the list of sff_files it writes the seqs,
|
940
|
+
qualities and ancillary data into the output file(s).
|
941
|
+
|
942
|
+
If file for paired-end linker was given, first extracts all sequences
|
943
|
+
of an SFF and searches these against the linker(s) with SSAHA2 to
|
944
|
+
create needed information to split reads.
|
945
|
+
'''
|
946
|
+
|
947
|
+
global ssahapematches
|
948
|
+
|
949
|
+
|
950
|
+
if len(sff_files) == 0 :
|
951
|
+
raise RuntimeError("No SFF file given?")
|
952
|
+
|
953
|
+
#we go through all input files
|
954
|
+
for sff_file in sff_files:
|
955
|
+
if not os.path.getsize(sff_file):
|
956
|
+
raise RuntimeError('Empty file? : ' + sff_file)
|
957
|
+
fh = open(sff_file, 'r')
|
958
|
+
fh.close()
|
959
|
+
|
960
|
+
openmode = 'w'
|
961
|
+
if config['append']:
|
962
|
+
openmode = 'a'
|
963
|
+
|
964
|
+
seq_fh = open(config['seq_fname'], openmode)
|
965
|
+
xml_fh = open(config['xml_fname'], openmode)
|
966
|
+
if config['want_fastq']:
|
967
|
+
qual_fh = None
|
968
|
+
try:
|
969
|
+
os.remove(config['qual_fname'])
|
970
|
+
except :
|
971
|
+
python_formattingwithoutbracesisdumb_dummy = 1
|
972
|
+
else:
|
973
|
+
qual_fh = open(config['qual_fname'], openmode)
|
974
|
+
|
975
|
+
if not config['append']:
|
976
|
+
xml_fh.write('<?xml version="1.0"?>\n<trace_volume>\n')
|
977
|
+
else:
|
978
|
+
remove_last_xmltag_in_file(config['xml_fname'], "trace_volume")
|
979
|
+
|
980
|
+
#we go through all input files
|
981
|
+
for sff_file in sff_files:
|
982
|
+
print "Working on '" + sff_file + "':"
|
983
|
+
ssahapematches.clear()
|
984
|
+
|
985
|
+
seqcheckstore = ([])
|
986
|
+
|
987
|
+
debug = 0
|
988
|
+
|
989
|
+
if not debug and config['pelinker_fname']:
|
990
|
+
print "Creating temporary sequences from reads in '" + sff_file + "' ... ",
|
991
|
+
sys.stdout.flush()
|
992
|
+
|
993
|
+
if 0 :
|
994
|
+
# for debugging
|
995
|
+
pid = os.getpid()
|
996
|
+
tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
|
997
|
+
tmpfasta_fh = open(tmpfasta_fname, 'w')
|
998
|
+
else:
|
999
|
+
tmpfasta_fh = tempfile.NamedTemporaryFile(prefix = 'sffeseqs_',
|
1000
|
+
suffix = '.fasta')
|
1001
|
+
|
1002
|
+
sff_fh = open(sff_file, 'rb')
|
1003
|
+
header_data = read_header(fileh=sff_fh)
|
1004
|
+
for seq_data in sequences(fileh=sff_fh, header=header_data):
|
1005
|
+
seq,qual = get_read_data(seq_data)
|
1006
|
+
seqstring, qualstring = format_as_fasta(seq_data['name'],seq,qual)
|
1007
|
+
tmpfasta_fh.write(seqstring)
|
1008
|
+
#seq, qual, anci = extract_read_info(seq_data, sff_fh.name)
|
1009
|
+
#tmpfasta_fh.write(seq)
|
1010
|
+
print "done."
|
1011
|
+
tmpfasta_fh.seek(0)
|
1012
|
+
|
1013
|
+
if 0 :
|
1014
|
+
# for debugging
|
1015
|
+
tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
|
1016
|
+
tmpssaha_fh = open(tmpssaha_fname, 'w+')
|
1017
|
+
else:
|
1018
|
+
tmpssaha_fh = tempfile.NamedTemporaryFile(prefix = 'sffealig_',
|
1019
|
+
suffix = '.ssaha2')
|
1020
|
+
|
1021
|
+
launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
|
1022
|
+
tmpfasta_fh.close()
|
1023
|
+
|
1024
|
+
tmpssaha_fh.seek(0)
|
1025
|
+
read_ssaha_data(tmpssaha_fh)
|
1026
|
+
tmpssaha_fh.close()
|
1027
|
+
|
1028
|
+
if debug:
|
1029
|
+
tmpssaha_fh = open("sffe.tmp.10634.ssaha2", 'r')
|
1030
|
+
read_ssaha_data(tmpssaha_fh)
|
1031
|
+
|
1032
|
+
print "Converting '" + sff_file + "' ... ",
|
1033
|
+
sys.stdout.flush()
|
1034
|
+
sff_fh = open(sff_file, 'rb')
|
1035
|
+
#read_header(infile)
|
1036
|
+
header_data = read_header(fileh=sff_fh)
|
1037
|
+
|
1038
|
+
#now convert all reads
|
1039
|
+
nseqs_sff = 0
|
1040
|
+
nseqs_out = 0
|
1041
|
+
for seq_data in sequences(fileh=sff_fh, header=header_data):
|
1042
|
+
nseqs_sff += 1
|
1043
|
+
|
1044
|
+
seq, qual = clip_read(seq_data)
|
1045
|
+
seqcheckstore.append(seq[0:50])
|
1046
|
+
|
1047
|
+
#if nseqs_sff >1000:
|
1048
|
+
# check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
|
1049
|
+
# sys.exit()
|
1050
|
+
|
1051
|
+
if ssahapematches.has_key(seq_data['name']):
|
1052
|
+
#print "Paired end:",seq_data['name']
|
1053
|
+
nseqs_out += split_paired_end(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
|
1054
|
+
else:
|
1055
|
+
#print "Normal:",seq_data['name']
|
1056
|
+
if config['pelinker_fname']:
|
1057
|
+
seq_data['name'] = seq_data['name'] + ".fn"
|
1058
|
+
write_unpaired_read(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
|
1059
|
+
nseqs_out += 1
|
1060
|
+
print "done."
|
1061
|
+
print 'Converted', str(nseqs_sff), 'reads into', str(nseqs_out), 'sequences.'
|
1062
|
+
sff_fh.close()
|
1063
|
+
|
1064
|
+
check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
|
1065
|
+
seqcheckstore = ([])
|
1066
|
+
|
1067
|
+
xml_fh.write('</trace_volume>\n')
|
1068
|
+
|
1069
|
+
xml_fh.close()
|
1070
|
+
seq_fh.close()
|
1071
|
+
if qual_fh is not None:
|
1072
|
+
qual_fh.close()
|
1073
|
+
|
1074
|
+
return
|
1075
|
+
|
1076
|
+
def check_for_dubious_startseq(seqcheckstore, sffname,seqdata):
|
1077
|
+
|
1078
|
+
global stern_warning
|
1079
|
+
|
1080
|
+
foundproblem = ""
|
1081
|
+
for checklen in range(1,len(seqcheckstore[0])):
|
1082
|
+
foundinloop = False
|
1083
|
+
seqdict = {}
|
1084
|
+
for seq in seqcheckstore:
|
1085
|
+
shortseq = seq[0:checklen]
|
1086
|
+
if shortseq in seqdict:
|
1087
|
+
seqdict[shortseq] += 1
|
1088
|
+
else:
|
1089
|
+
seqdict[shortseq] = 1
|
1090
|
+
|
1091
|
+
for shortseq, count in seqdict.items():
|
1092
|
+
if float(count)/len(seqcheckstore) >= 0.5:
|
1093
|
+
foundinloop = True
|
1094
|
+
stern_warning
|
1095
|
+
foundproblem = "\n"+"*" * 80
|
1096
|
+
foundproblem += "\nWARNING: "
|
1097
|
+
foundproblem += "weird sequences in file " + sffname + "\n\n"
|
1098
|
+
foundproblem += "After applying left clips, " + str(count) + " sequences (="
|
1099
|
+
foundproblem += '%.0f'%(100.0*float(count)/len(seqcheckstore))
|
1100
|
+
foundproblem += "%) start with these bases:\n" + shortseq
|
1101
|
+
foundproblem += "\n\nThis does not look sane.\n\n"
|
1102
|
+
foundproblem += "Countermeasures you *probably* must take:\n"
|
1103
|
+
foundproblem += "1) Make your sequence provider aware of that problem and ask whether this can be\n corrected in the SFF.\n"
|
1104
|
+
foundproblem += "2) If you decide that this is not normal and your sequence provider does not\n react, use the --min_left_clip of sff_extract.\n"
|
1105
|
+
left,right = return_merged_clips(seqdata)
|
1106
|
+
foundproblem += " (Probably '--min_left_clip="+ str(left+len(shortseq))+"' but you should cross-check that)\n"
|
1107
|
+
foundproblem += "*" * 80 + "\n"
|
1108
|
+
if not foundinloop :
|
1109
|
+
break
|
1110
|
+
if len(foundproblem):
|
1111
|
+
print foundproblem
|
1112
|
+
|
1113
|
+
|
1114
|
+
def parse_extra_info(info):
|
1115
|
+
'''It parses the information that will go in the xml file.
|
1116
|
+
|
1117
|
+
There are two formats accepted for the extra information:
|
1118
|
+
key1:value1, key2:value2
|
1119
|
+
or:
|
1120
|
+
file1.sff{key1:value1, key2:value2};file2.sff{key3:value3}
|
1121
|
+
'''
|
1122
|
+
if not info:
|
1123
|
+
return info
|
1124
|
+
finfos = info.split(';') #information for each file
|
1125
|
+
data_for_files = {}
|
1126
|
+
for finfo in finfos:
|
1127
|
+
#we split the file name from the rest
|
1128
|
+
items = finfo.split('{')
|
1129
|
+
if len(items) == 1:
|
1130
|
+
fname = fake_sff_name
|
1131
|
+
info = items[0]
|
1132
|
+
else:
|
1133
|
+
fname = items[0]
|
1134
|
+
info = items[1]
|
1135
|
+
#now we get each key,value pair in the info
|
1136
|
+
info = info.replace('}', '')
|
1137
|
+
data = {}
|
1138
|
+
for item in info.split(','):
|
1139
|
+
key, value = item.strip().split(':')
|
1140
|
+
key = key.strip()
|
1141
|
+
value = value.strip()
|
1142
|
+
data[key] = value
|
1143
|
+
data_for_files[fname] = data
|
1144
|
+
return data_for_files
|
1145
|
+
|
1146
|
+
|
1147
|
+
def return_merged_clips(data):
|
1148
|
+
'''It returns the left and right positions to clip.'''
|
1149
|
+
def max(a, b):
|
1150
|
+
'''It returns the max of the two given numbers.
|
1151
|
+
|
1152
|
+
It won't take into account the zero values.
|
1153
|
+
'''
|
1154
|
+
if not a and not b:
|
1155
|
+
return None
|
1156
|
+
if not a:
|
1157
|
+
return b
|
1158
|
+
if not b:
|
1159
|
+
return a
|
1160
|
+
if a >= b:
|
1161
|
+
return a
|
1162
|
+
else:
|
1163
|
+
return b
|
1164
|
+
def min(a, b):
|
1165
|
+
'''It returns the min of the two given numbers.
|
1166
|
+
|
1167
|
+
It won't take into account the zero values.
|
1168
|
+
'''
|
1169
|
+
if not a and not b:
|
1170
|
+
return None
|
1171
|
+
if not a:
|
1172
|
+
return b
|
1173
|
+
if not b:
|
1174
|
+
return a
|
1175
|
+
if a <= b:
|
1176
|
+
return a
|
1177
|
+
else:
|
1178
|
+
return b
|
1179
|
+
left = max(data['clip_adapter_left'], data['clip_qual_left'])
|
1180
|
+
right = min(data['clip_adapter_right'], data['clip_qual_right'])
|
1181
|
+
#maybe both clips where zero
|
1182
|
+
if left is None:
|
1183
|
+
left = 1
|
1184
|
+
if right is None:
|
1185
|
+
right = data['number_of_bases']
|
1186
|
+
return left, right
|
1187
|
+
|
1188
|
+
def sequence_case(data):
|
1189
|
+
'''Given the data for one read it returns the seq with mixed case.
|
1190
|
+
|
1191
|
+
The regions to be clipped will be lower case and the rest upper case.
|
1192
|
+
'''
|
1193
|
+
left, right = return_merged_clips(data)
|
1194
|
+
seq = data['bases']
|
1195
|
+
new_seq = ''.join((seq[:left-1].lower(), seq[left-1:right], seq[right:].lower()))
|
1196
|
+
return new_seq
|
1197
|
+
|
1198
|
+
def clip_read(data):
|
1199
|
+
'''Given the data for one read it returns clipped seq and qual.'''
|
1200
|
+
|
1201
|
+
qual = data['quality_scores']
|
1202
|
+
left, right = return_merged_clips(data)
|
1203
|
+
seq = data['bases']
|
1204
|
+
qual = data['quality_scores']
|
1205
|
+
new_seq = seq[left-1:right]
|
1206
|
+
new_qual = qual[left-1:right]
|
1207
|
+
|
1208
|
+
return new_seq, new_qual
|
1209
|
+
|
1210
|
+
|
1211
|
+
|
1212
|
+
def tests_for_ssaha(linker_fname):
|
1213
|
+
'''Tests whether SSAHA2 can be successfully called.'''
|
1214
|
+
|
1215
|
+
try:
|
1216
|
+
print "Testing whether SSAHA2 is installed and can be launched ... ",
|
1217
|
+
sys.stdout.flush()
|
1218
|
+
fh = open('/dev/null', 'w')
|
1219
|
+
retcode = subprocess.call(["ssaha2", "-v"], stdout = fh)
|
1220
|
+
fh.close()
|
1221
|
+
print "ok."
|
1222
|
+
except :
|
1223
|
+
print "nope? Uh oh ...\n\n"
|
1224
|
+
raise RuntimeError('Could not launch ssaha2. Have you installed it? Is it in your path?')
|
1225
|
+
|
1226
|
+
|
1227
|
+
def load_linker_sequences(linker_fname):
|
1228
|
+
'''Loads all linker sequences into memory, storing only the length
|
1229
|
+
of each linker.'''
|
1230
|
+
|
1231
|
+
global linkerlengths
|
1232
|
+
|
1233
|
+
if not os.path.getsize(linker_fname):
|
1234
|
+
raise RuntimeError("File empty? '" + linker_fname + "'")
|
1235
|
+
fh = open(linker_fname, 'r')
|
1236
|
+
linkerseqs = read_fasta(fh)
|
1237
|
+
if len(linkerseqs) == 0:
|
1238
|
+
raise RuntimeError(linker_fname + ": no sequence found?")
|
1239
|
+
for i in linkerseqs:
|
1240
|
+
if linkerlengths.has_key(i.name):
|
1241
|
+
raise RuntimeError(linker_fname + ": sequence '" + i.name + "' present multiple times. Aborting.")
|
1242
|
+
linkerlengths[i.name] = len(i.sequence)
|
1243
|
+
fh.close()
|
1244
|
+
|
1245
|
+
|
1246
|
+
def launch_ssaha(linker_fname, query_fname, output_fh):
|
1247
|
+
'''Launches SSAHA2 on the linker and query file, string SSAHA2 output
|
1248
|
+
into the output filehandle'''
|
1249
|
+
|
1250
|
+
try:
|
1251
|
+
print "Searching linker sequences with SSAHA2 (this may take a while) ... ",
|
1252
|
+
sys.stdout.flush()
|
1253
|
+
retcode = subprocess.call(["ssaha2", "-output", "ssaha2", "-solexa", "-kmer", "4", "-skip", "1", linker_fname, query_fname], stdout = output_fh)
|
1254
|
+
if retcode:
|
1255
|
+
raise RuntimeError('Ups.')
|
1256
|
+
else:
|
1257
|
+
print "ok."
|
1258
|
+
except:
|
1259
|
+
print "\n"
|
1260
|
+
raise RuntimeError('An error occured during the SSAHA2 execution, aborting.')
|
1261
|
+
|
1262
|
+
def read_ssaha_data(ssahadata_fh):
|
1263
|
+
'''Given file handle, reads file generated with SSAHA2 (with default
|
1264
|
+
output format) and stores all matches as list ssahapematches
|
1265
|
+
(ssaha paired-end matches) dictionary'''
|
1266
|
+
|
1267
|
+
global ssahapematches
|
1268
|
+
|
1269
|
+
print "Parsing SSAHA2 result file ... ",
|
1270
|
+
sys.stdout.flush()
|
1271
|
+
|
1272
|
+
for line in ssahadata_fh:
|
1273
|
+
if line.startswith('ALIGNMENT'):
|
1274
|
+
ml = line.split()
|
1275
|
+
if len(ml) != 12 :
|
1276
|
+
print "\n", line,
|
1277
|
+
raise RuntimeError('Expected 12 elements in the SSAHA2 line with ALIGMENT keyword, but found ' + str(len(ml)))
|
1278
|
+
if not ssahapematches.has_key(ml[2]) :
|
1279
|
+
ssahapematches[ml[2]] = ([])
|
1280
|
+
if ml[8] == 'F':
|
1281
|
+
#print line,
|
1282
|
+
|
1283
|
+
# store everything except the first element (output
|
1284
|
+
# format name (ALIGNMENT)) and the last element
|
1285
|
+
# (length)
|
1286
|
+
ssahapematches[ml[2]].append(ml[1:-1])
|
1287
|
+
else:
|
1288
|
+
#print ml
|
1289
|
+
ml[4],ml[5] = ml[5],ml[4]
|
1290
|
+
#print ml
|
1291
|
+
ssahapematches[ml[2]].append(ml[1:-1])
|
1292
|
+
|
1293
|
+
print "done."
|
1294
|
+
|
1295
|
+
|
1296
|
+
##########################################################################
|
1297
|
+
#
|
1298
|
+
# BaCh: This block was shamelessly copied from
|
1299
|
+
# http://python.genedrift.org/2007/07/04/reading-fasta-files-conclusion/
|
1300
|
+
# and then subsequently modified to read fasta correctly
|
1301
|
+
# It's still not fool proof, but should be good enough
|
1302
|
+
#
|
1303
|
+
##########################################################################
|
1304
|
+
|
1305
|
+
class Fasta:
|
1306
|
+
def __init__(self, name, sequence):
|
1307
|
+
self.name = name
|
1308
|
+
self.sequence = sequence
|
1309
|
+
|
1310
|
+
def read_fasta(file):
|
1311
|
+
items = []
|
1312
|
+
aninstance = Fasta('', '')
|
1313
|
+
linenum = 0
|
1314
|
+
for line in file:
|
1315
|
+
linenum += 1
|
1316
|
+
if line.startswith(">"):
|
1317
|
+
if len(aninstance.sequence):
|
1318
|
+
items.append(aninstance)
|
1319
|
+
aninstance = Fasta('', '')
|
1320
|
+
# name == all characters until the first whitespace
|
1321
|
+
# (split()[0]) but without the starting ">" ([1:])
|
1322
|
+
aninstance.name = line.split()[0][1:]
|
1323
|
+
aninstance.sequence = ''
|
1324
|
+
if len(aninstance.name) == 0:
|
1325
|
+
raise RuntimeError(file.name + ': no name in line ' + str(linenum) + '?')
|
1326
|
+
|
1327
|
+
else:
|
1328
|
+
if len(aninstance.name) == 0:
|
1329
|
+
raise RuntimeError(file.name + ': no sequence header at line ' + str(linenum) + '?')
|
1330
|
+
aninstance.sequence += line.strip()
|
1331
|
+
|
1332
|
+
if len(aninstance.name) and len(aninstance.sequence):
|
1333
|
+
items.append(aninstance)
|
1334
|
+
|
1335
|
+
return items
|
1336
|
+
##########################################################################
|
1337
|
+
|
1338
|
+
def version_string ():
|
1339
|
+
return "sff_extract " + __version__
|
1340
|
+
|
1341
|
+
def read_config():
|
1342
|
+
'''It reads the configuration options from the command line arguments and
|
1343
|
+
it returns a dict with them.'''
|
1344
|
+
from optparse import OptionParser, OptionGroup
|
1345
|
+
usage = "usage: %prog [options] sff1 sff2 ..."
|
1346
|
+
desc = "Extract sequences from 454 SFF files into FASTA, FASTA quality"\
|
1347
|
+
" and XML traceinfo format. When a paired-end linker sequence"\
|
1348
|
+
" is given (-l), use SSAHA2 to scan the sequences for the linker,"\
|
1349
|
+
" then split the sequences, removing the linker."
|
1350
|
+
parser = OptionParser(usage = usage, version = version_string(), description = desc)
|
1351
|
+
parser.add_option('-a', '--append', action="store_true", dest='append',
|
1352
|
+
help='append output to existing files', default=False)
|
1353
|
+
parser.add_option('-i', '--xml_info', dest='xml_info',
|
1354
|
+
help='extra info to write in the xml file')
|
1355
|
+
parser.add_option("-l", "--linker_file", dest="pelinker_fname",
|
1356
|
+
help="FASTA file with paired-end linker sequences", metavar="FILE")
|
1357
|
+
|
1358
|
+
group = OptionGroup(parser, "File name options","")
|
1359
|
+
group.add_option('-c', '--clip', action="store_true", dest='clip',
|
1360
|
+
help='clip (completely remove) ends with low qual and/or adaptor sequence', default=False)
|
1361
|
+
group.add_option('-u', '--upper_case', action="store_false", dest='mix_case',
|
1362
|
+
help='all bases in upper case, including clipped ends', default=True)
|
1363
|
+
group.add_option('', '--min_left_clip', dest='min_leftclip',
|
1364
|
+
metavar="INTEGER", type = "int",
|
1365
|
+
help='if the left clip coming from the SFF is smaller than this value, override it', default=0)
|
1366
|
+
group.add_option('-Q', '--fastq', action="store_true", dest='want_fastq',
|
1367
|
+
help='store as FASTQ file instead of FASTA + FASTA quality file', default=False)
|
1368
|
+
parser.add_option_group(group)
|
1369
|
+
|
1370
|
+
group = OptionGroup(parser, "File name options","")
|
1371
|
+
group.add_option("-o", "--out_basename", dest="basename",
|
1372
|
+
help="base name for all output files")
|
1373
|
+
group.add_option("-s", "--seq_file", dest="seq_fname",
|
1374
|
+
help="output sequence file name", metavar="FILE")
|
1375
|
+
group.add_option("-q", "--qual_file", dest="qual_fname",
|
1376
|
+
help="output quality file name", metavar="FILE")
|
1377
|
+
group.add_option("-x", "--xml_file", dest="xml_fname",
|
1378
|
+
help="output ancillary xml file name", metavar="FILE")
|
1379
|
+
parser.add_option_group(group)
|
1380
|
+
|
1381
|
+
#default fnames
|
1382
|
+
#is there an sff file?
|
1383
|
+
basename = 'reads'
|
1384
|
+
if sys.argv[-1][-4:].lower() == '.sff':
|
1385
|
+
basename = sys.argv[-1][:-4]
|
1386
|
+
def_seq_fname = basename + '.fasta'
|
1387
|
+
def_qual_fname = basename + '.fasta.qual'
|
1388
|
+
def_xml_fname = basename + '.xml'
|
1389
|
+
def_pelinker_fname = ''
|
1390
|
+
parser.set_defaults(seq_fname = def_seq_fname)
|
1391
|
+
parser.set_defaults(qual_fname = def_qual_fname)
|
1392
|
+
parser.set_defaults(xml_fname = def_xml_fname)
|
1393
|
+
parser.set_defaults(pelinker_fname = def_pelinker_fname)
|
1394
|
+
|
1395
|
+
#we parse the cmd line
|
1396
|
+
(options, args) = parser.parse_args()
|
1397
|
+
|
1398
|
+
#we put the result in a dict
|
1399
|
+
global config
|
1400
|
+
config = {}
|
1401
|
+
for property in dir(options):
|
1402
|
+
if property[0] == '_' or property in ('ensure_value', 'read_file',
|
1403
|
+
'read_module'):
|
1404
|
+
continue
|
1405
|
+
config[property] = getattr(options, property)
|
1406
|
+
|
1407
|
+
if config['basename'] is None:
|
1408
|
+
config['basename']=basename
|
1409
|
+
|
1410
|
+
#if we have not set a file name with -s, -q or -x we set the basename
|
1411
|
+
#based file name
|
1412
|
+
if config['want_fastq']:
|
1413
|
+
config['qual_fname'] = ''
|
1414
|
+
if config['seq_fname'] == def_seq_fname:
|
1415
|
+
config['seq_fname'] = config['basename'] + '.fastq'
|
1416
|
+
else:
|
1417
|
+
if config['seq_fname'] == def_seq_fname:
|
1418
|
+
config['seq_fname'] = config['basename'] + '.fasta'
|
1419
|
+
if config['qual_fname'] == def_qual_fname:
|
1420
|
+
config['qual_fname'] = config['basename'] + '.fasta.qual'
|
1421
|
+
|
1422
|
+
if config['xml_fname'] == def_xml_fname:
|
1423
|
+
config['xml_fname'] = config['basename'] + '.xml'
|
1424
|
+
|
1425
|
+
#we parse the extra info for the xml file
|
1426
|
+
config['xml_info'] = parse_extra_info(config['xml_info'])
|
1427
|
+
return config, args
|
1428
|
+
|
1429
|
+
|
1430
|
+
|
1431
|
+
##########################################################################
|
1432
|
+
|
1433
|
+
|
1434
|
+
def testsome():
|
1435
|
+
sys.exit()
|
1436
|
+
return
|
1437
|
+
|
1438
|
+
|
1439
|
+
def debug():
|
1440
|
+
try:
|
1441
|
+
dummy = 1
|
1442
|
+
#debug()
|
1443
|
+
#testsome()
|
1444
|
+
|
1445
|
+
config, args = read_config()
|
1446
|
+
load_linker_sequences(config['pelinker_fname'])
|
1447
|
+
|
1448
|
+
#pid = os.getpid()
|
1449
|
+
pid = 15603
|
1450
|
+
|
1451
|
+
#tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
|
1452
|
+
#tmpfasta_fh = open(tmpfasta_fname, 'w')
|
1453
|
+
tmpfasta_fname = 'FLVI58L05.fa'
|
1454
|
+
tmpfasta_fh = open(tmpfasta_fname, 'r')
|
1455
|
+
|
1456
|
+
tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
|
1457
|
+
tmpssaha_fh = open(tmpssaha_fname, 'w')
|
1458
|
+
|
1459
|
+
launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
|
1460
|
+
|
1461
|
+
tmpssaha_fh = open("sffe.tmp.15603.ssaha2", 'r')
|
1462
|
+
read_ssaha_data(tmpssaha_fh)
|
1463
|
+
|
1464
|
+
sys.exit()
|
1465
|
+
|
1466
|
+
extract_reads_from_sff(config, args)
|
1467
|
+
|
1468
|
+
except (OSError, IOError, RuntimeError), errval:
|
1469
|
+
print errval
|
1470
|
+
sys.exit()
|
1471
|
+
|
1472
|
+
sys.exit()
|
1473
|
+
|
1474
|
+
|
1475
|
+
def main():
|
1476
|
+
|
1477
|
+
argv = sys.argv
|
1478
|
+
if len(argv) == 1:
|
1479
|
+
sys.argv.append('-h')
|
1480
|
+
read_config()
|
1481
|
+
sys.exit()
|
1482
|
+
try:
|
1483
|
+
#debug();
|
1484
|
+
|
1485
|
+
config, args = read_config()
|
1486
|
+
|
1487
|
+
if config['pelinker_fname']:
|
1488
|
+
#tests_for_ssaha(config['pelinker_fname'])
|
1489
|
+
load_linker_sequences(config['pelinker_fname'])
|
1490
|
+
if len(args) == 0:
|
1491
|
+
raise RuntimeError("No SFF file given?")
|
1492
|
+
extract_reads_from_sff(config, args)
|
1493
|
+
except (OSError, IOError, RuntimeError), errval:
|
1494
|
+
print errval
|
1495
|
+
return 1
|
1496
|
+
|
1497
|
+
if stern_warning:
|
1498
|
+
return 1
|
1499
|
+
|
1500
|
+
return 0
|
1501
|
+
|
1502
|
+
|
1503
|
+
|
1504
|
+
if __name__ == "__main__":
|
1505
|
+
sys.exit(main())
|