bio-ngs 0.3.2.alpha.01

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/Gemfile +39 -0
  3. data/Gemfile.lock +81 -0
  4. data/LICENSE.txt +28 -0
  5. data/README.rdoc +240 -0
  6. data/Rakefile +60 -0
  7. data/VERSION +1 -0
  8. data/bin/biongs +35 -0
  9. data/bio-ngs.gemspec +215 -0
  10. data/ext/mkrf_conf.rb +87 -0
  11. data/lib/bio-ngs.rb +54 -0
  12. data/lib/bio/appl/ngs/bcl2qseq.rb +93 -0
  13. data/lib/bio/appl/ngs/blast.rb +36 -0
  14. data/lib/bio/appl/ngs/bowtie-inspect.rb +50 -0
  15. data/lib/bio/appl/ngs/cufflinks.rb +489 -0
  16. data/lib/bio/appl/ngs/fastx.rb +170 -0
  17. data/lib/bio/appl/ngs/samtools.rb +118 -0
  18. data/lib/bio/appl/ngs/sff_extract.rb +23 -0
  19. data/lib/bio/appl/ngs/tophat.rb +158 -0
  20. data/lib/bio/ngs/converter.rb +100 -0
  21. data/lib/bio/ngs/core_ext.rb +12 -0
  22. data/lib/bio/ngs/db.rb +66 -0
  23. data/lib/bio/ngs/db/migrate/homology/201105030707_create_blastout.rb +22 -0
  24. data/lib/bio/ngs/db/migrate/homology/201105030709_create_goannotation.rb +29 -0
  25. data/lib/bio/ngs/db/migrate/ontology/201105030708_create_go.rb +18 -0
  26. data/lib/bio/ngs/db/migrate/ontology/201105030710_create_gene_go.rb +17 -0
  27. data/lib/bio/ngs/db/migrate/ontology/201105030711_create_gene.rb +16 -0
  28. data/lib/bio/ngs/db/models.rb +1 -0
  29. data/lib/bio/ngs/db/models/homology.rb +8 -0
  30. data/lib/bio/ngs/db/models/ontology.rb +16 -0
  31. data/lib/bio/ngs/ext/bin/common/fastq_coverage_graph.sh +161 -0
  32. data/lib/bio/ngs/ext/bin/common/sff_extract +1505 -0
  33. data/lib/bio/ngs/ext/bin/linux/samtools +0 -0
  34. data/lib/bio/ngs/ext/bin/osx/samtools +0 -0
  35. data/lib/bio/ngs/ext/versions.yaml +73 -0
  36. data/lib/bio/ngs/graphics.rb +189 -0
  37. data/lib/bio/ngs/homology.rb +102 -0
  38. data/lib/bio/ngs/ontology.rb +103 -0
  39. data/lib/bio/ngs/quality.rb +64 -0
  40. data/lib/bio/ngs/record.rb +50 -0
  41. data/lib/bio/ngs/task.rb +46 -0
  42. data/lib/bio/ngs/utils.rb +176 -0
  43. data/lib/development_tasks.rb +34 -0
  44. data/lib/enumerable.rb +37 -0
  45. data/lib/tasks/bwa.thor +126 -0
  46. data/lib/tasks/convert.thor +454 -0
  47. data/lib/tasks/history.thor +51 -0
  48. data/lib/tasks/homology.thor +121 -0
  49. data/lib/tasks/ontology.thor +93 -0
  50. data/lib/tasks/project.thor +51 -0
  51. data/lib/tasks/quality.thor +142 -0
  52. data/lib/tasks/rna.thor +126 -0
  53. data/lib/tasks/sff_extract.thor +9 -0
  54. data/lib/templates/README.tt +43 -0
  55. data/lib/templates/db.tt +6 -0
  56. data/lib/wrapper.rb +225 -0
  57. data/spec/converter_qseq_spec.rb +56 -0
  58. data/spec/fixture/s_1_1_1108_qseq.txt +100 -0
  59. data/spec/quality_spec.rb +40 -0
  60. data/spec/sff_extract_spec.rb +98 -0
  61. data/spec/spec_helper.rb +55 -0
  62. data/spec/tophat_spec.rb +99 -0
  63. data/spec/utils_spec.rb +22 -0
  64. data/test/conf/test_db.yml +4 -0
  65. data/test/data/blastoutput.xml +69 -0
  66. data/test/data/gene-GO.json +1 -0
  67. data/test/data/goa_uniprot +27 -0
  68. data/test/data/goslim_goa.obo +1763 -0
  69. data/test/helper.rb +18 -0
  70. data/test/test_bio-ngs.rb +17 -0
  71. data/test/test_db.rb +21 -0
  72. data/test/test_homology.rb +102 -0
  73. data/test/test_ngs.rb +21 -0
  74. data/test/test_ontology.rb +74 -0
  75. data/test/test_utils.rb +29 -0
  76. metadata +460 -0
@@ -0,0 +1,1505 @@
1
+ #!/usr/bin/python
2
+ '''This software extracts the seq, qual and ancillary information from an sff
3
+ file, like the ones used by the 454 sequencer.
4
+
5
+ Optionally, it can also split paired-end reads if given the linker sequence.
6
+ The splitting is done with maximum match, i.e., every occurence of the linker
7
+ sequence will be removed, even if occuring multiple times.'''
8
+
9
+ #copyright Jose Blanca and Bastien Chevreux
10
+ #COMAV institute, Universidad Politecnica de Valencia (UPV)
11
+ #Valencia, Spain
12
+
13
+ # additions to handle paired end reads by Bastien Chevreux
14
+ # bugfixes for linker specific lengths: Lionel Guy
15
+
16
+ #This program is free software: you can redistribute it and/or modify
17
+ #it under the terms of the GNU General Public License as published by
18
+ #the Free Software Foundation, either version 3 of the License, or
19
+ #(at your option) any later version.
20
+ #This program is distributed in the hope that it will be useful,
21
+ #but WITHOUT ANY WARRANTY; without even the implied warranty of
22
+ #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23
+ #GNU General Public License for more details.
24
+ #You should have received a copy of the GNU General Public License
25
+ #along with this program. If not, see <http://www.gnu.org/licenses/>.
26
+
27
+ __author__ = 'Jose Blanca and Bastien Chevreux'
28
+ __copyright__ = 'Copyright 2008, Jose Blanca, COMAV, and Bastien Chevreux'
29
+ __license__ = 'GPLv3 or later'
30
+ __version__ = '0.2.8'
31
+ __email__ = 'jblanca@btc.upv.es'
32
+ __status__ = 'beta'
33
+
34
+ import struct
35
+ import sys
36
+ import os
37
+ import subprocess
38
+ import tempfile
39
+
40
+
41
+ fake_sff_name = 'fake_sff_name'
42
+
43
+
44
+ # readname as key: lines with matches from SSAHA, one best match
45
+ ssahapematches = {}
46
+ # linker readname as key: length of linker sequence
47
+ linkerlengths = {}
48
+
49
+ # set to true if something really fishy is going on with the sequences
50
+ stern_warning = True
51
+
52
+ def read_bin_fragment(struct_def, fileh, offset=0, data=None,
53
+ byte_padding=None):
54
+ '''It reads a chunk of a binary file.
55
+
56
+ You have to provide the struct, a file object, the offset (where to start
57
+ reading).
58
+ Also you can provide an optional dict that will be populated with the
59
+ extracted data.
60
+ If a byte_padding is given the number of bytes read will be a multiple of
61
+ that number, adding the required pad at the end.
62
+ It returns the number of bytes reads and the data dict.
63
+ '''
64
+ if data is None:
65
+ data = {}
66
+
67
+ #we read each item
68
+ bytes_read = 0
69
+ for item in struct_def:
70
+ #we go to the place and read
71
+ fileh.seek(offset + bytes_read)
72
+ n_bytes = struct.calcsize(item[1])
73
+ buffer = fileh.read(n_bytes)
74
+ read = struct.unpack('>' + item[1], buffer)
75
+ if len(read) == 1:
76
+ read = read[0]
77
+ data[item[0]] = read
78
+ bytes_read += n_bytes
79
+
80
+ #if there is byte_padding the bytes_to_read should be a multiple of the
81
+ #byte_padding
82
+ if byte_padding is not None:
83
+ pad = byte_padding
84
+ bytes_read = ((bytes_read + pad - 1) // pad) * pad
85
+
86
+ return (bytes_read, data)
87
+
88
+
89
+ def check_magic(magic):
90
+ '''It checks that the magic number of the file matches the sff magic.'''
91
+ if magic != 779314790:
92
+ raise RuntimeError('This file does not seems to be an sff file.')
93
+
94
+ def check_version(version):
95
+ '''It checks that the version is supported, otherwise it raises an error.'''
96
+ supported = ('\x00', '\x00', '\x00', '\x01')
97
+ i = 0
98
+ for item in version:
99
+ if version[i] != supported[i]:
100
+ raise RuntimeError('SFF version not supported. Please contact the author of the software.')
101
+ i += 1
102
+
103
+ def read_header(fileh):
104
+ '''It reads the header from the sff file and returns a dict with the
105
+ information'''
106
+ #first we read the first part of the header
107
+ head_struct = [
108
+ ('magic_number', 'I'),
109
+ ('version', 'cccc'),
110
+ ('index_offset', 'Q'),
111
+ ('index_length', 'I'),
112
+ ('number_of_reads', 'I'),
113
+ ('header_length', 'H'),
114
+ ('key_length', 'H'),
115
+ ('number_of_flows_per_read', 'H'),
116
+ ('flowgram_format_code', 'B'),
117
+ ]
118
+ data = {}
119
+ first_bytes, data = read_bin_fragment(struct_def=head_struct, fileh=fileh,
120
+ offset=0, data=data)
121
+ check_magic(data['magic_number'])
122
+ check_version(data['version'])
123
+ #now that we know the number_of_flows_per_read and the key_length
124
+ #we can read the second part of the header
125
+ struct2 = [
126
+ ('flow_chars', str(data['number_of_flows_per_read']) + 'c'),
127
+ ('key_sequence', str(data['key_length']) + 'c')
128
+ ]
129
+ read_bin_fragment(struct_def=struct2, fileh=fileh, offset=first_bytes,
130
+ data=data)
131
+ return data
132
+
133
+
134
+ def read_sequence(header, fileh, fposition):
135
+ '''It reads one read from the sff file located at the fposition and
136
+ returns a dict with the information.'''
137
+ header_length = header['header_length']
138
+ index_offset = header['index_offset']
139
+ index_length = header['index_length']
140
+
141
+ #the sequence struct
142
+ read_header_1 = [
143
+ ('read_header_length', 'H'),
144
+ ('name_length', 'H'),
145
+ ('number_of_bases', 'I'),
146
+ ('clip_qual_left', 'H'),
147
+ ('clip_qual_right', 'H'),
148
+ ('clip_adapter_left', 'H'),
149
+ ('clip_adapter_right', 'H'),
150
+ ]
151
+ def read_header_2(name_length):
152
+ '''It returns the struct definition for the second part of the header'''
153
+ return [('name', str(name_length) +'c')]
154
+ def read_data(number_of_bases):
155
+ '''It returns the struct definition for the read data section.'''
156
+ #size = {'c': 1, 'B':1, 'H':2, 'I':4, 'Q':8}
157
+ if header['flowgram_format_code'] == 1:
158
+ flow_type = 'H'
159
+ else:
160
+ raise Error('file version not supported')
161
+ number_of_bases = str(number_of_bases)
162
+ return [
163
+ ('flowgram_values', str(header['number_of_flows_per_read']) +
164
+ flow_type),
165
+ ('flow_index_per_base', number_of_bases + 'B'),
166
+ ('bases', number_of_bases + 'c'),
167
+ ('quality_scores', number_of_bases + 'B'),
168
+ ]
169
+
170
+ data = {}
171
+ #we read the first part of the header
172
+ bytes_read, data = read_bin_fragment(struct_def=read_header_1,
173
+ fileh=fileh, offset=fposition, data=data)
174
+
175
+ read_bin_fragment(struct_def=read_header_2(data['name_length']),
176
+ fileh=fileh, offset=fposition + bytes_read, data=data)
177
+ #we join the letters of the name
178
+ data['name'] = ''.join(data['name'])
179
+ offset = data['read_header_length']
180
+ #we read the sequence and the quality
181
+ read_data_st = read_data(data['number_of_bases'])
182
+ bytes_read, data = read_bin_fragment(struct_def=read_data_st,
183
+ fileh=fileh, offset=fposition + offset,
184
+ data=data, byte_padding=8)
185
+ #we join the bases
186
+ data['bases'] = ''.join(data['bases'])
187
+
188
+ #print data
189
+ #print "pre cqr: ", data['clip_qual_right']
190
+ #print "pre car: ", data['clip_adapter_right']
191
+ #print "pre cql: ", data['clip_qual_left']
192
+ #print "pre cal: ", data['clip_adapter_left']
193
+
194
+ # correct for the case the right clip is <= than the left clip
195
+ # in this case, left clip is 0 are set to 0 (right clip == 0 means
196
+ # "whole sequence")
197
+ if data['clip_qual_right'] <= data['clip_qual_left'] :
198
+ data['clip_qual_right'] = 0
199
+ data['clip_qual_left'] = 0
200
+ if data['clip_adapter_right'] <= data['clip_adapter_left'] :
201
+ data['clip_adapter_right'] = 0
202
+ data['clip_adapter_left'] = 0
203
+
204
+ #the clipping section follows the NCBI's guidelines Trace Archive RFC
205
+ #http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=rfc&m=doc&s=rfc
206
+ #if there's no adapter clip: qual -> vector
207
+ #else: qual-> qual
208
+ # adapter -> vector
209
+
210
+ if not data['clip_adapter_left']:
211
+ data['clip_adapter_left'], data['clip_qual_left'] = data['clip_qual_left'], data['clip_adapter_left']
212
+ if not data['clip_adapter_right']:
213
+ data['clip_adapter_right'], data['clip_qual_right'] = data['clip_qual_right'], data['clip_adapter_right']
214
+
215
+ # see whether we have to override the minimum left clips
216
+ if config['min_leftclip'] > 0:
217
+ if data['clip_adapter_left'] >0 and data['clip_adapter_left'] < config['min_leftclip']:
218
+ data['clip_adapter_left'] = config['min_leftclip']
219
+ if data['clip_qual_left'] >0 and data['clip_qual_left'] < config['min_leftclip']:
220
+ data['clip_qual_left'] = config['min_leftclip']
221
+
222
+
223
+ #print "post cqr: ", data['clip_qual_right']
224
+ #print "post car: ", data['clip_adapter_right']
225
+ #print "post cql: ", data['clip_qual_left']
226
+ #print "post cal: ", data['clip_adapter_left']
227
+
228
+
229
+ # for handling the -c (clip) option gently, we already clip here
230
+ # and set all clip points to the sequence end points
231
+ if config['clip']:
232
+ data['bases'], data['quality_scores'] = clip_read(data)
233
+
234
+ data['number_of_bases']=len(data['bases'])
235
+ data['clip_qual_right'] = data['number_of_bases']
236
+ data['clip_adapter_right'] = data['number_of_bases']
237
+ data['clip_qual_left'] = 0
238
+ data['clip_adapter_left'] = 0
239
+
240
+ return data['read_header_length'] + bytes_read, data
241
+
242
+
243
+ def sequences(fileh, header):
244
+ '''It returns a generator with the data for each read.'''
245
+ #now we can read all the sequences
246
+ fposition = header['header_length'] #position in the file
247
+ reads_read = 0
248
+ while True:
249
+ if fposition == header['index_offset']:
250
+ #we have to skip the index section
251
+ fposition += index_length
252
+ continue
253
+ else:
254
+ bytes_read, seq_data = read_sequence(header=header, fileh=fileh,
255
+ fposition=fposition)
256
+ yield seq_data
257
+ fposition += bytes_read
258
+ reads_read += 1
259
+ if reads_read >= header['number_of_reads']:
260
+ break
261
+
262
+
263
+ def remove_last_xmltag_in_file(fname, tag=None):
264
+ '''Given an xml file name and a tag, it removes the last tag of the
265
+ file if it matches the given tag. Tag removal is performed via file
266
+ truncation.
267
+
268
+ It the given tag is not the last in the file, a RunTimeError will be
269
+ raised.
270
+
271
+ The resulting xml file will be not xml valid. This function is a hack
272
+ that allows to append records to xml files in a quick and dirty way.
273
+ '''
274
+
275
+ fh = open(fname, 'r+')
276
+ #we have to read from the end to the start of the file and keep the
277
+ #string enclosed by </ >
278
+ i = -1
279
+ last_tag = [] #the chars that form the last tag
280
+ start_offset = None #in which byte does the last tag starts?
281
+ end_offset = None #in which byte does the last tag ends?
282
+ while True:
283
+ fh.seek(i, 2)
284
+ char = fh.read(1)
285
+ if not char.isspace():
286
+ last_tag.append(char)
287
+ if char == '>':
288
+ end_offset = i
289
+ if char == '<':
290
+ start_offset = i
291
+ break
292
+ i -= 1
293
+
294
+ #we have read the last tag backwards
295
+ last_tag = ''.join(last_tag[::-1])
296
+ #we remove the </ and >
297
+ last_tag = last_tag.rstrip('>').lstrip('</')
298
+
299
+ #we check that we're removing the asked tag
300
+ if tag is not None and tag != last_tag:
301
+ raise RuntimeError("The given xml tag wasn't the last one in the file")
302
+
303
+ # while we are at it: also remove all white spaces in that line :-)
304
+ i -= 1
305
+ while True:
306
+ fh.seek(i, 2)
307
+ char = fh.read(1)
308
+ if not char == ' ' and not char == '\t':
309
+ break;
310
+ if fh.tell() == 1:
311
+ break;
312
+ i -= 1
313
+
314
+ fh.truncate();
315
+
316
+ fh.close()
317
+ return last_tag
318
+
319
+
320
+ def create_basic_xml_info(readname, fname):
321
+ '''Formats a number of read specific infos into XML format.
322
+ Currently formated: name and the tags set from command line
323
+ '''
324
+ to_print = [' <trace>\n']
325
+ to_print.append(' <trace_name>')
326
+ to_print.append(readname)
327
+ to_print.append('</trace_name>\n')
328
+
329
+ #extra information
330
+ #do we have extra info for this file?
331
+ info = None
332
+ if config['xml_info']:
333
+ #with this name?
334
+ if fname in config['xml_info']:
335
+ info = config['xml_info'][fname]
336
+ else:
337
+ #with no name?
338
+ try:
339
+ info = config['xml_info'][fake_sff_name]
340
+ except KeyError:
341
+ pass
342
+ #we print the info that we have
343
+ if info:
344
+ for key in info:
345
+ to_print.append(' <' + key + '>' + info[key] + \
346
+ '</' + key +'>\n')
347
+
348
+ return ''.join(to_print)
349
+
350
+
351
+ def create_clip_xml_info(readlen, adapl, adapr, quall, qualr):
352
+ '''Takes the clip values of the read and formats them into XML
353
+ Corrects "wrong" values that might have resulted through
354
+ simplified calculations earlier in the process of conversion
355
+ (especially during splitting of paired-end reads)
356
+ '''
357
+
358
+ to_print = [""]
359
+
360
+ # if right borders are >= to read length, they don't need
361
+ # to be printed
362
+ if adapr >= readlen:
363
+ adapr = 0
364
+ if qualr >= readlen:
365
+ qualr = 0
366
+
367
+ # BaCh
368
+ # when called via split_paired_end(), some values may be < 0
369
+ # (when clip values were 0 previously)
370
+ # instead of putting tons of if clauses for different calculations there,
371
+ # I centralise corrective measure here
372
+ # set all values <0 to 0
373
+
374
+ if adapr < 0:
375
+ adapr = 0
376
+ if qualr <0:
377
+ qualr = 0
378
+ if adapl < 0:
379
+ adapl = 0
380
+ if quall <0:
381
+ quall = 0
382
+
383
+ if quall:
384
+ to_print.append(' <clip_quality_left>')
385
+ to_print.append(str(quall))
386
+ to_print.append('</clip_quality_left>\n')
387
+ if qualr:
388
+ to_print.append(' <clip_quality_right>')
389
+ to_print.append(str(qualr))
390
+ to_print.append('</clip_quality_right>\n')
391
+ if adapl:
392
+ to_print.append(' <clip_vector_left>')
393
+ to_print.append(str(adapl))
394
+ to_print.append('</clip_vector_left>\n')
395
+ if adapr:
396
+ to_print.append(' <clip_vector_right>')
397
+ to_print.append(str(adapr))
398
+ to_print.append('</clip_vector_right>\n')
399
+ return ''.join(to_print)
400
+
401
+
402
+ def create_xml_for_unpaired_read(data, fname):
403
+ '''Given the data for one read it returns an str with the xml ancillary
404
+ data.'''
405
+ to_print = [create_basic_xml_info(data['name'],fname)]
406
+ #clippings in the XML only if we do not hard clip
407
+ if not config['clip']:
408
+ to_print.append(create_clip_xml_info(data['number_of_bases'],data['clip_adapter_left'], data['clip_adapter_right'], data['clip_qual_left'], data['clip_qual_right']));
409
+ to_print.append(' </trace>\n')
410
+ return ''.join(to_print)
411
+
412
+
413
+ def format_as_fasta(name,seq,qual):
414
+ name_line = ''.join(('>', name,'\n'))
415
+ seqstring = ''.join((name_line, seq, '\n'))
416
+ qual_line = ' '.join([str(q) for q in qual])
417
+ qualstring = ''.join((name_line, qual_line, '\n'))
418
+ return seqstring, qualstring
419
+
420
+ def format_as_fastq(name,seq,qual):
421
+ qual_line = ''.join([chr(q+33) for q in qual])
422
+ #seqstring = ''.join(('@', name,'\n', seq, '\n+', name,'\n', qual_line, '\n'))
423
+ seqstring = ''.join(('@', name,'\n', seq, '\n+\n', qual_line, '\n'))
424
+ return seqstring
425
+
426
+
427
+ def get_read_data(data):
428
+ '''Given the data for one read it returns 2 strs with the fasta seq
429
+ and fasta qual.'''
430
+ #seq and qual
431
+ if config['mix_case']:
432
+ seq = sequence_case(data)
433
+ qual = data['quality_scores']
434
+ else :
435
+ seq = data['bases']
436
+ qual = data['quality_scores']
437
+
438
+ return seq, qual
439
+
440
+ def extract_read_info(data, fname):
441
+ '''Given the data for one read it returns 3 strs with the fasta seq, fasta
442
+ qual and xml ancillary data.'''
443
+
444
+ seq,qual = get_read_data(data)
445
+ seqstring, qualstring = format_as_fasta(data['name'],seq,qual)
446
+
447
+ #name_line = ''.join(('>', data['name'],'\n'))
448
+ #seq = ''.join((name_line, seq, '\n'))
449
+ #qual_line = ' '.join([str(q) for q in qual])
450
+ #qual = ''.join((name_line, qual_line, '\n'))
451
+
452
+ xmlstring = create_xml_for_unpaired_read(data, fname)
453
+
454
+ return seqstring, qualstring, xmlstring
455
+
456
+ def write_sequence(name,seq,qual,seq_fh,qual_fh):
457
+ '''Write sequence and quality FASTA and FASTA qual filehandles
458
+ (or into FASTQ and XML)
459
+ if sequence length is 0, don't write'''
460
+
461
+ if len(seq) == 0 : return
462
+
463
+ if qual_fh is None:
464
+ seq_fh.write(format_as_fastq(name,seq,qual))
465
+ else:
466
+ seqstring, qualstring = format_as_fasta(name,seq,qual)
467
+ seq_fh.write(seqstring)
468
+ qual_fh.write(qualstring)
469
+ return
470
+
471
+ def write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh):
472
+ '''Writes an unpaired read into FASTA, FASTA qual and XML filehandles
473
+ (or into FASTQ and XML)
474
+ if sequence length is 0, don't write'''
475
+
476
+ seq,qual = get_read_data(data)
477
+ if len(seq) == 0 : return
478
+
479
+ write_sequence(data['name'],seq,qual,seq_fh,qual_fh)
480
+
481
+ anci = create_xml_for_unpaired_read(data, sff_fh.name)
482
+ if anci is not None:
483
+ xml_fh.write(anci)
484
+ return
485
+
486
+
487
+ def reverse_complement(seq):
488
+ '''Returns the reverse complement of a DNA sequence as string'''
489
+
490
+ compdict = {
491
+ 'a': 't',
492
+ 'c': 'g',
493
+ 'g': 'c',
494
+ 't': 'a',
495
+ 'u': 't',
496
+ 'm': 'k',
497
+ 'r': 'y',
498
+ 'w': 'w',
499
+ 's': 's',
500
+ 'y': 'r',
501
+ 'k': 'm',
502
+ 'v': 'b',
503
+ 'h': 'd',
504
+ 'd': 'h',
505
+ 'b': 'v',
506
+ 'x': 'x',
507
+ 'n': 'n',
508
+ 'A': 'T',
509
+ 'C': 'G',
510
+ 'G': 'C',
511
+ 'T': 'A',
512
+ 'U': 'T',
513
+ 'M': 'K',
514
+ 'R': 'Y',
515
+ 'W': 'W',
516
+ 'S': 'S',
517
+ 'Y': 'R',
518
+ 'K': 'M',
519
+ 'V': 'B',
520
+ 'H': 'D',
521
+ 'D': 'H',
522
+ 'B': 'V',
523
+ 'X': 'X',
524
+ 'N': 'N',
525
+ '*': '*'
526
+ }
527
+
528
+ complseq = ''.join([compdict[base] for base in seq])
529
+ # python hack to reverse a list/string/etc
530
+ complseq = complseq[::-1]
531
+ return complseq
532
+
533
+
534
+ def mask_sequence(seq, maskchar, fpos, tpos):
535
+ '''Given a sequence, mask it with maskchar starting at fpos (including) and
536
+ ending at tpos (excluding)
537
+ '''
538
+
539
+ if len(maskchar) > 1:
540
+ raise RuntimeError("Internal error: more than one character given to mask_sequence")
541
+ if fpos<0:
542
+ fpos = 0
543
+ if tpos > len(seq):
544
+ tpos = len(seq)
545
+
546
+ newseq = ''.join((seq[:fpos],maskchar*(tpos-fpos), seq[tpos:]))
547
+
548
+ return newseq
549
+
550
+
551
+ def fragment_sequences(sequence, qualities, splitchar):
552
+ '''Works like split() on strings, except it does this on a sequence
553
+ and the corresponding list with quality values.
554
+ Returns a tuple for each fragment, each sublist has the fragment
555
+ sequence as first and the fragment qualities as second elemnt'''
556
+
557
+ # this is slow (due to zip and list appends... use an iterator over
558
+ # the sequence find find variations and splices on seq and qual
559
+
560
+ if len(sequence) != len(qualities):
561
+ print sequence, qualities
562
+ raise RuntimeError("Internal error: length of sequence and qualities don't match???")
563
+
564
+ retlist = ([])
565
+ if len(sequence) == 0:
566
+ return retlist
567
+
568
+ actseq = ([])
569
+ actqual = ([])
570
+ if sequence[0] != splitchar:
571
+ inseq = True
572
+ else:
573
+ inseq = False
574
+ for char,qual in zip(sequence,qualities):
575
+ if inseq:
576
+ if char != splitchar:
577
+ actseq.append(char)
578
+ actqual.append(qual)
579
+ else:
580
+ retlist.append((''.join(actseq), actqual))
581
+ actseq = ([])
582
+ actqual = ([])
583
+ inseq = False
584
+ else:
585
+ if char != splitchar:
586
+ inseq = True
587
+ actseq.append(char)
588
+ actqual.append(qual)
589
+
590
+ if inseq and len(actseq):
591
+ retlist.append((''.join(actseq), actqual))
592
+
593
+ return retlist
594
+
595
+
596
+ def calc_subseq_boundaries(maskedseq, maskchar):
597
+ '''E.g.:
598
+ ........xxxxxxxx..........xxxxxxxxxxxxxxxxxxxxx.........
599
+ to
600
+ (0,8),(8,16),(16,26),(26,47),(47,56)
601
+ '''
602
+
603
+ blist = ([])
604
+ if len(maskedseq) == 0:
605
+ return blist
606
+
607
+ inmask = True
608
+ if maskedseq[0] != maskchar:
609
+ inmask = False
610
+
611
+ start = 0
612
+ for spos in range(len(maskedseq)):
613
+ if inmask and maskedseq[spos] != maskchar:
614
+ blist.append(([start,spos]))
615
+ start = spos
616
+ inmask = False
617
+ elif not inmask and maskedseq[spos] == maskchar:
618
+ blist.append(([start,spos]))
619
+ start = spos
620
+ inmask = True
621
+
622
+ blist.append(([start,spos+1]))
623
+
624
+ return blist
625
+
626
+
627
+ def correct_for_smallhits(maskedseq, maskchar, linkername):
628
+ '''If partial hits were found, take preventive measure: grow
629
+ the masked areas by 20 bases in each direction
630
+ Returns either unchanged "maskedseq" or a new sequence
631
+ with some more characters masked.
632
+ '''
633
+ global linkerlengths
634
+
635
+ CEBUG = 0
636
+
637
+ if CEBUG : print "correct_for_smallhits"
638
+ if CEBUG : print "Masked seq\n", maskedseq
639
+ if CEBUG : print "Linkername: ", linkername
640
+
641
+ if len(maskedseq) == 0:
642
+ return maskedseq
643
+
644
+ growl=40
645
+ growl2=growl/2
646
+
647
+ boundaries = calc_subseq_boundaries(maskedseq,maskchar)
648
+ if CEBUG : print "Boundaries: ", boundaries
649
+
650
+ foundpartial = False
651
+ for bounds in boundaries:
652
+ if CEBUG : print "\tbounds: ", bounds
653
+ left, right = bounds
654
+ if left != 0 and right != len(maskedseq):
655
+ if maskedseq[left] == maskchar:
656
+ # allow 10% discrepancy
657
+ # -linkerlengths[linkername]/10
658
+ # that's a kind of safety net if there are slight sequencing
659
+ # errors in the linker itself
660
+ if right-left < linkerlengths[linkername]-linkerlengths[linkername]/10:
661
+ if CEBUG : print "\t\tPartial: found " + str(right-left) + " gaps, " + linkername + " is " + str(linkerlengths[linkername]) + " nt long."
662
+ foundpartial = True
663
+
664
+ if not foundpartial:
665
+ return maskedseq
666
+
667
+ # grow
668
+ newseq = ""
669
+ for bounds in boundaries:
670
+ if CEBUG : print "Bounds: ", bounds
671
+ left, right = bounds
672
+ if maskedseq[left] == maskchar:
673
+ newseq += maskedseq[left:right]
674
+ else:
675
+ clearstart = 0
676
+ if left > 0 :
677
+ clearstart = left+growl2
678
+ clearstop = len(maskedseq)
679
+ if right < len(maskedseq):
680
+ clearstop = right-growl2
681
+
682
+ if CEBUG : print "clearstart, clearstop: ",clearstart, clearstop
683
+
684
+ if clearstop <= clearstart:
685
+ newseq += maskchar * (right-left)
686
+ else:
687
+ if clearstart != left:
688
+ newseq += maskchar * growl2
689
+ newseq += maskedseq[clearstart:clearstop]
690
+ if clearstop != right:
691
+ newseq += maskchar * growl2
692
+
693
+ #print "newseq\n",newseq
694
+
695
+ return newseq
696
+
697
+
698
+ def split_paired_end(data, sff_fh, seq_fh, qual_fh, xml_fh):
699
+ '''Splits a paired end read and writes sequences into FASTA, FASTA qual
700
+ and XML traceinfo file. Returns the number of sequences created.
701
+
702
+ As the linker sequence may be anywhere in the read, including the ends
703
+ and overlapping with bad quality sequence, we need to perform some
704
+ computing and eventually set new clip points.
705
+
706
+ If the resulting split yields only one sequence (because linker
707
+ was not present or overlapping with left or right clip), only one
708
+ sequence will be written with ".fn" appended to the name.
709
+
710
+ If the read can be split, two reads will be written. The side left of
711
+ the linker will be named ".r" and will be written in reverse complement
712
+ into the file to conform with what approximately all assemblers expect
713
+ when reading paired-end data: reads in forward direction in file. The side
714
+ right of the linker will be named ".f"
715
+
716
+ If SSAHA found partial linker (linker sequences < length of linker),
717
+ the sequences will get a "_pl" furthermore be cut back thoroughly.
718
+
719
+ If SSAHA found multiple occurences of the linker, the names will get an
720
+ additional "_mlc" within the name to show that there was "multiple
721
+ linker contamination".
722
+
723
+ For multiple or partial linker, the "good" parts of the reads are
724
+ stored with a ".part<number>" name, additionally they will not get
725
+ template information in the XML
726
+ '''
727
+
728
+ global ssahapematches
729
+
730
+ CEBUG = 0
731
+
732
+ maskchar = "#"
733
+
734
+ if CEBUG : print "Need to split: " + data['name']
735
+
736
+ numseqs = 0;
737
+ readname = data['name']
738
+ readlen = data['number_of_bases']
739
+
740
+ leftclip, rightclip = return_merged_clips(data)
741
+ seq, qual = get_read_data(data)
742
+
743
+ if CEBUG : print "Original read:\n",seq
744
+
745
+ maskedseq = seq
746
+ if leftclip > 0:
747
+ maskedseq = mask_sequence(maskedseq, maskchar, 0, leftclip-1)
748
+ if rightclip < len(maskedseq):
749
+ maskedseq = mask_sequence(maskedseq, maskchar, rightclip, len(maskedseq))
750
+
751
+ leftclip, rightclip = return_merged_clips(data)
752
+ readlen = data['number_of_bases']
753
+
754
+ if CEBUG : print "Readname:", readname
755
+ if CEBUG : print "Readlen:", readlen
756
+ if CEBUG : print "Num matches:", str(len(ssahapematches[data['name']]))
757
+ if CEBUG : print "matches:", ssahapematches[data['name']]
758
+
759
+ for match in ssahapematches[data['name']]:
760
+ score = int(match[0])
761
+ linkername = match[2]
762
+ leftreadhit = int(match[3])
763
+ rightreadhit = int(match[4])
764
+ #leftlinkerhit = int(match[5])
765
+ #rightlinkerhit = int(match[6])
766
+ #direction = match[7]
767
+ #hitlen = int(match[8])
768
+ #hitidentity = float(match[9])
769
+
770
+ if CEBUG : print match
771
+ if CEBUG : print "Match with score:", score
772
+ if CEBUG : print "Read before:\n", maskedseq
773
+ maskedseq = mask_sequence(maskedseq, maskchar, leftreadhit-1, rightreadhit)
774
+ if CEBUG : print "Masked seq:\n", maskedseq
775
+
776
+ correctedseq = correct_for_smallhits(maskedseq, maskchar, linkername)
777
+
778
+ if len(maskedseq) != len(correctedseq):
779
+ raise RuntimeError("Internal error: maskedseq != correctedseq")
780
+
781
+ partialhits = False
782
+ if correctedseq != maskedseq:
783
+ if CEBUG : print "Partial hits in", readname
784
+ if CEBUG : print "Original seq:\n", seq
785
+ if CEBUG : print "Masked seq:\n", maskedseq
786
+ if CEBUG : print "Corrected seq\n", correctedseq
787
+ partialhits = True
788
+ readname += "_pl"
789
+ maskedseq = correctedseq
790
+
791
+ fragments = fragment_sequences(maskedseq, qual, maskchar)
792
+
793
+ if CEBUG : print "Fragments (", len(fragments), "): ", fragments
794
+
795
+ mlcflag = False
796
+ #if len(ssahapematches[data['name']]) > 1:
797
+ # #print "Multi linker contamination"
798
+ # mlcflag = True
799
+ # readname += "_mlc"
800
+
801
+ if len(fragments) > 2:
802
+ if CEBUG : print "Multi linker contamination"
803
+ mlcflag = True
804
+ readname += "_mlc"
805
+
806
+
807
+ #print fragments
808
+ if mlcflag or partialhits:
809
+ fragcounter = 1
810
+ readname += ".part"
811
+ for frag in fragments:
812
+ actseq = frag[0]
813
+ if len(actseq) >= 20:
814
+ actqual = frag[1]
815
+ oname = readname + str(fragcounter)
816
+ #seq_fh.write(">"+oname+"\n")
817
+ #seq_fh.write(actseq+"\n")
818
+ #qual_fh.write(">"+oname+"\n")
819
+ #qual_fh.write(' '.join((str(q) for q in actqual)))
820
+ #qual_fh.write("\n")
821
+ write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
822
+ to_print = [create_basic_xml_info(oname,sff_fh.name)]
823
+ # No clipping in XML ... the multiple and partial fragments
824
+ # are clipped "hard"
825
+ # No template ID and trace_end: we don't know the
826
+ # orientation of the frahments. Even if it were
827
+ # only two, the fact we had multiple linkers
828
+ # says something went wrong, so simply do not
829
+ # write any paired-end information for all these fragments
830
+ to_print.append(' </trace>\n')
831
+ xml_fh.write(''.join(to_print))
832
+ numseqs += 1
833
+ fragcounter += 1
834
+ else:
835
+ if len(fragments) >2:
836
+ raise RuntimeError("Unexpected: more than two fragments detected in " + readname + ". please contact the authors.")
837
+ # nothing will happen for 0 fragments
838
+ if len(fragments) == 1:
839
+ #print "Tada1"
840
+ boundaries = calc_subseq_boundaries(maskedseq,maskchar)
841
+ if len(boundaries) < 1 or len(boundaries) >3:
842
+ raise RuntimeError("Unexpected case: ", str(len(boundaries)), "boundaries for 1 fragment of ", readname)
843
+ if len(boundaries) == 3:
844
+ # case: mask char on both sides of sequence
845
+ #print "bounds3"
846
+ data['clip_adapter_left']=1+boundaries[0][1]
847
+ data['clip_adapter_right']=boundaries[2][0]
848
+ elif len(boundaries) == 2:
849
+ # case: mask char left or right of sequence
850
+ #print "bounds2",
851
+ if maskedseq[0] == maskchar :
852
+ # case: mask char left
853
+ #print "left"
854
+ data['clip_adapter_left']=1+boundaries[0][1]
855
+ else:
856
+ # case: mask char right
857
+ #print "right"
858
+ data['clip_adapter_right']=boundaries[1][0]
859
+ data['name'] = data['name'] + ".fn"
860
+ write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh)
861
+ numseqs = 1
862
+ elif len(fragments) == 2:
863
+ #print "Tada2"
864
+ oname = readname + ".r"
865
+ seq, qual = get_read_data(data)
866
+
867
+ startsearch = False
868
+ for spos in range(len(maskedseq)):
869
+ if maskedseq[spos] != maskchar:
870
+ startsearch = True;
871
+ else:
872
+ if startsearch:
873
+ break
874
+
875
+ #print "\nspos: ", spos
876
+ lseq=seq[:spos]
877
+ #print "lseq:", lseq
878
+ actseq = reverse_complement(lseq)
879
+ lreadlen = len(actseq)
880
+ lqual = qual[:spos];
881
+ # python hack to reverse a list/string/etc
882
+ lqual = lqual[::-1];
883
+
884
+ #seq_fh.write(">"+oname+"\n")
885
+ #seq_fh.write(actseq+"\n")
886
+ #qual_fh.write(">"+oname+"\n")
887
+ #qual_fh.write(' '.join((str(q) for q in lqual)))
888
+ #qual_fh.write("\n")
889
+
890
+ write_sequence(oname,actseq,lqual,seq_fh,qual_fh)
891
+
892
+ to_print = [create_basic_xml_info(oname,sff_fh.name)]
893
+ to_print.append(create_clip_xml_info(lreadlen, 0, lreadlen+1-data['clip_adapter_left'], 0, lreadlen+1-data['clip_qual_left']));
894
+ to_print.append(' <template_id>')
895
+ to_print.append(readname)
896
+ to_print.append('</template_id>\n')
897
+ to_print.append(' <trace_end>r</trace_end>\n')
898
+ to_print.append(' </trace>\n')
899
+ xml_fh.write(''.join(to_print))
900
+
901
+ oname = readname + ".f"
902
+ startsearch = False
903
+ for spos in range(len(maskedseq)-1,-1,-1):
904
+ if maskedseq[spos] != maskchar:
905
+ startsearch = True;
906
+ else:
907
+ if startsearch:
908
+ break
909
+
910
+ actseq = seq[spos+1:]
911
+ actqual = qual[spos+1:];
912
+
913
+ #print "\nspos: ", spos
914
+ #print "rseq:", actseq
915
+
916
+ #seq_fh.write(">"+oname+"\n")
917
+ #seq_fh.write(actseq+"\n")
918
+ #qual_fh.write(">"+oname+"\n")
919
+ #qual_fh.write(' '.join((str(q) for q in actqual)))
920
+ #qual_fh.write("\n")
921
+ write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
922
+
923
+ rreadlen = len(actseq)
924
+ to_print = [create_basic_xml_info(oname,sff_fh.name)]
925
+ to_print.append(create_clip_xml_info(rreadlen, 0, rreadlen-(readlen-data['clip_adapter_right']), 0, rreadlen-(readlen-data['clip_qual_right'])));
926
+ to_print.append(' <template_id>')
927
+ to_print.append(readname)
928
+ to_print.append('</template_id>\n')
929
+ to_print.append(' <trace_end>f</trace_end>\n')
930
+ to_print.append(' </trace>\n')
931
+ xml_fh.write(''.join(to_print))
932
+ numseqs = 2
933
+
934
+ return numseqs
935
+
936
+
937
+
938
+ def extract_reads_from_sff(config, sff_files):
939
+ '''Given the configuration and the list of sff_files it writes the seqs,
940
+ qualities and ancillary data into the output file(s).
941
+
942
+ If file for paired-end linker was given, first extracts all sequences
943
+ of an SFF and searches these against the linker(s) with SSAHA2 to
944
+ create needed information to split reads.
945
+ '''
946
+
947
+ global ssahapematches
948
+
949
+
950
+ if len(sff_files) == 0 :
951
+ raise RuntimeError("No SFF file given?")
952
+
953
+ #we go through all input files
954
+ for sff_file in sff_files:
955
+ if not os.path.getsize(sff_file):
956
+ raise RuntimeError('Empty file? : ' + sff_file)
957
+ fh = open(sff_file, 'r')
958
+ fh.close()
959
+
960
+ openmode = 'w'
961
+ if config['append']:
962
+ openmode = 'a'
963
+
964
+ seq_fh = open(config['seq_fname'], openmode)
965
+ xml_fh = open(config['xml_fname'], openmode)
966
+ if config['want_fastq']:
967
+ qual_fh = None
968
+ try:
969
+ os.remove(config['qual_fname'])
970
+ except :
971
+ python_formattingwithoutbracesisdumb_dummy = 1
972
+ else:
973
+ qual_fh = open(config['qual_fname'], openmode)
974
+
975
+ if not config['append']:
976
+ xml_fh.write('<?xml version="1.0"?>\n<trace_volume>\n')
977
+ else:
978
+ remove_last_xmltag_in_file(config['xml_fname'], "trace_volume")
979
+
980
+ #we go through all input files
981
+ for sff_file in sff_files:
982
+ print "Working on '" + sff_file + "':"
983
+ ssahapematches.clear()
984
+
985
+ seqcheckstore = ([])
986
+
987
+ debug = 0
988
+
989
+ if not debug and config['pelinker_fname']:
990
+ print "Creating temporary sequences from reads in '" + sff_file + "' ... ",
991
+ sys.stdout.flush()
992
+
993
+ if 0 :
994
+ # for debugging
995
+ pid = os.getpid()
996
+ tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
997
+ tmpfasta_fh = open(tmpfasta_fname, 'w')
998
+ else:
999
+ tmpfasta_fh = tempfile.NamedTemporaryFile(prefix = 'sffeseqs_',
1000
+ suffix = '.fasta')
1001
+
1002
+ sff_fh = open(sff_file, 'rb')
1003
+ header_data = read_header(fileh=sff_fh)
1004
+ for seq_data in sequences(fileh=sff_fh, header=header_data):
1005
+ seq,qual = get_read_data(seq_data)
1006
+ seqstring, qualstring = format_as_fasta(seq_data['name'],seq,qual)
1007
+ tmpfasta_fh.write(seqstring)
1008
+ #seq, qual, anci = extract_read_info(seq_data, sff_fh.name)
1009
+ #tmpfasta_fh.write(seq)
1010
+ print "done."
1011
+ tmpfasta_fh.seek(0)
1012
+
1013
+ if 0 :
1014
+ # for debugging
1015
+ tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
1016
+ tmpssaha_fh = open(tmpssaha_fname, 'w+')
1017
+ else:
1018
+ tmpssaha_fh = tempfile.NamedTemporaryFile(prefix = 'sffealig_',
1019
+ suffix = '.ssaha2')
1020
+
1021
+ launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
1022
+ tmpfasta_fh.close()
1023
+
1024
+ tmpssaha_fh.seek(0)
1025
+ read_ssaha_data(tmpssaha_fh)
1026
+ tmpssaha_fh.close()
1027
+
1028
+ if debug:
1029
+ tmpssaha_fh = open("sffe.tmp.10634.ssaha2", 'r')
1030
+ read_ssaha_data(tmpssaha_fh)
1031
+
1032
+ print "Converting '" + sff_file + "' ... ",
1033
+ sys.stdout.flush()
1034
+ sff_fh = open(sff_file, 'rb')
1035
+ #read_header(infile)
1036
+ header_data = read_header(fileh=sff_fh)
1037
+
1038
+ #now convert all reads
1039
+ nseqs_sff = 0
1040
+ nseqs_out = 0
1041
+ for seq_data in sequences(fileh=sff_fh, header=header_data):
1042
+ nseqs_sff += 1
1043
+
1044
+ seq, qual = clip_read(seq_data)
1045
+ seqcheckstore.append(seq[0:50])
1046
+
1047
+ #if nseqs_sff >1000:
1048
+ # check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
1049
+ # sys.exit()
1050
+
1051
+ if ssahapematches.has_key(seq_data['name']):
1052
+ #print "Paired end:",seq_data['name']
1053
+ nseqs_out += split_paired_end(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
1054
+ else:
1055
+ #print "Normal:",seq_data['name']
1056
+ if config['pelinker_fname']:
1057
+ seq_data['name'] = seq_data['name'] + ".fn"
1058
+ write_unpaired_read(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
1059
+ nseqs_out += 1
1060
+ print "done."
1061
+ print 'Converted', str(nseqs_sff), 'reads into', str(nseqs_out), 'sequences.'
1062
+ sff_fh.close()
1063
+
1064
+ check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
1065
+ seqcheckstore = ([])
1066
+
1067
+ xml_fh.write('</trace_volume>\n')
1068
+
1069
+ xml_fh.close()
1070
+ seq_fh.close()
1071
+ if qual_fh is not None:
1072
+ qual_fh.close()
1073
+
1074
+ return
1075
+
1076
+ def check_for_dubious_startseq(seqcheckstore, sffname,seqdata):
1077
+
1078
+ global stern_warning
1079
+
1080
+ foundproblem = ""
1081
+ for checklen in range(1,len(seqcheckstore[0])):
1082
+ foundinloop = False
1083
+ seqdict = {}
1084
+ for seq in seqcheckstore:
1085
+ shortseq = seq[0:checklen]
1086
+ if shortseq in seqdict:
1087
+ seqdict[shortseq] += 1
1088
+ else:
1089
+ seqdict[shortseq] = 1
1090
+
1091
+ for shortseq, count in seqdict.items():
1092
+ if float(count)/len(seqcheckstore) >= 0.5:
1093
+ foundinloop = True
1094
+ stern_warning
1095
+ foundproblem = "\n"+"*" * 80
1096
+ foundproblem += "\nWARNING: "
1097
+ foundproblem += "weird sequences in file " + sffname + "\n\n"
1098
+ foundproblem += "After applying left clips, " + str(count) + " sequences (="
1099
+ foundproblem += '%.0f'%(100.0*float(count)/len(seqcheckstore))
1100
+ foundproblem += "%) start with these bases:\n" + shortseq
1101
+ foundproblem += "\n\nThis does not look sane.\n\n"
1102
+ foundproblem += "Countermeasures you *probably* must take:\n"
1103
+ foundproblem += "1) Make your sequence provider aware of that problem and ask whether this can be\n corrected in the SFF.\n"
1104
+ foundproblem += "2) If you decide that this is not normal and your sequence provider does not\n react, use the --min_left_clip of sff_extract.\n"
1105
+ left,right = return_merged_clips(seqdata)
1106
+ foundproblem += " (Probably '--min_left_clip="+ str(left+len(shortseq))+"' but you should cross-check that)\n"
1107
+ foundproblem += "*" * 80 + "\n"
1108
+ if not foundinloop :
1109
+ break
1110
+ if len(foundproblem):
1111
+ print foundproblem
1112
+
1113
+
1114
+ def parse_extra_info(info):
1115
+ '''It parses the information that will go in the xml file.
1116
+
1117
+ There are two formats accepted for the extra information:
1118
+ key1:value1, key2:value2
1119
+ or:
1120
+ file1.sff{key1:value1, key2:value2};file2.sff{key3:value3}
1121
+ '''
1122
+ if not info:
1123
+ return info
1124
+ finfos = info.split(';') #information for each file
1125
+ data_for_files = {}
1126
+ for finfo in finfos:
1127
+ #we split the file name from the rest
1128
+ items = finfo.split('{')
1129
+ if len(items) == 1:
1130
+ fname = fake_sff_name
1131
+ info = items[0]
1132
+ else:
1133
+ fname = items[0]
1134
+ info = items[1]
1135
+ #now we get each key,value pair in the info
1136
+ info = info.replace('}', '')
1137
+ data = {}
1138
+ for item in info.split(','):
1139
+ key, value = item.strip().split(':')
1140
+ key = key.strip()
1141
+ value = value.strip()
1142
+ data[key] = value
1143
+ data_for_files[fname] = data
1144
+ return data_for_files
1145
+
1146
+
1147
+ def return_merged_clips(data):
1148
+ '''It returns the left and right positions to clip.'''
1149
+ def max(a, b):
1150
+ '''It returns the max of the two given numbers.
1151
+
1152
+ It won't take into account the zero values.
1153
+ '''
1154
+ if not a and not b:
1155
+ return None
1156
+ if not a:
1157
+ return b
1158
+ if not b:
1159
+ return a
1160
+ if a >= b:
1161
+ return a
1162
+ else:
1163
+ return b
1164
+ def min(a, b):
1165
+ '''It returns the min of the two given numbers.
1166
+
1167
+ It won't take into account the zero values.
1168
+ '''
1169
+ if not a and not b:
1170
+ return None
1171
+ if not a:
1172
+ return b
1173
+ if not b:
1174
+ return a
1175
+ if a <= b:
1176
+ return a
1177
+ else:
1178
+ return b
1179
+ left = max(data['clip_adapter_left'], data['clip_qual_left'])
1180
+ right = min(data['clip_adapter_right'], data['clip_qual_right'])
1181
+ #maybe both clips where zero
1182
+ if left is None:
1183
+ left = 1
1184
+ if right is None:
1185
+ right = data['number_of_bases']
1186
+ return left, right
1187
+
1188
+ def sequence_case(data):
1189
+ '''Given the data for one read it returns the seq with mixed case.
1190
+
1191
+ The regions to be clipped will be lower case and the rest upper case.
1192
+ '''
1193
+ left, right = return_merged_clips(data)
1194
+ seq = data['bases']
1195
+ new_seq = ''.join((seq[:left-1].lower(), seq[left-1:right], seq[right:].lower()))
1196
+ return new_seq
1197
+
1198
+ def clip_read(data):
1199
+ '''Given the data for one read it returns clipped seq and qual.'''
1200
+
1201
+ qual = data['quality_scores']
1202
+ left, right = return_merged_clips(data)
1203
+ seq = data['bases']
1204
+ qual = data['quality_scores']
1205
+ new_seq = seq[left-1:right]
1206
+ new_qual = qual[left-1:right]
1207
+
1208
+ return new_seq, new_qual
1209
+
1210
+
1211
+
1212
+ def tests_for_ssaha(linker_fname):
1213
+ '''Tests whether SSAHA2 can be successfully called.'''
1214
+
1215
+ try:
1216
+ print "Testing whether SSAHA2 is installed and can be launched ... ",
1217
+ sys.stdout.flush()
1218
+ fh = open('/dev/null', 'w')
1219
+ retcode = subprocess.call(["ssaha2", "-v"], stdout = fh)
1220
+ fh.close()
1221
+ print "ok."
1222
+ except :
1223
+ print "nope? Uh oh ...\n\n"
1224
+ raise RuntimeError('Could not launch ssaha2. Have you installed it? Is it in your path?')
1225
+
1226
+
1227
+ def load_linker_sequences(linker_fname):
1228
+ '''Loads all linker sequences into memory, storing only the length
1229
+ of each linker.'''
1230
+
1231
+ global linkerlengths
1232
+
1233
+ if not os.path.getsize(linker_fname):
1234
+ raise RuntimeError("File empty? '" + linker_fname + "'")
1235
+ fh = open(linker_fname, 'r')
1236
+ linkerseqs = read_fasta(fh)
1237
+ if len(linkerseqs) == 0:
1238
+ raise RuntimeError(linker_fname + ": no sequence found?")
1239
+ for i in linkerseqs:
1240
+ if linkerlengths.has_key(i.name):
1241
+ raise RuntimeError(linker_fname + ": sequence '" + i.name + "' present multiple times. Aborting.")
1242
+ linkerlengths[i.name] = len(i.sequence)
1243
+ fh.close()
1244
+
1245
+
1246
+ def launch_ssaha(linker_fname, query_fname, output_fh):
1247
+ '''Launches SSAHA2 on the linker and query file, string SSAHA2 output
1248
+ into the output filehandle'''
1249
+
1250
+ try:
1251
+ print "Searching linker sequences with SSAHA2 (this may take a while) ... ",
1252
+ sys.stdout.flush()
1253
+ retcode = subprocess.call(["ssaha2", "-output", "ssaha2", "-solexa", "-kmer", "4", "-skip", "1", linker_fname, query_fname], stdout = output_fh)
1254
+ if retcode:
1255
+ raise RuntimeError('Ups.')
1256
+ else:
1257
+ print "ok."
1258
+ except:
1259
+ print "\n"
1260
+ raise RuntimeError('An error occured during the SSAHA2 execution, aborting.')
1261
+
1262
+ def read_ssaha_data(ssahadata_fh):
1263
+ '''Given file handle, reads file generated with SSAHA2 (with default
1264
+ output format) and stores all matches as list ssahapematches
1265
+ (ssaha paired-end matches) dictionary'''
1266
+
1267
+ global ssahapematches
1268
+
1269
+ print "Parsing SSAHA2 result file ... ",
1270
+ sys.stdout.flush()
1271
+
1272
+ for line in ssahadata_fh:
1273
+ if line.startswith('ALIGNMENT'):
1274
+ ml = line.split()
1275
+ if len(ml) != 12 :
1276
+ print "\n", line,
1277
+ raise RuntimeError('Expected 12 elements in the SSAHA2 line with ALIGMENT keyword, but found ' + str(len(ml)))
1278
+ if not ssahapematches.has_key(ml[2]) :
1279
+ ssahapematches[ml[2]] = ([])
1280
+ if ml[8] == 'F':
1281
+ #print line,
1282
+
1283
+ # store everything except the first element (output
1284
+ # format name (ALIGNMENT)) and the last element
1285
+ # (length)
1286
+ ssahapematches[ml[2]].append(ml[1:-1])
1287
+ else:
1288
+ #print ml
1289
+ ml[4],ml[5] = ml[5],ml[4]
1290
+ #print ml
1291
+ ssahapematches[ml[2]].append(ml[1:-1])
1292
+
1293
+ print "done."
1294
+
1295
+
1296
+ ##########################################################################
1297
+ #
1298
+ # BaCh: This block was shamelessly copied from
1299
+ # http://python.genedrift.org/2007/07/04/reading-fasta-files-conclusion/
1300
+ # and then subsequently modified to read fasta correctly
1301
+ # It's still not fool proof, but should be good enough
1302
+ #
1303
+ ##########################################################################
1304
+
1305
+ class Fasta:
1306
+ def __init__(self, name, sequence):
1307
+ self.name = name
1308
+ self.sequence = sequence
1309
+
1310
+ def read_fasta(file):
1311
+ items = []
1312
+ aninstance = Fasta('', '')
1313
+ linenum = 0
1314
+ for line in file:
1315
+ linenum += 1
1316
+ if line.startswith(">"):
1317
+ if len(aninstance.sequence):
1318
+ items.append(aninstance)
1319
+ aninstance = Fasta('', '')
1320
+ # name == all characters until the first whitespace
1321
+ # (split()[0]) but without the starting ">" ([1:])
1322
+ aninstance.name = line.split()[0][1:]
1323
+ aninstance.sequence = ''
1324
+ if len(aninstance.name) == 0:
1325
+ raise RuntimeError(file.name + ': no name in line ' + str(linenum) + '?')
1326
+
1327
+ else:
1328
+ if len(aninstance.name) == 0:
1329
+ raise RuntimeError(file.name + ': no sequence header at line ' + str(linenum) + '?')
1330
+ aninstance.sequence += line.strip()
1331
+
1332
+ if len(aninstance.name) and len(aninstance.sequence):
1333
+ items.append(aninstance)
1334
+
1335
+ return items
1336
+ ##########################################################################
1337
+
1338
+ def version_string ():
1339
+ return "sff_extract " + __version__
1340
+
1341
+ def read_config():
1342
+ '''It reads the configuration options from the command line arguments and
1343
+ it returns a dict with them.'''
1344
+ from optparse import OptionParser, OptionGroup
1345
+ usage = "usage: %prog [options] sff1 sff2 ..."
1346
+ desc = "Extract sequences from 454 SFF files into FASTA, FASTA quality"\
1347
+ " and XML traceinfo format. When a paired-end linker sequence"\
1348
+ " is given (-l), use SSAHA2 to scan the sequences for the linker,"\
1349
+ " then split the sequences, removing the linker."
1350
+ parser = OptionParser(usage = usage, version = version_string(), description = desc)
1351
+ parser.add_option('-a', '--append', action="store_true", dest='append',
1352
+ help='append output to existing files', default=False)
1353
+ parser.add_option('-i', '--xml_info', dest='xml_info',
1354
+ help='extra info to write in the xml file')
1355
+ parser.add_option("-l", "--linker_file", dest="pelinker_fname",
1356
+ help="FASTA file with paired-end linker sequences", metavar="FILE")
1357
+
1358
+ group = OptionGroup(parser, "File name options","")
1359
+ group.add_option('-c', '--clip', action="store_true", dest='clip',
1360
+ help='clip (completely remove) ends with low qual and/or adaptor sequence', default=False)
1361
+ group.add_option('-u', '--upper_case', action="store_false", dest='mix_case',
1362
+ help='all bases in upper case, including clipped ends', default=True)
1363
+ group.add_option('', '--min_left_clip', dest='min_leftclip',
1364
+ metavar="INTEGER", type = "int",
1365
+ help='if the left clip coming from the SFF is smaller than this value, override it', default=0)
1366
+ group.add_option('-Q', '--fastq', action="store_true", dest='want_fastq',
1367
+ help='store as FASTQ file instead of FASTA + FASTA quality file', default=False)
1368
+ parser.add_option_group(group)
1369
+
1370
+ group = OptionGroup(parser, "File name options","")
1371
+ group.add_option("-o", "--out_basename", dest="basename",
1372
+ help="base name for all output files")
1373
+ group.add_option("-s", "--seq_file", dest="seq_fname",
1374
+ help="output sequence file name", metavar="FILE")
1375
+ group.add_option("-q", "--qual_file", dest="qual_fname",
1376
+ help="output quality file name", metavar="FILE")
1377
+ group.add_option("-x", "--xml_file", dest="xml_fname",
1378
+ help="output ancillary xml file name", metavar="FILE")
1379
+ parser.add_option_group(group)
1380
+
1381
+ #default fnames
1382
+ #is there an sff file?
1383
+ basename = 'reads'
1384
+ if sys.argv[-1][-4:].lower() == '.sff':
1385
+ basename = sys.argv[-1][:-4]
1386
+ def_seq_fname = basename + '.fasta'
1387
+ def_qual_fname = basename + '.fasta.qual'
1388
+ def_xml_fname = basename + '.xml'
1389
+ def_pelinker_fname = ''
1390
+ parser.set_defaults(seq_fname = def_seq_fname)
1391
+ parser.set_defaults(qual_fname = def_qual_fname)
1392
+ parser.set_defaults(xml_fname = def_xml_fname)
1393
+ parser.set_defaults(pelinker_fname = def_pelinker_fname)
1394
+
1395
+ #we parse the cmd line
1396
+ (options, args) = parser.parse_args()
1397
+
1398
+ #we put the result in a dict
1399
+ global config
1400
+ config = {}
1401
+ for property in dir(options):
1402
+ if property[0] == '_' or property in ('ensure_value', 'read_file',
1403
+ 'read_module'):
1404
+ continue
1405
+ config[property] = getattr(options, property)
1406
+
1407
+ if config['basename'] is None:
1408
+ config['basename']=basename
1409
+
1410
+ #if we have not set a file name with -s, -q or -x we set the basename
1411
+ #based file name
1412
+ if config['want_fastq']:
1413
+ config['qual_fname'] = ''
1414
+ if config['seq_fname'] == def_seq_fname:
1415
+ config['seq_fname'] = config['basename'] + '.fastq'
1416
+ else:
1417
+ if config['seq_fname'] == def_seq_fname:
1418
+ config['seq_fname'] = config['basename'] + '.fasta'
1419
+ if config['qual_fname'] == def_qual_fname:
1420
+ config['qual_fname'] = config['basename'] + '.fasta.qual'
1421
+
1422
+ if config['xml_fname'] == def_xml_fname:
1423
+ config['xml_fname'] = config['basename'] + '.xml'
1424
+
1425
+ #we parse the extra info for the xml file
1426
+ config['xml_info'] = parse_extra_info(config['xml_info'])
1427
+ return config, args
1428
+
1429
+
1430
+
1431
+ ##########################################################################
1432
+
1433
+
1434
+ def testsome():
1435
+ sys.exit()
1436
+ return
1437
+
1438
+
1439
+ def debug():
1440
+ try:
1441
+ dummy = 1
1442
+ #debug()
1443
+ #testsome()
1444
+
1445
+ config, args = read_config()
1446
+ load_linker_sequences(config['pelinker_fname'])
1447
+
1448
+ #pid = os.getpid()
1449
+ pid = 15603
1450
+
1451
+ #tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
1452
+ #tmpfasta_fh = open(tmpfasta_fname, 'w')
1453
+ tmpfasta_fname = 'FLVI58L05.fa'
1454
+ tmpfasta_fh = open(tmpfasta_fname, 'r')
1455
+
1456
+ tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
1457
+ tmpssaha_fh = open(tmpssaha_fname, 'w')
1458
+
1459
+ launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
1460
+
1461
+ tmpssaha_fh = open("sffe.tmp.15603.ssaha2", 'r')
1462
+ read_ssaha_data(tmpssaha_fh)
1463
+
1464
+ sys.exit()
1465
+
1466
+ extract_reads_from_sff(config, args)
1467
+
1468
+ except (OSError, IOError, RuntimeError), errval:
1469
+ print errval
1470
+ sys.exit()
1471
+
1472
+ sys.exit()
1473
+
1474
+
1475
+ def main():
1476
+
1477
+ argv = sys.argv
1478
+ if len(argv) == 1:
1479
+ sys.argv.append('-h')
1480
+ read_config()
1481
+ sys.exit()
1482
+ try:
1483
+ #debug();
1484
+
1485
+ config, args = read_config()
1486
+
1487
+ if config['pelinker_fname']:
1488
+ #tests_for_ssaha(config['pelinker_fname'])
1489
+ load_linker_sequences(config['pelinker_fname'])
1490
+ if len(args) == 0:
1491
+ raise RuntimeError("No SFF file given?")
1492
+ extract_reads_from_sff(config, args)
1493
+ except (OSError, IOError, RuntimeError), errval:
1494
+ print errval
1495
+ return 1
1496
+
1497
+ if stern_warning:
1498
+ return 1
1499
+
1500
+ return 0
1501
+
1502
+
1503
+
1504
+ if __name__ == "__main__":
1505
+ sys.exit(main())