HTSeq 2.1.2__cp313-cp313-macosx_10_15_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- HTSeq/StepVector.py +629 -0
- HTSeq/StretchVector.py +491 -0
- HTSeq/_HTSeq.cpython-313-darwin.so +0 -0
- HTSeq/_HTSeq_internal.py +85 -0
- HTSeq/_StepVector.cpython-313-darwin.so +0 -0
- HTSeq/__init__.py +1249 -0
- HTSeq/features.py +489 -0
- HTSeq/scripts/__init__.py +0 -0
- HTSeq/scripts/count.py +528 -0
- HTSeq/scripts/count_features/__init__.py +0 -0
- HTSeq/scripts/count_features/count_features_per_file.py +465 -0
- HTSeq/scripts/count_features/reads_io_processor.py +187 -0
- HTSeq/scripts/count_features/reads_stats.py +92 -0
- HTSeq/scripts/count_with_barcodes.py +746 -0
- HTSeq/scripts/qa.py +336 -0
- HTSeq/scripts/utils.py +372 -0
- HTSeq/utils.py +92 -0
- htseq-2.1.2.dist-info/METADATA +813 -0
- htseq-2.1.2.dist-info/RECORD +23 -0
- htseq-2.1.2.dist-info/WHEEL +5 -0
- htseq-2.1.2.dist-info/entry_points.txt +4 -0
- htseq-2.1.2.dist-info/licenses/LICENSE +674 -0
- htseq-2.1.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,465 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
import random
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
from HTSeq.scripts.utils import invert_strand, UnknownChrom
|
|
6
|
+
from HTSeq.scripts.count_features.reads_io_processor import ReadsIO
|
|
7
|
+
from HTSeq.scripts.count_features.reads_stats import ReadsStatistics
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def count_reads_single_file(
|
|
11
|
+
isam,
|
|
12
|
+
sam_filename,
|
|
13
|
+
features,
|
|
14
|
+
feature_attr,
|
|
15
|
+
order,
|
|
16
|
+
max_buffer_size,
|
|
17
|
+
stranded,
|
|
18
|
+
overlap_mode,
|
|
19
|
+
multimapped_mode,
|
|
20
|
+
secondary_alignment_mode,
|
|
21
|
+
supplementary_alignment_mode,
|
|
22
|
+
feature_type,
|
|
23
|
+
id_attribute,
|
|
24
|
+
additional_attributes,
|
|
25
|
+
quiet,
|
|
26
|
+
minaqual,
|
|
27
|
+
samout_format,
|
|
28
|
+
samout_filename,
|
|
29
|
+
):
|
|
30
|
+
"""
|
|
31
|
+
The function that does the counting for each input BAM/SAM file.
|
|
32
|
+
Fixme: there are some redundant parameters here.. feature_type, id_attribute, additional_attributes
|
|
33
|
+
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
isam : int
|
|
37
|
+
input files' indexing for the purpose of parallel processing.
|
|
38
|
+
This basically tell you which input file is being processed by this
|
|
39
|
+
instance of function.
|
|
40
|
+
sam_filename : str
|
|
41
|
+
Path to the SAM/BAM file containing the mapped reads.
|
|
42
|
+
features : array
|
|
43
|
+
TODO check the type of this parameter.
|
|
44
|
+
Supplied by HTSeq.make_feature_genomicarrayofsets
|
|
45
|
+
feature_attr : array
|
|
46
|
+
TODO check the type of this parameter.
|
|
47
|
+
Supplied by HTSeq.make_feature_genomicarrayofsets
|
|
48
|
+
order : str
|
|
49
|
+
Can only be either 'pos' or 'name'. Sorting order of <alignment_file>.
|
|
50
|
+
max_buffer_size : int
|
|
51
|
+
The number of reads allowed to stay in memory until mates are found.
|
|
52
|
+
Used when <alignment_file> is paired end sorted by position.
|
|
53
|
+
stranded : str
|
|
54
|
+
Whether the data to be aligned is from a strand-specific assay.
|
|
55
|
+
Option is yes, no, reverse.
|
|
56
|
+
reverse means yes with reversed strand interpretation.
|
|
57
|
+
overlap_mode : str
|
|
58
|
+
Mode to handle reads overlapping more than one feature.
|
|
59
|
+
Choices: union, intersection-strict, intersection-nonempty.
|
|
60
|
+
multimapped_mode : str
|
|
61
|
+
Whether and how to score reads that are not uniquely aligned or
|
|
62
|
+
ambiguously assigned to features.
|
|
63
|
+
Choices: none, all, fraction, random.
|
|
64
|
+
secondary_alignment_mode : str
|
|
65
|
+
Whether to score secondary alignments (0x100 flag).
|
|
66
|
+
Choices: score or ignore.
|
|
67
|
+
supplementary_alignment_mode : str
|
|
68
|
+
Whether to score supplementary alignments (0x800 flag).
|
|
69
|
+
Choices: score or ignore.
|
|
70
|
+
feature_type : str
|
|
71
|
+
Feature type (3rd column in GTF file) to be used, all features of other
|
|
72
|
+
type are ignored (default, suitable for Ensembl, GTF files: exon).
|
|
73
|
+
id_attribute : str
|
|
74
|
+
GTF attribute to be used as feature ID.
|
|
75
|
+
Normally gene_id, suitable for Ensembl GTF files.
|
|
76
|
+
additional_attributes : array
|
|
77
|
+
Additional feature attributes.
|
|
78
|
+
Commonly, gene_name is suitable for Ensembl GTF files.
|
|
79
|
+
quiet : boolean
|
|
80
|
+
Whether to suppress progress report.
|
|
81
|
+
minaqual : int
|
|
82
|
+
Value denoting the MAPQ alignment quality of reads to skip.
|
|
83
|
+
samout_format : str
|
|
84
|
+
Format of the output files denoted by samouts.
|
|
85
|
+
Choices: SAM, BAM, sam, bam.
|
|
86
|
+
samout_filename : str
|
|
87
|
+
The name of SAM/BAM file to write out all SAM alignment records into.
|
|
88
|
+
Returns
|
|
89
|
+
-------
|
|
90
|
+
Dictionary
|
|
91
|
+
TODO update me when done refactoring
|
|
92
|
+
|
|
93
|
+
"""
|
|
94
|
+
try:
|
|
95
|
+
read_io_obj = ReadsIO(
|
|
96
|
+
sam_filename=sam_filename,
|
|
97
|
+
samout_filename=samout_filename,
|
|
98
|
+
samout_format=samout_format,
|
|
99
|
+
supplementary_alignment_mode=supplementary_alignment_mode,
|
|
100
|
+
secondary_alignment_mode=secondary_alignment_mode,
|
|
101
|
+
order=order,
|
|
102
|
+
max_buffer_size=max_buffer_size,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# If the BAM header is available, check that at least one of the
|
|
106
|
+
# chromosomes is also found in the GTF/GFF file, otherwise the user
|
|
107
|
+
# is probably doing something wrong (e.g. "chr1" vs "1").
|
|
108
|
+
bam_chroms = read_io_obj.get_chromosome_names_header()
|
|
109
|
+
if bam_chroms is not None:
|
|
110
|
+
bam_chroms = set(bam_chroms)
|
|
111
|
+
feature_chroms = set(features.chrom_vectors.keys())
|
|
112
|
+
if not (bam_chroms & feature_chroms):
|
|
113
|
+
sys.stderr.write(
|
|
114
|
+
f"The alignment file has no chromosomes in common with the GFF/GTF "
|
|
115
|
+
"file. This will result in zero feature counts. Please check if the "
|
|
116
|
+
"references match, e.g. if you are using 'chr1' or '1' as "
|
|
117
|
+
"chromosome names.\n")
|
|
118
|
+
|
|
119
|
+
except:
|
|
120
|
+
sys.stderr.write("Error occurred when reading beginning of SAM/BAM file.\n")
|
|
121
|
+
raise
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
read_stats = ReadsStatistics(
|
|
125
|
+
feature_attr=feature_attr, read_io_object=read_io_obj
|
|
126
|
+
)
|
|
127
|
+
except:
|
|
128
|
+
sys.stderr.write(
|
|
129
|
+
"Error occurred when preparing object to store the reads' assignments\n"
|
|
130
|
+
)
|
|
131
|
+
raise
|
|
132
|
+
|
|
133
|
+
# CIGAR match characters (including alignment match, sequence match, and
|
|
134
|
+
# sequence mismatch
|
|
135
|
+
com = ("M", "=", "X")
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
for r in read_io_obj.read_seq:
|
|
139
|
+
read_stats.print_progress()
|
|
140
|
+
read_stats.add_num_reads_processed()
|
|
141
|
+
|
|
142
|
+
# get the interval/read sequence.
|
|
143
|
+
if not read_io_obj.pe_mode:
|
|
144
|
+
skip_read = _assess_non_pe_read(
|
|
145
|
+
read_sequence=r,
|
|
146
|
+
read_stats=read_stats,
|
|
147
|
+
secondary_alignment_mode=secondary_alignment_mode,
|
|
148
|
+
supplementary_alignment_mode=supplementary_alignment_mode,
|
|
149
|
+
multimapped_mode=multimapped_mode,
|
|
150
|
+
minaqual=minaqual,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
if skip_read:
|
|
154
|
+
continue
|
|
155
|
+
iv_seq = _get_iv_seq_non_pe_read(com, r, stranded)
|
|
156
|
+
else:
|
|
157
|
+
|
|
158
|
+
# NOTE: the logic here is a little arbitrary and might benefit
|
|
159
|
+
# from an optional arg. If the reads are paired-end but one of
|
|
160
|
+
# the two is missing, ATM we rely on the other one for info,
|
|
161
|
+
# however the data is technically inconsistent and we might
|
|
162
|
+
# want to let the user choose.
|
|
163
|
+
skip_read = _assess_pe_read(
|
|
164
|
+
minaqual,
|
|
165
|
+
multimapped_mode,
|
|
166
|
+
r,
|
|
167
|
+
read_stats,
|
|
168
|
+
secondary_alignment_mode,
|
|
169
|
+
supplementary_alignment_mode,
|
|
170
|
+
)
|
|
171
|
+
if skip_read:
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
iv_seq = _get_iv_seq_pe_read(com, r, stranded)
|
|
175
|
+
|
|
176
|
+
# this bit updates the counts obtained from aligning reads to feature sets.
|
|
177
|
+
try:
|
|
178
|
+
fs = _align_reads_to_feature_set(features, iv_seq, overlap_mode)
|
|
179
|
+
|
|
180
|
+
_update_feature_set_counts(fs, multimapped_mode, r, read_stats)
|
|
181
|
+
|
|
182
|
+
except UnknownChrom:
|
|
183
|
+
read_stats.add_empty_read(read_sequence=r)
|
|
184
|
+
|
|
185
|
+
except:
|
|
186
|
+
sys.stderr.write(
|
|
187
|
+
"Error occured when processing input (%s):\n"
|
|
188
|
+
% (read_io_obj.read_seq_file.get_line_number_string())
|
|
189
|
+
)
|
|
190
|
+
raise
|
|
191
|
+
|
|
192
|
+
if not quiet:
|
|
193
|
+
read_stats.print_progress(force_print=True)
|
|
194
|
+
|
|
195
|
+
read_io_obj.close_samoutfile()
|
|
196
|
+
|
|
197
|
+
res = read_stats.get_output(isam)
|
|
198
|
+
return res
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _update_feature_set_counts(fs, multimapped_mode, read_sequence, read_stats):
|
|
202
|
+
"""
|
|
203
|
+
Distribute the counts among the aligned feature set.
|
|
204
|
+
|
|
205
|
+
Parameters
|
|
206
|
+
----------
|
|
207
|
+
fs : array
|
|
208
|
+
A list of feature set previously aligned to the read
|
|
209
|
+
multimapped_mode : str
|
|
210
|
+
How to handle read mapped to multiple features
|
|
211
|
+
read_sequence : array
|
|
212
|
+
Read sequence
|
|
213
|
+
read_stats : ReadsStatistics object
|
|
214
|
+
For updating bad reads
|
|
215
|
+
|
|
216
|
+
"""
|
|
217
|
+
if fs is None or len(fs) == 0:
|
|
218
|
+
read_stats.add_empty_read(read_sequence=read_sequence)
|
|
219
|
+
elif len(fs) > 1:
|
|
220
|
+
read_stats.add_ambiguous_read(
|
|
221
|
+
read_sequence=read_sequence,
|
|
222
|
+
assignment="__ambiguous[" + "+".join(sorted(fs)) + "]",
|
|
223
|
+
)
|
|
224
|
+
else:
|
|
225
|
+
read_stats.add_good_read_assignment(
|
|
226
|
+
read_sequence=read_sequence, assignment=list(fs)[0]
|
|
227
|
+
)
|
|
228
|
+
if fs is not None and len(fs) > 0:
|
|
229
|
+
fs = list(fs)
|
|
230
|
+
if multimapped_mode == "none":
|
|
231
|
+
if len(fs) == 1:
|
|
232
|
+
read_stats.add_to_count(feature=fs[0])
|
|
233
|
+
elif multimapped_mode == "all":
|
|
234
|
+
for fsi in fs:
|
|
235
|
+
read_stats.add_to_count(feature=fsi)
|
|
236
|
+
elif multimapped_mode == "fraction":
|
|
237
|
+
val = 1.0 / len(fs)
|
|
238
|
+
for fsi in fs:
|
|
239
|
+
read_stats.add_to_count(feature=fsi, value=val)
|
|
240
|
+
elif multimapped_mode == "random":
|
|
241
|
+
fsi = random.choice(fs)
|
|
242
|
+
read_stats.add_to_count(feature=fsi)
|
|
243
|
+
else:
|
|
244
|
+
sys.exit("Illegal multimap mode.")
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _align_reads_to_feature_set(features, iv_seq, overlap_mode):
|
|
248
|
+
"""
|
|
249
|
+
Align reads to feature set.
|
|
250
|
+
|
|
251
|
+
Parameters
|
|
252
|
+
----------
|
|
253
|
+
features : array
|
|
254
|
+
A set of features to align the reads to
|
|
255
|
+
TODO not sure the type yet.
|
|
256
|
+
iv_seq : array
|
|
257
|
+
TODO not sure the type yet.
|
|
258
|
+
Read (or interval?) sequence
|
|
259
|
+
overlap_mode : str
|
|
260
|
+
How to select the features for read that not 100% aligned to a feature.
|
|
261
|
+
|
|
262
|
+
Returns
|
|
263
|
+
-------
|
|
264
|
+
fs : array
|
|
265
|
+
A set of features to align the reads to
|
|
266
|
+
TODO not sure the type yet.
|
|
267
|
+
|
|
268
|
+
"""
|
|
269
|
+
if overlap_mode == "union":
|
|
270
|
+
fs = set()
|
|
271
|
+
for iv in iv_seq:
|
|
272
|
+
if iv.chrom not in features.chrom_vectors:
|
|
273
|
+
raise UnknownChrom
|
|
274
|
+
for iv2, fs2 in features[iv].steps():
|
|
275
|
+
fs = fs.union(fs2)
|
|
276
|
+
elif overlap_mode in ("intersection-strict", "intersection-nonempty"):
|
|
277
|
+
fs = None
|
|
278
|
+
for iv in iv_seq:
|
|
279
|
+
if iv.chrom not in features.chrom_vectors:
|
|
280
|
+
raise UnknownChrom
|
|
281
|
+
for iv2, fs2 in features[iv].steps():
|
|
282
|
+
if (len(fs2) > 0) or (overlap_mode == "intersection-strict"):
|
|
283
|
+
if fs is None:
|
|
284
|
+
fs = fs2.copy()
|
|
285
|
+
else:
|
|
286
|
+
fs = fs.intersection(fs2)
|
|
287
|
+
else:
|
|
288
|
+
sys.exit("Illegal overlap mode.")
|
|
289
|
+
return fs
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _get_iv_seq_pe_read(com, r, stranded):
|
|
293
|
+
"""
|
|
294
|
+
Function to break down the read sequence into intervals which will
|
|
295
|
+
subsequently be processed.
|
|
296
|
+
|
|
297
|
+
Parameters
|
|
298
|
+
----------
|
|
299
|
+
com : array
|
|
300
|
+
CIGAR match characters (including alignment match, sequence match, and
|
|
301
|
+
sequence mismatch
|
|
302
|
+
r :
|
|
303
|
+
todo update type
|
|
304
|
+
Read sequence
|
|
305
|
+
stranded : str
|
|
306
|
+
Whether the data to be aligned is from a strand-specific assay.
|
|
307
|
+
Option is yes, no, reverse.
|
|
308
|
+
reverse means yes with reversed strand interpretation.
|
|
309
|
+
|
|
310
|
+
Returns
|
|
311
|
+
-------
|
|
312
|
+
iv_seq :
|
|
313
|
+
todo update type
|
|
314
|
+
|
|
315
|
+
"""
|
|
316
|
+
if r[0] is not None and r[0].aligned:
|
|
317
|
+
iv_seq = _get_iv_seq_pe_read_first(com, r[0], stranded)
|
|
318
|
+
else:
|
|
319
|
+
iv_seq = tuple()
|
|
320
|
+
if r[1] is not None and r[1].aligned:
|
|
321
|
+
iv_seq = _get_iv_seq_pe_read_second(com, iv_seq, r[1], stranded)
|
|
322
|
+
return iv_seq
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def _assess_pe_read(
|
|
326
|
+
minaqual,
|
|
327
|
+
multimapped_mode,
|
|
328
|
+
read_sequence,
|
|
329
|
+
read_stats,
|
|
330
|
+
secondary_alignment_mode,
|
|
331
|
+
supplementary_alignment_mode,
|
|
332
|
+
):
|
|
333
|
+
"""
|
|
334
|
+
Function to check the read for paired end.
|
|
335
|
+
|
|
336
|
+
Parameters
|
|
337
|
+
----------
|
|
338
|
+
minaqual : int
|
|
339
|
+
Value denoting the MAPQ alignment quality of reads to skip.
|
|
340
|
+
multimapped_mode : str
|
|
341
|
+
Whether and how to score reads that are not uniquely aligned or
|
|
342
|
+
ambiguously assigned to features.
|
|
343
|
+
Choices: none, all, fraction, random.
|
|
344
|
+
read_sequence :
|
|
345
|
+
todo update type
|
|
346
|
+
read_stats : ReadsStatistics object
|
|
347
|
+
Object which stores a bunch of statistics about the read sequences.
|
|
348
|
+
secondary_alignment_mode : str
|
|
349
|
+
Whether to score secondary alignments (0x100 flag).
|
|
350
|
+
Choices: score or ignore.
|
|
351
|
+
supplementary_alignment_mode : str
|
|
352
|
+
Whether to score supplementary alignments (0x800 flag).
|
|
353
|
+
Choices: score or ignore.
|
|
354
|
+
|
|
355
|
+
Returns
|
|
356
|
+
-------
|
|
357
|
+
|
|
358
|
+
"""
|
|
359
|
+
# NOTE: Sometimes read1 is None or not aligned but read2 is fine, in that
|
|
360
|
+
# case we should not exclude the entire pair but rather use the interval
|
|
361
|
+
# of the second read
|
|
362
|
+
read1_miss = (read_sequence[0] is None) or (not read_sequence[0].aligned)
|
|
363
|
+
read2_miss = (read_sequence[1] is None) or (not read_sequence[1].aligned)
|
|
364
|
+
if read1_miss and read2_miss:
|
|
365
|
+
read_stats.add_not_aligned_read(read_sequence=read_sequence)
|
|
366
|
+
return True
|
|
367
|
+
|
|
368
|
+
if secondary_alignment_mode == "ignore":
|
|
369
|
+
if (read_sequence[0] is not None) and read_sequence[0].not_primary_alignment:
|
|
370
|
+
return True
|
|
371
|
+
elif (read_sequence[1] is not None) and read_sequence[1].not_primary_alignment:
|
|
372
|
+
return True
|
|
373
|
+
if supplementary_alignment_mode == "ignore":
|
|
374
|
+
if (read_sequence[0] is not None) and read_sequence[0].supplementary:
|
|
375
|
+
return True
|
|
376
|
+
elif (read_sequence[1] is not None) and read_sequence[1].supplementary:
|
|
377
|
+
return True
|
|
378
|
+
try:
|
|
379
|
+
if (
|
|
380
|
+
read_sequence[0] is not None and read_sequence[0].optional_field("NH") > 1
|
|
381
|
+
) or (
|
|
382
|
+
read_sequence[1] is not None and read_sequence[1].optional_field("NH") > 1
|
|
383
|
+
):
|
|
384
|
+
read_stats.add_not_unique_read(read_sequence=read_sequence)
|
|
385
|
+
if multimapped_mode == "none":
|
|
386
|
+
return True
|
|
387
|
+
except KeyError:
|
|
388
|
+
pass
|
|
389
|
+
if (read_sequence[0] and read_sequence[0].aQual < minaqual) or (
|
|
390
|
+
read_sequence[1] and read_sequence[1].aQual < minaqual
|
|
391
|
+
):
|
|
392
|
+
read_stats.add_low_quality_read(read_sequence=read_sequence)
|
|
393
|
+
return True
|
|
394
|
+
return False
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
# Get GenomicInterval for each read, whether single-end or paired-end
|
|
398
|
+
def _get_iv_seq_non_pe_read(com, r, stranded):
|
|
399
|
+
if stranded != "reverse":
|
|
400
|
+
iv_seq = (co.ref_iv for co in r.cigar if co.type in com and co.size > 0)
|
|
401
|
+
else:
|
|
402
|
+
iv_seq = (
|
|
403
|
+
invert_strand(co.ref_iv)
|
|
404
|
+
for co in r.cigar
|
|
405
|
+
if (co.type in com and co.size > 0)
|
|
406
|
+
)
|
|
407
|
+
return iv_seq
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def _get_iv_seq_pe_read_first(com, read, stranded):
|
|
411
|
+
if stranded != "reverse":
|
|
412
|
+
iv_seq = (co.ref_iv for co in read.cigar if co.type in com and co.size > 0)
|
|
413
|
+
else:
|
|
414
|
+
iv_seq = (
|
|
415
|
+
invert_strand(co.ref_iv)
|
|
416
|
+
for co in read.cigar
|
|
417
|
+
if co.type in com and co.size > 0
|
|
418
|
+
)
|
|
419
|
+
return iv_seq
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def _get_iv_seq_pe_read_second(com, iv_seq, read, stranded):
|
|
423
|
+
if stranded != "reverse":
|
|
424
|
+
iv_seq = itertools.chain(
|
|
425
|
+
iv_seq,
|
|
426
|
+
(
|
|
427
|
+
invert_strand(co.ref_iv)
|
|
428
|
+
for co in read.cigar
|
|
429
|
+
if co.type in com and co.size > 0
|
|
430
|
+
),
|
|
431
|
+
)
|
|
432
|
+
else:
|
|
433
|
+
iv_seq = itertools.chain(
|
|
434
|
+
iv_seq, (co.ref_iv for co in read.cigar if co.type in com and co.size > 0)
|
|
435
|
+
)
|
|
436
|
+
return iv_seq
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def _assess_non_pe_read(
|
|
440
|
+
read_sequence,
|
|
441
|
+
read_stats,
|
|
442
|
+
secondary_alignment_mode,
|
|
443
|
+
supplementary_alignment_mode,
|
|
444
|
+
multimapped_mode,
|
|
445
|
+
minaqual,
|
|
446
|
+
):
|
|
447
|
+
if not read_sequence.aligned:
|
|
448
|
+
read_stats.add_not_aligned_read(read_sequence=read_sequence)
|
|
449
|
+
return True
|
|
450
|
+
if (secondary_alignment_mode == "ignore") and read_sequence.not_primary_alignment:
|
|
451
|
+
return True
|
|
452
|
+
if (supplementary_alignment_mode == "ignore") and read_sequence.supplementary:
|
|
453
|
+
return True
|
|
454
|
+
try:
|
|
455
|
+
if read_sequence.optional_field("NH") > 1:
|
|
456
|
+
read_stats.add_not_unique_read(read_sequence=read_sequence)
|
|
457
|
+
if multimapped_mode == "none":
|
|
458
|
+
return True
|
|
459
|
+
except KeyError:
|
|
460
|
+
pass
|
|
461
|
+
if read_sequence.aQual < minaqual:
|
|
462
|
+
read_stats.add_low_quality_read(read_sequence=read_sequence)
|
|
463
|
+
return True
|
|
464
|
+
|
|
465
|
+
return False
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
import pysam
|
|
3
|
+
import HTSeq
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ReadsIO(object):
|
|
8
|
+
"""docstring for ReadsIO."""
|
|
9
|
+
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
sam_filename,
|
|
13
|
+
samout_filename,
|
|
14
|
+
samout_format,
|
|
15
|
+
supplementary_alignment_mode,
|
|
16
|
+
secondary_alignment_mode,
|
|
17
|
+
order,
|
|
18
|
+
max_buffer_size,
|
|
19
|
+
):
|
|
20
|
+
|
|
21
|
+
# Set by _prepare_bam_sam_file_parser function below.
|
|
22
|
+
self.pe_mode = None
|
|
23
|
+
self.read_seq = None
|
|
24
|
+
self.read_seq_file = None
|
|
25
|
+
self.template = None
|
|
26
|
+
self.samoutfile = None
|
|
27
|
+
self.samout_format = samout_format
|
|
28
|
+
|
|
29
|
+
self._set_BAM_reader(sam_filename)
|
|
30
|
+
self._set_output_template(samout_filename, samout_format)
|
|
31
|
+
self._set_read_seq(
|
|
32
|
+
supplementary_alignment_mode,
|
|
33
|
+
secondary_alignment_mode,
|
|
34
|
+
order,
|
|
35
|
+
max_buffer_size,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def write_to_samout(self, read_sequence, assignment):
|
|
39
|
+
if self.samoutfile is None:
|
|
40
|
+
return
|
|
41
|
+
if not self.pe_mode:
|
|
42
|
+
# TODO not sure if this is good in all honesty..
|
|
43
|
+
read_sequence = (read_sequence,)
|
|
44
|
+
for read in read_sequence:
|
|
45
|
+
if read is not None:
|
|
46
|
+
read.optional_fields.append(("XF", assignment))
|
|
47
|
+
if self.template is not None:
|
|
48
|
+
self.samoutfile.write(read.to_pysam_AlignedSegment(self.template))
|
|
49
|
+
elif self.samout_format in ("SAM", "sam"):
|
|
50
|
+
self.samoutfile.write(read.get_sam_line() + "\n")
|
|
51
|
+
else:
|
|
52
|
+
raise ValueError(
|
|
53
|
+
"BAM/SAM output: no template and not a test SAM file",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def close_samoutfile(self):
|
|
57
|
+
if self.samoutfile is not None:
|
|
58
|
+
self.samoutfile.close()
|
|
59
|
+
|
|
60
|
+
def _set_BAM_reader(self, sam_filename):
|
|
61
|
+
"""
|
|
62
|
+
Convert the input SAM/BAM files into a parser.
|
|
63
|
+
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
sam_filename : str
|
|
67
|
+
The name of SAM/BAM file to write out all SAM alignment records into.
|
|
68
|
+
|
|
69
|
+
"""
|
|
70
|
+
if sam_filename == "-":
|
|
71
|
+
self.read_seq_file = HTSeq.BAM_Reader(sys.stdin)
|
|
72
|
+
else:
|
|
73
|
+
self.read_seq_file = HTSeq.BAM_Reader(sam_filename)
|
|
74
|
+
|
|
75
|
+
def get_chromosome_names_header(self):
|
|
76
|
+
""" Reads BAM header and returns a list of contigs, or None if no SQ in header """
|
|
77
|
+
contigs = None
|
|
78
|
+
sq = self.read_seq_file.get_header_dict().get("SQ")
|
|
79
|
+
if sq is not None:
|
|
80
|
+
contigs = []
|
|
81
|
+
for sq_record in sq:
|
|
82
|
+
sn = sq_record.get("SN")
|
|
83
|
+
if sn:
|
|
84
|
+
contigs.append(sn)
|
|
85
|
+
return contigs
|
|
86
|
+
|
|
87
|
+
def _set_read_seq(
|
|
88
|
+
self,
|
|
89
|
+
supplementary_alignment_mode,
|
|
90
|
+
secondary_alignment_mode,
|
|
91
|
+
order,
|
|
92
|
+
max_buffer_size,
|
|
93
|
+
):
|
|
94
|
+
|
|
95
|
+
"""
|
|
96
|
+
Prepare the BAM/SAM file iterator.
|
|
97
|
+
Note, only run this after _set_BAM_reader as you need self.read_seq_file to be set.
|
|
98
|
+
This will create a parser and prepare an iterator for it.
|
|
99
|
+
Depending on whether we have paired-end reads or not, different iterator
|
|
100
|
+
will be returned.
|
|
101
|
+
|
|
102
|
+
Parameters
|
|
103
|
+
----------
|
|
104
|
+
supplementary_alignment_mode : str
|
|
105
|
+
Whether to score supplementary alignments (0x800 flag).
|
|
106
|
+
Choices: score or ignore.
|
|
107
|
+
secondary_alignment_mode : str
|
|
108
|
+
Whether to score secondary alignments (0x100 flag).
|
|
109
|
+
Choices: score or ignore.
|
|
110
|
+
order : str
|
|
111
|
+
Can only be either 'pos' or 'name'. Sorting order of <alignment_file>.
|
|
112
|
+
max_buffer_size : int
|
|
113
|
+
When <alignment_file> is paired end sorted by position, allow only so many reads to stay in memory
|
|
114
|
+
until the mates are found (raising this number will use more memory).
|
|
115
|
+
Has no effect for single end or paired end sorted by name.
|
|
116
|
+
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
read_seq_iter = iter(self.read_seq_file)
|
|
120
|
+
# Catch empty BAM files
|
|
121
|
+
try:
|
|
122
|
+
first_read = next(read_seq_iter)
|
|
123
|
+
self.pe_mode = first_read.paired_end
|
|
124
|
+
# FIXME: catchall can hide subtle bugs
|
|
125
|
+
except:
|
|
126
|
+
first_read = None
|
|
127
|
+
self.pe_mode = False
|
|
128
|
+
if first_read is not None:
|
|
129
|
+
self.read_seq = itertools.chain([first_read], read_seq_iter)
|
|
130
|
+
else:
|
|
131
|
+
self.read_seq = []
|
|
132
|
+
|
|
133
|
+
if self.pe_mode:
|
|
134
|
+
if (supplementary_alignment_mode == "ignore") and (
|
|
135
|
+
secondary_alignment_mode == "ignore"
|
|
136
|
+
):
|
|
137
|
+
primary_only = True
|
|
138
|
+
else:
|
|
139
|
+
primary_only = False
|
|
140
|
+
if order == "name":
|
|
141
|
+
self.read_seq = HTSeq.pair_SAM_alignments(
|
|
142
|
+
self.read_seq, primary_only=primary_only
|
|
143
|
+
)
|
|
144
|
+
elif order == "pos":
|
|
145
|
+
self.read_seq = HTSeq.pair_SAM_alignments_with_buffer(
|
|
146
|
+
self.read_seq,
|
|
147
|
+
max_buffer_size=max_buffer_size,
|
|
148
|
+
primary_only=primary_only,
|
|
149
|
+
)
|
|
150
|
+
else:
|
|
151
|
+
raise ValueError("Illegal order specified.")
|
|
152
|
+
|
|
153
|
+
def _set_output_template(self, samout_filename, samout_format):
|
|
154
|
+
"""
|
|
155
|
+
Set up the SAM/BAM output files (and corresponding template) if possible.
|
|
156
|
+
|
|
157
|
+
Parameters
|
|
158
|
+
----------
|
|
159
|
+
samout_filename : str
|
|
160
|
+
The name of SAM/BAM file to write out all SAM alignment records into.
|
|
161
|
+
samout_format : str
|
|
162
|
+
Format of the output files denoted by samouts.
|
|
163
|
+
Choices: SAM, BAM, sam, bam.
|
|
164
|
+
|
|
165
|
+
"""
|
|
166
|
+
if samout_filename is None:
|
|
167
|
+
self.template = None
|
|
168
|
+
self.samoutfile = None
|
|
169
|
+
elif samout_format in ("bam", "BAM"):
|
|
170
|
+
self.template = self.read_seq_file.get_template()
|
|
171
|
+
self.samoutfile = pysam.AlignmentFile(
|
|
172
|
+
samout_filename,
|
|
173
|
+
"wb",
|
|
174
|
+
template=self.template,
|
|
175
|
+
)
|
|
176
|
+
elif (samout_format in ("sam", "SAM")) and hasattr(
|
|
177
|
+
self.read_seq_file, "get_template"
|
|
178
|
+
):
|
|
179
|
+
self.template = self.read_seq_file.get_template()
|
|
180
|
+
self.samoutfile = pysam.AlignmentFile(
|
|
181
|
+
samout_filename,
|
|
182
|
+
"w",
|
|
183
|
+
template=self.template,
|
|
184
|
+
)
|
|
185
|
+
else:
|
|
186
|
+
self.template = None
|
|
187
|
+
self.samoutfile = open(samout_filename, "w")
|