HTSeq 2.1.2__cp313-cp313-macosx_10_15_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,465 @@
1
+ import itertools
2
+ import random
3
+ import sys
4
+
5
+ from HTSeq.scripts.utils import invert_strand, UnknownChrom
6
+ from HTSeq.scripts.count_features.reads_io_processor import ReadsIO
7
+ from HTSeq.scripts.count_features.reads_stats import ReadsStatistics
8
+
9
+
10
+ def count_reads_single_file(
11
+ isam,
12
+ sam_filename,
13
+ features,
14
+ feature_attr,
15
+ order,
16
+ max_buffer_size,
17
+ stranded,
18
+ overlap_mode,
19
+ multimapped_mode,
20
+ secondary_alignment_mode,
21
+ supplementary_alignment_mode,
22
+ feature_type,
23
+ id_attribute,
24
+ additional_attributes,
25
+ quiet,
26
+ minaqual,
27
+ samout_format,
28
+ samout_filename,
29
+ ):
30
+ """
31
+ The function that does the counting for each input BAM/SAM file.
32
+ Fixme: there are some redundant parameters here.. feature_type, id_attribute, additional_attributes
33
+
34
+ Parameters
35
+ ----------
36
+ isam : int
37
+ input files' indexing for the purpose of parallel processing.
38
+ This basically tell you which input file is being processed by this
39
+ instance of function.
40
+ sam_filename : str
41
+ Path to the SAM/BAM file containing the mapped reads.
42
+ features : array
43
+ TODO check the type of this parameter.
44
+ Supplied by HTSeq.make_feature_genomicarrayofsets
45
+ feature_attr : array
46
+ TODO check the type of this parameter.
47
+ Supplied by HTSeq.make_feature_genomicarrayofsets
48
+ order : str
49
+ Can only be either 'pos' or 'name'. Sorting order of <alignment_file>.
50
+ max_buffer_size : int
51
+ The number of reads allowed to stay in memory until mates are found.
52
+ Used when <alignment_file> is paired end sorted by position.
53
+ stranded : str
54
+ Whether the data to be aligned is from a strand-specific assay.
55
+ Option is yes, no, reverse.
56
+ reverse means yes with reversed strand interpretation.
57
+ overlap_mode : str
58
+ Mode to handle reads overlapping more than one feature.
59
+ Choices: union, intersection-strict, intersection-nonempty.
60
+ multimapped_mode : str
61
+ Whether and how to score reads that are not uniquely aligned or
62
+ ambiguously assigned to features.
63
+ Choices: none, all, fraction, random.
64
+ secondary_alignment_mode : str
65
+ Whether to score secondary alignments (0x100 flag).
66
+ Choices: score or ignore.
67
+ supplementary_alignment_mode : str
68
+ Whether to score supplementary alignments (0x800 flag).
69
+ Choices: score or ignore.
70
+ feature_type : str
71
+ Feature type (3rd column in GTF file) to be used, all features of other
72
+ type are ignored (default, suitable for Ensembl, GTF files: exon).
73
+ id_attribute : str
74
+ GTF attribute to be used as feature ID.
75
+ Normally gene_id, suitable for Ensembl GTF files.
76
+ additional_attributes : array
77
+ Additional feature attributes.
78
+ Commonly, gene_name is suitable for Ensembl GTF files.
79
+ quiet : boolean
80
+ Whether to suppress progress report.
81
+ minaqual : int
82
+ Value denoting the MAPQ alignment quality of reads to skip.
83
+ samout_format : str
84
+ Format of the output files denoted by samouts.
85
+ Choices: SAM, BAM, sam, bam.
86
+ samout_filename : str
87
+ The name of SAM/BAM file to write out all SAM alignment records into.
88
+ Returns
89
+ -------
90
+ Dictionary
91
+ TODO update me when done refactoring
92
+
93
+ """
94
+ try:
95
+ read_io_obj = ReadsIO(
96
+ sam_filename=sam_filename,
97
+ samout_filename=samout_filename,
98
+ samout_format=samout_format,
99
+ supplementary_alignment_mode=supplementary_alignment_mode,
100
+ secondary_alignment_mode=secondary_alignment_mode,
101
+ order=order,
102
+ max_buffer_size=max_buffer_size,
103
+ )
104
+
105
+ # If the BAM header is available, check that at least one of the
106
+ # chromosomes is also found in the GTF/GFF file, otherwise the user
107
+ # is probably doing something wrong (e.g. "chr1" vs "1").
108
+ bam_chroms = read_io_obj.get_chromosome_names_header()
109
+ if bam_chroms is not None:
110
+ bam_chroms = set(bam_chroms)
111
+ feature_chroms = set(features.chrom_vectors.keys())
112
+ if not (bam_chroms & feature_chroms):
113
+ sys.stderr.write(
114
+ f"The alignment file has no chromosomes in common with the GFF/GTF "
115
+ "file. This will result in zero feature counts. Please check if the "
116
+ "references match, e.g. if you are using 'chr1' or '1' as "
117
+ "chromosome names.\n")
118
+
119
+ except:
120
+ sys.stderr.write("Error occurred when reading beginning of SAM/BAM file.\n")
121
+ raise
122
+
123
+ try:
124
+ read_stats = ReadsStatistics(
125
+ feature_attr=feature_attr, read_io_object=read_io_obj
126
+ )
127
+ except:
128
+ sys.stderr.write(
129
+ "Error occurred when preparing object to store the reads' assignments\n"
130
+ )
131
+ raise
132
+
133
+ # CIGAR match characters (including alignment match, sequence match, and
134
+ # sequence mismatch
135
+ com = ("M", "=", "X")
136
+
137
+ try:
138
+ for r in read_io_obj.read_seq:
139
+ read_stats.print_progress()
140
+ read_stats.add_num_reads_processed()
141
+
142
+ # get the interval/read sequence.
143
+ if not read_io_obj.pe_mode:
144
+ skip_read = _assess_non_pe_read(
145
+ read_sequence=r,
146
+ read_stats=read_stats,
147
+ secondary_alignment_mode=secondary_alignment_mode,
148
+ supplementary_alignment_mode=supplementary_alignment_mode,
149
+ multimapped_mode=multimapped_mode,
150
+ minaqual=minaqual,
151
+ )
152
+
153
+ if skip_read:
154
+ continue
155
+ iv_seq = _get_iv_seq_non_pe_read(com, r, stranded)
156
+ else:
157
+
158
+ # NOTE: the logic here is a little arbitrary and might benefit
159
+ # from an optional arg. If the reads are paired-end but one of
160
+ # the two is missing, ATM we rely on the other one for info,
161
+ # however the data is technically inconsistent and we might
162
+ # want to let the user choose.
163
+ skip_read = _assess_pe_read(
164
+ minaqual,
165
+ multimapped_mode,
166
+ r,
167
+ read_stats,
168
+ secondary_alignment_mode,
169
+ supplementary_alignment_mode,
170
+ )
171
+ if skip_read:
172
+ continue
173
+
174
+ iv_seq = _get_iv_seq_pe_read(com, r, stranded)
175
+
176
+ # this bit updates the counts obtained from aligning reads to feature sets.
177
+ try:
178
+ fs = _align_reads_to_feature_set(features, iv_seq, overlap_mode)
179
+
180
+ _update_feature_set_counts(fs, multimapped_mode, r, read_stats)
181
+
182
+ except UnknownChrom:
183
+ read_stats.add_empty_read(read_sequence=r)
184
+
185
+ except:
186
+ sys.stderr.write(
187
+ "Error occured when processing input (%s):\n"
188
+ % (read_io_obj.read_seq_file.get_line_number_string())
189
+ )
190
+ raise
191
+
192
+ if not quiet:
193
+ read_stats.print_progress(force_print=True)
194
+
195
+ read_io_obj.close_samoutfile()
196
+
197
+ res = read_stats.get_output(isam)
198
+ return res
199
+
200
+
201
+ def _update_feature_set_counts(fs, multimapped_mode, read_sequence, read_stats):
202
+ """
203
+ Distribute the counts among the aligned feature set.
204
+
205
+ Parameters
206
+ ----------
207
+ fs : array
208
+ A list of feature set previously aligned to the read
209
+ multimapped_mode : str
210
+ How to handle read mapped to multiple features
211
+ read_sequence : array
212
+ Read sequence
213
+ read_stats : ReadsStatistics object
214
+ For updating bad reads
215
+
216
+ """
217
+ if fs is None or len(fs) == 0:
218
+ read_stats.add_empty_read(read_sequence=read_sequence)
219
+ elif len(fs) > 1:
220
+ read_stats.add_ambiguous_read(
221
+ read_sequence=read_sequence,
222
+ assignment="__ambiguous[" + "+".join(sorted(fs)) + "]",
223
+ )
224
+ else:
225
+ read_stats.add_good_read_assignment(
226
+ read_sequence=read_sequence, assignment=list(fs)[0]
227
+ )
228
+ if fs is not None and len(fs) > 0:
229
+ fs = list(fs)
230
+ if multimapped_mode == "none":
231
+ if len(fs) == 1:
232
+ read_stats.add_to_count(feature=fs[0])
233
+ elif multimapped_mode == "all":
234
+ for fsi in fs:
235
+ read_stats.add_to_count(feature=fsi)
236
+ elif multimapped_mode == "fraction":
237
+ val = 1.0 / len(fs)
238
+ for fsi in fs:
239
+ read_stats.add_to_count(feature=fsi, value=val)
240
+ elif multimapped_mode == "random":
241
+ fsi = random.choice(fs)
242
+ read_stats.add_to_count(feature=fsi)
243
+ else:
244
+ sys.exit("Illegal multimap mode.")
245
+
246
+
247
+ def _align_reads_to_feature_set(features, iv_seq, overlap_mode):
248
+ """
249
+ Align reads to feature set.
250
+
251
+ Parameters
252
+ ----------
253
+ features : array
254
+ A set of features to align the reads to
255
+ TODO not sure the type yet.
256
+ iv_seq : array
257
+ TODO not sure the type yet.
258
+ Read (or interval?) sequence
259
+ overlap_mode : str
260
+ How to select the features for read that not 100% aligned to a feature.
261
+
262
+ Returns
263
+ -------
264
+ fs : array
265
+ A set of features to align the reads to
266
+ TODO not sure the type yet.
267
+
268
+ """
269
+ if overlap_mode == "union":
270
+ fs = set()
271
+ for iv in iv_seq:
272
+ if iv.chrom not in features.chrom_vectors:
273
+ raise UnknownChrom
274
+ for iv2, fs2 in features[iv].steps():
275
+ fs = fs.union(fs2)
276
+ elif overlap_mode in ("intersection-strict", "intersection-nonempty"):
277
+ fs = None
278
+ for iv in iv_seq:
279
+ if iv.chrom not in features.chrom_vectors:
280
+ raise UnknownChrom
281
+ for iv2, fs2 in features[iv].steps():
282
+ if (len(fs2) > 0) or (overlap_mode == "intersection-strict"):
283
+ if fs is None:
284
+ fs = fs2.copy()
285
+ else:
286
+ fs = fs.intersection(fs2)
287
+ else:
288
+ sys.exit("Illegal overlap mode.")
289
+ return fs
290
+
291
+
292
+ def _get_iv_seq_pe_read(com, r, stranded):
293
+ """
294
+ Function to break down the read sequence into intervals which will
295
+ subsequently be processed.
296
+
297
+ Parameters
298
+ ----------
299
+ com : array
300
+ CIGAR match characters (including alignment match, sequence match, and
301
+ sequence mismatch
302
+ r :
303
+ todo update type
304
+ Read sequence
305
+ stranded : str
306
+ Whether the data to be aligned is from a strand-specific assay.
307
+ Option is yes, no, reverse.
308
+ reverse means yes with reversed strand interpretation.
309
+
310
+ Returns
311
+ -------
312
+ iv_seq :
313
+ todo update type
314
+
315
+ """
316
+ if r[0] is not None and r[0].aligned:
317
+ iv_seq = _get_iv_seq_pe_read_first(com, r[0], stranded)
318
+ else:
319
+ iv_seq = tuple()
320
+ if r[1] is not None and r[1].aligned:
321
+ iv_seq = _get_iv_seq_pe_read_second(com, iv_seq, r[1], stranded)
322
+ return iv_seq
323
+
324
+
325
+ def _assess_pe_read(
326
+ minaqual,
327
+ multimapped_mode,
328
+ read_sequence,
329
+ read_stats,
330
+ secondary_alignment_mode,
331
+ supplementary_alignment_mode,
332
+ ):
333
+ """
334
+ Function to check the read for paired end.
335
+
336
+ Parameters
337
+ ----------
338
+ minaqual : int
339
+ Value denoting the MAPQ alignment quality of reads to skip.
340
+ multimapped_mode : str
341
+ Whether and how to score reads that are not uniquely aligned or
342
+ ambiguously assigned to features.
343
+ Choices: none, all, fraction, random.
344
+ read_sequence :
345
+ todo update type
346
+ read_stats : ReadsStatistics object
347
+ Object which stores a bunch of statistics about the read sequences.
348
+ secondary_alignment_mode : str
349
+ Whether to score secondary alignments (0x100 flag).
350
+ Choices: score or ignore.
351
+ supplementary_alignment_mode : str
352
+ Whether to score supplementary alignments (0x800 flag).
353
+ Choices: score or ignore.
354
+
355
+ Returns
356
+ -------
357
+
358
+ """
359
+ # NOTE: Sometimes read1 is None or not aligned but read2 is fine, in that
360
+ # case we should not exclude the entire pair but rather use the interval
361
+ # of the second read
362
+ read1_miss = (read_sequence[0] is None) or (not read_sequence[0].aligned)
363
+ read2_miss = (read_sequence[1] is None) or (not read_sequence[1].aligned)
364
+ if read1_miss and read2_miss:
365
+ read_stats.add_not_aligned_read(read_sequence=read_sequence)
366
+ return True
367
+
368
+ if secondary_alignment_mode == "ignore":
369
+ if (read_sequence[0] is not None) and read_sequence[0].not_primary_alignment:
370
+ return True
371
+ elif (read_sequence[1] is not None) and read_sequence[1].not_primary_alignment:
372
+ return True
373
+ if supplementary_alignment_mode == "ignore":
374
+ if (read_sequence[0] is not None) and read_sequence[0].supplementary:
375
+ return True
376
+ elif (read_sequence[1] is not None) and read_sequence[1].supplementary:
377
+ return True
378
+ try:
379
+ if (
380
+ read_sequence[0] is not None and read_sequence[0].optional_field("NH") > 1
381
+ ) or (
382
+ read_sequence[1] is not None and read_sequence[1].optional_field("NH") > 1
383
+ ):
384
+ read_stats.add_not_unique_read(read_sequence=read_sequence)
385
+ if multimapped_mode == "none":
386
+ return True
387
+ except KeyError:
388
+ pass
389
+ if (read_sequence[0] and read_sequence[0].aQual < minaqual) or (
390
+ read_sequence[1] and read_sequence[1].aQual < minaqual
391
+ ):
392
+ read_stats.add_low_quality_read(read_sequence=read_sequence)
393
+ return True
394
+ return False
395
+
396
+
397
+ # Get GenomicInterval for each read, whether single-end or paired-end
398
+ def _get_iv_seq_non_pe_read(com, r, stranded):
399
+ if stranded != "reverse":
400
+ iv_seq = (co.ref_iv for co in r.cigar if co.type in com and co.size > 0)
401
+ else:
402
+ iv_seq = (
403
+ invert_strand(co.ref_iv)
404
+ for co in r.cigar
405
+ if (co.type in com and co.size > 0)
406
+ )
407
+ return iv_seq
408
+
409
+
410
+ def _get_iv_seq_pe_read_first(com, read, stranded):
411
+ if stranded != "reverse":
412
+ iv_seq = (co.ref_iv for co in read.cigar if co.type in com and co.size > 0)
413
+ else:
414
+ iv_seq = (
415
+ invert_strand(co.ref_iv)
416
+ for co in read.cigar
417
+ if co.type in com and co.size > 0
418
+ )
419
+ return iv_seq
420
+
421
+
422
+ def _get_iv_seq_pe_read_second(com, iv_seq, read, stranded):
423
+ if stranded != "reverse":
424
+ iv_seq = itertools.chain(
425
+ iv_seq,
426
+ (
427
+ invert_strand(co.ref_iv)
428
+ for co in read.cigar
429
+ if co.type in com and co.size > 0
430
+ ),
431
+ )
432
+ else:
433
+ iv_seq = itertools.chain(
434
+ iv_seq, (co.ref_iv for co in read.cigar if co.type in com and co.size > 0)
435
+ )
436
+ return iv_seq
437
+
438
+
439
+ def _assess_non_pe_read(
440
+ read_sequence,
441
+ read_stats,
442
+ secondary_alignment_mode,
443
+ supplementary_alignment_mode,
444
+ multimapped_mode,
445
+ minaqual,
446
+ ):
447
+ if not read_sequence.aligned:
448
+ read_stats.add_not_aligned_read(read_sequence=read_sequence)
449
+ return True
450
+ if (secondary_alignment_mode == "ignore") and read_sequence.not_primary_alignment:
451
+ return True
452
+ if (supplementary_alignment_mode == "ignore") and read_sequence.supplementary:
453
+ return True
454
+ try:
455
+ if read_sequence.optional_field("NH") > 1:
456
+ read_stats.add_not_unique_read(read_sequence=read_sequence)
457
+ if multimapped_mode == "none":
458
+ return True
459
+ except KeyError:
460
+ pass
461
+ if read_sequence.aQual < minaqual:
462
+ read_stats.add_low_quality_read(read_sequence=read_sequence)
463
+ return True
464
+
465
+ return False
@@ -0,0 +1,187 @@
1
+ import itertools
2
+ import pysam
3
+ import HTSeq
4
+ import sys
5
+
6
+
7
+ class ReadsIO(object):
8
+ """docstring for ReadsIO."""
9
+
10
+ def __init__(
11
+ self,
12
+ sam_filename,
13
+ samout_filename,
14
+ samout_format,
15
+ supplementary_alignment_mode,
16
+ secondary_alignment_mode,
17
+ order,
18
+ max_buffer_size,
19
+ ):
20
+
21
+ # Set by _prepare_bam_sam_file_parser function below.
22
+ self.pe_mode = None
23
+ self.read_seq = None
24
+ self.read_seq_file = None
25
+ self.template = None
26
+ self.samoutfile = None
27
+ self.samout_format = samout_format
28
+
29
+ self._set_BAM_reader(sam_filename)
30
+ self._set_output_template(samout_filename, samout_format)
31
+ self._set_read_seq(
32
+ supplementary_alignment_mode,
33
+ secondary_alignment_mode,
34
+ order,
35
+ max_buffer_size,
36
+ )
37
+
38
+ def write_to_samout(self, read_sequence, assignment):
39
+ if self.samoutfile is None:
40
+ return
41
+ if not self.pe_mode:
42
+ # TODO not sure if this is good in all honesty..
43
+ read_sequence = (read_sequence,)
44
+ for read in read_sequence:
45
+ if read is not None:
46
+ read.optional_fields.append(("XF", assignment))
47
+ if self.template is not None:
48
+ self.samoutfile.write(read.to_pysam_AlignedSegment(self.template))
49
+ elif self.samout_format in ("SAM", "sam"):
50
+ self.samoutfile.write(read.get_sam_line() + "\n")
51
+ else:
52
+ raise ValueError(
53
+ "BAM/SAM output: no template and not a test SAM file",
54
+ )
55
+
56
+ def close_samoutfile(self):
57
+ if self.samoutfile is not None:
58
+ self.samoutfile.close()
59
+
60
+ def _set_BAM_reader(self, sam_filename):
61
+ """
62
+ Convert the input SAM/BAM files into a parser.
63
+
64
+ Parameters
65
+ ----------
66
+ sam_filename : str
67
+ The name of SAM/BAM file to write out all SAM alignment records into.
68
+
69
+ """
70
+ if sam_filename == "-":
71
+ self.read_seq_file = HTSeq.BAM_Reader(sys.stdin)
72
+ else:
73
+ self.read_seq_file = HTSeq.BAM_Reader(sam_filename)
74
+
75
+ def get_chromosome_names_header(self):
76
+ """ Reads BAM header and returns a list of contigs, or None if no SQ in header """
77
+ contigs = None
78
+ sq = self.read_seq_file.get_header_dict().get("SQ")
79
+ if sq is not None:
80
+ contigs = []
81
+ for sq_record in sq:
82
+ sn = sq_record.get("SN")
83
+ if sn:
84
+ contigs.append(sn)
85
+ return contigs
86
+
87
+ def _set_read_seq(
88
+ self,
89
+ supplementary_alignment_mode,
90
+ secondary_alignment_mode,
91
+ order,
92
+ max_buffer_size,
93
+ ):
94
+
95
+ """
96
+ Prepare the BAM/SAM file iterator.
97
+ Note, only run this after _set_BAM_reader as you need self.read_seq_file to be set.
98
+ This will create a parser and prepare an iterator for it.
99
+ Depending on whether we have paired-end reads or not, different iterator
100
+ will be returned.
101
+
102
+ Parameters
103
+ ----------
104
+ supplementary_alignment_mode : str
105
+ Whether to score supplementary alignments (0x800 flag).
106
+ Choices: score or ignore.
107
+ secondary_alignment_mode : str
108
+ Whether to score secondary alignments (0x100 flag).
109
+ Choices: score or ignore.
110
+ order : str
111
+ Can only be either 'pos' or 'name'. Sorting order of <alignment_file>.
112
+ max_buffer_size : int
113
+ When <alignment_file> is paired end sorted by position, allow only so many reads to stay in memory
114
+ until the mates are found (raising this number will use more memory).
115
+ Has no effect for single end or paired end sorted by name.
116
+
117
+ """
118
+
119
+ read_seq_iter = iter(self.read_seq_file)
120
+ # Catch empty BAM files
121
+ try:
122
+ first_read = next(read_seq_iter)
123
+ self.pe_mode = first_read.paired_end
124
+ # FIXME: catchall can hide subtle bugs
125
+ except:
126
+ first_read = None
127
+ self.pe_mode = False
128
+ if first_read is not None:
129
+ self.read_seq = itertools.chain([first_read], read_seq_iter)
130
+ else:
131
+ self.read_seq = []
132
+
133
+ if self.pe_mode:
134
+ if (supplementary_alignment_mode == "ignore") and (
135
+ secondary_alignment_mode == "ignore"
136
+ ):
137
+ primary_only = True
138
+ else:
139
+ primary_only = False
140
+ if order == "name":
141
+ self.read_seq = HTSeq.pair_SAM_alignments(
142
+ self.read_seq, primary_only=primary_only
143
+ )
144
+ elif order == "pos":
145
+ self.read_seq = HTSeq.pair_SAM_alignments_with_buffer(
146
+ self.read_seq,
147
+ max_buffer_size=max_buffer_size,
148
+ primary_only=primary_only,
149
+ )
150
+ else:
151
+ raise ValueError("Illegal order specified.")
152
+
153
+ def _set_output_template(self, samout_filename, samout_format):
154
+ """
155
+ Set up the SAM/BAM output files (and corresponding template) if possible.
156
+
157
+ Parameters
158
+ ----------
159
+ samout_filename : str
160
+ The name of SAM/BAM file to write out all SAM alignment records into.
161
+ samout_format : str
162
+ Format of the output files denoted by samouts.
163
+ Choices: SAM, BAM, sam, bam.
164
+
165
+ """
166
+ if samout_filename is None:
167
+ self.template = None
168
+ self.samoutfile = None
169
+ elif samout_format in ("bam", "BAM"):
170
+ self.template = self.read_seq_file.get_template()
171
+ self.samoutfile = pysam.AlignmentFile(
172
+ samout_filename,
173
+ "wb",
174
+ template=self.template,
175
+ )
176
+ elif (samout_format in ("sam", "SAM")) and hasattr(
177
+ self.read_seq_file, "get_template"
178
+ ):
179
+ self.template = self.read_seq_file.get_template()
180
+ self.samoutfile = pysam.AlignmentFile(
181
+ samout_filename,
182
+ "w",
183
+ template=self.template,
184
+ )
185
+ else:
186
+ self.template = None
187
+ self.samoutfile = open(samout_filename, "w")