HTSeq 2.1.2__cp310-cp310-macosx_10_15_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
HTSeq/StretchVector.py ADDED
@@ -0,0 +1,491 @@
1
+ from collections import namedtuple
2
+ import numpy as np
3
+ import warnings
4
+
5
+
6
+ Interval = namedtuple('Interval', ['start', 'end'])
7
+
8
+
9
+ class StretchVector:
10
+ """Sparse representation for 'island' of dense data on a long line.
11
+
12
+ This class is the most basic object representing 'windows of data' in a sea
13
+ of unknowns of a linear structure, e.g. a chromosome. The basic design is to
14
+ store a list of (start, end) intervals, each of them associated with a
15
+ "stretch", i.e. a dense numpy array.
16
+
17
+ In its simplest form, a StretchVector can be thought of as an array with
18
+ an arbitratily large offset. This can be useful e.g. to plot coverage or
19
+ signal enrichments while keeping real genomic coordinates.
20
+
21
+ The class does more heavy lifting, however, when several separate arrays
22
+ are used, e.g. to represent "peaks" of signals (ATAC-Seq, ChIP-Seq, etc.).
23
+ In that situation, the bookkeeping involved in manipulating all offsets
24
+ correctly can become burdensome quickly, so StretchVector takes care of it.
25
+ In addition, extension of data to flanks is easy, so if you want to expand
26
+ an existing data window by 1kb on each side, that can be done directly:
27
+
28
+ >>> sv = HTSeq.StretchVector(typecode="d")
29
+ >>> sv[6789: 8900] = 56.8
30
+ >>> sv[6000: 7000] = 20 # <-- left extension
31
+ >>> sv[8000: 9000] = 10 # <-- right extension
32
+
33
+ You can use StretchVector as a storage option for higher level objects such
34
+ as ChromVector and GenomicArray. Those classes support strandedness, unlike
35
+ StretchVector itself.
36
+ """
37
+ _typecode_dict = {
38
+ "d": np.float32,
39
+ "i": np.int32,
40
+ "l": np.int64,
41
+ "O": object,
42
+ }
43
+
44
+ def __init__(self, typecode):
45
+ """Create an empty StretchVector of a given type
46
+
47
+ Args:
48
+ typecode ("d", "i", "l", or "O"): The dtype of the stored data. Can
49
+ be "d" (double, i.e. np.float32), "i" (np.int32), "l" (np.int64),
50
+ or "O" (generic object). Note that np.float64 data will be recast
51
+ to np.float32, which might lead to a loss of machine precision.
52
+
53
+ Returns:
54
+ A StretchVector instance of the chosen type.
55
+ """
56
+ self.typecode = typecode
57
+ self.ivs = []
58
+ self.stretches = []
59
+
60
+ def _in_stretch(self, index):
61
+ if len(self.stretches) == 0:
62
+ return -1
63
+
64
+ if isinstance(index, int):
65
+ if (index < self.ivs[0].start) or (index >= self.ivs[-1].end):
66
+ return -1
67
+ for i, iv in enumerate(self.ivs):
68
+ if index < iv.start:
69
+ return -1
70
+ if index < iv.end:
71
+ return i
72
+
73
+ def _get_interval(self, start, end):
74
+ if len(self.stretches) == 0:
75
+ return self
76
+
77
+ ivs = []
78
+ stretches = []
79
+ for i, iv in enumerate(self.ivs):
80
+ # They end before the start, skip
81
+ if iv.end <= start:
82
+ continue
83
+
84
+ # They start after the end, skip all remaining
85
+ if iv.start >= end:
86
+ break
87
+
88
+ # This interval overlap with start-end
89
+ if (iv.start <= start) and (iv.end <= end):
90
+ new_iv = Interval(start, iv.end)
91
+ new_stretch = self.stretches[i][start - iv.start:]
92
+ ivs.append(new_iv)
93
+ stretches.append(new_stretch)
94
+ continue
95
+
96
+ if (iv.start <= start) and (iv.end > end):
97
+ new_iv = Interval(start, end)
98
+ new_stretch = self.stretches[i][start - iv.start:-(iv.end - end)]
99
+ ivs.append(new_iv)
100
+ stretches.append(new_stretch)
101
+ break
102
+
103
+ if (iv.start > start) and (iv.end <= end):
104
+ new_iv = Interval(iv.start, iv.end)
105
+ new_stretch = self.stretches[i]
106
+ ivs.append(new_iv)
107
+ stretches.append(new_stretch)
108
+ continue
109
+
110
+ if (iv.start > start) and (iv.end > end):
111
+ new_iv = Interval(iv.start, end)
112
+ new_stretch = self.stretches[i][:-(iv.end - end)]
113
+ ivs.append(new_iv)
114
+ stretches.append(new_stretch)
115
+ break
116
+
117
+ new_cls = self.__class__(self.typecode)
118
+ new_cls.ivs = ivs
119
+ new_cls.stretches = stretches
120
+ return new_cls
121
+
122
+ def _set_interval(self, start, end, values):
123
+ if len(self.stretches) == 0:
124
+ self.ivs.append(Interval(start, end))
125
+ self.stretches.append(
126
+ np.zeros(end - start, self._typecode_dict[self.typecode])
127
+ )
128
+ self.stretches[-1][:] = values
129
+ return len(self.ivs) - 1
130
+
131
+ # For each end, there are two possibilities, inside or outside an
132
+ # existing stretch
133
+ idx_start = self._in_stretch(start)
134
+ idx_end = self._in_stretch(end - 1)
135
+
136
+ # Neither is in, make a new stretch and delete existing stretches
137
+ if idx_start == idx_end == -1:
138
+ new_iv = Interval(start, end)
139
+ new_stretch = np.zeros(end - start, self._typecode_dict[self.typecode])
140
+ new_stretch[:] = values
141
+ new_ivs = []
142
+ new_stretches = []
143
+ new_added = False
144
+ for i, iv in enumerate(self.ivs):
145
+ # Stretches before
146
+ if start >= iv.end:
147
+ new_ivs.append(iv)
148
+ new_stretches.append(self.stretches[i])
149
+ continue
150
+
151
+ # Add new stretch
152
+ if not new_added:
153
+ new_ivs.append(new_iv)
154
+ new_stretches.append(new_stretch)
155
+
156
+ # Skip overlapping stretches
157
+ if (start <= iv.start) and (end >= iv.end):
158
+ continue
159
+
160
+ # Stretches after
161
+ new_ivs.append(iv)
162
+ new_stretches.append(self.stretches[i])
163
+
164
+ # Add new stretch if still missing
165
+ if not new_added:
166
+ new_ivs.append(new_iv)
167
+ new_stretches.append(new_stretch)
168
+
169
+ # Start is in a stretch, end is not
170
+ elif (idx_start != -1) and (idx_end == -1):
171
+ new_iv = Interval(self.ivs[idx_start].start, end)
172
+ new_stretch = np.zeros(
173
+ end - self.ivs[idx_start].start,
174
+ self._typecode_dict[self.typecode],
175
+ )
176
+ l1 = start - self.ivs[idx_start].start
177
+ new_stretch[:l1] = self.stretches[idx_start][:l1]
178
+ new_stretch[l1:] = values
179
+
180
+ new_ivs = self.ivs[:idx_start] + [new_iv]
181
+ new_stretches = self.stretches[:idx_start] = [new_stretch]
182
+
183
+ for i, iv in enumerate(self.ivs[idx_start:], idx_start):
184
+ # Skip the first one
185
+ if i == idx_start:
186
+ continue
187
+
188
+ # Skip overlapping stretches
189
+ if iv.end < end:
190
+ continue
191
+
192
+ # Stretches after
193
+ new_ivs.append(iv)
194
+ new_stretches.append(self.stretches[i])
195
+
196
+ # Start is not in a stretch, end is
197
+ elif (idx_start == -1) and (idx_end != -1):
198
+ new_iv = Interval(start, self.ivs[idx_end].end)
199
+ new_stretch = np.zeros(
200
+ self.ivs[idx_end].end - start,
201
+ self._typecode_dict[self.typecode],
202
+ )
203
+ l2 = self.ivs[idx_end].end - end
204
+ if l2 == 0:
205
+ l2 = -len(self.stretches[idx_end])
206
+ new_stretch[:-l2] = values
207
+ if -l2 != len(self.stretches[idx_end]):
208
+ new_stretch[-l2:] = self.stretches[idx_end][-l2:]
209
+
210
+ new_ivs = []
211
+ new_stretches = []
212
+ for i, iv in enumerate(self.ivs):
213
+ # Stretches before
214
+ if start >= iv.end:
215
+ new_ivs.append(iv)
216
+ new_stretches.append(self.stretches[i])
217
+ continue
218
+ break
219
+
220
+ # New stretch
221
+ new_ivs.append(new_iv)
222
+ new_stretches.append(new_stretch)
223
+
224
+ # If there are stretches left, add them
225
+ if idx_end != len(self.ivs) - 1:
226
+ new_ivs.extend(self.ivs[idx_end+1:])
227
+ new_stretches.extend(self.stretches[idx_end+1:])
228
+
229
+ # Both start and end are in the same stretch
230
+ elif idx_start == idx_end:
231
+ l1 = start - self.ivs[idx_start].start
232
+ l2 = self.ivs[idx_end].end - end
233
+ if l2 == 0:
234
+ l2 = -len(self.stretches[idx_end])
235
+ new_ivs = self.ivs
236
+ new_stretches = self.stretches
237
+ new_stretches[idx_start][l1:-l2] = values
238
+
239
+ # They are in different stretches
240
+ else:
241
+ new_iv = Interval(
242
+ self.ivs[idx_start].start,
243
+ self.ivs[idx_end].end,
244
+ )
245
+ new_stretch = np.zeros(
246
+ self.ivs[idx_end].end - self.ivs[idx_start].start,
247
+ self._typecode_dict[self.typecode],
248
+ )
249
+ l1 = start - self.ivs[idx_start].start
250
+ l2 = self.ivs[idx_end].end - end
251
+ if l2 == 0:
252
+ l2 = -len(self.stretches[idx_end])
253
+ new_stretch[:l1] = self.stretches[idx_start][:l1]
254
+ new_stretch[l1:-l2] = values
255
+ if -l2 != len(self.stretches[idx_end]):
256
+ new_stretch[-l2:] = self.stretches[idx_end][-l2:]
257
+
258
+ new_ivs = self.ivs[:idx_start] + [iv]
259
+ if idx_end != len(self.ivs) - 1:
260
+ new_ivs.extend(self.ivs[idx_end+1:])
261
+ new_stretches.extend(self.stretches[idx_end+1:])
262
+
263
+ self.ivs = new_ivs
264
+ self.stretches = new_stretches
265
+
266
+ def _add_stretch(self, start, end, i_start=0):
267
+ for i, iv in enumerate(self.ivs, i_start):
268
+ if start < iv.start:
269
+ self.ivs.insert(
270
+ i,
271
+ Interval(start, end),
272
+ )
273
+ self.stretches.insert(
274
+ i,
275
+ np.zeros(end - start, self._typecode_dict[self.typecode])
276
+ )
277
+ return i
278
+
279
+ self.ivs.append(
280
+ Interval(start, end),
281
+ )
282
+ self.stretches.append(
283
+ np.zeros(end - start, self._typecode_dict[self.typecode])
284
+ )
285
+ return len(self.ivs) - 1
286
+
287
+ def __getitem__(self, index):
288
+ """Get a view of a portion of the StretchVector
289
+
290
+
291
+ Args:
292
+ index (int, slice, or GenomicInterval): Coordinate or interval to
293
+ extract. For slices, the stretches from this intervals are
294
+ *views* of the original array, so changes in them will be
295
+ reflected in the parent StretchVector as well.
296
+ Returns:
297
+ A number of index is an int, containing the value at that site.
298
+ A StretchVector with adapted stretches if index is anything else.
299
+
300
+ """
301
+ from HTSeq import GenomicInterval
302
+
303
+ if isinstance(index, int):
304
+ idx_iv = self._in_stretch(index)
305
+ if idx_iv == -1:
306
+ return None
307
+ return self.stretches[idx_iv][index - self.ivs[idx_iv].start]
308
+
309
+ elif isinstance(index, slice):
310
+ if index.step is not None and index.step != 1:
311
+ raise ValueError(
312
+ "Striding slices (i.e., step != 1) are not supported")
313
+ if index.start is None:
314
+ index.start = 0
315
+ if index.stop is None:
316
+ if len(self.ivs) == 0:
317
+ raise IndexError('No stretches, cannot find end')
318
+ index.stop = self.ivs[-1].end
319
+
320
+ return self._get_interval(index.start, index.stop)
321
+
322
+ elif isinstance(index, GenomicInterval):
323
+ return self.__getitem__(slice(index.start, index.end))
324
+
325
+ def __setitem__(self, index, values):
326
+ """Set/reset values within or outside the stretch
327
+
328
+ Args:
329
+ index (int, slice, or GenomicInterval): Coordinate or interval to
330
+ set. These can be within current stretches, between them, outside,
331
+ or partially overlapping.
332
+ values (numpy.ndarray or convertible sequence): Values to set or
333
+ reset at those locations. This will be recast as a numpy array
334
+ of the appropriate dtype within this function.
335
+
336
+ Returns: None
337
+ """
338
+ from HTSeq import GenomicInterval
339
+
340
+ # Leave dtype out for now for speed, it will be taken care of later on
341
+ values = np.asarray(values)
342
+
343
+ if isinstance(index, int):
344
+ idx_iv = self._in_stretch(index)
345
+ if idx_iv == -1:
346
+ idx_iv = self._add_stretch(index, index + 1)
347
+ self.stretches[idx_iv][index - self.ivs[idx_iv].start] = values
348
+ return
349
+
350
+ elif isinstance(index, slice):
351
+ if index.step is not None and index.step != 1:
352
+ raise ValueError(
353
+ "Striding slices (i.e., step != 1) are not supported")
354
+ if index.start is None:
355
+ index.start = 0
356
+ if index.stop is None:
357
+ if len(self.ivs) == 0:
358
+ raise IndexError('No stretches, cannot find end')
359
+ index.stop = self.ivs[-1].end
360
+
361
+ self._set_interval(index.start, index.stop, values)
362
+
363
+ elif isinstance(index, GenomicInterval):
364
+ return self.__setitem__(slice(index.start, index.end), values)
365
+
366
+
367
+ def todense(self):
368
+ """Dense numpy array of the whole stretch, using NaNs for missing data"""
369
+ if len(self.ivs) == 0:
370
+ return np.empty(0, self._typecode_dict[self.typecode])
371
+
372
+ if len(self.ivs) == 1:
373
+ return self.stretches[0].copy()
374
+
375
+ # At least two stretches, have to stitch them
376
+ start = self.ivs[0].start
377
+ res = np.empty(
378
+ self.ivs[-1].end - start,
379
+ self.stretches[0].dtype,
380
+ )
381
+ res[:] = np.nan
382
+
383
+ for i, iv in enumerate(self.ivs):
384
+ res[iv.start - start: iv.end - start] = self.stretches[i]
385
+
386
+ return res
387
+
388
+ @classmethod
389
+ def from_dense(cls, array, offset=0):
390
+ """Create from dense array with NaNs
391
+
392
+ Args:
393
+ array (numpy.array): dense array containing NaNs at positions to
394
+ be skipped.
395
+ offset (int): Start of the initial interval.
396
+ """
397
+ if array.dtype == np.float64:
398
+ warnings.warn("np.float64 array converted to np.float32")
399
+ array = array.astype(np.float32)
400
+
401
+ for typecode, dtype in cls._typecode_dict.items():
402
+ if dtype == array.dtype:
403
+ sv = cls(typecode=typecode)
404
+ break
405
+ else:
406
+ raise TypeError('Typecode not found for dtype: '+str(array.dtype))
407
+
408
+ if len(array) == 0:
409
+ return sv
410
+
411
+ flips = np.diff(np.isnan(array)).nonzero()[0]
412
+
413
+ # No flips: either all good or all skip
414
+ if flips.sum() == 0:
415
+ if np.isnan(array[0]):
416
+ return sv
417
+ sv.ivs.append(Interval(offset, offset + len(array)))
418
+ sv.stretches.append(array.copy())
419
+ return sv
420
+
421
+ # If we start with nan, just increase the offset
422
+ if np.isnan(array[0]):
423
+ add_offset = flips[0] + 1
424
+ offset += add_offset
425
+ array = array[add_offset:]
426
+
427
+ # Single flip means keep the whole rest
428
+ if len(flips) > 1:
429
+ flips = flips[1:]
430
+ flips -= add_offset
431
+ else:
432
+ sv.ivs.append(Interval(offset, offset + len(array)))
433
+ sv.stretches.append(array.copy())
434
+ return sv
435
+
436
+ # Now we have at least one flip left, and we start with a number/object
437
+ # If we have an odd number of flips, we can forget the last block
438
+ if len(flips) % 2:
439
+ end = flips[-1] + 1
440
+ array = array[:end]
441
+ flips = flips[:-1]
442
+
443
+ # No flip left, all good
444
+ if len(flips) == 0:
445
+ sv.ivs.append(Interval(offset, offset + len(array)))
446
+ sv.stretches.append(array.copy())
447
+ return sv
448
+
449
+ # Now we start and end with a number/object, and there are at least
450
+ # two flips. Initial stretch
451
+ sv.ivs.append(Interval(offset, offset + flips[0] + 1))
452
+ sv.stretches.append(
453
+ array[:flips[0] + 1]
454
+ )
455
+ # Intermediate stretches
456
+ for i in range((len(flips) // 2) - 1):
457
+ new_iv = Interval(offset + flips[i * 2 + 1] + 1, offset + flips[i * 2 + 2] + 1)
458
+ new_stretch = array[flips[i * 2 + 1] + 1: flips[i * 2 + 2] + 1]
459
+ sv.ivs.append(new_iv)
460
+ sv.stretches.append(new_stretch)
461
+ # Final stretch
462
+ new_iv = Interval(offset + flips[-1] + 1, offset + len(array))
463
+ new_stretch = array[flips[-1] + 1:]
464
+ sv.ivs.append(new_iv)
465
+ sv.stretches.append(new_stretch)
466
+
467
+ return sv
468
+
469
+ def __iter__(self):
470
+ """Iterate over intervals and stretches ("islands")."""
471
+ return zip(self.ivs, self.stretches)
472
+
473
+ def copy(self):
474
+ """Make a copy the StretchVector and of all its stretches"""
475
+ sv_new = StretchVector(typecode=self.typecode)
476
+ for iv, stretch in self:
477
+ sv_new.ivs.append(Interval(iv.start, iv.end))
478
+ sv_new.stretches.append(stretch.copy())
479
+ return sv_new
480
+
481
+ def shift(self, offset):
482
+ """Shift all stretch intervals by a constant number
483
+
484
+ Args:
485
+ offset (int): Shift the start and end coordinates of each stretch
486
+ by this amount.
487
+ Returns:
488
+ None. The function acts in place.
489
+ """
490
+ for i, iv in self.ivs:
491
+ self.ivs[i] = Interval(iv.start + offset, iv.end + offset)
Binary file
@@ -0,0 +1,85 @@
1
+ # Internal HTSeq functions, not part of the API
2
+ import HTSeq
3
+ import numpy
4
+
5
+
6
+ def GenomicInterval_range(gi, step):
7
+ for pos in range(gi.start, gi.end, step):
8
+ yield HTSeq.GenomicPosition(gi.chrom, pos, gi.strand)
9
+
10
+
11
+ def GenomicInterval_xranged(gi, step):
12
+ if gi.strand == "-":
13
+ step *= -1
14
+ for pos in range(gi.start_d, gi.end_d, step):
15
+ yield HTSeq.GenomicPosition(gi.chrom, pos, gi.strand)
16
+
17
+
18
+ def ChromVector_steps(cv):
19
+ '''Steps over a ChromVector
20
+
21
+
22
+ NOTE: ChromVectors use an offset which is also iv.start compared to their
23
+ storage objects that start at 0.
24
+ '''
25
+ # "Steps" of an ndarray (or memmap?)-storaged ChromVector
26
+ if isinstance(cv.array, numpy.ndarray):
27
+ start = cv.iv.start
28
+ prev_val = None
29
+ for i in range(cv.iv.start, cv.iv.end):
30
+ val = cv.array[i - cv.offset]
31
+ if prev_val is None or val != prev_val:
32
+ if prev_val is not None:
33
+ yield (HTSeq.GenomicInterval(cv.iv.chrom, start, i, cv.iv.strand), prev_val)
34
+ prev_val = val
35
+ start = i
36
+ yield (HTSeq.GenomicInterval(
37
+ cv.iv.chrom, start, cv.iv.end, cv.iv.strand), prev_val,
38
+ )
39
+
40
+ # Steps of a StepVector-storaged ChromVector
41
+ elif isinstance(cv.array, HTSeq.StepVector.StepVector):
42
+ for start, stop, value in cv.array[
43
+ cv.iv.start - cv.offset: cv.iv.end - cv.offset].get_steps():
44
+ yield (HTSeq.GenomicInterval(
45
+ cv.iv.chrom, start + cv.offset, stop + cv.offset, cv.iv.strand), value,
46
+ )
47
+
48
+ # Steps in a StretchVector behave similar to a full numpy array, but uses
49
+ # np.nan for None and treats each stretch as independent, of course
50
+ # NOTE: one could optimize this by using np.diff and flips, as we have done
51
+ # in the StretchVector methods. For now, we leave it like this because
52
+ # the whole point of StretchVector is to be used for stretches, not steps.
53
+ elif isinstance(cv.array, HTSeq.StretchVector):
54
+ for iv, stretch in cv.array:
55
+ start = cv.offset + iv.start
56
+ prev_val = None
57
+ for i, val in enumerate(stretch, cv.offset + iv.start):
58
+ # Subsequent NaNs, ignore
59
+ if (prev_val is not None) and numpy.isnan(prev_val) and numpy.isnan(val):
60
+ continue
61
+ if prev_val is None or val != prev_val:
62
+ # Delay yield of the first item until you meet the first
63
+ # unequal item, i.e. until you know the end of the step
64
+ if prev_val is not None:
65
+ yield (HTSeq.GenomicInterval(
66
+ cv.iv.chrom,
67
+ start,
68
+ i,
69
+ cv.iv.strand),
70
+ prev_val)
71
+ prev_val = val
72
+ start = i
73
+ yield (HTSeq.GenomicInterval(
74
+ cv.iv.chrom, start, cv.offset + iv.end, cv.iv.strand), prev_val,
75
+ )
76
+ else:
77
+ raise SystemError("Unknown array type.")
78
+
79
+
80
+ def GenomicArray_steps(ga):
81
+ """Steps of a GenomicArray are just the chained steps of each ChromVector"""
82
+ for chrom, chromstrand_dict in ga.chrom_vectors.items():
83
+ for strand, chrom_vector in chromstrand_dict.items():
84
+ for iv, val in chrom_vector.steps():
85
+ yield iv, val
Binary file