seqpro 0.8.1__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {seqpro-0.8.1 → seqpro-0.9.0}/CHANGELOG.md +12 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/PKG-INFO +1 -1
- {seqpro-0.8.1 → seqpro-0.9.0}/pyproject.toml +1 -1
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/_numba.py +24 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/_utils.py +19 -19
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/alphabets/__init__.py +1 -2
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/alphabets/_alphabets.py +72 -24
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/rag/_array.py +10 -14
- {seqpro-0.8.1 → seqpro-0.9.0}/tests/test_modifiers.py +79 -2
- {seqpro-0.8.1 → seqpro-0.9.0}/.gitattributes +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/.github/workflows/bump.yaml +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/.github/workflows/merge.yaml +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/.github/workflows/publish.yaml +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/.github/workflows/test.yaml +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/.gitignore +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/.pre-commit-config.yaml +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/Cargo.lock +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/Cargo.toml +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/LICENSE +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/README.md +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/meta.yaml +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/pixi.lock +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/pixi.toml +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/__init__.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/_analyzers.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/_cleaners.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/_encoders.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/_modifiers.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/_types.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/bed.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/experimental/_experimental.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/experimental/_visualizers.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/gtf.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/py.typed +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/rag/__init__.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/rag/_gufuncs.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/rag/_types.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/rag/_utils.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/transforms/__init__.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/transforms/augmentation.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/transforms/tmm.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/xr/__init__.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/src/kshuffle.rs +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/src/lib.rs +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/tests/bed/test_pyranges.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/tests/bed/test_read.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/tests/bed/test_sort.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/tests/bed/test_with_length.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/tests/test_analyzers.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/tests/test_ohe.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/tests/test_ragged.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/tests/test_tokenize.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/tests/test_transforms.py +0 -0
- {seqpro-0.8.1 → seqpro-0.9.0}/tests/test_translate.py +0 -0
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from typing import Optional, Union, overload
|
|
2
4
|
|
|
3
5
|
import numba as nb
|
|
@@ -132,3 +134,25 @@ def gufunc_translate(
|
|
|
132
134
|
if (seq_kmers == kmer_keys[i]).all():
|
|
133
135
|
res[0] = kmer_values[i] # type: ignore
|
|
134
136
|
break
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@nb.guvectorize(
|
|
140
|
+
["(u1, u1[:], u1[:])"],
|
|
141
|
+
"(),(n)->()",
|
|
142
|
+
nopython=True,
|
|
143
|
+
cache=True,
|
|
144
|
+
)
|
|
145
|
+
def gufunc_complement_bytes(
|
|
146
|
+
seq: NDArray[np.uint8],
|
|
147
|
+
complement_map: NDArray[np.uint8],
|
|
148
|
+
res: NDArray[np.uint8] | None = None,
|
|
149
|
+
) -> NDArray[np.uint8]: # type: ignore
|
|
150
|
+
res[0] = complement_map[seq] # type: ignore
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
_COMP = np.frombuffer(bytes.maketrans(b"ACGT", b"TGCA"), np.uint8)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@nb.vectorize(["u1(u1)"], nopython=True, cache=True)
|
|
157
|
+
def ufunc_comp_dna(seq: NDArray[np.uint8]) -> NDArray[np.uint8]:
|
|
158
|
+
return _COMP[seq]
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import TypeVar, Union, cast, overload
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
from numpy.typing import NDArray
|
|
7
|
+
from typing_extensions import TypeGuard
|
|
7
8
|
|
|
8
9
|
NestedStr = Union[bytes, str, list["NestedStr"]]
|
|
9
10
|
"""String or nested list of strings"""
|
|
@@ -13,20 +14,22 @@ StrSeqType = Union[NestedStr, NDArray[Union[np.str_, np.object_, np.bytes_]]]
|
|
|
13
14
|
|
|
14
15
|
SeqType = Union[NestedStr, NDArray[Union[np.str_, np.object_, np.bytes_, np.uint8]]]
|
|
15
16
|
|
|
17
|
+
DTYPE = TypeVar("DTYPE", bound=np.generic)
|
|
16
18
|
|
|
17
|
-
@overload
|
|
18
|
-
def cast_seqs(seqs: NDArray[np.uint8]) -> NDArray[np.uint8]: ...
|
|
19
19
|
|
|
20
|
+
def is_dtype(
|
|
21
|
+
obj: object, dtype: DTYPE | np.dtype[DTYPE] | type[DTYPE]
|
|
22
|
+
) -> TypeGuard[NDArray[DTYPE]]:
|
|
23
|
+
return isinstance(obj, np.ndarray) and np.issubdtype(obj.dtype, dtype)
|
|
20
24
|
|
|
25
|
+
|
|
26
|
+
@overload
|
|
27
|
+
def cast_seqs(seqs: NDArray[np.uint8]) -> NDArray[np.uint8]: ...
|
|
21
28
|
@overload
|
|
22
29
|
def cast_seqs(seqs: StrSeqType) -> NDArray[np.bytes_]: ...
|
|
23
|
-
|
|
24
|
-
|
|
25
30
|
@overload
|
|
26
|
-
def cast_seqs(seqs: SeqType) -> NDArray[
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def cast_seqs(seqs: SeqType) -> NDArray[Union[np.bytes_, np.uint8]]:
|
|
31
|
+
def cast_seqs(seqs: SeqType) -> NDArray[np.bytes_ | np.uint8]: ...
|
|
32
|
+
def cast_seqs(seqs: SeqType) -> NDArray[np.bytes_ | np.uint8]:
|
|
30
33
|
"""Cast any sequence type to be a NumPy array of ASCII characters (or left alone as
|
|
31
34
|
8-bit unsigned integers if the input is OHE).
|
|
32
35
|
|
|
@@ -54,8 +57,8 @@ def cast_seqs(seqs: SeqType) -> NDArray[Union[np.bytes_, np.uint8]]:
|
|
|
54
57
|
|
|
55
58
|
def check_axes(
|
|
56
59
|
seqs: SeqType,
|
|
57
|
-
length_axis:
|
|
58
|
-
ohe_axis:
|
|
60
|
+
length_axis: int | bool | None = None,
|
|
61
|
+
ohe_axis: int | bool | None = None,
|
|
59
62
|
):
|
|
60
63
|
"""Raise errors if length_axis or ohe_axis is missing when they're needed. Pass
|
|
61
64
|
False to corresponding axis to not check for it.
|
|
@@ -63,16 +66,16 @@ def check_axes(
|
|
|
63
66
|
- ndarray with itemsize == 1 => length axis required.
|
|
64
67
|
- OHE array => length and OHE axis required.
|
|
65
68
|
"""
|
|
69
|
+
# OHE
|
|
70
|
+
if ohe_axis is None and is_dtype(seqs, np.uint8):
|
|
71
|
+
raise ValueError("Need an one hot encoding axis to process OHE sequences.")
|
|
72
|
+
|
|
66
73
|
# bytes or OHE
|
|
67
|
-
if length_axis is None and
|
|
74
|
+
if length_axis is None and is_dtype(seqs, np.bytes_) and seqs.itemsize == 1:
|
|
68
75
|
raise ValueError(
|
|
69
76
|
"Need a length axis to process an ndarray with itemsize == 1 (S1, u1)."
|
|
70
77
|
)
|
|
71
78
|
|
|
72
|
-
# OHE
|
|
73
|
-
if ohe_axis is None and isinstance(seqs, np.ndarray) and seqs.dtype == np.uint8:
|
|
74
|
-
raise ValueError("Need an one hot encoding axis to process OHE sequences.")
|
|
75
|
-
|
|
76
79
|
# length_axis != ohe_axis
|
|
77
80
|
if (
|
|
78
81
|
isinstance(length_axis, int)
|
|
@@ -82,9 +85,6 @@ def check_axes(
|
|
|
82
85
|
raise ValueError("Length and OHE axis must be different.")
|
|
83
86
|
|
|
84
87
|
|
|
85
|
-
DTYPE = TypeVar("DTYPE", bound=np.generic)
|
|
86
|
-
|
|
87
|
-
|
|
88
88
|
def array_slice(a: NDArray[DTYPE], axis: int, slice_: slice) -> NDArray[DTYPE]:
|
|
89
89
|
"""Slice an array from a dynamic axis."""
|
|
90
90
|
return a[(slice(None),) * (axis % a.ndim) + (slice_,)]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from ._alphabets import AminoAlphabet, NucleotideAlphabet
|
|
1
|
+
from ._alphabets import DNA, AminoAlphabet, NucleotideAlphabet
|
|
2
2
|
|
|
3
3
|
# NOTE the "*" character is termination i.e. STOP codon
|
|
4
4
|
canonical_codons_to_aas = {
|
|
@@ -69,7 +69,6 @@ canonical_codons_to_aas = {
|
|
|
69
69
|
}
|
|
70
70
|
|
|
71
71
|
|
|
72
|
-
DNA = NucleotideAlphabet(alphabet="ACGT", complement="TGCA")
|
|
73
72
|
RNA = NucleotideAlphabet(alphabet="ACGU", complement="UGCA")
|
|
74
73
|
AA = AminoAlphabet(*map(list, zip(*canonical_codons_to_aas.items())))
|
|
75
74
|
|
|
@@ -1,10 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from types import MethodType
|
|
1
4
|
from typing import Dict, List, Optional, Union, cast, overload
|
|
2
5
|
|
|
3
6
|
import numpy as np
|
|
4
7
|
from numpy.typing import NDArray
|
|
8
|
+
from typing_extensions import assert_never
|
|
5
9
|
|
|
6
|
-
from .._numba import
|
|
7
|
-
|
|
10
|
+
from .._numba import (
|
|
11
|
+
gufunc_complement_bytes,
|
|
12
|
+
gufunc_ohe,
|
|
13
|
+
gufunc_ohe_char_idx,
|
|
14
|
+
gufunc_translate,
|
|
15
|
+
ufunc_comp_dna,
|
|
16
|
+
)
|
|
17
|
+
from .._utils import SeqType, StrSeqType, cast_seqs, check_axes, is_dtype
|
|
8
18
|
|
|
9
19
|
|
|
10
20
|
class NucleotideAlphabet:
|
|
@@ -12,10 +22,11 @@ class NucleotideAlphabet:
|
|
|
12
22
|
"""Alphabet excluding ambiguous characters e.g. "N" for DNA."""
|
|
13
23
|
complement: str
|
|
14
24
|
array: NDArray[np.bytes_]
|
|
15
|
-
complement_map:
|
|
16
|
-
complement_map_bytes:
|
|
17
|
-
str_comp_table:
|
|
25
|
+
complement_map: dict[str, str]
|
|
26
|
+
complement_map_bytes: dict[bytes, bytes]
|
|
27
|
+
str_comp_table: dict[int, str]
|
|
18
28
|
bytes_comp_table: bytes
|
|
29
|
+
bytes_comp_array: NDArray[np.bytes_]
|
|
19
30
|
|
|
20
31
|
def __init__(self, alphabet: str, complement: str) -> None:
|
|
21
32
|
"""Parse and validate sequence alphabets.
|
|
@@ -36,9 +47,7 @@ class NucleotideAlphabet:
|
|
|
36
47
|
self.array = cast(
|
|
37
48
|
NDArray[np.bytes_], np.frombuffer(self.alphabet.encode("ascii"), "|S1")
|
|
38
49
|
)
|
|
39
|
-
self.complement_map
|
|
40
|
-
zip(list(self.alphabet), list(self.complement))
|
|
41
|
-
)
|
|
50
|
+
self.complement_map = dict(zip(list(self.alphabet), list(self.complement)))
|
|
42
51
|
self.complement_map_bytes = {
|
|
43
52
|
k.encode("ascii"): v.encode("ascii") for k, v in self.complement_map.items()
|
|
44
53
|
}
|
|
@@ -46,6 +55,7 @@ class NucleotideAlphabet:
|
|
|
46
55
|
self.bytes_comp_table = bytes.maketrans(
|
|
47
56
|
self.alphabet.encode("ascii"), self.complement.encode("ascii")
|
|
48
57
|
)
|
|
58
|
+
self.bytes_comp_array = np.frombuffer(self.bytes_comp_table, "S1")
|
|
49
59
|
|
|
50
60
|
def __len__(self):
|
|
51
61
|
return len(self.alphabet)
|
|
@@ -109,23 +119,29 @@ class NucleotideAlphabet:
|
|
|
109
119
|
|
|
110
120
|
return _alphabet[idx].reshape(shape)
|
|
111
121
|
|
|
112
|
-
def complement_bytes(
|
|
122
|
+
def complement_bytes(
|
|
123
|
+
self, byte_arr: NDArray[np.bytes_], out: NDArray[np.bytes_] | None = None
|
|
124
|
+
) -> NDArray[np.bytes_]:
|
|
113
125
|
"""Get reverse complement of byte (S1) array.
|
|
114
126
|
|
|
115
127
|
Parameters
|
|
116
128
|
----------
|
|
117
129
|
byte_arr : ndarray[bytes]
|
|
118
130
|
"""
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
131
|
+
if out is None:
|
|
132
|
+
_out = out
|
|
133
|
+
else:
|
|
134
|
+
_out = out.view(np.uint8)
|
|
135
|
+
_out = gufunc_complement_bytes(
|
|
136
|
+
byte_arr.view(np.uint8), self.bytes_comp_array.view(np.uint8), _out
|
|
137
|
+
)
|
|
138
|
+
return _out.view("S1")
|
|
126
139
|
|
|
127
140
|
def rev_comp_byte(
|
|
128
|
-
self,
|
|
141
|
+
self,
|
|
142
|
+
byte_arr: NDArray[np.bytes_],
|
|
143
|
+
length_axis: int,
|
|
144
|
+
out: NDArray[np.bytes_] | None = None,
|
|
129
145
|
) -> NDArray[np.bytes_]:
|
|
130
146
|
"""Get reverse complement of byte (S1) array.
|
|
131
147
|
|
|
@@ -133,7 +149,7 @@ class NucleotideAlphabet:
|
|
|
133
149
|
----------
|
|
134
150
|
byte_arr : ndarray[bytes]
|
|
135
151
|
"""
|
|
136
|
-
out = self.complement_bytes(byte_arr)
|
|
152
|
+
out = self.complement_bytes(byte_arr, out)
|
|
137
153
|
return np.flip(out, length_axis)
|
|
138
154
|
|
|
139
155
|
def rev_comp_string(self, string: str):
|
|
@@ -150,6 +166,7 @@ class NucleotideAlphabet:
|
|
|
150
166
|
seqs: StrSeqType,
|
|
151
167
|
length_axis: Optional[int] = None,
|
|
152
168
|
ohe_axis: Optional[int] = None,
|
|
169
|
+
out: NDArray[np.bytes_] | None = None,
|
|
153
170
|
) -> NDArray[np.bytes_]: ...
|
|
154
171
|
@overload
|
|
155
172
|
def reverse_complement(
|
|
@@ -157,6 +174,7 @@ class NucleotideAlphabet:
|
|
|
157
174
|
seqs: NDArray[np.uint8],
|
|
158
175
|
length_axis: Optional[int] = None,
|
|
159
176
|
ohe_axis: Optional[int] = None,
|
|
177
|
+
out: NDArray[np.bytes_] | None = None,
|
|
160
178
|
) -> NDArray[np.uint8]: ...
|
|
161
179
|
@overload
|
|
162
180
|
def reverse_complement(
|
|
@@ -164,13 +182,15 @@ class NucleotideAlphabet:
|
|
|
164
182
|
seqs: SeqType,
|
|
165
183
|
length_axis: Optional[int] = None,
|
|
166
184
|
ohe_axis: Optional[int] = None,
|
|
185
|
+
out: NDArray[np.bytes_] | None = None,
|
|
167
186
|
) -> NDArray[Union[np.bytes_, np.uint8]]: ...
|
|
168
187
|
def reverse_complement(
|
|
169
188
|
self,
|
|
170
189
|
seqs: SeqType,
|
|
171
190
|
length_axis: Optional[int] = None,
|
|
172
191
|
ohe_axis: Optional[int] = None,
|
|
173
|
-
|
|
192
|
+
out: NDArray[np.bytes_] | None = None,
|
|
193
|
+
) -> NDArray[np.bytes_ | np.uint8]:
|
|
174
194
|
"""Reverse complement a sequence.
|
|
175
195
|
|
|
176
196
|
Parameters
|
|
@@ -190,14 +210,20 @@ class NucleotideAlphabet:
|
|
|
190
210
|
|
|
191
211
|
seqs = cast_seqs(seqs)
|
|
192
212
|
|
|
193
|
-
if seqs
|
|
213
|
+
if is_dtype(seqs, np.bytes_):
|
|
214
|
+
if length_axis is None:
|
|
215
|
+
length_axis = -1
|
|
216
|
+
return self.rev_comp_byte(seqs, length_axis, out)
|
|
217
|
+
elif is_dtype(seqs, np.uint8): # OHE
|
|
194
218
|
assert length_axis is not None
|
|
195
219
|
assert ohe_axis is not None
|
|
196
|
-
|
|
220
|
+
_out = np.flip(seqs, axis=(length_axis, ohe_axis))
|
|
221
|
+
if out is not None:
|
|
222
|
+
out[:] = _out
|
|
223
|
+
_out = out
|
|
224
|
+
return _out
|
|
197
225
|
else:
|
|
198
|
-
|
|
199
|
-
length_axis = -1
|
|
200
|
-
return self.rev_comp_byte(seqs, length_axis) # type: ignore
|
|
226
|
+
assert_never(seqs) # type: ignore
|
|
201
227
|
|
|
202
228
|
|
|
203
229
|
class AminoAlphabet:
|
|
@@ -334,3 +360,25 @@ class AminoAlphabet:
|
|
|
334
360
|
_alphabet = np.concatenate([self.aa_array, [unknown_char.encode("ascii")]])
|
|
335
361
|
|
|
336
362
|
return _alphabet[idx].reshape(shape)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
DNA = NucleotideAlphabet("ACGT", "TGCA")
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
# * Monkey patch DNA instance with a faster complement function using
|
|
369
|
+
# * a static, const lookup table. The base method is slower because it uses a
|
|
370
|
+
# * dynamic lookup table.
|
|
371
|
+
def complement_bytes(
|
|
372
|
+
self: NucleotideAlphabet,
|
|
373
|
+
byte_arr: NDArray[np.bytes_],
|
|
374
|
+
out: NDArray[np.bytes_] | None = None,
|
|
375
|
+
) -> NDArray[np.bytes_]:
|
|
376
|
+
if out is None:
|
|
377
|
+
_out = out
|
|
378
|
+
else:
|
|
379
|
+
_out = out.view(np.uint8)
|
|
380
|
+
_out = ufunc_comp_dna(byte_arr.view(np.uint8), _out) # type: ignore
|
|
381
|
+
return _out.view("S1")
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
DNA.complement_bytes = MethodType(complement_bytes, DNA)
|
|
@@ -61,7 +61,7 @@ class Ragged(ak.Array, Generic[RDTYPE]):
|
|
|
61
61
|
if isinstance(data, RagParts):
|
|
62
62
|
content = _parts_to_content(data)
|
|
63
63
|
else:
|
|
64
|
-
content =
|
|
64
|
+
content = _as_ragged(data, highlevel=False)
|
|
65
65
|
super().__init__(content, behavior=deepcopy(ak.behavior))
|
|
66
66
|
self._parts = unbox(self)
|
|
67
67
|
type_parts: list[str] = []
|
|
@@ -223,7 +223,7 @@ class Ragged(ak.Array, Generic[RDTYPE]):
|
|
|
223
223
|
"""Note: not zero-copy if offsets or data are non-contiguous."""
|
|
224
224
|
arr = super().to_numpy(allow_missing=allow_missing)
|
|
225
225
|
if self.dtype.type == np.bytes_:
|
|
226
|
-
arr = arr[
|
|
226
|
+
arr = arr[..., None].view("S1")
|
|
227
227
|
return arr
|
|
228
228
|
|
|
229
229
|
def __getitem__(self, where):
|
|
@@ -232,7 +232,7 @@ class Ragged(ak.Array, Generic[RDTYPE]):
|
|
|
232
232
|
if _n_var(arr) == 1:
|
|
233
233
|
return type(self)(arr)
|
|
234
234
|
else:
|
|
235
|
-
return
|
|
235
|
+
return _as_ak(arr)
|
|
236
236
|
else:
|
|
237
237
|
return arr
|
|
238
238
|
|
|
@@ -293,7 +293,7 @@ class Ragged(ak.Array, Generic[RDTYPE]):
|
|
|
293
293
|
|
|
294
294
|
def to_ak(self):
|
|
295
295
|
"""Convert to an Awkward array."""
|
|
296
|
-
arr =
|
|
296
|
+
arr = _as_ak(self)
|
|
297
297
|
arr.behavior = None
|
|
298
298
|
return arr
|
|
299
299
|
|
|
@@ -331,12 +331,12 @@ def _n_var(arr: ak.Array) -> int:
|
|
|
331
331
|
|
|
332
332
|
|
|
333
333
|
@overload
|
|
334
|
-
def
|
|
334
|
+
def _as_ragged(
|
|
335
335
|
arr: ak.Array | Content, highlevel: Literal[True] = True
|
|
336
336
|
) -> ak.Array: ...
|
|
337
337
|
@overload
|
|
338
|
-
def
|
|
339
|
-
def
|
|
338
|
+
def _as_ragged(arr: ak.Array | Content, highlevel: Literal[False]) -> Content: ...
|
|
339
|
+
def _as_ragged(arr: ak.Array | Content, highlevel: bool = True) -> ak.Array | Content:
|
|
340
340
|
def fn(layout: Content, **kwargs):
|
|
341
341
|
if isinstance(layout, (ListArray, ListOffsetArray)):
|
|
342
342
|
return ak.with_parameter(
|
|
@@ -350,16 +350,12 @@ def _with_ragged(arr: ak.Array | Content, highlevel: bool = True) -> ak.Array |
|
|
|
350
350
|
|
|
351
351
|
|
|
352
352
|
@overload
|
|
353
|
-
def
|
|
353
|
+
def _as_ak(
|
|
354
354
|
arr: ak.Array | Ragged[DTYPE], highlevel: Literal[True] = True
|
|
355
355
|
) -> ak.Array: ...
|
|
356
356
|
@overload
|
|
357
|
-
def
|
|
358
|
-
|
|
359
|
-
) -> Content: ...
|
|
360
|
-
def _without_ragged(
|
|
361
|
-
arr: ak.Array | Ragged[DTYPE], highlevel: bool = True
|
|
362
|
-
) -> ak.Array | Content:
|
|
357
|
+
def _as_ak(arr: ak.Array | Ragged[DTYPE], highlevel: Literal[False]) -> Content: ...
|
|
358
|
+
def _as_ak(arr: ak.Array | Ragged[DTYPE], highlevel: bool = True) -> ak.Array | Content:
|
|
363
359
|
def fn(layout, **kwargs):
|
|
364
360
|
if isinstance(layout, (ListArray, ListOffsetArray)):
|
|
365
361
|
return ak.with_parameter(layout, "__list__", None, highlevel=False)
|
|
@@ -2,8 +2,9 @@ from collections import defaultdict
|
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import seqpro as sp
|
|
5
|
-
from
|
|
6
|
-
from seqpro.
|
|
5
|
+
from pytest_cases import parametrize_with_cases
|
|
6
|
+
from seqpro._modifiers import _align_axes, _slice_kmers, reverse_complement
|
|
7
|
+
from seqpro._utils import cast_seqs, check_axes
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
def test_align_axes():
|
|
@@ -143,3 +144,79 @@ def test_k_shuffle():
|
|
|
143
144
|
shuffled_counts = _count_kmers(shuffled, k, length_axis)
|
|
144
145
|
|
|
145
146
|
assert counts == shuffled_counts
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# Test cases for reverse_complement
|
|
150
|
+
class ReverseComplementCases:
|
|
151
|
+
def case_single_string(self):
|
|
152
|
+
"""Test single string sequence."""
|
|
153
|
+
seq = "ATCG"
|
|
154
|
+
# ATCG -> CGAT (reverse complement)
|
|
155
|
+
expected = cast_seqs("CGAT")
|
|
156
|
+
return seq, expected, None, None
|
|
157
|
+
|
|
158
|
+
def case_list_of_strings(self):
|
|
159
|
+
"""Test list of string sequences."""
|
|
160
|
+
seqs = ["ATCG", "GCTA"]
|
|
161
|
+
# ATCG -> CGAT, GCTA -> TAGC
|
|
162
|
+
expected = cast_seqs(["CGAT", "TAGC"])
|
|
163
|
+
return seqs, expected, None, None
|
|
164
|
+
|
|
165
|
+
def case_byte_array_1d(self):
|
|
166
|
+
"""Test 1D byte array."""
|
|
167
|
+
seqs = cast_seqs("ATCG")
|
|
168
|
+
# ATCG -> CGAT
|
|
169
|
+
expected = cast_seqs("CGAT")
|
|
170
|
+
return seqs, expected, -1, None
|
|
171
|
+
|
|
172
|
+
def case_byte_array_2d(self):
|
|
173
|
+
"""Test 2D byte array."""
|
|
174
|
+
seqs = cast_seqs(["ATCG", "GCTA"])
|
|
175
|
+
# ATCG -> CGAT, GCTA -> TAGC
|
|
176
|
+
expected = cast_seqs(["CGAT", "TAGC"])
|
|
177
|
+
return seqs, expected, -1, None
|
|
178
|
+
|
|
179
|
+
def case_byte_array_3d(self):
|
|
180
|
+
"""Test 3D byte array with last axis as length."""
|
|
181
|
+
seqs = cast_seqs([["AT", "CG"], ["GC", "TA"]])
|
|
182
|
+
# AT -> AT (palindrome), CG -> CG (palindrome)
|
|
183
|
+
# GC -> GC (palindrome), TA -> TA (palindrome)
|
|
184
|
+
expected = cast_seqs([["AT", "CG"], ["GC", "TA"]])
|
|
185
|
+
return seqs, expected, -1, None
|
|
186
|
+
|
|
187
|
+
def case_ohe_array_2d(self):
|
|
188
|
+
"""Test 2D one-hot encoded array."""
|
|
189
|
+
# Create OHE sequence: "AC"
|
|
190
|
+
# Shape: (2, 4) - length axis x alphabet axis
|
|
191
|
+
seqs = sp.DNA.ohe("AC")
|
|
192
|
+
# Reverse complement: "AC" -> "GT"
|
|
193
|
+
expected = sp.DNA.ohe("GT")
|
|
194
|
+
return seqs, expected, 0, 1
|
|
195
|
+
|
|
196
|
+
def case_ohe_array_3d(self):
|
|
197
|
+
"""Test 3D one-hot encoded array."""
|
|
198
|
+
# Create two sequences: "AC" and "GT"
|
|
199
|
+
# Shape: (2, 2, 4) - batch x length x alphabet
|
|
200
|
+
seqs = sp.DNA.ohe(["AC", "GT"])
|
|
201
|
+
# Reverse complement:
|
|
202
|
+
# "AC" -> "GT"
|
|
203
|
+
# "GT" -> "AC"
|
|
204
|
+
expected = sp.DNA.ohe(["GT", "AC"])
|
|
205
|
+
return seqs, expected, 1, 2
|
|
206
|
+
|
|
207
|
+
def case_palindrome(self):
|
|
208
|
+
"""Test palindromic sequence (same as its reverse complement)."""
|
|
209
|
+
seq = "GAATTC" # EcoRI site - palindrome
|
|
210
|
+
expected = cast_seqs("GAATTC")
|
|
211
|
+
return seq, expected, None, None
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
@parametrize_with_cases(
|
|
215
|
+
"seqs,expected,length_axis,ohe_axis", cases=ReverseComplementCases
|
|
216
|
+
)
|
|
217
|
+
def test_reverse_complement(seqs, expected, length_axis, ohe_axis):
|
|
218
|
+
"""Test reverse_complement with various input types and configurations."""
|
|
219
|
+
result = reverse_complement(
|
|
220
|
+
seqs, sp.DNA, length_axis=length_axis, ohe_axis=ohe_axis
|
|
221
|
+
)
|
|
222
|
+
np.testing.assert_array_equal(result, expected)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|