seqpro 0.8.1__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {seqpro-0.8.1 → seqpro-0.9.0}/CHANGELOG.md +12 -0
  2. {seqpro-0.8.1 → seqpro-0.9.0}/PKG-INFO +1 -1
  3. {seqpro-0.8.1 → seqpro-0.9.0}/pyproject.toml +1 -1
  4. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/_numba.py +24 -0
  5. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/_utils.py +19 -19
  6. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/alphabets/__init__.py +1 -2
  7. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/alphabets/_alphabets.py +72 -24
  8. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/rag/_array.py +10 -14
  9. {seqpro-0.8.1 → seqpro-0.9.0}/tests/test_modifiers.py +79 -2
  10. {seqpro-0.8.1 → seqpro-0.9.0}/.gitattributes +0 -0
  11. {seqpro-0.8.1 → seqpro-0.9.0}/.github/workflows/bump.yaml +0 -0
  12. {seqpro-0.8.1 → seqpro-0.9.0}/.github/workflows/merge.yaml +0 -0
  13. {seqpro-0.8.1 → seqpro-0.9.0}/.github/workflows/publish.yaml +0 -0
  14. {seqpro-0.8.1 → seqpro-0.9.0}/.github/workflows/test.yaml +0 -0
  15. {seqpro-0.8.1 → seqpro-0.9.0}/.gitignore +0 -0
  16. {seqpro-0.8.1 → seqpro-0.9.0}/.pre-commit-config.yaml +0 -0
  17. {seqpro-0.8.1 → seqpro-0.9.0}/Cargo.lock +0 -0
  18. {seqpro-0.8.1 → seqpro-0.9.0}/Cargo.toml +0 -0
  19. {seqpro-0.8.1 → seqpro-0.9.0}/LICENSE +0 -0
  20. {seqpro-0.8.1 → seqpro-0.9.0}/README.md +0 -0
  21. {seqpro-0.8.1 → seqpro-0.9.0}/meta.yaml +0 -0
  22. {seqpro-0.8.1 → seqpro-0.9.0}/pixi.lock +0 -0
  23. {seqpro-0.8.1 → seqpro-0.9.0}/pixi.toml +0 -0
  24. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/__init__.py +0 -0
  25. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/_analyzers.py +0 -0
  26. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/_cleaners.py +0 -0
  27. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/_encoders.py +0 -0
  28. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/_modifiers.py +0 -0
  29. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/_types.py +0 -0
  30. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/bed.py +0 -0
  31. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/experimental/_experimental.py +0 -0
  32. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/experimental/_visualizers.py +0 -0
  33. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/gtf.py +0 -0
  34. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/py.typed +0 -0
  35. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/rag/__init__.py +0 -0
  36. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/rag/_gufuncs.py +0 -0
  37. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/rag/_types.py +0 -0
  38. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/rag/_utils.py +0 -0
  39. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/transforms/__init__.py +0 -0
  40. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/transforms/augmentation.py +0 -0
  41. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/transforms/tmm.py +0 -0
  42. {seqpro-0.8.1 → seqpro-0.9.0}/python/seqpro/xr/__init__.py +0 -0
  43. {seqpro-0.8.1 → seqpro-0.9.0}/src/kshuffle.rs +0 -0
  44. {seqpro-0.8.1 → seqpro-0.9.0}/src/lib.rs +0 -0
  45. {seqpro-0.8.1 → seqpro-0.9.0}/tests/bed/test_pyranges.py +0 -0
  46. {seqpro-0.8.1 → seqpro-0.9.0}/tests/bed/test_read.py +0 -0
  47. {seqpro-0.8.1 → seqpro-0.9.0}/tests/bed/test_sort.py +0 -0
  48. {seqpro-0.8.1 → seqpro-0.9.0}/tests/bed/test_with_length.py +0 -0
  49. {seqpro-0.8.1 → seqpro-0.9.0}/tests/test_analyzers.py +0 -0
  50. {seqpro-0.8.1 → seqpro-0.9.0}/tests/test_ohe.py +0 -0
  51. {seqpro-0.8.1 → seqpro-0.9.0}/tests/test_ragged.py +0 -0
  52. {seqpro-0.8.1 → seqpro-0.9.0}/tests/test_tokenize.py +0 -0
  53. {seqpro-0.8.1 → seqpro-0.9.0}/tests/test_transforms.py +0 -0
  54. {seqpro-0.8.1 → seqpro-0.9.0}/tests/test_translate.py +0 -0
@@ -1,3 +1,15 @@
1
+ ## 0.9.0 (2025-11-09)
2
+
3
+ ### Feat
4
+
5
+ - **perf**: faster reverse complementing and option to pass pre-alloc output
6
+
7
+ ## 0.8.2 (2025-10-22)
8
+
9
+ ### Fix
10
+
11
+ - wrong shape of Ragged[bytes].to_numpy()
12
+
1
13
  ## 0.8.1 (2025-10-22)
2
14
 
3
15
  ### Fix
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: seqpro
3
- Version: 0.8.1
3
+ Version: 0.9.0
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: Implementation :: CPython
6
6
  Classifier: Programming Language :: Python :: Implementation :: PyPy
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "seqpro"
3
- version = "0.8.1"
3
+ version = "0.9.0"
4
4
  authors = [
5
5
  { name = "David Laub", email = "dlaub@ucsd.edu" },
6
6
  { name = "Adam Klie", email = "aklie@ucsd.edu" },
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  from typing import Optional, Union, overload
2
4
 
3
5
  import numba as nb
@@ -132,3 +134,25 @@ def gufunc_translate(
132
134
  if (seq_kmers == kmer_keys[i]).all():
133
135
  res[0] = kmer_values[i] # type: ignore
134
136
  break
137
+
138
+
139
+ @nb.guvectorize(
140
+ ["(u1, u1[:], u1[:])"],
141
+ "(),(n)->()",
142
+ nopython=True,
143
+ cache=True,
144
+ )
145
+ def gufunc_complement_bytes(
146
+ seq: NDArray[np.uint8],
147
+ complement_map: NDArray[np.uint8],
148
+ res: NDArray[np.uint8] | None = None,
149
+ ) -> NDArray[np.uint8]: # type: ignore
150
+ res[0] = complement_map[seq] # type: ignore
151
+
152
+
153
+ _COMP = np.frombuffer(bytes.maketrans(b"ACGT", b"TGCA"), np.uint8)
154
+
155
+
156
+ @nb.vectorize(["u1(u1)"], nopython=True, cache=True)
157
+ def ufunc_comp_dna(seq: NDArray[np.uint8]) -> NDArray[np.uint8]:
158
+ return _COMP[seq]
@@ -1,9 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Optional, TypeVar, Union, cast, overload
3
+ from typing import TypeVar, Union, cast, overload
4
4
 
5
5
  import numpy as np
6
6
  from numpy.typing import NDArray
7
+ from typing_extensions import TypeGuard
7
8
 
8
9
  NestedStr = Union[bytes, str, list["NestedStr"]]
9
10
  """String or nested list of strings"""
@@ -13,20 +14,22 @@ StrSeqType = Union[NestedStr, NDArray[Union[np.str_, np.object_, np.bytes_]]]
13
14
 
14
15
  SeqType = Union[NestedStr, NDArray[Union[np.str_, np.object_, np.bytes_, np.uint8]]]
15
16
 
17
+ DTYPE = TypeVar("DTYPE", bound=np.generic)
16
18
 
17
- @overload
18
- def cast_seqs(seqs: NDArray[np.uint8]) -> NDArray[np.uint8]: ...
19
19
 
20
+ def is_dtype(
21
+ obj: object, dtype: DTYPE | np.dtype[DTYPE] | type[DTYPE]
22
+ ) -> TypeGuard[NDArray[DTYPE]]:
23
+ return isinstance(obj, np.ndarray) and np.issubdtype(obj.dtype, dtype)
20
24
 
25
+
26
+ @overload
27
+ def cast_seqs(seqs: NDArray[np.uint8]) -> NDArray[np.uint8]: ...
21
28
  @overload
22
29
  def cast_seqs(seqs: StrSeqType) -> NDArray[np.bytes_]: ...
23
-
24
-
25
30
  @overload
26
- def cast_seqs(seqs: SeqType) -> NDArray[Union[np.bytes_, np.uint8]]: ...
27
-
28
-
29
- def cast_seqs(seqs: SeqType) -> NDArray[Union[np.bytes_, np.uint8]]:
31
+ def cast_seqs(seqs: SeqType) -> NDArray[np.bytes_ | np.uint8]: ...
32
+ def cast_seqs(seqs: SeqType) -> NDArray[np.bytes_ | np.uint8]:
30
33
  """Cast any sequence type to be a NumPy array of ASCII characters (or left alone as
31
34
  8-bit unsigned integers if the input is OHE).
32
35
 
@@ -54,8 +57,8 @@ def cast_seqs(seqs: SeqType) -> NDArray[Union[np.bytes_, np.uint8]]:
54
57
 
55
58
  def check_axes(
56
59
  seqs: SeqType,
57
- length_axis: Optional[Union[int, bool]] = None,
58
- ohe_axis: Optional[Union[int, bool]] = None,
60
+ length_axis: int | bool | None = None,
61
+ ohe_axis: int | bool | None = None,
59
62
  ):
60
63
  """Raise errors if length_axis or ohe_axis is missing when they're needed. Pass
61
64
  False to corresponding axis to not check for it.
@@ -63,16 +66,16 @@ def check_axes(
63
66
  - ndarray with itemsize == 1 => length axis required.
64
67
  - OHE array => length and OHE axis required.
65
68
  """
69
+ # OHE
70
+ if ohe_axis is None and is_dtype(seqs, np.uint8):
71
+ raise ValueError("Need an one hot encoding axis to process OHE sequences.")
72
+
66
73
  # bytes or OHE
67
- if length_axis is None and isinstance(seqs, np.ndarray) and seqs.itemsize == 1:
74
+ if length_axis is None and is_dtype(seqs, np.bytes_) and seqs.itemsize == 1:
68
75
  raise ValueError(
69
76
  "Need a length axis to process an ndarray with itemsize == 1 (S1, u1)."
70
77
  )
71
78
 
72
- # OHE
73
- if ohe_axis is None and isinstance(seqs, np.ndarray) and seqs.dtype == np.uint8:
74
- raise ValueError("Need an one hot encoding axis to process OHE sequences.")
75
-
76
79
  # length_axis != ohe_axis
77
80
  if (
78
81
  isinstance(length_axis, int)
@@ -82,9 +85,6 @@ def check_axes(
82
85
  raise ValueError("Length and OHE axis must be different.")
83
86
 
84
87
 
85
- DTYPE = TypeVar("DTYPE", bound=np.generic)
86
-
87
-
88
88
  def array_slice(a: NDArray[DTYPE], axis: int, slice_: slice) -> NDArray[DTYPE]:
89
89
  """Slice an array from a dynamic axis."""
90
90
  return a[(slice(None),) * (axis % a.ndim) + (slice_,)]
@@ -1,4 +1,4 @@
1
- from ._alphabets import AminoAlphabet, NucleotideAlphabet
1
+ from ._alphabets import DNA, AminoAlphabet, NucleotideAlphabet
2
2
 
3
3
  # NOTE the "*" character is termination i.e. STOP codon
4
4
  canonical_codons_to_aas = {
@@ -69,7 +69,6 @@ canonical_codons_to_aas = {
69
69
  }
70
70
 
71
71
 
72
- DNA = NucleotideAlphabet(alphabet="ACGT", complement="TGCA")
73
72
  RNA = NucleotideAlphabet(alphabet="ACGU", complement="UGCA")
74
73
  AA = AminoAlphabet(*map(list, zip(*canonical_codons_to_aas.items())))
75
74
 
@@ -1,10 +1,20 @@
1
+ from __future__ import annotations
2
+
3
+ from types import MethodType
1
4
  from typing import Dict, List, Optional, Union, cast, overload
2
5
 
3
6
  import numpy as np
4
7
  from numpy.typing import NDArray
8
+ from typing_extensions import assert_never
5
9
 
6
- from .._numba import gufunc_ohe, gufunc_ohe_char_idx, gufunc_translate
7
- from .._utils import SeqType, StrSeqType, cast_seqs, check_axes
10
+ from .._numba import (
11
+ gufunc_complement_bytes,
12
+ gufunc_ohe,
13
+ gufunc_ohe_char_idx,
14
+ gufunc_translate,
15
+ ufunc_comp_dna,
16
+ )
17
+ from .._utils import SeqType, StrSeqType, cast_seqs, check_axes, is_dtype
8
18
 
9
19
 
10
20
  class NucleotideAlphabet:
@@ -12,10 +22,11 @@ class NucleotideAlphabet:
12
22
  """Alphabet excluding ambiguous characters e.g. "N" for DNA."""
13
23
  complement: str
14
24
  array: NDArray[np.bytes_]
15
- complement_map: Dict[str, str]
16
- complement_map_bytes: Dict[bytes, bytes]
17
- str_comp_table: Dict[int, str]
25
+ complement_map: dict[str, str]
26
+ complement_map_bytes: dict[bytes, bytes]
27
+ str_comp_table: dict[int, str]
18
28
  bytes_comp_table: bytes
29
+ bytes_comp_array: NDArray[np.bytes_]
19
30
 
20
31
  def __init__(self, alphabet: str, complement: str) -> None:
21
32
  """Parse and validate sequence alphabets.
@@ -36,9 +47,7 @@ class NucleotideAlphabet:
36
47
  self.array = cast(
37
48
  NDArray[np.bytes_], np.frombuffer(self.alphabet.encode("ascii"), "|S1")
38
49
  )
39
- self.complement_map: Dict[str, str] = dict(
40
- zip(list(self.alphabet), list(self.complement))
41
- )
50
+ self.complement_map = dict(zip(list(self.alphabet), list(self.complement)))
42
51
  self.complement_map_bytes = {
43
52
  k.encode("ascii"): v.encode("ascii") for k, v in self.complement_map.items()
44
53
  }
@@ -46,6 +55,7 @@ class NucleotideAlphabet:
46
55
  self.bytes_comp_table = bytes.maketrans(
47
56
  self.alphabet.encode("ascii"), self.complement.encode("ascii")
48
57
  )
58
+ self.bytes_comp_array = np.frombuffer(self.bytes_comp_table, "S1")
49
59
 
50
60
  def __len__(self):
51
61
  return len(self.alphabet)
@@ -109,23 +119,29 @@ class NucleotideAlphabet:
109
119
 
110
120
  return _alphabet[idx].reshape(shape)
111
121
 
112
- def complement_bytes(self, byte_arr: NDArray[np.bytes_]) -> NDArray[np.bytes_]:
122
+ def complement_bytes(
123
+ self, byte_arr: NDArray[np.bytes_], out: NDArray[np.bytes_] | None = None
124
+ ) -> NDArray[np.bytes_]:
113
125
  """Get reverse complement of byte (S1) array.
114
126
 
115
127
  Parameters
116
128
  ----------
117
129
  byte_arr : ndarray[bytes]
118
130
  """
119
- # * a vectorized implementation using np.unique or np.char.translate is NOT
120
- # * faster even for longer alphabets like IUPAC DNA/RNA. Another optimization to
121
- # * try would be using vectorized bit manipulations.
122
- out = byte_arr.copy()
123
- for nuc, comp in self.complement_map_bytes.items():
124
- out[byte_arr == nuc] = comp
125
- return out
131
+ if out is None:
132
+ _out = out
133
+ else:
134
+ _out = out.view(np.uint8)
135
+ _out = gufunc_complement_bytes(
136
+ byte_arr.view(np.uint8), self.bytes_comp_array.view(np.uint8), _out
137
+ )
138
+ return _out.view("S1")
126
139
 
127
140
  def rev_comp_byte(
128
- self, byte_arr: NDArray[np.bytes_], length_axis: int
141
+ self,
142
+ byte_arr: NDArray[np.bytes_],
143
+ length_axis: int,
144
+ out: NDArray[np.bytes_] | None = None,
129
145
  ) -> NDArray[np.bytes_]:
130
146
  """Get reverse complement of byte (S1) array.
131
147
 
@@ -133,7 +149,7 @@ class NucleotideAlphabet:
133
149
  ----------
134
150
  byte_arr : ndarray[bytes]
135
151
  """
136
- out = self.complement_bytes(byte_arr)
152
+ out = self.complement_bytes(byte_arr, out)
137
153
  return np.flip(out, length_axis)
138
154
 
139
155
  def rev_comp_string(self, string: str):
@@ -150,6 +166,7 @@ class NucleotideAlphabet:
150
166
  seqs: StrSeqType,
151
167
  length_axis: Optional[int] = None,
152
168
  ohe_axis: Optional[int] = None,
169
+ out: NDArray[np.bytes_] | None = None,
153
170
  ) -> NDArray[np.bytes_]: ...
154
171
  @overload
155
172
  def reverse_complement(
@@ -157,6 +174,7 @@ class NucleotideAlphabet:
157
174
  seqs: NDArray[np.uint8],
158
175
  length_axis: Optional[int] = None,
159
176
  ohe_axis: Optional[int] = None,
177
+ out: NDArray[np.bytes_] | None = None,
160
178
  ) -> NDArray[np.uint8]: ...
161
179
  @overload
162
180
  def reverse_complement(
@@ -164,13 +182,15 @@ class NucleotideAlphabet:
164
182
  seqs: SeqType,
165
183
  length_axis: Optional[int] = None,
166
184
  ohe_axis: Optional[int] = None,
185
+ out: NDArray[np.bytes_] | None = None,
167
186
  ) -> NDArray[Union[np.bytes_, np.uint8]]: ...
168
187
  def reverse_complement(
169
188
  self,
170
189
  seqs: SeqType,
171
190
  length_axis: Optional[int] = None,
172
191
  ohe_axis: Optional[int] = None,
173
- ) -> NDArray[Union[np.bytes_, np.uint8]]:
192
+ out: NDArray[np.bytes_] | None = None,
193
+ ) -> NDArray[np.bytes_ | np.uint8]:
174
194
  """Reverse complement a sequence.
175
195
 
176
196
  Parameters
@@ -190,14 +210,20 @@ class NucleotideAlphabet:
190
210
 
191
211
  seqs = cast_seqs(seqs)
192
212
 
193
- if seqs.dtype == np.uint8: # OHE
213
+ if is_dtype(seqs, np.bytes_):
214
+ if length_axis is None:
215
+ length_axis = -1
216
+ return self.rev_comp_byte(seqs, length_axis, out)
217
+ elif is_dtype(seqs, np.uint8): # OHE
194
218
  assert length_axis is not None
195
219
  assert ohe_axis is not None
196
- return np.flip(seqs, axis=(length_axis, ohe_axis))
220
+ _out = np.flip(seqs, axis=(length_axis, ohe_axis))
221
+ if out is not None:
222
+ out[:] = _out
223
+ _out = out
224
+ return _out
197
225
  else:
198
- if length_axis is None:
199
- length_axis = -1
200
- return self.rev_comp_byte(seqs, length_axis) # type: ignore
226
+ assert_never(seqs) # type: ignore
201
227
 
202
228
 
203
229
  class AminoAlphabet:
@@ -334,3 +360,25 @@ class AminoAlphabet:
334
360
  _alphabet = np.concatenate([self.aa_array, [unknown_char.encode("ascii")]])
335
361
 
336
362
  return _alphabet[idx].reshape(shape)
363
+
364
+
365
+ DNA = NucleotideAlphabet("ACGT", "TGCA")
366
+
367
+
368
+ # * Monkey patch DNA instance with a faster complement function using
369
+ # * a static, const lookup table. The base method is slower because it uses a
370
+ # * dynamic lookup table.
371
+ def complement_bytes(
372
+ self: NucleotideAlphabet,
373
+ byte_arr: NDArray[np.bytes_],
374
+ out: NDArray[np.bytes_] | None = None,
375
+ ) -> NDArray[np.bytes_]:
376
+ if out is None:
377
+ _out = out
378
+ else:
379
+ _out = out.view(np.uint8)
380
+ _out = ufunc_comp_dna(byte_arr.view(np.uint8), _out) # type: ignore
381
+ return _out.view("S1")
382
+
383
+
384
+ DNA.complement_bytes = MethodType(complement_bytes, DNA)
@@ -61,7 +61,7 @@ class Ragged(ak.Array, Generic[RDTYPE]):
61
61
  if isinstance(data, RagParts):
62
62
  content = _parts_to_content(data)
63
63
  else:
64
- content = _with_ragged(data, highlevel=False)
64
+ content = _as_ragged(data, highlevel=False)
65
65
  super().__init__(content, behavior=deepcopy(ak.behavior))
66
66
  self._parts = unbox(self)
67
67
  type_parts: list[str] = []
@@ -223,7 +223,7 @@ class Ragged(ak.Array, Generic[RDTYPE]):
223
223
  """Note: not zero-copy if offsets or data are non-contiguous."""
224
224
  arr = super().to_numpy(allow_missing=allow_missing)
225
225
  if self.dtype.type == np.bytes_:
226
- arr = arr[:, None].view("S1")
226
+ arr = arr[..., None].view("S1")
227
227
  return arr
228
228
 
229
229
  def __getitem__(self, where):
@@ -232,7 +232,7 @@ class Ragged(ak.Array, Generic[RDTYPE]):
232
232
  if _n_var(arr) == 1:
233
233
  return type(self)(arr)
234
234
  else:
235
- return _without_ragged(arr)
235
+ return _as_ak(arr)
236
236
  else:
237
237
  return arr
238
238
 
@@ -293,7 +293,7 @@ class Ragged(ak.Array, Generic[RDTYPE]):
293
293
 
294
294
  def to_ak(self):
295
295
  """Convert to an Awkward array."""
296
- arr = _without_ragged(self)
296
+ arr = _as_ak(self)
297
297
  arr.behavior = None
298
298
  return arr
299
299
 
@@ -331,12 +331,12 @@ def _n_var(arr: ak.Array) -> int:
331
331
 
332
332
 
333
333
  @overload
334
- def _with_ragged(
334
+ def _as_ragged(
335
335
  arr: ak.Array | Content, highlevel: Literal[True] = True
336
336
  ) -> ak.Array: ...
337
337
  @overload
338
- def _with_ragged(arr: ak.Array | Content, highlevel: Literal[False]) -> Content: ...
339
- def _with_ragged(arr: ak.Array | Content, highlevel: bool = True) -> ak.Array | Content:
338
+ def _as_ragged(arr: ak.Array | Content, highlevel: Literal[False]) -> Content: ...
339
+ def _as_ragged(arr: ak.Array | Content, highlevel: bool = True) -> ak.Array | Content:
340
340
  def fn(layout: Content, **kwargs):
341
341
  if isinstance(layout, (ListArray, ListOffsetArray)):
342
342
  return ak.with_parameter(
@@ -350,16 +350,12 @@ def _with_ragged(arr: ak.Array | Content, highlevel: bool = True) -> ak.Array |
350
350
 
351
351
 
352
352
  @overload
353
- def _without_ragged(
353
+ def _as_ak(
354
354
  arr: ak.Array | Ragged[DTYPE], highlevel: Literal[True] = True
355
355
  ) -> ak.Array: ...
356
356
  @overload
357
- def _without_ragged(
358
- arr: ak.Array | Ragged[DTYPE], highlevel: Literal[False]
359
- ) -> Content: ...
360
- def _without_ragged(
361
- arr: ak.Array | Ragged[DTYPE], highlevel: bool = True
362
- ) -> ak.Array | Content:
357
+ def _as_ak(arr: ak.Array | Ragged[DTYPE], highlevel: Literal[False]) -> Content: ...
358
+ def _as_ak(arr: ak.Array | Ragged[DTYPE], highlevel: bool = True) -> ak.Array | Content:
363
359
  def fn(layout, **kwargs):
364
360
  if isinstance(layout, (ListArray, ListOffsetArray)):
365
361
  return ak.with_parameter(layout, "__list__", None, highlevel=False)
@@ -2,8 +2,9 @@ from collections import defaultdict
2
2
 
3
3
  import numpy as np
4
4
  import seqpro as sp
5
- from seqpro._modifiers import _align_axes, _slice_kmers
6
- from seqpro._utils import check_axes
5
+ from pytest_cases import parametrize_with_cases
6
+ from seqpro._modifiers import _align_axes, _slice_kmers, reverse_complement
7
+ from seqpro._utils import cast_seqs, check_axes
7
8
 
8
9
 
9
10
  def test_align_axes():
@@ -143,3 +144,79 @@ def test_k_shuffle():
143
144
  shuffled_counts = _count_kmers(shuffled, k, length_axis)
144
145
 
145
146
  assert counts == shuffled_counts
147
+
148
+
149
+ # Test cases for reverse_complement
150
+ class ReverseComplementCases:
151
+ def case_single_string(self):
152
+ """Test single string sequence."""
153
+ seq = "ATCG"
154
+ # ATCG -> CGAT (reverse complement)
155
+ expected = cast_seqs("CGAT")
156
+ return seq, expected, None, None
157
+
158
+ def case_list_of_strings(self):
159
+ """Test list of string sequences."""
160
+ seqs = ["ATCG", "GCTA"]
161
+ # ATCG -> CGAT, GCTA -> TAGC
162
+ expected = cast_seqs(["CGAT", "TAGC"])
163
+ return seqs, expected, None, None
164
+
165
+ def case_byte_array_1d(self):
166
+ """Test 1D byte array."""
167
+ seqs = cast_seqs("ATCG")
168
+ # ATCG -> CGAT
169
+ expected = cast_seqs("CGAT")
170
+ return seqs, expected, -1, None
171
+
172
+ def case_byte_array_2d(self):
173
+ """Test 2D byte array."""
174
+ seqs = cast_seqs(["ATCG", "GCTA"])
175
+ # ATCG -> CGAT, GCTA -> TAGC
176
+ expected = cast_seqs(["CGAT", "TAGC"])
177
+ return seqs, expected, -1, None
178
+
179
+ def case_byte_array_3d(self):
180
+ """Test 3D byte array with last axis as length."""
181
+ seqs = cast_seqs([["AT", "CG"], ["GC", "TA"]])
182
+ # AT -> AT (palindrome), CG -> CG (palindrome)
183
+ # GC -> GC (palindrome), TA -> TA (palindrome)
184
+ expected = cast_seqs([["AT", "CG"], ["GC", "TA"]])
185
+ return seqs, expected, -1, None
186
+
187
+ def case_ohe_array_2d(self):
188
+ """Test 2D one-hot encoded array."""
189
+ # Create OHE sequence: "AC"
190
+ # Shape: (2, 4) - length axis x alphabet axis
191
+ seqs = sp.DNA.ohe("AC")
192
+ # Reverse complement: "AC" -> "GT"
193
+ expected = sp.DNA.ohe("GT")
194
+ return seqs, expected, 0, 1
195
+
196
+ def case_ohe_array_3d(self):
197
+ """Test 3D one-hot encoded array."""
198
+ # Create two sequences: "AC" and "GT"
199
+ # Shape: (2, 2, 4) - batch x length x alphabet
200
+ seqs = sp.DNA.ohe(["AC", "GT"])
201
+ # Reverse complement:
202
+ # "AC" -> "GT"
203
+ # "GT" -> "AC"
204
+ expected = sp.DNA.ohe(["GT", "AC"])
205
+ return seqs, expected, 1, 2
206
+
207
+ def case_palindrome(self):
208
+ """Test palindromic sequence (same as its reverse complement)."""
209
+ seq = "GAATTC" # EcoRI site - palindrome
210
+ expected = cast_seqs("GAATTC")
211
+ return seq, expected, None, None
212
+
213
+
214
+ @parametrize_with_cases(
215
+ "seqs,expected,length_axis,ohe_axis", cases=ReverseComplementCases
216
+ )
217
+ def test_reverse_complement(seqs, expected, length_axis, ohe_axis):
218
+ """Test reverse_complement with various input types and configurations."""
219
+ result = reverse_complement(
220
+ seqs, sp.DNA, length_axis=length_axis, ohe_axis=ohe_axis
221
+ )
222
+ np.testing.assert_array_equal(result, expected)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes