biolect 0.0.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolect/__init__.py +27 -0
- biolect/core/__init__.py +0 -0
- biolect/core/definitions/__init__.py +0 -0
- biolect/core/dtypes.py +62 -0
- biolect/core/errors/__init__.py +26 -0
- biolect/core/errors/etypes.py +55 -0
- biolect/core/errors/handlers.py +76 -0
- biolect/core/errors/monadic.py +0 -0
- biolect/core/errors/recovery.py +0 -0
- biolect/core/operations.py +0 -0
- biolect/core/pipeline.py +0 -0
- biolect/core/validation.py +0 -0
- biolect/formats/fasta.py +0 -0
- biolect/formats/fastq.py +0 -0
- biolect/formats/gff.py +0 -0
- biolect/formats/parsers/__init__.py +0 -0
- biolect/formats/parsers/safe_parsers.py +0 -0
- biolect/formats/parsers/validators.py +0 -0
- biolect/formats/phylip.py +0 -0
- biolect/formats/sambam.py +0 -0
- biolect/formats/vcf.py +0 -0
- biolect/integrations/__init__.py +0 -0
- biolect/integrations/blast.py +0 -0
- biolect/integrations/clustal.py +0 -0
- biolect/integrations/external.py +0 -0
- biolect/integrations/muscle.py +0 -0
- biolect/integrations/raxml.py +0 -0
- biolect/integrations/safe/__init__.py +0 -0
- biolect/integrations/safe/blast_safe.py +0 -0
- biolect/integrations/safe/tools_safe.py +0 -0
- biolect/operations/__init__.py +26 -0
- biolect/operations/alignment/__init__.py +0 -0
- biolect/operations/alignment/consensus.py +0 -0
- biolect/operations/alignment/multiple.py +0 -0
- biolect/operations/alignment/pairwise.py +0 -0
- biolect/operations/alignment/safe/__init__.py +0 -0
- biolect/operations/alignment/safe/monadic_ops.py +0 -0
- biolect/operations/analysis/__init__.py +0 -0
- biolect/operations/analysis/clustering.py +0 -0
- biolect/operations/analysis/quality.py +0 -0
- biolect/operations/analysis/safe/__init__.py +0 -0
- biolect/operations/analysis/safe/monadic_ops.py +0 -0
- biolect/operations/analysis/statistics.py +0 -0
- biolect/operations/analysis/visualization.py +0 -0
- biolect/operations/annotation/__init__.py +0 -0
- biolect/operations/annotation/comparative.py +0 -0
- biolect/operations/annotation/functional.py +0 -0
- biolect/operations/annotation/gene_calling.py +0 -0
- biolect/operations/annotation/safe/__init__.py +0 -0
- biolect/operations/annotation/safe/monadic_ops.py +0 -0
- biolect/operations/phylogenetics/__init__.py +0 -0
- biolect/operations/phylogenetics/distance.py +0 -0
- biolect/operations/phylogenetics/safe/__init__.py +0 -0
- biolect/operations/phylogenetics/safe/monadic_ops.py +0 -0
- biolect/operations/phylogenetics/trees.py +0 -0
- biolect/operations/phylogenetics/visualization.py +0 -0
- biolect/operations/sequencing/__init__.py +28 -0
- biolect/operations/sequencing/analysis.py +16 -0
- biolect/operations/sequencing/encoding.py +193 -0
- biolect/operations/sequencing/manipulation.py +0 -0
- biolect/operations/sequencing/safe/__init__.py +0 -0
- biolect/operations/sequencing/safe/monadic_ops.py +0 -0
- biolect/operations/sequencing/search.py +0 -0
- biolect/operations/sequencing/translation.py +0 -0
- biolect/utilities/__init__.py +0 -0
- biolect/utilities/caching.py +0 -0
- biolect/utilities/config.py +0 -0
- biolect/utilities/memory.py +0 -0
- biolect/utilities/parallel.py +0 -0
- biolect/utilities/progress.py +0 -0
- biolect/visualization/__init__.py +0 -0
- biolect/visualization/alignments.py +0 -0
- biolect/visualization/genomics.py +0 -0
- biolect/visualization/sequences.py +0 -0
- biolect/visualization/trees.py +0 -0
- biolect-0.0.0.dev1.dist-info/METADATA +40 -0
- biolect-0.0.0.dev1.dist-info/RECORD +79 -0
- biolect-0.0.0.dev1.dist-info/WHEEL +4 -0
- biolect-0.0.0.dev1.dist-info/licenses/LICENSE +661 -0
biolect/__init__.py
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
"""Biolect
|
2
|
+
|
3
|
+
This module initializes the Biolect DSL framework at its highest level.
|
4
|
+
"""
|
5
|
+
# ─── import statements ────────────────────────────────────────────────── ✦✦ ─
|
6
|
+
from . import core, formats, integrations, operations, utilities, visualization
|
7
|
+
|
8
|
+
from .operations import (
|
9
|
+
PackedSequence, SequenceBatch, SequenceStream,
|
10
|
+
pack_sequence, unpack_sequence, vectorized_decode
|
11
|
+
)
|
12
|
+
|
13
|
+
|
14
|
+
__all__ = [
|
15
|
+
# ─── module-level exports ────────────────────────────────────────────────
|
16
|
+
"core", "formats", "integrations", "operations", "utilities",
|
17
|
+
"visualization",
|
18
|
+
|
19
|
+
# ─── class-level exports ─────────────────────────────────────────────────
|
20
|
+
"PackedSequence", "SequenceBatch", "SequenceStream",
|
21
|
+
|
22
|
+
# ─── function-level exports ──────────────────────────────────────────────
|
23
|
+
"pack_sequence", "unpack_sequence", "vectorized_decode",
|
24
|
+
|
25
|
+
# ─── object-level exports ────────────────────────────────────────────────
|
26
|
+
# ...
|
27
|
+
]
|
biolect/core/__init__.py
ADDED
File without changes
|
File without changes
|
biolect/core/dtypes.py
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
"""Core Data Types
|
2
|
+
|
3
|
+
This module defines the framework's core data types.
|
4
|
+
"""
|
5
|
+
# ─── import statements ────────────────────────────────────────────────── ✦✦ ─
|
6
|
+
|
7
|
+
# standard library import
|
8
|
+
from __future__ import annotations
|
9
|
+
from abc import ABC, abstractmethods
|
10
|
+
from collections.abc import Callable, Iterable
|
11
|
+
from dataclasses import dataclass
|
12
|
+
from enum import Enum, auto
|
13
|
+
from typing import (
|
14
|
+
# types
|
15
|
+
Any, Final, Iterator, NamedTuple, Protocol, TypeAlias, TypeVar,
|
16
|
+
|
17
|
+
# descriptors
|
18
|
+
final, overload, runtime_checkable
|
19
|
+
)
|
20
|
+
|
21
|
+
# third-party imports
|
22
|
+
import numpy as np
|
23
|
+
|
24
|
+
from Bio import SeqIO
|
25
|
+
from Bio.Seq import Seq
|
26
|
+
from returns import Maybe, Result
|
27
|
+
|
28
|
+
# local imports
|
29
|
+
# ...
|
30
|
+
|
31
|
+
|
32
|
+
# ─── enums ────────────────────────────────────────────────────────────── ✦✦ ─
|
33
|
+
|
34
|
+
|
35
|
+
class Alphabet(Enum):
|
36
|
+
"""An enumeration of the alphabets used in sequencing."""
|
37
|
+
AA = auto()
|
38
|
+
DNA = auto()
|
39
|
+
RNA = auto()
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
# ─── typing ───────────────────────────────────────────────────────────── ✦✦ ─
|
44
|
+
|
45
|
+
T: TypeVar = TypeVar("T")
|
46
|
+
E: TypeVar = TypeVar("E")
|
47
|
+
S: TypeVar = TypeVar("S")
|
48
|
+
U: TypeVar = TypeVar("U")
|
49
|
+
|
50
|
+
|
51
|
+
@runtime_checkable
|
52
|
+
class Parseable(Protocol):
|
53
|
+
...
|
54
|
+
|
55
|
+
|
56
|
+
@dataclass
|
57
|
+
class Domain(Enum):
|
58
|
+
...
|
59
|
+
|
60
|
+
|
61
|
+
class Genome(ABC):
|
62
|
+
domain: Domain
|
@@ -0,0 +1,26 @@
|
|
1
|
+
"""Core Error Framework
|
2
|
+
|
3
|
+
This module defines the Biolect DSL's core error-handling mechanisms.
|
4
|
+
"""
|
5
|
+
# ─── import statements ────────────────────────────────────────────────── ✦✦ ─
|
6
|
+
from . import handlers, monadic, recovery, types
|
7
|
+
|
8
|
+
from .handlers import ErrorHandler, DefaultErrorHandler
|
9
|
+
|
10
|
+
__all__ = [
|
11
|
+
# ─── module-level exports ────────────────────────────────────────────────
|
12
|
+
"handlers", "monadic", "recovery", "types"
|
13
|
+
|
14
|
+
# ─── class-level exports ─────────────────────────────────────────────────
|
15
|
+
|
16
|
+
# error handlers
|
17
|
+
"ErrorHandler", "DefaultErrorHandler"
|
18
|
+
|
19
|
+
# monadic error handling
|
20
|
+
|
21
|
+
# error recovery
|
22
|
+
"Recovery",
|
23
|
+
|
24
|
+
# error typedefs
|
25
|
+
"SequencingError"
|
26
|
+
]
|
@@ -0,0 +1,55 @@
|
|
1
|
+
"""Error Types
|
2
|
+
|
3
|
+
This module defines the Biolect DSL framework's error types.
|
4
|
+
"""
|
5
|
+
|
6
|
+
# standard-libary imports
|
7
|
+
|
8
|
+
# third-party imports
|
9
|
+
# ...
|
10
|
+
|
11
|
+
# local imports
|
12
|
+
# ...
|
13
|
+
|
14
|
+
|
15
|
+
# ─── error type hierarchy ─────────────────────────────────────────────── ✦✦ ─
|
16
|
+
class BiolectError(Exception):
|
17
|
+
"""Base exception class for the Biolect DSL framework."""
|
18
|
+
pass
|
19
|
+
|
20
|
+
|
21
|
+
class DataFormatError(BiolectError):
|
22
|
+
"""Base class for all sequence format and structure violations."""
|
23
|
+
def __init__(
|
24
|
+
self,
|
25
|
+
sequence,
|
26
|
+
position: int | None = None,
|
27
|
+
expected_alphabet: Alphabet | None = None
|
28
|
+
) -> None:
|
29
|
+
self.sequence = sequence
|
30
|
+
self.position = position
|
31
|
+
self.expected_alphabet = expected_alphabet
|
32
|
+
super().__init__(f"Formatting error at position {position}.")
|
33
|
+
|
34
|
+
|
35
|
+
class InvalidSequenceError(DataFormatError):
|
36
|
+
"""Raised when invalid characters are detected in a sequence."""
|
37
|
+
def __init__(
|
38
|
+
self,
|
39
|
+
sequence,
|
40
|
+
position: int | None = None,
|
41
|
+
expected_alphabet: Alphabet | None = None
|
42
|
+
) -> None:
|
43
|
+
self.sequence = sequence
|
44
|
+
self.position = position
|
45
|
+
self.expected_alphabet = expected_alphabet
|
46
|
+
super().__init__(f"Invalid sequence at position {position}.")
|
47
|
+
|
48
|
+
|
49
|
+
class FileFormatError(DataFormatError):
|
50
|
+
"""Raised when the file format does not match the expected standard."""
|
51
|
+
def __init__(self, filepath, expected_format, detected_format=None):
|
52
|
+
self.filepath = filepath
|
53
|
+
self.expected_format = expected_format
|
54
|
+
self.detected_format = detected_format
|
55
|
+
|
@@ -0,0 +1,76 @@
|
|
1
|
+
"""Core Error Handlers
|
2
|
+
|
3
|
+
This module implements the core error-handling logic for
|
4
|
+
Biolect workflows.
|
5
|
+
"""
|
6
|
+
# ─── import statements ────────────────────────────────────────────────── ✦✦ ─
|
7
|
+
|
8
|
+
# standard library imports
|
9
|
+
from abc import ABC, abstractmethod
|
10
|
+
|
11
|
+
# third-party imports
|
12
|
+
# ...
|
13
|
+
|
14
|
+
# local imports
|
15
|
+
from .etypes import BiolectError, FileFormatError
|
16
|
+
|
17
|
+
|
18
|
+
class ErrorHandler(ABC):
|
19
|
+
@abstractmethod
|
20
|
+
def attempt_recovery(
|
21
|
+
self,
|
22
|
+
error: BiolectError,
|
23
|
+
context: dict
|
24
|
+
) -> bool:
|
25
|
+
"""Attempt to recover from a raised exception."""
|
26
|
+
...
|
27
|
+
|
28
|
+
@abstractmethod
|
29
|
+
def should_retry(
|
30
|
+
self,
|
31
|
+
error: BiolectError,
|
32
|
+
attempt_count: int
|
33
|
+
) -> bool:
|
34
|
+
"""Determine if the recovery operation should be reattempted."""
|
35
|
+
...
|
36
|
+
|
37
|
+
|
38
|
+
class FileFormatErrorHandler(ErrorHandler):
|
39
|
+
def attempt_recovery(
|
40
|
+
self,
|
41
|
+
error: FileFormatError,
|
42
|
+
context: dict
|
43
|
+
) -> bool:
|
44
|
+
"""Attempt to recover from an file formatting error.
|
45
|
+
|
46
|
+
Args:
|
47
|
+
error (FileFormatError):
|
48
|
+
The exception object raised by the caller.
|
49
|
+
context (dict):
|
50
|
+
A dictionary object that contains details about the context
|
51
|
+
surrounding the error.
|
52
|
+
|
53
|
+
Returns:
|
54
|
+
`True` if successful, otherwise `False`.
|
55
|
+
"""
|
56
|
+
pass
|
57
|
+
|
58
|
+
|
59
|
+
def should_retry(
|
60
|
+
self,
|
61
|
+
error: FileFormatError,
|
62
|
+
context: dict
|
63
|
+
) -> bool:
|
64
|
+
"""Deterimine if the recovery operation should be reattempted.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
error (FileFormatError):
|
68
|
+
The exception object raised by the caller.
|
69
|
+
context (dict):
|
70
|
+
A dictionary object that contains details about the context
|
71
|
+
surrounding the error.
|
72
|
+
|
73
|
+
Returns:
|
74
|
+
`True` if successful, otherwise `False`.
|
75
|
+
"""
|
76
|
+
pass
|
File without changes
|
File without changes
|
File without changes
|
biolect/core/pipeline.py
ADDED
File without changes
|
File without changes
|
biolect/formats/fasta.py
ADDED
File without changes
|
biolect/formats/fastq.py
ADDED
File without changes
|
biolect/formats/gff.py
ADDED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
biolect/formats/vcf.py
ADDED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -0,0 +1,26 @@
|
|
1
|
+
"""Operations Framework Initialization
|
2
|
+
|
3
|
+
This module provides the initialization logic for the Biolect DSL's
|
4
|
+
operations framework at a high level.
|
5
|
+
"""
|
6
|
+
from . import (
|
7
|
+
alignment, analysis, annotation, phylogenetics,
|
8
|
+
sequencing, utilities, visualization
|
9
|
+
)
|
10
|
+
|
11
|
+
from .sequencing import (
|
12
|
+
PackedSequence, SequenceStream, SequenceBatch,
|
13
|
+
pack_sequence, unpack_sequence, vectorized_decode
|
14
|
+
)
|
15
|
+
|
16
|
+
__all__ = [
|
17
|
+
# ─── module-level exports ────────────────────────────────────────────────
|
18
|
+
"alignment", "analysis", "annotation", "phylogenetics",
|
19
|
+
"sequencing", "utilities", "visualization",
|
20
|
+
|
21
|
+
# ─── class-level exports ─────────────────────────────────────────────────
|
22
|
+
"PackedSequence", "SequenceStream", "SequenceBatch",
|
23
|
+
|
24
|
+
# ─── function-level exports ──────────────────────────────────────────────
|
25
|
+
"pack_sequence", "unpack_sequence", "vectorized_decode"
|
26
|
+
]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -0,0 +1,28 @@
|
|
1
|
+
"""Sequencing Framework Initialization
|
2
|
+
|
3
|
+
This module provides the initialization logic for the Biolect DSL
|
4
|
+
sequencing framework.
|
5
|
+
"""
|
6
|
+
# ─── import statements ────────────────────────────────────────────────── ✦✦ ─
|
7
|
+
from . import analysis, encoding, manipulation, search, translation
|
8
|
+
|
9
|
+
from .encoding import (
|
10
|
+
DECODE_TABLE, ENCODE_TABLE,
|
11
|
+
PackedSequence, SequenceBatch, SequenceStream,
|
12
|
+
iter_kmers, pack_sequence, unpack_sequence, vectorized_decode
|
13
|
+
)
|
14
|
+
|
15
|
+
|
16
|
+
__all__ = [
|
17
|
+
# ─── constants ───────────────────────────────────────────────────────────
|
18
|
+
"DECODE_TABLE", "ENCODE_TABLE",
|
19
|
+
|
20
|
+
# ─── modules ─────────────────────────────────────────────────────────────
|
21
|
+
"analysis", "encoding", "manipulation", "search", "translation",
|
22
|
+
|
23
|
+
# ─── classes ─────────────────────────────────────────────────────────────
|
24
|
+
"PackedSequence", "SequenceBatch", "SequenceStream",
|
25
|
+
|
26
|
+
# ─── functions ───────────────────────────────────────────────────────────
|
27
|
+
"iter_kmers", "pack_sequence", "unpack_sequence", "vectorized_decode"
|
28
|
+
]
|
@@ -0,0 +1,16 @@
|
|
1
|
+
"""Sequence Analysis
|
2
|
+
|
3
|
+
This module defines the Biolect DSL sequence analysis engine.
|
4
|
+
"""
|
5
|
+
# ─── import statements ────────────────────────────────────────────────── ✦✦ ─
|
6
|
+
|
7
|
+
# standard library imports
|
8
|
+
from __future__ import annotations
|
9
|
+
from collections.abc import Generator
|
10
|
+
|
11
|
+
|
12
|
+
# third-party imports
|
13
|
+
# ...
|
14
|
+
|
15
|
+
# local imports
|
16
|
+
# ...
|
@@ -0,0 +1,193 @@
|
|
1
|
+
"""Core Encodings
|
2
|
+
|
3
|
+
This module defines the encodings that Biolect uses in its
|
4
|
+
approach to sequence compression.
|
5
|
+
"""
|
6
|
+
# ─── import statements ────────────────────────────────────────────────── ✦✦ ─
|
7
|
+
|
8
|
+
# standard library imports
|
9
|
+
from collections.abc import Generator
|
10
|
+
from typing import Final, Iterator, NamedTuple
|
11
|
+
|
12
|
+
# third-party imports
|
13
|
+
import Bio
|
14
|
+
import numpy as np
|
15
|
+
|
16
|
+
# local imports
|
17
|
+
|
18
|
+
|
19
|
+
# ─── interface ────────────────────────────────────────────────────────── ✦✦ ─
|
20
|
+
|
21
|
+
class PackedSequence(NamedTuple):
|
22
|
+
data: np.ndarray
|
23
|
+
length: int
|
24
|
+
ambiguous_positions: dict[int, str] | None
|
25
|
+
|
26
|
+
|
27
|
+
# ─── bit-packing ──────────────────────────────────────────────────────── ✦✦ ─
|
28
|
+
|
29
|
+
ENCODE_TABLE = str.maketrans("ATGC", "\x00\x01\x02\x03")
|
30
|
+
DECODE_TABLE = ["A", "T", "G", "C"]
|
31
|
+
|
32
|
+
def pack_sequence(seq: str | Bio.Seq) -> (
|
33
|
+
tuple[np.ndarray, dict[int, str] | None]
|
34
|
+
):
|
35
|
+
"""Pack a sequence of DNA nucleobases with ambiguity extraction."""
|
36
|
+
cleaned = []
|
37
|
+
ambiguous = {}
|
38
|
+
|
39
|
+
for i, base in enumerate(seq.upper()):
|
40
|
+
if base in "ATGC":
|
41
|
+
cleaned.append(ord(base.translate(ENCODE_TABLE)))
|
42
|
+
else:
|
43
|
+
# Use "A" as a plaecholder, and track the base.
|
44
|
+
cleaned.append(0)
|
45
|
+
ambiguous[i] = base
|
46
|
+
|
47
|
+
# ─── pack 4 bases per byte ───────────────────────────────────────────────
|
48
|
+
packed_length = (len(cleaned) + 3) // 4
|
49
|
+
packed = np.zeros(packed_length, dtype=np.uint8)
|
50
|
+
|
51
|
+
for i, base_code in enumerate(cleaned):
|
52
|
+
byte_idx = i // 4
|
53
|
+
bit_offset = (i % 4) * 2
|
54
|
+
packed[byte_idx] |= base_code << bit_offset
|
55
|
+
|
56
|
+
return packed, ambiguous if ambiguous else None
|
57
|
+
|
58
|
+
|
59
|
+
def unpack_sequence(packed_seq: PackedSequence, start: int, end: int) -> str:
|
60
|
+
"""Unpack a bit-packed subsequence into string representation."""
|
61
|
+
result = []
|
62
|
+
|
63
|
+
for pos in range(start, min(end, packed_seq.length)):
|
64
|
+
# Check for ambiguous base.
|
65
|
+
if packed_seq.ambiguous_positions and (
|
66
|
+
pos in packed_seq.ambiguous_positions
|
67
|
+
):
|
68
|
+
result.append(packed_seq.ambiguous_positions[pos])
|
69
|
+
continue
|
70
|
+
|
71
|
+
# Extract from packed data
|
72
|
+
byte_idx = pos // 4
|
73
|
+
bit_offset = (pos % 4) * 2
|
74
|
+
base_code = (packed_seq.data[byte_idx] >> bit_offset) & 0b11
|
75
|
+
result.append(DECODE_TABLE[base_code])
|
76
|
+
|
77
|
+
return "".join(result)
|
78
|
+
|
79
|
+
|
80
|
+
def vectorized_decode(packed_seq: PackedSequence) -> str:
|
81
|
+
"""High-performance full-sequence decode using NumPy."""
|
82
|
+
if packed_seq.length == 0:
|
83
|
+
return ""
|
84
|
+
|
85
|
+
# Expand packed bytes to individual bases.
|
86
|
+
expanded = np.zeros(packed_seq.length, dtype=np.uint8)
|
87
|
+
|
88
|
+
for i in range(packed_seq.length):
|
89
|
+
byte_idx = i // 4
|
90
|
+
bit_offset = (i % 4) * 2
|
91
|
+
expanded[i] = (packed_seq.data[byte_idx] >> bit_offset) & 0b11
|
92
|
+
|
93
|
+
base_chars = np.array(DECODE_TABLE, dtype="U1")[expanded]
|
94
|
+
result = "".join(base_chars)
|
95
|
+
|
96
|
+
# Apply ambiguous base substitutions.
|
97
|
+
if packed_seq.ambiguous_positions:
|
98
|
+
result_list = list(result)
|
99
|
+
for pos, ambiguous_base in packed_seq.ambiguous_positions.items():
|
100
|
+
if pos < len(result_list):
|
101
|
+
result_list[pos] = ambiguous_base
|
102
|
+
result = "".join(result_list)
|
103
|
+
|
104
|
+
return result
|
105
|
+
|
106
|
+
|
107
|
+
def iter_kmers(packed_seq: PackedSequence, k: int) -> (
|
108
|
+
Generator[str, None, None]
|
109
|
+
):
|
110
|
+
"""Memory-efficient k-mer generation."""
|
111
|
+
if packed_seq.length < k:
|
112
|
+
return
|
113
|
+
|
114
|
+
# Use a "sliding window" approach.
|
115
|
+
window = unpack_sequence(packed_seq, 0, k)
|
116
|
+
yield window
|
117
|
+
|
118
|
+
for i in range(1, packed_seq.length - k + 1):
|
119
|
+
# "Slide" the window by one position.
|
120
|
+
new_pos = i + k - 1
|
121
|
+
|
122
|
+
if packed_seq.ambiguous_positions and (
|
123
|
+
new_pos in packed_seq.ambiguous_positions
|
124
|
+
):
|
125
|
+
new_base = packed_seq.ambiguous_positions[new_pos]
|
126
|
+
else:
|
127
|
+
byte_idx = new_pos // 4
|
128
|
+
bit_offset = (new_pos % 4) * 2
|
129
|
+
base_code = (packed_seq.data[byte_idx] >> bit_offset) & 0b11
|
130
|
+
new_base = DECODE_TABLE[base_code]
|
131
|
+
|
132
|
+
window = window[1:] + new_base
|
133
|
+
yield window
|
134
|
+
|
135
|
+
class SequenceStream:
|
136
|
+
def __init__(self, file_handle):
|
137
|
+
self.file_handle = file_handle
|
138
|
+
self._buffer = bytearray(8192)
|
139
|
+
|
140
|
+
|
141
|
+
def iter_sequences(self) -> Generator[PackedSequence, None, None]:
|
142
|
+
"""Yield packed sequences from a FASTA stream."""
|
143
|
+
current_header = None
|
144
|
+
sequence_parts = []
|
145
|
+
|
146
|
+
for line in self.file_handle:
|
147
|
+
line = line.strip()
|
148
|
+
if line.startswith(">"):
|
149
|
+
if current_header and sequence_parts:
|
150
|
+
# Process accumulated sequence
|
151
|
+
full_seq = "".join(sequence_parts)
|
152
|
+
packed_data, ambiguous = pack_sequence(full_seq)
|
153
|
+
|
154
|
+
yield PackedSequence(
|
155
|
+
data=packed_data,
|
156
|
+
length=len(full_seq),
|
157
|
+
ambiguous_positions=ambiguous
|
158
|
+
)
|
159
|
+
|
160
|
+
current_header = line[1:]
|
161
|
+
sequence_parts = []
|
162
|
+
else:
|
163
|
+
sequence_parts.append(line)
|
164
|
+
|
165
|
+
# Handle final sequence
|
166
|
+
if current_header and sequence_parts:
|
167
|
+
full_seq = "".join(sequence_parts)
|
168
|
+
packed_data, ambiguous = pack_sequence(full_seq)
|
169
|
+
|
170
|
+
yield PackedSequence(
|
171
|
+
data=packed_data,
|
172
|
+
length=len(full_seq),
|
173
|
+
ambiguous_positions=ambiguous
|
174
|
+
)
|
175
|
+
|
176
|
+
|
177
|
+
class SequenceBatch:
|
178
|
+
"""Process multiple sequences efficiently."""
|
179
|
+
|
180
|
+
def __init__(self, batch_size: int = 1000) -> None:
|
181
|
+
self.batch_size = batch_size
|
182
|
+
self._sequence_buffer = []
|
183
|
+
|
184
|
+
|
185
|
+
def process_batch(self, sequences: list[PackedSequence]) -> Iterator[str]:
|
186
|
+
"""Vectorized processing of a sequence batch."""
|
187
|
+
# Sort by length for better cache behavior.
|
188
|
+
sorted_seqs = sorted(sequences, key=lambda s: s.length)
|
189
|
+
|
190
|
+
for seq in sorted_seqs:
|
191
|
+
yield vectorized_decode(seq)
|
192
|
+
|
193
|
+
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|