biolect 0.0.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. biolect/__init__.py +27 -0
  2. biolect/core/__init__.py +0 -0
  3. biolect/core/definitions/__init__.py +0 -0
  4. biolect/core/dtypes.py +62 -0
  5. biolect/core/errors/__init__.py +26 -0
  6. biolect/core/errors/etypes.py +55 -0
  7. biolect/core/errors/handlers.py +76 -0
  8. biolect/core/errors/monadic.py +0 -0
  9. biolect/core/errors/recovery.py +0 -0
  10. biolect/core/operations.py +0 -0
  11. biolect/core/pipeline.py +0 -0
  12. biolect/core/validation.py +0 -0
  13. biolect/formats/fasta.py +0 -0
  14. biolect/formats/fastq.py +0 -0
  15. biolect/formats/gff.py +0 -0
  16. biolect/formats/parsers/__init__.py +0 -0
  17. biolect/formats/parsers/safe_parsers.py +0 -0
  18. biolect/formats/parsers/validators.py +0 -0
  19. biolect/formats/phylip.py +0 -0
  20. biolect/formats/sambam.py +0 -0
  21. biolect/formats/vcf.py +0 -0
  22. biolect/integrations/__init__.py +0 -0
  23. biolect/integrations/blast.py +0 -0
  24. biolect/integrations/clustal.py +0 -0
  25. biolect/integrations/external.py +0 -0
  26. biolect/integrations/muscle.py +0 -0
  27. biolect/integrations/raxml.py +0 -0
  28. biolect/integrations/safe/__init__.py +0 -0
  29. biolect/integrations/safe/blast_safe.py +0 -0
  30. biolect/integrations/safe/tools_safe.py +0 -0
  31. biolect/operations/__init__.py +26 -0
  32. biolect/operations/alignment/__init__.py +0 -0
  33. biolect/operations/alignment/consensus.py +0 -0
  34. biolect/operations/alignment/multiple.py +0 -0
  35. biolect/operations/alignment/pairwise.py +0 -0
  36. biolect/operations/alignment/safe/__init__.py +0 -0
  37. biolect/operations/alignment/safe/monadic_ops.py +0 -0
  38. biolect/operations/analysis/__init__.py +0 -0
  39. biolect/operations/analysis/clustering.py +0 -0
  40. biolect/operations/analysis/quality.py +0 -0
  41. biolect/operations/analysis/safe/__init__.py +0 -0
  42. biolect/operations/analysis/safe/monadic_ops.py +0 -0
  43. biolect/operations/analysis/statistics.py +0 -0
  44. biolect/operations/analysis/visualization.py +0 -0
  45. biolect/operations/annotation/__init__.py +0 -0
  46. biolect/operations/annotation/comparative.py +0 -0
  47. biolect/operations/annotation/functional.py +0 -0
  48. biolect/operations/annotation/gene_calling.py +0 -0
  49. biolect/operations/annotation/safe/__init__.py +0 -0
  50. biolect/operations/annotation/safe/monadic_ops.py +0 -0
  51. biolect/operations/phylogenetics/__init__.py +0 -0
  52. biolect/operations/phylogenetics/distance.py +0 -0
  53. biolect/operations/phylogenetics/safe/__init__.py +0 -0
  54. biolect/operations/phylogenetics/safe/monadic_ops.py +0 -0
  55. biolect/operations/phylogenetics/trees.py +0 -0
  56. biolect/operations/phylogenetics/visualization.py +0 -0
  57. biolect/operations/sequencing/__init__.py +28 -0
  58. biolect/operations/sequencing/analysis.py +16 -0
  59. biolect/operations/sequencing/encoding.py +193 -0
  60. biolect/operations/sequencing/manipulation.py +0 -0
  61. biolect/operations/sequencing/safe/__init__.py +0 -0
  62. biolect/operations/sequencing/safe/monadic_ops.py +0 -0
  63. biolect/operations/sequencing/search.py +0 -0
  64. biolect/operations/sequencing/translation.py +0 -0
  65. biolect/utilities/__init__.py +0 -0
  66. biolect/utilities/caching.py +0 -0
  67. biolect/utilities/config.py +0 -0
  68. biolect/utilities/memory.py +0 -0
  69. biolect/utilities/parallel.py +0 -0
  70. biolect/utilities/progress.py +0 -0
  71. biolect/visualization/__init__.py +0 -0
  72. biolect/visualization/alignments.py +0 -0
  73. biolect/visualization/genomics.py +0 -0
  74. biolect/visualization/sequences.py +0 -0
  75. biolect/visualization/trees.py +0 -0
  76. biolect-0.0.0.dev1.dist-info/METADATA +40 -0
  77. biolect-0.0.0.dev1.dist-info/RECORD +79 -0
  78. biolect-0.0.0.dev1.dist-info/WHEEL +4 -0
  79. biolect-0.0.0.dev1.dist-info/licenses/LICENSE +661 -0
biolect/__init__.py ADDED
@@ -0,0 +1,27 @@
1
+ """Biolect
2
+
3
+ This module initializes the Biolect DSL framework at its highest level.
4
+ """
5
+ # ─── import statements ────────────────────────────────────────────────── ✦✦ ─
6
+ from . import core, formats, integrations, operations, utilities, visualization
7
+
8
+ from .operations import (
9
+ PackedSequence, SequenceBatch, SequenceStream,
10
+ pack_sequence, unpack_sequence, vectorized_decode
11
+ )
12
+
13
+
14
+ __all__ = [
15
+ # ─── module-level exports ────────────────────────────────────────────────
16
+ "core", "formats", "integrations", "operations", "utilities",
17
+ "visualization",
18
+
19
+ # ─── class-level exports ─────────────────────────────────────────────────
20
+ "PackedSequence", "SequenceBatch", "SequenceStream",
21
+
22
+ # ─── function-level exports ──────────────────────────────────────────────
23
+ "pack_sequence", "unpack_sequence", "vectorized_decode",
24
+
25
+ # ─── object-level exports ────────────────────────────────────────────────
26
+ # ...
27
+ ]
File without changes
File without changes
biolect/core/dtypes.py ADDED
@@ -0,0 +1,62 @@
1
+ """Core Data Types
2
+
3
+ This module defines the framework's core data types.
4
+ """
5
+ # ─── import statements ────────────────────────────────────────────────── ✦✦ ─
6
+
7
+ # standard library import
8
+ from __future__ import annotations
9
+ from abc import ABC, abstractmethods
10
+ from collections.abc import Callable, Iterable
11
+ from dataclasses import dataclass
12
+ from enum import Enum, auto
13
+ from typing import (
14
+ # types
15
+ Any, Final, Iterator, NamedTuple, Protocol, TypeAlias, TypeVar,
16
+
17
+ # descriptors
18
+ final, overload, runtime_checkable
19
+ )
20
+
21
+ # third-party imports
22
+ import numpy as np
23
+
24
+ from Bio import SeqIO
25
+ from Bio.Seq import Seq
26
+ from returns import Maybe, Result
27
+
28
+ # local imports
29
+ # ...
30
+
31
+
32
+ # ─── enums ────────────────────────────────────────────────────────────── ✦✦ ─
33
+
34
+
35
+ class Alphabet(Enum):
36
+ """An enumeration of the alphabets used in sequencing."""
37
+ AA = auto()
38
+ DNA = auto()
39
+ RNA = auto()
40
+
41
+
42
+
43
+ # ─── typing ───────────────────────────────────────────────────────────── ✦✦ ─
44
+
45
+ T: TypeVar = TypeVar("T")
46
+ E: TypeVar = TypeVar("E")
47
+ S: TypeVar = TypeVar("S")
48
+ U: TypeVar = TypeVar("U")
49
+
50
+
51
+ @runtime_checkable
52
+ class Parseable(Protocol):
53
+ ...
54
+
55
+
56
+ @dataclass
57
+ class Domain(Enum):
58
+ ...
59
+
60
+
61
+ class Genome(ABC):
62
+ domain: Domain
@@ -0,0 +1,26 @@
1
+ """Core Error Framework
2
+
3
+ This module defines the Biolect DSL's core error-handling mechanisms.
4
+ """
5
+ # ─── import statements ────────────────────────────────────────────────── ✦✦ ─
6
+ from . import handlers, monadic, recovery, types
7
+
8
+ from .handlers import ErrorHandler, DefaultErrorHandler
9
+
10
+ __all__ = [
11
+ # ─── module-level exports ────────────────────────────────────────────────
12
+ "handlers", "monadic", "recovery", "types"
13
+
14
+ # ─── class-level exports ─────────────────────────────────────────────────
15
+
16
+ # error handlers
17
+ "ErrorHandler", "DefaultErrorHandler"
18
+
19
+ # monadic error handling
20
+
21
+ # error recovery
22
+ "Recovery",
23
+
24
+ # error typedefs
25
+ "SequencingError"
26
+ ]
@@ -0,0 +1,55 @@
1
+ """Error Types
2
+
3
+ This module defines the Biolect DSL framework's error types.
4
+ """
5
+
6
+ # standard-libary imports
7
+
8
+ # third-party imports
9
+ # ...
10
+
11
+ # local imports
12
+ # ...
13
+
14
+
15
+ # ─── error type hierarchy ─────────────────────────────────────────────── ✦✦ ─
16
+ class BiolectError(Exception):
17
+ """Base exception class for the Biolect DSL framework."""
18
+ pass
19
+
20
+
21
+ class DataFormatError(BiolectError):
22
+ """Base class for all sequence format and structure violations."""
23
+ def __init__(
24
+ self,
25
+ sequence,
26
+ position: int | None = None,
27
+ expected_alphabet: Alphabet | None = None
28
+ ) -> None:
29
+ self.sequence = sequence
30
+ self.position = position
31
+ self.expected_alphabet = expected_alphabet
32
+ super().__init__(f"Formatting error at position {position}.")
33
+
34
+
35
+ class InvalidSequenceError(DataFormatError):
36
+ """Raised when invalid characters are detected in a sequence."""
37
+ def __init__(
38
+ self,
39
+ sequence,
40
+ position: int | None = None,
41
+ expected_alphabet: Alphabet | None = None
42
+ ) -> None:
43
+ self.sequence = sequence
44
+ self.position = position
45
+ self.expected_alphabet = expected_alphabet
46
+ super().__init__(f"Invalid sequence at position {position}.")
47
+
48
+
49
+ class FileFormatError(DataFormatError):
50
+ """Raised when the file format does not match the expected standard."""
51
+ def __init__(self, filepath, expected_format, detected_format=None):
52
+ self.filepath = filepath
53
+ self.expected_format = expected_format
54
+ self.detected_format = detected_format
55
+
@@ -0,0 +1,76 @@
1
+ """Core Error Handlers
2
+
3
+ This module implements the core error-handling logic for
4
+ Biolect workflows.
5
+ """
6
+ # ─── import statements ────────────────────────────────────────────────── ✦✦ ─
7
+
8
+ # standard library imports
9
+ from abc import ABC, abstractmethod
10
+
11
+ # third-party imports
12
+ # ...
13
+
14
+ # local imports
15
+ from .etypes import BiolectError, FileFormatError
16
+
17
+
18
+ class ErrorHandler(ABC):
19
+ @abstractmethod
20
+ def attempt_recovery(
21
+ self,
22
+ error: BiolectError,
23
+ context: dict
24
+ ) -> bool:
25
+ """Attempt to recover from a raised exception."""
26
+ ...
27
+
28
+ @abstractmethod
29
+ def should_retry(
30
+ self,
31
+ error: BiolectError,
32
+ attempt_count: int
33
+ ) -> bool:
34
+ """Determine if the recovery operation should be reattempted."""
35
+ ...
36
+
37
+
38
+ class FileFormatErrorHandler(ErrorHandler):
39
+ def attempt_recovery(
40
+ self,
41
+ error: FileFormatError,
42
+ context: dict
43
+ ) -> bool:
44
+ """Attempt to recover from an file formatting error.
45
+
46
+ Args:
47
+ error (FileFormatError):
48
+ The exception object raised by the caller.
49
+ context (dict):
50
+ A dictionary object that contains details about the context
51
+ surrounding the error.
52
+
53
+ Returns:
54
+ `True` if successful, otherwise `False`.
55
+ """
56
+ pass
57
+
58
+
59
+ def should_retry(
60
+ self,
61
+ error: FileFormatError,
62
+ context: dict
63
+ ) -> bool:
64
+ """Deterimine if the recovery operation should be reattempted.
65
+
66
+ Args:
67
+ error (FileFormatError):
68
+ The exception object raised by the caller.
69
+ context (dict):
70
+ A dictionary object that contains details about the context
71
+ surrounding the error.
72
+
73
+ Returns:
74
+ `True` if successful, otherwise `False`.
75
+ """
76
+ pass
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
biolect/formats/gff.py ADDED
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
biolect/formats/vcf.py ADDED
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,26 @@
1
+ """Operations Framework Initialization
2
+
3
+ This module provides the initialization logic for the Biolect DSL's
4
+ operations framework at a high level.
5
+ """
6
+ from . import (
7
+ alignment, analysis, annotation, phylogenetics,
8
+ sequencing, utilities, visualization
9
+ )
10
+
11
+ from .sequencing import (
12
+ PackedSequence, SequenceStream, SequenceBatch,
13
+ pack_sequence, unpack_sequence, vectorized_decode
14
+ )
15
+
16
+ __all__ = [
17
+ # ─── module-level exports ────────────────────────────────────────────────
18
+ "alignment", "analysis", "annotation", "phylogenetics",
19
+ "sequencing", "utilities", "visualization",
20
+
21
+ # ─── class-level exports ─────────────────────────────────────────────────
22
+ "PackedSequence", "SequenceStream", "SequenceBatch",
23
+
24
+ # ─── function-level exports ──────────────────────────────────────────────
25
+ "pack_sequence", "unpack_sequence", "vectorized_decode"
26
+ ]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,28 @@
1
+ """Sequencing Framework Initialization
2
+
3
+ This module provides the initialization logic for the Biolect DSL
4
+ sequencing framework.
5
+ """
6
+ # ─── import statements ────────────────────────────────────────────────── ✦✦ ─
7
+ from . import analysis, encoding, manipulation, search, translation
8
+
9
+ from .encoding import (
10
+ DECODE_TABLE, ENCODE_TABLE,
11
+ PackedSequence, SequenceBatch, SequenceStream,
12
+ iter_kmers, pack_sequence, unpack_sequence, vectorized_decode
13
+ )
14
+
15
+
16
+ __all__ = [
17
+ # ─── constants ───────────────────────────────────────────────────────────
18
+ "DECODE_TABLE", "ENCODE_TABLE",
19
+
20
+ # ─── modules ─────────────────────────────────────────────────────────────
21
+ "analysis", "encoding", "manipulation", "search", "translation",
22
+
23
+ # ─── classes ─────────────────────────────────────────────────────────────
24
+ "PackedSequence", "SequenceBatch", "SequenceStream",
25
+
26
+ # ─── functions ───────────────────────────────────────────────────────────
27
+ "iter_kmers", "pack_sequence", "unpack_sequence", "vectorized_decode"
28
+ ]
@@ -0,0 +1,16 @@
1
+ """Sequence Analysis
2
+
3
+ This module defines the Biolect DSL sequence analysis engine.
4
+ """
5
+ # ─── import statements ────────────────────────────────────────────────── ✦✦ ─
6
+
7
+ # standard library imports
8
+ from __future__ import annotations
9
+ from collections.abc import Generator
10
+
11
+
12
+ # third-party imports
13
+ # ...
14
+
15
+ # local imports
16
+ # ...
@@ -0,0 +1,193 @@
1
+ """Core Encodings
2
+
3
+ This module defines the encodings that Biolect uses in its
4
+ approach to sequence compression.
5
+ """
6
+ # ─── import statements ────────────────────────────────────────────────── ✦✦ ─
7
+
8
+ # standard library imports
9
+ from collections.abc import Generator
10
+ from typing import Final, Iterator, NamedTuple
11
+
12
+ # third-party imports
13
+ import Bio
14
+ import numpy as np
15
+
16
+ # local imports
17
+
18
+
19
+ # ─── interface ────────────────────────────────────────────────────────── ✦✦ ─
20
+
21
+ class PackedSequence(NamedTuple):
22
+ data: np.ndarray
23
+ length: int
24
+ ambiguous_positions: dict[int, str] | None
25
+
26
+
27
+ # ─── bit-packing ──────────────────────────────────────────────────────── ✦✦ ─
28
+
29
+ ENCODE_TABLE = str.maketrans("ATGC", "\x00\x01\x02\x03")
30
+ DECODE_TABLE = ["A", "T", "G", "C"]
31
+
32
+ def pack_sequence(seq: str | Bio.Seq) -> (
33
+ tuple[np.ndarray, dict[int, str] | None]
34
+ ):
35
+ """Pack a sequence of DNA nucleobases with ambiguity extraction."""
36
+ cleaned = []
37
+ ambiguous = {}
38
+
39
+ for i, base in enumerate(seq.upper()):
40
+ if base in "ATGC":
41
+ cleaned.append(ord(base.translate(ENCODE_TABLE)))
42
+ else:
43
+ # Use "A" as a plaecholder, and track the base.
44
+ cleaned.append(0)
45
+ ambiguous[i] = base
46
+
47
+ # ─── pack 4 bases per byte ───────────────────────────────────────────────
48
+ packed_length = (len(cleaned) + 3) // 4
49
+ packed = np.zeros(packed_length, dtype=np.uint8)
50
+
51
+ for i, base_code in enumerate(cleaned):
52
+ byte_idx = i // 4
53
+ bit_offset = (i % 4) * 2
54
+ packed[byte_idx] |= base_code << bit_offset
55
+
56
+ return packed, ambiguous if ambiguous else None
57
+
58
+
59
+ def unpack_sequence(packed_seq: PackedSequence, start: int, end: int) -> str:
60
+ """Unpack a bit-packed subsequence into string representation."""
61
+ result = []
62
+
63
+ for pos in range(start, min(end, packed_seq.length)):
64
+ # Check for ambiguous base.
65
+ if packed_seq.ambiguous_positions and (
66
+ pos in packed_seq.ambiguous_positions
67
+ ):
68
+ result.append(packed_seq.ambiguous_positions[pos])
69
+ continue
70
+
71
+ # Extract from packed data
72
+ byte_idx = pos // 4
73
+ bit_offset = (pos % 4) * 2
74
+ base_code = (packed_seq.data[byte_idx] >> bit_offset) & 0b11
75
+ result.append(DECODE_TABLE[base_code])
76
+
77
+ return "".join(result)
78
+
79
+
80
+ def vectorized_decode(packed_seq: PackedSequence) -> str:
81
+ """High-performance full-sequence decode using NumPy."""
82
+ if packed_seq.length == 0:
83
+ return ""
84
+
85
+ # Expand packed bytes to individual bases.
86
+ expanded = np.zeros(packed_seq.length, dtype=np.uint8)
87
+
88
+ for i in range(packed_seq.length):
89
+ byte_idx = i // 4
90
+ bit_offset = (i % 4) * 2
91
+ expanded[i] = (packed_seq.data[byte_idx] >> bit_offset) & 0b11
92
+
93
+ base_chars = np.array(DECODE_TABLE, dtype="U1")[expanded]
94
+ result = "".join(base_chars)
95
+
96
+ # Apply ambiguous base substitutions.
97
+ if packed_seq.ambiguous_positions:
98
+ result_list = list(result)
99
+ for pos, ambiguous_base in packed_seq.ambiguous_positions.items():
100
+ if pos < len(result_list):
101
+ result_list[pos] = ambiguous_base
102
+ result = "".join(result_list)
103
+
104
+ return result
105
+
106
+
107
+ def iter_kmers(packed_seq: PackedSequence, k: int) -> (
108
+ Generator[str, None, None]
109
+ ):
110
+ """Memory-efficient k-mer generation."""
111
+ if packed_seq.length < k:
112
+ return
113
+
114
+ # Use a "sliding window" approach.
115
+ window = unpack_sequence(packed_seq, 0, k)
116
+ yield window
117
+
118
+ for i in range(1, packed_seq.length - k + 1):
119
+ # "Slide" the window by one position.
120
+ new_pos = i + k - 1
121
+
122
+ if packed_seq.ambiguous_positions and (
123
+ new_pos in packed_seq.ambiguous_positions
124
+ ):
125
+ new_base = packed_seq.ambiguous_positions[new_pos]
126
+ else:
127
+ byte_idx = new_pos // 4
128
+ bit_offset = (new_pos % 4) * 2
129
+ base_code = (packed_seq.data[byte_idx] >> bit_offset) & 0b11
130
+ new_base = DECODE_TABLE[base_code]
131
+
132
+ window = window[1:] + new_base
133
+ yield window
134
+
135
+ class SequenceStream:
136
+ def __init__(self, file_handle):
137
+ self.file_handle = file_handle
138
+ self._buffer = bytearray(8192)
139
+
140
+
141
+ def iter_sequences(self) -> Generator[PackedSequence, None, None]:
142
+ """Yield packed sequences from a FASTA stream."""
143
+ current_header = None
144
+ sequence_parts = []
145
+
146
+ for line in self.file_handle:
147
+ line = line.strip()
148
+ if line.startswith(">"):
149
+ if current_header and sequence_parts:
150
+ # Process accumulated sequence
151
+ full_seq = "".join(sequence_parts)
152
+ packed_data, ambiguous = pack_sequence(full_seq)
153
+
154
+ yield PackedSequence(
155
+ data=packed_data,
156
+ length=len(full_seq),
157
+ ambiguous_positions=ambiguous
158
+ )
159
+
160
+ current_header = line[1:]
161
+ sequence_parts = []
162
+ else:
163
+ sequence_parts.append(line)
164
+
165
+ # Handle final sequence
166
+ if current_header and sequence_parts:
167
+ full_seq = "".join(sequence_parts)
168
+ packed_data, ambiguous = pack_sequence(full_seq)
169
+
170
+ yield PackedSequence(
171
+ data=packed_data,
172
+ length=len(full_seq),
173
+ ambiguous_positions=ambiguous
174
+ )
175
+
176
+
177
+ class SequenceBatch:
178
+ """Process multiple sequences efficiently."""
179
+
180
+ def __init__(self, batch_size: int = 1000) -> None:
181
+ self.batch_size = batch_size
182
+ self._sequence_buffer = []
183
+
184
+
185
+ def process_batch(self, sequences: list[PackedSequence]) -> Iterator[str]:
186
+ """Vectorized processing of a sequence batch."""
187
+ # Sort by length for better cache behavior.
188
+ sorted_seqs = sorted(sequences, key=lambda s: s.length)
189
+
190
+ for seq in sorted_seqs:
191
+ yield vectorized_decode(seq)
192
+
193
+
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes