geney 1.4.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geney/__init__.py +25 -0
- geney/engines.py +307 -0
- geney/oncosplice.py +411 -0
- geney/pipelines.py +97 -0
- geney/samples.py +3 -0
- geney/splice_graph.py +371 -0
- geney/splicing_table.py +142 -0
- geney/transcripts.py +68 -0
- geney/utils.py +254 -0
- geney/variants.py +389 -0
- geney-1.4.40.dist-info/METADATA +32 -0
- geney-1.4.40.dist-info/RECORD +14 -0
- geney-1.4.40.dist-info/WHEEL +5 -0
- geney-1.4.40.dist-info/top_level.txt +1 -0
geney/utils.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
__all__ = ['is_monotonic', 'contains', 'unload_json', 'unload_pickle', 'dump_json', 'dump_pickle', 'generate_random_nucleotide_sequences', 'generate_random_sequence', 'short_hash_of_list']
|
|
2
|
+
|
|
3
|
+
import pickle
|
|
4
|
+
import json
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from bisect import bisect_left
|
|
7
|
+
import hashlib
|
|
8
|
+
import random
|
|
9
|
+
from typing import Any, List, Sequence, Union
|
|
10
|
+
|
|
11
|
+
# def is_monotonic(A):
|
|
12
|
+
# x, y = [], []
|
|
13
|
+
# x.extend(A)
|
|
14
|
+
# y.extend(A)
|
|
15
|
+
# x.sort()
|
|
16
|
+
# y.sort(reverse=True)
|
|
17
|
+
# if (x == A or y == A):
|
|
18
|
+
# return True
|
|
19
|
+
# return False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# def available_genes(organism='hg38'):
|
|
23
|
+
# from geney import config
|
|
24
|
+
# annotation_path = config[organism]['MRNA_PATH'] / 'protein_coding'
|
|
25
|
+
# return sorted(list(set([m.stem.split('_')[-1] for m in annotation_path.glob('*')])))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def contains(a: Sequence[Any], x: Any) -> bool:
|
|
29
|
+
"""Check if sorted sequence contains value using binary search.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
a: Sorted sequence to search in
|
|
33
|
+
x: Value to search for
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
True if value is found, False otherwise
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
TypeError: If sequence is not sortable
|
|
40
|
+
"""
|
|
41
|
+
if not hasattr(a, '__len__') or not hasattr(a, '__getitem__'):
|
|
42
|
+
raise TypeError("First argument must be a sequence")
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
i = bisect_left(a, x)
|
|
46
|
+
return i != len(a) and a[i] == x
|
|
47
|
+
except TypeError as e:
|
|
48
|
+
raise TypeError(f"Cannot compare types in sequence: {e}") from e
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def unload_json(file_path: Union[str, Path]) -> Any:
|
|
52
|
+
"""Load data from JSON file.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
file_path: Path to JSON file
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Loaded data structure
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
FileNotFoundError: If file doesn't exist
|
|
62
|
+
JSONDecodeError: If file contains invalid JSON
|
|
63
|
+
"""
|
|
64
|
+
file_path = Path(file_path)
|
|
65
|
+
|
|
66
|
+
if not file_path.exists():
|
|
67
|
+
raise FileNotFoundError(f"JSON file not found: {file_path}")
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
71
|
+
data = json.load(f)
|
|
72
|
+
return data
|
|
73
|
+
except json.JSONDecodeError as e:
|
|
74
|
+
raise json.JSONDecodeError(f"Invalid JSON in file {file_path}: {e.msg}", e.doc, e.pos) from e
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def dump_json(file_path: Union[str, Path], payload: Any, indent: int = 2) -> None:
|
|
78
|
+
"""Save data to JSON file.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
file_path: Path to output JSON file
|
|
82
|
+
payload: Data to save
|
|
83
|
+
indent: JSON indentation level
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
TypeError: If payload is not JSON serializable
|
|
87
|
+
PermissionError: If cannot write to file
|
|
88
|
+
"""
|
|
89
|
+
file_path = Path(file_path)
|
|
90
|
+
|
|
91
|
+
# Create parent directory if it doesn't exist
|
|
92
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
with open(file_path, 'w', encoding='utf-8') as f:
|
|
96
|
+
json.dump(payload, f, indent=indent, ensure_ascii=False)
|
|
97
|
+
except TypeError as e:
|
|
98
|
+
raise TypeError(f"Cannot serialize data to JSON: {e}") from e
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def unload_pickle(file_path: Union[str, Path]) -> Any:
|
|
102
|
+
"""Load data from pickle file.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
file_path: Path to pickle file
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Loaded data structure
|
|
109
|
+
|
|
110
|
+
Raises:
|
|
111
|
+
FileNotFoundError: If file doesn't exist
|
|
112
|
+
pickle.UnpicklingError: If file contains invalid pickle data
|
|
113
|
+
"""
|
|
114
|
+
file_path = Path(file_path)
|
|
115
|
+
|
|
116
|
+
if not file_path.exists():
|
|
117
|
+
raise FileNotFoundError(f"Pickle file not found: {file_path}")
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
with open(file_path, 'rb') as f:
|
|
121
|
+
data = pickle.load(f)
|
|
122
|
+
return data
|
|
123
|
+
except pickle.UnpicklingError as e:
|
|
124
|
+
raise pickle.UnpicklingError(f"Invalid pickle data in file {file_path}: {e}") from e
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def dump_pickle(file_path: Union[str, Path], payload: Any) -> None:
|
|
128
|
+
"""Save data to pickle file.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
file_path: Path to output pickle file
|
|
132
|
+
payload: Data to save
|
|
133
|
+
|
|
134
|
+
Raises:
|
|
135
|
+
PermissionError: If cannot write to file
|
|
136
|
+
"""
|
|
137
|
+
file_path = Path(file_path)
|
|
138
|
+
|
|
139
|
+
# Create parent directory if it doesn't exist
|
|
140
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
with open(file_path, 'wb') as f:
|
|
144
|
+
pickle.dump(payload, f)
|
|
145
|
+
except Exception as e:
|
|
146
|
+
raise RuntimeError(f"Failed to save pickle file {file_path}: {e}") from e
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def is_monotonic(A: Sequence[Any]) -> bool:
|
|
151
|
+
"""Check if sequence is monotonic (non-decreasing or non-increasing).
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
A: Sequence to check
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
True if sequence is monotonic, False otherwise
|
|
158
|
+
|
|
159
|
+
Raises:
|
|
160
|
+
TypeError: If sequence elements are not comparable
|
|
161
|
+
"""
|
|
162
|
+
if not hasattr(A, '__len__') or len(A) < 2:
|
|
163
|
+
return True
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
return (all(x <= y for x, y in zip(A, A[1:])) or
|
|
167
|
+
all(x >= y for x, y in zip(A, A[1:])))
|
|
168
|
+
except TypeError as e:
|
|
169
|
+
raise TypeError(f"Cannot compare sequence elements: {e}") from e
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def generate_random_sequence(length: int) -> str:
|
|
173
|
+
"""Generate a random DNA sequence of given length.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
length: Length of sequence to generate
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Random DNA sequence containing only A, C, G, T
|
|
180
|
+
|
|
181
|
+
Raises:
|
|
182
|
+
ValueError: If length is not positive
|
|
183
|
+
"""
|
|
184
|
+
if not isinstance(length, int):
|
|
185
|
+
raise TypeError(f"Length must be integer, got {type(length).__name__}")
|
|
186
|
+
|
|
187
|
+
if length <= 0:
|
|
188
|
+
raise ValueError(f"Length must be positive, got {length}")
|
|
189
|
+
|
|
190
|
+
return ''.join(random.choices('ACGT', k=length))
|
|
191
|
+
|
|
192
|
+
def generate_random_nucleotide_sequences(num_sequences: int, min_len: int = 3, max_len: int = 10) -> List[str]:
|
|
193
|
+
"""
|
|
194
|
+
Generate random DNA sequences of variable lengths.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
num_sequences: Number of sequences to generate
|
|
198
|
+
min_len: Minimum sequence length
|
|
199
|
+
max_len: Maximum sequence length
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
List of random nucleotide sequences
|
|
203
|
+
|
|
204
|
+
Raises:
|
|
205
|
+
ValueError: If parameters are invalid
|
|
206
|
+
"""
|
|
207
|
+
if not isinstance(num_sequences, int) or num_sequences <= 0:
|
|
208
|
+
raise ValueError(f"num_sequences must be positive integer, got {num_sequences}")
|
|
209
|
+
|
|
210
|
+
if not isinstance(min_len, int) or min_len <= 0:
|
|
211
|
+
raise ValueError(f"min_len must be positive integer, got {min_len}")
|
|
212
|
+
|
|
213
|
+
if not isinstance(max_len, int) or max_len <= 0:
|
|
214
|
+
raise ValueError(f"max_len must be positive integer, got {max_len}")
|
|
215
|
+
|
|
216
|
+
if min_len > max_len:
|
|
217
|
+
raise ValueError(f"min_len ({min_len}) cannot be greater than max_len ({max_len})")
|
|
218
|
+
|
|
219
|
+
nucleotides = ['A', 'C', 'G', 'T']
|
|
220
|
+
lengths = list(range(min_len, max_len + 1))
|
|
221
|
+
|
|
222
|
+
sequences = [
|
|
223
|
+
''.join(random.choices(nucleotides, k=random.choice(lengths)))
|
|
224
|
+
for _ in range(num_sequences)
|
|
225
|
+
]
|
|
226
|
+
return sequences
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def short_hash_of_list(numbers: List[Any], length: int = 5) -> str:
|
|
231
|
+
"""Generate a short hash string from a list of numbers.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
numbers: List of values to hash
|
|
235
|
+
length: Length of output hash string
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
Short hash string
|
|
239
|
+
|
|
240
|
+
Raises:
|
|
241
|
+
ValueError: If length is not positive
|
|
242
|
+
"""
|
|
243
|
+
if not isinstance(length, int) or length <= 0:
|
|
244
|
+
raise ValueError(f"Length must be positive integer, got {length}")
|
|
245
|
+
|
|
246
|
+
if length > 64: # SHA256 hex digest is 64 characters
|
|
247
|
+
raise ValueError(f"Length cannot exceed 64, got {length}")
|
|
248
|
+
|
|
249
|
+
try:
|
|
250
|
+
encoded = repr(numbers).encode('utf-8')
|
|
251
|
+
full_hash = hashlib.sha256(encoded).hexdigest()
|
|
252
|
+
return full_hash[:length]
|
|
253
|
+
except Exception as e:
|
|
254
|
+
raise RuntimeError(f"Failed to generate hash: {e}") from e
|
geney/variants.py
ADDED
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
# oncosplice/variants.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
__all__ = ["Mutation", "MutationalEvent", "MutationLibrary"]
|
|
12
|
+
|
|
13
|
+
# GENE:CHR:POS:REF:ALT
|
|
14
|
+
_MUTATION_ID_RE = re.compile(
|
|
15
|
+
r"^([^:]+):([^:]+):(\d+):([ACGTN\-]+):([ACGTN\-]+)$",
|
|
16
|
+
re.IGNORECASE,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Mutation:
|
|
21
|
+
"""Represents a single mutation with genomic coordinates and alleles."""
|
|
22
|
+
|
|
23
|
+
def __init__(self, gene: str, chrom: str, pos: Union[int, str], ref: str, alt: str):
|
|
24
|
+
if not gene:
|
|
25
|
+
raise ValueError("Gene name cannot be empty")
|
|
26
|
+
if not chrom:
|
|
27
|
+
raise ValueError("Chromosome cannot be empty")
|
|
28
|
+
if not ref or not alt:
|
|
29
|
+
raise ValueError("Reference and alternate alleles cannot be empty")
|
|
30
|
+
|
|
31
|
+
self.gene = gene
|
|
32
|
+
self.chrom = chrom
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
self.pos = int(pos)
|
|
36
|
+
except (ValueError, TypeError) as e:
|
|
37
|
+
raise ValueError(f"Position must be numeric, got '{pos}'") from e
|
|
38
|
+
|
|
39
|
+
if self.pos < 0:
|
|
40
|
+
raise ValueError(f"Position must be non-negative, got {self.pos}")
|
|
41
|
+
|
|
42
|
+
valid_chars = set("ACGTN-")
|
|
43
|
+
ref_u = ref.upper()
|
|
44
|
+
alt_u = alt.upper()
|
|
45
|
+
if not all(c in valid_chars for c in ref_u):
|
|
46
|
+
raise ValueError(f"Invalid characters in reference allele: {ref}")
|
|
47
|
+
if not all(c in valid_chars for c in alt_u):
|
|
48
|
+
raise ValueError(f"Invalid characters in alternate allele: {alt}")
|
|
49
|
+
|
|
50
|
+
self.ref = ref_u
|
|
51
|
+
self.alt = alt_u
|
|
52
|
+
self.mut_type = self._infer_type()
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def from_id(cls, mut_id: str) -> "Mutation":
|
|
56
|
+
m = _MUTATION_ID_RE.match(mut_id.strip())
|
|
57
|
+
if not m:
|
|
58
|
+
raise ValueError(
|
|
59
|
+
f"Invalid mutation ID '{mut_id}'. Expected GENE:CHROM:POS:REF:ALT"
|
|
60
|
+
)
|
|
61
|
+
return cls(*m.groups())
|
|
62
|
+
|
|
63
|
+
def _infer_type(self) -> str:
|
|
64
|
+
if self.ref == "-" or self.alt == "-":
|
|
65
|
+
return "indel"
|
|
66
|
+
if len(self.ref) == len(self.alt) == 1:
|
|
67
|
+
return "snp"
|
|
68
|
+
return "indel"
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def span(self) -> int:
|
|
72
|
+
ref_len = 0 if self.ref == "-" else len(self.ref)
|
|
73
|
+
alt_len = 0 if self.alt == "-" else len(self.alt)
|
|
74
|
+
return max(ref_len, alt_len, 1)
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def end(self) -> int:
|
|
78
|
+
return self.pos + self.span
|
|
79
|
+
|
|
80
|
+
def overlaps_with(self, other: "Mutation") -> bool:
|
|
81
|
+
if not isinstance(other, Mutation):
|
|
82
|
+
raise TypeError(f"Expected Mutation, got {type(other).__name__}")
|
|
83
|
+
if self.chrom != other.chrom:
|
|
84
|
+
return False
|
|
85
|
+
return not (self.end <= other.pos or other.end <= self.pos)
|
|
86
|
+
|
|
87
|
+
def to_dict(self) -> Dict[str, Union[str, int]]:
|
|
88
|
+
return {
|
|
89
|
+
"gene": self.gene,
|
|
90
|
+
"chrom": self.chrom,
|
|
91
|
+
"pos": self.pos,
|
|
92
|
+
"ref": self.ref,
|
|
93
|
+
"alt": self.alt,
|
|
94
|
+
"type": self.mut_type,
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
def __repr__(self) -> str:
|
|
98
|
+
return f"{self.gene}:{self.chrom}:{self.pos}:{self.ref}:{self.alt}"
|
|
99
|
+
|
|
100
|
+
def __eq__(self, other: object) -> bool:
|
|
101
|
+
if not isinstance(other, Mutation):
|
|
102
|
+
return False
|
|
103
|
+
return (
|
|
104
|
+
self.gene == other.gene
|
|
105
|
+
and self.chrom == other.chrom
|
|
106
|
+
and self.pos == other.pos
|
|
107
|
+
and self.ref == other.ref
|
|
108
|
+
and self.alt == other.alt
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def __hash__(self) -> int:
|
|
112
|
+
return hash((self.gene, self.chrom, self.pos, self.ref, self.alt))
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class MutationalEvent:
|
|
116
|
+
"""Represents a compound mutational event (one or more mutations)."""
|
|
117
|
+
|
|
118
|
+
def __init__(self, mut_id: str):
|
|
119
|
+
if not mut_id:
|
|
120
|
+
raise ValueError("Mutation ID cannot be empty")
|
|
121
|
+
|
|
122
|
+
self.raw = mut_id.strip()
|
|
123
|
+
try:
|
|
124
|
+
self.mutations: List[Mutation] = self._parse_mutations(self.raw)
|
|
125
|
+
except Exception as e:
|
|
126
|
+
raise ValueError(f"Failed to parse mutation ID '{mut_id}': {e}") from e
|
|
127
|
+
|
|
128
|
+
if not self.mutations:
|
|
129
|
+
raise ValueError(f"No valid mutations found in '{mut_id}'")
|
|
130
|
+
|
|
131
|
+
self.gene = self._verify_same_gene()
|
|
132
|
+
|
|
133
|
+
def __len__(self) -> int:
|
|
134
|
+
return len(self.mutations)
|
|
135
|
+
|
|
136
|
+
@staticmethod
|
|
137
|
+
def _parse_mutations(mut_id: str) -> List[Mutation]:
|
|
138
|
+
parts = re.split(r"[|,]", mut_id)
|
|
139
|
+
out: List[Mutation] = []
|
|
140
|
+
for i, part in enumerate(parts):
|
|
141
|
+
part = part.strip()
|
|
142
|
+
if not part:
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
m = _MUTATION_ID_RE.match(part)
|
|
146
|
+
if not m:
|
|
147
|
+
raise ValueError(
|
|
148
|
+
f"Invalid format for mutation #{i+1}: '{part}'. "
|
|
149
|
+
"Expected GENE:CHROM:POS:REF:ALT"
|
|
150
|
+
)
|
|
151
|
+
out.append(Mutation(*m.groups()))
|
|
152
|
+
return out
|
|
153
|
+
|
|
154
|
+
def _verify_same_gene(self) -> str:
|
|
155
|
+
genes = {m.gene for m in self.mutations}
|
|
156
|
+
if len(genes) != 1:
|
|
157
|
+
raise ValueError(
|
|
158
|
+
f"All mutations must be in the same gene, found: {', '.join(sorted(genes))}"
|
|
159
|
+
)
|
|
160
|
+
return genes.pop()
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def chrom(self) -> str:
|
|
164
|
+
chroms = {m.chrom for m in self.mutations}
|
|
165
|
+
if len(chroms) != 1:
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"Mutations span multiple chromosomes: {', '.join(sorted(chroms))}"
|
|
168
|
+
)
|
|
169
|
+
return chroms.pop()
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def positions(self) -> List[int]:
|
|
173
|
+
return [m.pos for m in self.mutations]
|
|
174
|
+
|
|
175
|
+
@property
|
|
176
|
+
def central_position(self) -> int:
|
|
177
|
+
return int(np.mean(self.positions))
|
|
178
|
+
|
|
179
|
+
@property
|
|
180
|
+
def position(self) -> int:
|
|
181
|
+
return self.central_position
|
|
182
|
+
|
|
183
|
+
@property
|
|
184
|
+
def types(self) -> List[str]:
|
|
185
|
+
return [m.mut_type for m in self.mutations]
|
|
186
|
+
|
|
187
|
+
def compatible(self) -> bool:
|
|
188
|
+
for i, m1 in enumerate(self.mutations):
|
|
189
|
+
for m2 in self.mutations[i + 1 :]:
|
|
190
|
+
if m1.overlaps_with(m2):
|
|
191
|
+
return False
|
|
192
|
+
return True
|
|
193
|
+
|
|
194
|
+
def validate(self) -> None:
|
|
195
|
+
if not self.mutations:
|
|
196
|
+
raise ValueError("Event contains no mutations")
|
|
197
|
+
|
|
198
|
+
if not self.compatible():
|
|
199
|
+
overlapping = []
|
|
200
|
+
for i, m1 in enumerate(self.mutations):
|
|
201
|
+
for m2 in self.mutations[i + 1 :]:
|
|
202
|
+
if m1.overlaps_with(m2):
|
|
203
|
+
overlapping.append(f"{m1} overlaps with {m2}")
|
|
204
|
+
raise ValueError(
|
|
205
|
+
f"Mutations are not compatible: {'; '.join(overlapping)}"
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
chroms = {m.chrom for m in self.mutations}
|
|
209
|
+
if len(chroms) > 1:
|
|
210
|
+
raise ValueError(
|
|
211
|
+
f"Mutations span multiple chromosomes: {', '.join(sorted(chroms))}"
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
215
|
+
if not self.mutations:
|
|
216
|
+
return pd.DataFrame(
|
|
217
|
+
columns=["gene", "chrom", "pos", "ref", "alt", "type"]
|
|
218
|
+
)
|
|
219
|
+
return pd.DataFrame([m.to_dict() for m in self.mutations])
|
|
220
|
+
|
|
221
|
+
def mutation_args(self) -> List[Tuple[int, str, str]]:
|
|
222
|
+
return [(m.pos, m.ref, m.alt) for m in self.mutations]
|
|
223
|
+
|
|
224
|
+
def __iter__(self):
|
|
225
|
+
return iter(self.mutation_args())
|
|
226
|
+
|
|
227
|
+
def __repr__(self) -> str:
|
|
228
|
+
muts = ", ".join(f"{m.pos}:{m.ref}>{m.alt}" for m in self.mutations)
|
|
229
|
+
return f"MutationalEvent({self.gene} -> [{muts}])"
|
|
230
|
+
|
|
231
|
+
def __str__(self) -> str:
|
|
232
|
+
return f"{self.gene}: {len(self.mutations)} mutation(s)"
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class MutationLibrary:
|
|
236
|
+
"""
|
|
237
|
+
Collection of mutational events built from IDs, text files, or VCF files.
|
|
238
|
+
|
|
239
|
+
events: mapping from raw mutation ID string -> MutationalEvent (validated)
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
def __init__(self, events: Optional[Dict[str, MutationalEvent]] = None):
|
|
243
|
+
self.events: Dict[str, MutationalEvent] = events or {}
|
|
244
|
+
self._errors: Dict[str, str] = {}
|
|
245
|
+
|
|
246
|
+
@classmethod
|
|
247
|
+
def from_mutation_ids(
|
|
248
|
+
cls,
|
|
249
|
+
mut_ids: Iterable[str],
|
|
250
|
+
*,
|
|
251
|
+
validate: bool = True,
|
|
252
|
+
skip_invalid: bool = True,
|
|
253
|
+
) -> "MutationLibrary":
|
|
254
|
+
events: Dict[str, MutationalEvent] = {}
|
|
255
|
+
errors: Dict[str, str] = {}
|
|
256
|
+
|
|
257
|
+
for mid in mut_ids:
|
|
258
|
+
mid = mid.strip()
|
|
259
|
+
if not mid:
|
|
260
|
+
continue
|
|
261
|
+
|
|
262
|
+
try:
|
|
263
|
+
ev = MutationalEvent(mid)
|
|
264
|
+
if validate:
|
|
265
|
+
ev.validate()
|
|
266
|
+
events[mid] = ev
|
|
267
|
+
except Exception as e:
|
|
268
|
+
if skip_invalid:
|
|
269
|
+
errors[mid] = str(e)
|
|
270
|
+
continue
|
|
271
|
+
raise
|
|
272
|
+
|
|
273
|
+
lib = cls(events)
|
|
274
|
+
lib._errors = errors
|
|
275
|
+
return lib
|
|
276
|
+
|
|
277
|
+
@classmethod
|
|
278
|
+
def from_text_file(
|
|
279
|
+
cls,
|
|
280
|
+
path: Union[str, Path],
|
|
281
|
+
comment_char: str = "#",
|
|
282
|
+
*,
|
|
283
|
+
validate: bool = True,
|
|
284
|
+
skip_invalid: bool = True,
|
|
285
|
+
) -> "MutationLibrary":
|
|
286
|
+
p = Path(path)
|
|
287
|
+
mut_ids: List[str] = []
|
|
288
|
+
with p.open() as fh:
|
|
289
|
+
for line in fh:
|
|
290
|
+
line = line.strip()
|
|
291
|
+
if not line or line.startswith(comment_char):
|
|
292
|
+
continue
|
|
293
|
+
mut_ids.append(line)
|
|
294
|
+
return cls.from_mutation_ids(
|
|
295
|
+
mut_ids,
|
|
296
|
+
validate=validate,
|
|
297
|
+
skip_invalid=skip_invalid,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
@classmethod
|
|
301
|
+
def from_vcf(
|
|
302
|
+
cls,
|
|
303
|
+
path: Union[str, Path],
|
|
304
|
+
gene_field: Optional[str] = "GENE",
|
|
305
|
+
*,
|
|
306
|
+
validate: bool = True,
|
|
307
|
+
skip_invalid: bool = True,
|
|
308
|
+
) -> "MutationLibrary":
|
|
309
|
+
p = Path(path)
|
|
310
|
+
|
|
311
|
+
df = pd.read_csv(
|
|
312
|
+
p,
|
|
313
|
+
sep="\t",
|
|
314
|
+
comment="#",
|
|
315
|
+
dtype={"CHROM": str},
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
required = {"CHROM", "POS", "REF", "ALT"}
|
|
319
|
+
missing = required - set(df.columns)
|
|
320
|
+
if missing:
|
|
321
|
+
raise ValueError(
|
|
322
|
+
f"VCF is missing required columns: {', '.join(sorted(missing))}"
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
mut_ids: List[str] = []
|
|
326
|
+
for _, row in df.iterrows():
|
|
327
|
+
chrom = str(row["CHROM"])
|
|
328
|
+
pos = int(row["POS"])
|
|
329
|
+
ref = str(row["REF"])
|
|
330
|
+
alts = str(row["ALT"]).split(",")
|
|
331
|
+
|
|
332
|
+
if gene_field is not None and gene_field in row and pd.notna(row[gene_field]):
|
|
333
|
+
gene = str(row[gene_field])
|
|
334
|
+
else:
|
|
335
|
+
gene = "."
|
|
336
|
+
|
|
337
|
+
for alt in alts:
|
|
338
|
+
mut_ids.append(f"{gene}:{chrom}:{pos}:{ref}:{alt}")
|
|
339
|
+
|
|
340
|
+
return cls.from_mutation_ids(
|
|
341
|
+
mut_ids,
|
|
342
|
+
validate=validate,
|
|
343
|
+
skip_invalid=skip_invalid,
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
def __len__(self) -> int:
|
|
347
|
+
return len(self.events)
|
|
348
|
+
|
|
349
|
+
def __iter__(self):
|
|
350
|
+
return iter(self.events.items())
|
|
351
|
+
|
|
352
|
+
def __contains__(self, mut_id: str) -> bool:
|
|
353
|
+
return mut_id in self.events
|
|
354
|
+
|
|
355
|
+
def get(self, mut_id: str) -> Optional[MutationalEvent]:
|
|
356
|
+
return self.events.get(mut_id)
|
|
357
|
+
|
|
358
|
+
def add(self, mut_id: str, *, validate: bool = True) -> Optional[MutationalEvent]:
|
|
359
|
+
mut_id = mut_id.strip()
|
|
360
|
+
if not mut_id:
|
|
361
|
+
return None
|
|
362
|
+
|
|
363
|
+
try:
|
|
364
|
+
ev = MutationalEvent(mut_id)
|
|
365
|
+
if validate:
|
|
366
|
+
ev.validate()
|
|
367
|
+
self.events[ev.raw] = ev
|
|
368
|
+
return ev
|
|
369
|
+
except Exception as e:
|
|
370
|
+
self._errors[mut_id] = str(e)
|
|
371
|
+
return None
|
|
372
|
+
|
|
373
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
374
|
+
records: List[Dict[str, Union[str, int]]] = []
|
|
375
|
+
for eid, event in self.events.items():
|
|
376
|
+
df = event.to_dataframe()
|
|
377
|
+
if not df.empty:
|
|
378
|
+
df = df.copy()
|
|
379
|
+
df["event_id"] = eid
|
|
380
|
+
records.append(df)
|
|
381
|
+
if not records:
|
|
382
|
+
return pd.DataFrame(
|
|
383
|
+
columns=["event_id", "gene", "chrom", "pos", "ref", "alt", "type"]
|
|
384
|
+
)
|
|
385
|
+
return pd.concat(records, ignore_index=True)
|
|
386
|
+
|
|
387
|
+
@property
|
|
388
|
+
def errors(self) -> Dict[str, str]:
|
|
389
|
+
return dict(self._errors)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: geney
|
|
3
|
+
Version: 1.4.40
|
|
4
|
+
Summary: A Python package for gene expression modeling.
|
|
5
|
+
Home-page: https://github.com/nicolaslynn/geney
|
|
6
|
+
Author: Nicolas Lynn
|
|
7
|
+
Author-email: nicolasalynn@gmail.com
|
|
8
|
+
License: Free for non-commercial use
|
|
9
|
+
Classifier: Development Status :: 1 - Planning
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: Free for non-commercial use
|
|
12
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
13
|
+
Classifier: Operating System :: MacOS
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Requires-Python: >3.10
|
|
16
|
+
Requires-Dist: numpy<2.0
|
|
17
|
+
Requires-Dist: pandas==2.1.4
|
|
18
|
+
Requires-Dist: biopython>=1.81
|
|
19
|
+
Requires-Dist: matplotlib
|
|
20
|
+
Requires-Dist: seaborn
|
|
21
|
+
Requires-Dist: tensorflow>=2.8.0
|
|
22
|
+
Requires-Dist: keras>=2.8.0
|
|
23
|
+
Requires-Dist: torch
|
|
24
|
+
Requires-Dist: seqmat
|
|
25
|
+
Dynamic: author
|
|
26
|
+
Dynamic: author-email
|
|
27
|
+
Dynamic: classifier
|
|
28
|
+
Dynamic: home-page
|
|
29
|
+
Dynamic: license
|
|
30
|
+
Dynamic: requires-dist
|
|
31
|
+
Dynamic: requires-python
|
|
32
|
+
Dynamic: summary
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
geney/__init__.py,sha256=1V1SxqcLFPxRJOqr4VmGillv1r4_azJtbmNtf0pZ18I,684
|
|
2
|
+
geney/engines.py,sha256=ZK6x0YdY8_yPRTUmhwL8GWcuS3U5OotqMJBKPE-z7cE,10548
|
|
3
|
+
geney/oncosplice.py,sha256=eGQQl9ftmoFENMYBWoJtenKWmzyxR9N1of5cZst_bHQ,18014
|
|
4
|
+
geney/pipelines.py,sha256=zK1zDFFAxElnxgXWeM_xZqEZtwxyF7CwmtQLCkKOq2w,3356
|
|
5
|
+
geney/samples.py,sha256=3KrWNILHYql-vPC_TidkzqDuFaLx3JSJZbUoVW2RTlo,92
|
|
6
|
+
geney/splice_graph.py,sha256=wCStApnnrwbej_yhk_s39p5sQatRtqg9Ve8GqH2ZfGA,14849
|
|
7
|
+
geney/splicing_table.py,sha256=mXDXUr4h_q7grYQpmXO5Ex15Mt7BchieWF9lawd6src,5412
|
|
8
|
+
geney/transcripts.py,sha256=I6NmBcW9QG5XtRumn6i0TeT8tKECHQycsbSSZ7e8LZo,2601
|
|
9
|
+
geney/utils.py,sha256=pv4_LPIzjYAxwUgmufZJL6UhVVq2SllpF90ix_uH_-Q,7627
|
|
10
|
+
geney/variants.py,sha256=vjbiBH-duZ4TJZyXwXbQ_VmJxCFafjeDwLNTZg3ubSc,11832
|
|
11
|
+
geney-1.4.40.dist-info/METADATA,sha256=BiZJ2yQaYrHybVewBIQ2Cdw_qKNENiHoIEiFPp29xs8,952
|
|
12
|
+
geney-1.4.40.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
+
geney-1.4.40.dist-info/top_level.txt,sha256=O-FuNUMb5fn9dhZ-dYCgF0aZtfi1EslMstnzhc5IIVo,6
|
|
14
|
+
geney-1.4.40.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
geney
|