evoseer-utils 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evoseer_utils-0.1.4/PKG-INFO +140 -0
- evoseer_utils-0.1.4/README.md +124 -0
- evoseer_utils-0.1.4/evoseer_utils/__init__.py +5 -0
- evoseer_utils-0.1.4/evoseer_utils/data/schema.sql +55 -0
- evoseer_utils-0.1.4/evoseer_utils/db_connection.py +80 -0
- evoseer_utils-0.1.4/evoseer_utils/mutations.py +406 -0
- evoseer_utils-0.1.4/evoseer_utils/output_description.py +131 -0
- evoseer_utils-0.1.4/evoseer_utils/test_utils.py +269 -0
- evoseer_utils-0.1.4/pyproject.toml +52 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: evoseer-utils
|
|
3
|
+
Version: 0.1.4
|
|
4
|
+
Summary: Shared library for mutation management across modules
|
|
5
|
+
Author: benoît de Witte
|
|
6
|
+
Requires-Python: >=3.9,<4.0
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Requires-Dist: pydantic (>=2.0,<3.0)
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# Mutation Library
|
|
17
|
+
|
|
18
|
+
Shared library for mutation management across modules.
|
|
19
|
+
|
|
20
|
+
## Components
|
|
21
|
+
|
|
22
|
+
### `DbConnection` - Singleton DB connection
|
|
23
|
+
```python
|
|
24
|
+
from libs import DbConnection
|
|
25
|
+
|
|
26
|
+
DbConnection.set_db_path("mutations.db")
|
|
27
|
+
conn = DbConnection.get_connection()
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### `Mutation` - Pydantic model with DB integration
|
|
31
|
+
|
|
32
|
+
#### States
|
|
33
|
+
- `"full"`: Has both id and (chrom, pos, ref, alt)
|
|
34
|
+
- `"miss_id"`: Has coordinates, missing id
|
|
35
|
+
- `"miss_attributes"`: Has id, missing coordinates
|
|
36
|
+
|
|
37
|
+
#### Creation patterns
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
# With coordinates (lazy load id)
|
|
41
|
+
mut = Mutation(chrom=17, pos=7577548, ref="C", alt="T")
|
|
42
|
+
|
|
43
|
+
# With id (lazy load attributes)
|
|
44
|
+
mut = Mutation(id=123)
|
|
45
|
+
|
|
46
|
+
# With both
|
|
47
|
+
mut = Mutation(id=123, chrom=17, pos=7577548, ref="C", alt="T")
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
#### Methods
|
|
51
|
+
|
|
52
|
+
**Instance methods:**
|
|
53
|
+
```python
|
|
54
|
+
mut.fetch_id_from_db() # Get id from coordinates
|
|
55
|
+
mut.fetch_attributes_from_db() # Get coordinates from id
|
|
56
|
+
mut.ensure_in_db() # Create if missing, return id
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
**Class methods (batch):**
|
|
60
|
+
```python
|
|
61
|
+
Mutation.fetch_ids_from_db_batch(mutations)
|
|
62
|
+
Mutation.fetch_attributes_from_db_batch(mutations)
|
|
63
|
+
Mutation.ensure_in_db_batch(mutations)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Usage in modules with OutputDescription (fully automatic)
|
|
67
|
+
|
|
68
|
+
`OutputDescription` is a base class that provides automatic DB insertion for module outputs.
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from pydantic import Field
|
|
72
|
+
from typing import ClassVar, List
|
|
73
|
+
from libs import OutputDescription, DbConnection, Mutation
|
|
74
|
+
|
|
75
|
+
class MyModuleOutput(OutputDescription):
|
|
76
|
+
table_name: ClassVar[str] = "tool_mymodule"
|
|
77
|
+
db_fields: ClassVar[List[str]] = ["my_score", "my_prediction"]
|
|
78
|
+
|
|
79
|
+
my_score: float = Field(..., description="Module score")
|
|
80
|
+
my_prediction: str = Field(..., description="Prediction")
|
|
81
|
+
|
|
82
|
+
# Setup
|
|
83
|
+
DbConnection.set_db_path("mutations.db")
|
|
84
|
+
|
|
85
|
+
# Single insertion (automatic table creation + mutation insertion)
|
|
86
|
+
output = MyModuleOutput(
|
|
87
|
+
mutation=Mutation(chrom=17, pos=7577548, ref="C", alt="T"),
|
|
88
|
+
version="1.0.0", # Required field (free text)
|
|
89
|
+
my_score=0.85,
|
|
90
|
+
my_prediction="pathogenic"
|
|
91
|
+
)
|
|
92
|
+
output.insert_to_db() # Creates table if needed, ensures mutation exists, inserts
|
|
93
|
+
|
|
94
|
+
# Batch insertion
|
|
95
|
+
outputs = [...]
|
|
96
|
+
MyModuleOutput.insert_batch_to_db(outputs)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
**What happens automatically:**
|
|
100
|
+
- Table creation with correct SQL types (inferred from Python types)
|
|
101
|
+
- Mutation insertion/lookup
|
|
102
|
+
- Index creation on mutation_id
|
|
103
|
+
- `version` field automatically added to table and insertion
|
|
104
|
+
- INSERT OR REPLACE (idempotent)
|
|
105
|
+
|
|
106
|
+
**Note:** `version` field is required in all OutputDescription subclasses. Format is free text.
|
|
107
|
+
|
|
108
|
+
## Chromosome encoding
|
|
109
|
+
|
|
110
|
+
- Autosomes: `1-22`
|
|
111
|
+
- X: `23`
|
|
112
|
+
- Y: `24`
|
|
113
|
+
|
|
114
|
+
Helper functions:
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from libs.src.mutations import chrom_to_int, int_to_chrom
|
|
118
|
+
|
|
119
|
+
chrom_to_int("chr17") # 17
|
|
120
|
+
chrom_to_int("chrX") # 23
|
|
121
|
+
int_to_chrom(23) # "chrX"
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Tests
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
# From project root
|
|
128
|
+
.venv/bin/python3 libs/tests/test_mutations_lib.py
|
|
129
|
+
|
|
130
|
+
# Or use the test runner
|
|
131
|
+
libs/tests/run_tests.sh
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Examples
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
python3 example_mutations_lib.py
|
|
138
|
+
python3 modules/boostdm/output_description_example.py
|
|
139
|
+
```
|
|
140
|
+
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# Mutation Library
|
|
2
|
+
|
|
3
|
+
Shared library for mutation management across modules.
|
|
4
|
+
|
|
5
|
+
## Components
|
|
6
|
+
|
|
7
|
+
### `DbConnection` - Singleton DB connection
|
|
8
|
+
```python
|
|
9
|
+
from libs import DbConnection
|
|
10
|
+
|
|
11
|
+
DbConnection.set_db_path("mutations.db")
|
|
12
|
+
conn = DbConnection.get_connection()
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
### `Mutation` - Pydantic model with DB integration
|
|
16
|
+
|
|
17
|
+
#### States
|
|
18
|
+
- `"full"`: Has both id and (chrom, pos, ref, alt)
|
|
19
|
+
- `"miss_id"`: Has coordinates, missing id
|
|
20
|
+
- `"miss_attributes"`: Has id, missing coordinates
|
|
21
|
+
|
|
22
|
+
#### Creation patterns
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
# With coordinates (lazy load id)
|
|
26
|
+
mut = Mutation(chrom=17, pos=7577548, ref="C", alt="T")
|
|
27
|
+
|
|
28
|
+
# With id (lazy load attributes)
|
|
29
|
+
mut = Mutation(id=123)
|
|
30
|
+
|
|
31
|
+
# With both
|
|
32
|
+
mut = Mutation(id=123, chrom=17, pos=7577548, ref="C", alt="T")
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
#### Methods
|
|
36
|
+
|
|
37
|
+
**Instance methods:**
|
|
38
|
+
```python
|
|
39
|
+
mut.fetch_id_from_db() # Get id from coordinates
|
|
40
|
+
mut.fetch_attributes_from_db() # Get coordinates from id
|
|
41
|
+
mut.ensure_in_db() # Create if missing, return id
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
**Class methods (batch):**
|
|
45
|
+
```python
|
|
46
|
+
Mutation.fetch_ids_from_db_batch(mutations)
|
|
47
|
+
Mutation.fetch_attributes_from_db_batch(mutations)
|
|
48
|
+
Mutation.ensure_in_db_batch(mutations)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Usage in modules with OutputDescription (fully automatic)
|
|
52
|
+
|
|
53
|
+
`OutputDescription` is a base class that provides automatic DB insertion for module outputs.
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from pydantic import Field
|
|
57
|
+
from typing import ClassVar, List
|
|
58
|
+
from libs import OutputDescription, DbConnection, Mutation
|
|
59
|
+
|
|
60
|
+
class MyModuleOutput(OutputDescription):
|
|
61
|
+
table_name: ClassVar[str] = "tool_mymodule"
|
|
62
|
+
db_fields: ClassVar[List[str]] = ["my_score", "my_prediction"]
|
|
63
|
+
|
|
64
|
+
my_score: float = Field(..., description="Module score")
|
|
65
|
+
my_prediction: str = Field(..., description="Prediction")
|
|
66
|
+
|
|
67
|
+
# Setup
|
|
68
|
+
DbConnection.set_db_path("mutations.db")
|
|
69
|
+
|
|
70
|
+
# Single insertion (automatic table creation + mutation insertion)
|
|
71
|
+
output = MyModuleOutput(
|
|
72
|
+
mutation=Mutation(chrom=17, pos=7577548, ref="C", alt="T"),
|
|
73
|
+
version="1.0.0", # Required field (free text)
|
|
74
|
+
my_score=0.85,
|
|
75
|
+
my_prediction="pathogenic"
|
|
76
|
+
)
|
|
77
|
+
output.insert_to_db() # Creates table if needed, ensures mutation exists, inserts
|
|
78
|
+
|
|
79
|
+
# Batch insertion
|
|
80
|
+
outputs = [...]
|
|
81
|
+
MyModuleOutput.insert_batch_to_db(outputs)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**What happens automatically:**
|
|
85
|
+
- Table creation with correct SQL types (inferred from Python types)
|
|
86
|
+
- Mutation insertion/lookup
|
|
87
|
+
- Index creation on mutation_id
|
|
88
|
+
- `version` field automatically added to table and insertion
|
|
89
|
+
- INSERT OR REPLACE (idempotent)
|
|
90
|
+
|
|
91
|
+
**Note:** `version` field is required in all OutputDescription subclasses. Format is free text.
|
|
92
|
+
|
|
93
|
+
## Chromosome encoding
|
|
94
|
+
|
|
95
|
+
- Autosomes: `1-22`
|
|
96
|
+
- X: `23`
|
|
97
|
+
- Y: `24`
|
|
98
|
+
|
|
99
|
+
Helper functions:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from libs.src.mutations import chrom_to_int, int_to_chrom
|
|
103
|
+
|
|
104
|
+
chrom_to_int("chr17") # 17
|
|
105
|
+
chrom_to_int("chrX") # 23
|
|
106
|
+
int_to_chrom(23) # "chrX"
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Tests
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
# From project root
|
|
113
|
+
.venv/bin/python3 libs/tests/test_mutations_lib.py
|
|
114
|
+
|
|
115
|
+
# Or use the test runner
|
|
116
|
+
libs/tests/run_tests.sh
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Examples
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
python3 example_mutations_lib.py
|
|
123
|
+
python3 modules/boostdm/output_description_example.py
|
|
124
|
+
```
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
-- Schéma SQLite pour annotations de mutations génomiques
|
|
2
|
+
-- Simple, modulaire, optimisé pour ML
|
|
3
|
+
-- Convention chromosomes: 1-22 (autosomes), 23=X, 24=Y
|
|
4
|
+
|
|
5
|
+
-- Table centrale des mutations
|
|
6
|
+
CREATE TABLE mutations (
|
|
7
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
8
|
+
chrom INTEGER NOT NULL, -- 1-22, 23=X, 24=Y
|
|
9
|
+
pos INTEGER NOT NULL,
|
|
10
|
+
ref TEXT NOT NULL,
|
|
11
|
+
alt TEXT NOT NULL,
|
|
12
|
+
UNIQUE(chrom, pos, ref, alt)
|
|
13
|
+
);
|
|
14
|
+
CREATE INDEX idx_mutations_location ON mutations(chrom, pos);
|
|
15
|
+
|
|
16
|
+
-- Table des gènes (depuis GTF canonique)
|
|
17
|
+
CREATE TABLE genes (
|
|
18
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
19
|
+
gene_id TEXT NOT NULL UNIQUE, -- ENSG...
|
|
20
|
+
gene_name TEXT, -- Symbole (ex: TP53)
|
|
21
|
+
chrom INTEGER NOT NULL, -- 1-22, 23=X, 24=Y
|
|
22
|
+
start INTEGER NOT NULL, -- Start du gène
|
|
23
|
+
end INTEGER NOT NULL, -- End du gène
|
|
24
|
+
strand TEXT NOT NULL, -- + ou -
|
|
25
|
+
tss INTEGER NOT NULL -- Transcription Start Site (pour promoteur)
|
|
26
|
+
);
|
|
27
|
+
CREATE INDEX idx_genes_location ON genes(chrom, start, end);
|
|
28
|
+
CREATE INDEX idx_genes_name ON genes(gene_name);
|
|
29
|
+
|
|
30
|
+
-- Table des features génomiques (depuis GTF)
|
|
31
|
+
-- Stocke les coordonnées permanentes des exons, introns, UTRs, promoteurs
|
|
32
|
+
CREATE TABLE genomic_features (
|
|
33
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
34
|
+
gene_id INTEGER NOT NULL,
|
|
35
|
+
annotation_type TEXT NOT NULL, -- 'exon', 'intron', 'UTR5', 'UTR3', 'promoter'
|
|
36
|
+
feature_start INTEGER NOT NULL,
|
|
37
|
+
feature_end INTEGER NOT NULL,
|
|
38
|
+
transcript_id TEXT, -- ID du transcript canonique
|
|
39
|
+
FOREIGN KEY (gene_id) REFERENCES genes(id) ON DELETE CASCADE
|
|
40
|
+
);
|
|
41
|
+
CREATE INDEX idx_genomic_features_gene ON genomic_features(gene_id);
|
|
42
|
+
CREATE INDEX idx_genomic_features_location ON genomic_features(feature_start, feature_end);
|
|
43
|
+
|
|
44
|
+
-- Table de liaison: mutation <-> genomic_feature
|
|
45
|
+
-- Une mutation peut toucher plusieurs features (ex: exon de 2 gènes chevauchants)
|
|
46
|
+
CREATE TABLE mutation_annotations (
|
|
47
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
48
|
+
mutation_id INTEGER NOT NULL,
|
|
49
|
+
feature_id INTEGER, -- NULL si intergenic
|
|
50
|
+
FOREIGN KEY (mutation_id) REFERENCES mutations(id) ON DELETE CASCADE,
|
|
51
|
+
FOREIGN KEY (feature_id) REFERENCES genomic_features(id) ON DELETE CASCADE,
|
|
52
|
+
UNIQUE(mutation_id, feature_id) -- Éviter les doublons
|
|
53
|
+
);
|
|
54
|
+
CREATE INDEX idx_mutation_annotations_mutation ON mutation_annotations(mutation_id);
|
|
55
|
+
CREATE INDEX idx_mutation_annotations_feature ON mutation_annotations(feature_id);
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Gestion de la connexion à la base de données SQLite (singleton)
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import sqlite3
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DbConnection:
|
|
11
|
+
"""
|
|
12
|
+
Singleton pour gérer la connexion à la base de données SQLite
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
DbConnection.set_db_path("mutations.db")
|
|
16
|
+
conn = DbConnection.get_connection()
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
_instance: Optional["DbConnection"] = None
|
|
20
|
+
_connection: Optional[sqlite3.Connection] = None
|
|
21
|
+
_db_path: Optional[str] = None
|
|
22
|
+
|
|
23
|
+
def __new__(cls):
|
|
24
|
+
if cls._instance is None:
|
|
25
|
+
cls._instance = super().__new__(cls)
|
|
26
|
+
return cls._instance
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def set_db_path(cls, db_path: str) -> None:
|
|
30
|
+
"""
|
|
31
|
+
Configure le chemin vers la base de données
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
db_path: Chemin vers le fichier SQLite
|
|
35
|
+
"""
|
|
36
|
+
if cls._db_path != db_path:
|
|
37
|
+
# Fermer l'ancienne connexion si elle existe
|
|
38
|
+
if cls._connection is not None:
|
|
39
|
+
cls._connection.close()
|
|
40
|
+
cls._connection = None
|
|
41
|
+
|
|
42
|
+
cls._db_path = db_path
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def get_connection(cls) -> sqlite3.Connection:
|
|
46
|
+
"""
|
|
47
|
+
Retourne la connexion SQLite (crée si nécessaire)
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Connexion SQLite
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
RuntimeError: Si le chemin DB n'a pas été configuré
|
|
54
|
+
"""
|
|
55
|
+
if cls._db_path is None:
|
|
56
|
+
raise RuntimeError("Database path not set. Call DbConnection.set_db_path() first.")
|
|
57
|
+
|
|
58
|
+
if not Path(cls._db_path).exists():
|
|
59
|
+
raise FileNotFoundError(
|
|
60
|
+
f"Database file not found: {cls._db_path}. " f"Run init_database.py first."
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Créer la connexion si elle n'existe pas
|
|
64
|
+
if cls._connection is None:
|
|
65
|
+
cls._connection = sqlite3.connect(cls._db_path)
|
|
66
|
+
cls._connection.row_factory = sqlite3.Row # Accès par nom de colonne
|
|
67
|
+
|
|
68
|
+
return cls._connection
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def close(cls) -> None:
|
|
72
|
+
"""Ferme la connexion à la base de données"""
|
|
73
|
+
if cls._connection is not None:
|
|
74
|
+
cls._connection.close()
|
|
75
|
+
cls._connection = None
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
def is_configured(cls) -> bool:
|
|
79
|
+
"""Vérifie si la connexion est configurée"""
|
|
80
|
+
return cls._db_path is not None
|
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Modèle Pydantic pour les mutations génomiques
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Literal, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, field_validator, model_validator
|
|
8
|
+
|
|
9
|
+
from .db_connection import DbConnection
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def chrom_to_int(chrom_str: str) -> Optional[int]:
|
|
13
|
+
"""Convertit chr1-chr22, chrX, chrY en 1-24"""
|
|
14
|
+
chrom_str = str(chrom_str).upper().replace("chr", "")
|
|
15
|
+
if chrom_str == "X":
|
|
16
|
+
return 23
|
|
17
|
+
elif chrom_str == "Y":
|
|
18
|
+
return 24
|
|
19
|
+
elif chrom_str in ["M", "MT"]:
|
|
20
|
+
return None
|
|
21
|
+
else:
|
|
22
|
+
try:
|
|
23
|
+
return int(chrom_str)
|
|
24
|
+
except ValueError:
|
|
25
|
+
return None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def int_to_chrom(chrom_int: int) -> str:
|
|
29
|
+
"""Convertit 1-24 en chr1-chr22, chrX, chrY"""
|
|
30
|
+
if chrom_int == 23:
|
|
31
|
+
return "chrX"
|
|
32
|
+
elif chrom_int == 24:
|
|
33
|
+
return "chrY"
|
|
34
|
+
else:
|
|
35
|
+
return f"chr{chrom_int}"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Mutation(BaseModel):
|
|
39
|
+
"""
|
|
40
|
+
Modèle pour une mutation génomique
|
|
41
|
+
|
|
42
|
+
Attributs:
|
|
43
|
+
id: ID de la mutation dans la DB (auto-généré)
|
|
44
|
+
chrom: Chromosome (1-22, 23=X, 24=Y)
|
|
45
|
+
pos: Position génomique
|
|
46
|
+
ref: Allèle de référence
|
|
47
|
+
alt: Allèle alternatif
|
|
48
|
+
|
|
49
|
+
Validation:
|
|
50
|
+
- Soit id est fourni
|
|
51
|
+
- Soit (chrom, pos, ref, alt) sont tous fournis
|
|
52
|
+
- Soit les deux
|
|
53
|
+
|
|
54
|
+
Usage:
|
|
55
|
+
# Avec ID seulement (lazy load des attributs)
|
|
56
|
+
mutation = Mutation(id=123)
|
|
57
|
+
mutation.fetch_attributes_from_db()
|
|
58
|
+
|
|
59
|
+
# Avec coordonnées (lazy load de l'ID)
|
|
60
|
+
mutation = Mutation(chrom=17, pos=7577548, ref="C", alt="T")
|
|
61
|
+
mutation.fetch_id_from_db()
|
|
62
|
+
|
|
63
|
+
# Avec tout
|
|
64
|
+
mutation = Mutation(id=123, chrom=17, pos=7577548, ref="C", alt="T")
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
id: Optional[int] = None
|
|
68
|
+
chrom: Optional[int] = None
|
|
69
|
+
pos: Optional[int] = None
|
|
70
|
+
ref: Optional[str] = None
|
|
71
|
+
alt: Optional[str] = None
|
|
72
|
+
|
|
73
|
+
@field_validator("chrom", mode="before")
|
|
74
|
+
@classmethod
|
|
75
|
+
def normalize_chrom(cls, v):
|
|
76
|
+
return chrom_to_int(v)
|
|
77
|
+
|
|
78
|
+
@model_validator(mode="after")
|
|
79
|
+
def validate_mutation(self):
|
|
80
|
+
"""
|
|
81
|
+
Valide qu'on a soit id, soit (chrom, pos, ref, alt), soit les deux
|
|
82
|
+
"""
|
|
83
|
+
has_id = self.id is not None
|
|
84
|
+
has_coords = all(
|
|
85
|
+
[
|
|
86
|
+
self.chrom is not None,
|
|
87
|
+
self.pos is not None,
|
|
88
|
+
self.ref is not None,
|
|
89
|
+
self.alt is not None,
|
|
90
|
+
]
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
if not has_id and not has_coords:
|
|
94
|
+
raise ValueError("Must provide either 'id' or all of (chrom, pos, ref, alt)")
|
|
95
|
+
|
|
96
|
+
# Vérifier que si on a des coordonnées partielles, elles sont complètes
|
|
97
|
+
coord_fields = [self.chrom, self.pos, self.ref, self.alt]
|
|
98
|
+
partial_coords = any(f is not None for f in coord_fields)
|
|
99
|
+
|
|
100
|
+
if partial_coords and not has_coords:
|
|
101
|
+
raise ValueError("If providing coordinates, must provide all of (chrom, pos, ref, alt)")
|
|
102
|
+
|
|
103
|
+
return self
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def state(self) -> Literal["full", "miss_id", "miss_attributes"]:
|
|
107
|
+
"""
|
|
108
|
+
Retourne l'état de la mutation
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
- "full": id et attributs présents
|
|
112
|
+
- "miss_id": attributs présents, id manquant
|
|
113
|
+
- "miss_attributes": id présent, attributs manquants
|
|
114
|
+
"""
|
|
115
|
+
has_id = self.id is not None
|
|
116
|
+
has_coords = all(
|
|
117
|
+
[
|
|
118
|
+
self.chrom is not None,
|
|
119
|
+
self.pos is not None,
|
|
120
|
+
self.ref is not None,
|
|
121
|
+
self.alt is not None,
|
|
122
|
+
]
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if has_id and has_coords:
|
|
126
|
+
return "full"
|
|
127
|
+
elif has_coords:
|
|
128
|
+
return "miss_id"
|
|
129
|
+
else:
|
|
130
|
+
return "miss_attributes"
|
|
131
|
+
|
|
132
|
+
def fetch_id_from_db(self) -> Optional[int]:
|
|
133
|
+
"""
|
|
134
|
+
Récupère l'ID de la mutation depuis la DB via (chrom, pos, ref, alt)
|
|
135
|
+
Met à jour self.id si trouvé
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
ID de la mutation ou None si non trouvée
|
|
139
|
+
|
|
140
|
+
Raises:
|
|
141
|
+
ValueError: Si les coordonnées ne sont pas complètes
|
|
142
|
+
"""
|
|
143
|
+
if self.state == "miss_attributes":
|
|
144
|
+
raise ValueError("Cannot fetch ID: coordinates (chrom, pos, ref, alt) are required")
|
|
145
|
+
|
|
146
|
+
conn = DbConnection.get_connection()
|
|
147
|
+
cursor = conn.cursor()
|
|
148
|
+
|
|
149
|
+
cursor.execute(
|
|
150
|
+
"""
|
|
151
|
+
SELECT id FROM mutations
|
|
152
|
+
WHERE chrom=? AND pos=? AND ref=? AND alt=?
|
|
153
|
+
""",
|
|
154
|
+
(self.chrom, self.pos, self.ref, self.alt),
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
result = cursor.fetchone()
|
|
158
|
+
if result:
|
|
159
|
+
self.id = result["id"]
|
|
160
|
+
return self.id
|
|
161
|
+
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
def fetch_attributes_from_db(self) -> bool:
|
|
165
|
+
"""
|
|
166
|
+
Récupère les attributs de la mutation depuis la DB via l'ID
|
|
167
|
+
Met à jour (chrom, pos, ref, alt) si trouvés
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
True si trouvé, False sinon
|
|
171
|
+
|
|
172
|
+
Raises:
|
|
173
|
+
ValueError: Si l'ID n'est pas fourni
|
|
174
|
+
"""
|
|
175
|
+
if self.id is None:
|
|
176
|
+
raise ValueError("Cannot fetch attributes: id is required")
|
|
177
|
+
|
|
178
|
+
conn = DbConnection.get_connection()
|
|
179
|
+
cursor = conn.cursor()
|
|
180
|
+
|
|
181
|
+
cursor.execute(
|
|
182
|
+
"""
|
|
183
|
+
SELECT chrom, pos, ref, alt FROM mutations
|
|
184
|
+
WHERE id=?
|
|
185
|
+
""",
|
|
186
|
+
(self.id,),
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
result = cursor.fetchone()
|
|
190
|
+
if result:
|
|
191
|
+
self.chrom = result["chrom"]
|
|
192
|
+
self.pos = result["pos"]
|
|
193
|
+
self.ref = result["ref"]
|
|
194
|
+
self.alt = result["alt"]
|
|
195
|
+
return True
|
|
196
|
+
|
|
197
|
+
return False
|
|
198
|
+
|
|
199
|
+
def ensure_in_db(self, annotate: bool = True) -> int:
|
|
200
|
+
"""
|
|
201
|
+
S'assure que la mutation existe dans la DB (crée si nécessaire)
|
|
202
|
+
Met à jour self.id
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
annotate: Si True, annote automatiquement avec le contexte génomique
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
ID de la mutation
|
|
209
|
+
|
|
210
|
+
Raises:
|
|
211
|
+
ValueError: Si les coordonnées ne sont pas complètes
|
|
212
|
+
"""
|
|
213
|
+
if self.state == "miss_attributes":
|
|
214
|
+
raise ValueError("Cannot ensure in DB: coordinates (chrom, pos, ref, alt) are required")
|
|
215
|
+
|
|
216
|
+
# Si on a déjà l'ID, on vérifie qu'il existe
|
|
217
|
+
if self.id is not None:
|
|
218
|
+
conn = DbConnection.get_connection()
|
|
219
|
+
cursor = conn.cursor()
|
|
220
|
+
cursor.execute("SELECT id FROM mutations WHERE id=?", (self.id,))
|
|
221
|
+
if cursor.fetchone():
|
|
222
|
+
return self.id
|
|
223
|
+
|
|
224
|
+
# Sinon, chercher par coordonnées
|
|
225
|
+
existing_id = self.fetch_id_from_db()
|
|
226
|
+
if existing_id is not None:
|
|
227
|
+
return existing_id
|
|
228
|
+
|
|
229
|
+
# Si pas trouvé, créer
|
|
230
|
+
conn = DbConnection.get_connection()
|
|
231
|
+
cursor = conn.cursor()
|
|
232
|
+
|
|
233
|
+
cursor.execute(
|
|
234
|
+
"""
|
|
235
|
+
INSERT INTO mutations (chrom, pos, ref, alt)
|
|
236
|
+
VALUES (?, ?, ?, ?)
|
|
237
|
+
""",
|
|
238
|
+
(self.chrom, self.pos, self.ref, self.alt),
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
self.id = cursor.lastrowid
|
|
242
|
+
|
|
243
|
+
# Annoter si demandé
|
|
244
|
+
if annotate:
|
|
245
|
+
self._annotate_mutation()
|
|
246
|
+
|
|
247
|
+
conn.commit()
|
|
248
|
+
return self.id
|
|
249
|
+
|
|
250
|
+
def _annotate_mutation(self) -> None:
|
|
251
|
+
"""
|
|
252
|
+
Annote la mutation avec le contexte génomique
|
|
253
|
+
(méthode interne, appelée par ensure_in_db)
|
|
254
|
+
"""
|
|
255
|
+
if self.id is None or self.chrom is None or self.pos is None:
|
|
256
|
+
return
|
|
257
|
+
|
|
258
|
+
conn = DbConnection.get_connection()
|
|
259
|
+
cursor = conn.cursor()
|
|
260
|
+
|
|
261
|
+
# Supprimer les anciennes annotations
|
|
262
|
+
cursor.execute("DELETE FROM mutation_annotations WHERE mutation_id = ?", (self.id,))
|
|
263
|
+
|
|
264
|
+
# Trouver les features qui chevauchent
|
|
265
|
+
cursor.execute(
|
|
266
|
+
"""
|
|
267
|
+
SELECT gf.id
|
|
268
|
+
FROM genomic_features gf
|
|
269
|
+
JOIN genes g ON gf.gene_id = g.id
|
|
270
|
+
WHERE g.chrom = ? AND gf.feature_start <= ? AND gf.feature_end >= ?
|
|
271
|
+
""",
|
|
272
|
+
(self.chrom, self.pos, self.pos),
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
feature_ids = [row["id"] for row in cursor.fetchall()]
|
|
276
|
+
|
|
277
|
+
if feature_ids:
|
|
278
|
+
for feature_id in feature_ids:
|
|
279
|
+
cursor.execute(
|
|
280
|
+
"""
|
|
281
|
+
INSERT INTO mutation_annotations (mutation_id, feature_id)
|
|
282
|
+
VALUES (?, ?)
|
|
283
|
+
""",
|
|
284
|
+
(self.id, feature_id),
|
|
285
|
+
)
|
|
286
|
+
else:
|
|
287
|
+
# Intergenic
|
|
288
|
+
cursor.execute(
|
|
289
|
+
"""
|
|
290
|
+
INSERT INTO mutation_annotations (mutation_id, feature_id)
|
|
291
|
+
VALUES (?, NULL)
|
|
292
|
+
""",
|
|
293
|
+
(self.id,),
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
@classmethod
|
|
297
|
+
def fetch_ids_from_db_batch(cls, mutations: list["Mutation"]) -> None:
|
|
298
|
+
"""
|
|
299
|
+
Récupère les IDs pour un batch de mutations (modifie en place)
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
mutations: Liste de mutations (doivent avoir chrom, pos, ref, alt)
|
|
303
|
+
|
|
304
|
+
Raises:
|
|
305
|
+
ValueError: Si une mutation n'a pas de coordonnées complètes
|
|
306
|
+
"""
|
|
307
|
+
conn = DbConnection.get_connection()
|
|
308
|
+
cursor = conn.cursor()
|
|
309
|
+
|
|
310
|
+
for mutation in mutations:
|
|
311
|
+
if mutation.state == "miss_attributes":
|
|
312
|
+
raise ValueError(f"Mutation {mutation} missing coordinates")
|
|
313
|
+
|
|
314
|
+
cursor.execute(
|
|
315
|
+
"""
|
|
316
|
+
SELECT id FROM mutations
|
|
317
|
+
WHERE chrom=? AND pos=? AND ref=? AND alt=?
|
|
318
|
+
""",
|
|
319
|
+
(mutation.chrom, mutation.pos, mutation.ref, mutation.alt),
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
result = cursor.fetchone()
|
|
323
|
+
if result:
|
|
324
|
+
mutation.id = result["id"]
|
|
325
|
+
|
|
326
|
+
@classmethod
|
|
327
|
+
def fetch_attributes_from_db_batch(cls, mutations: list["Mutation"]) -> None:
|
|
328
|
+
"""
|
|
329
|
+
Récupère les attributs pour un batch de mutations (modifie en place)
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
mutations: Liste de mutations (doivent avoir id)
|
|
333
|
+
|
|
334
|
+
Raises:
|
|
335
|
+
ValueError: Si une mutation n'a pas d'ID
|
|
336
|
+
"""
|
|
337
|
+
conn = DbConnection.get_connection()
|
|
338
|
+
cursor = conn.cursor()
|
|
339
|
+
|
|
340
|
+
for mutation in mutations:
|
|
341
|
+
if mutation.id is None:
|
|
342
|
+
raise ValueError(f"Mutation {mutation} missing id")
|
|
343
|
+
|
|
344
|
+
cursor.execute(
|
|
345
|
+
"""
|
|
346
|
+
SELECT chrom, pos, ref, alt FROM mutations
|
|
347
|
+
WHERE id=?
|
|
348
|
+
""",
|
|
349
|
+
(mutation.id,),
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
result = cursor.fetchone()
|
|
353
|
+
if result:
|
|
354
|
+
mutation.chrom = result["chrom"]
|
|
355
|
+
mutation.pos = result["pos"]
|
|
356
|
+
mutation.ref = result["ref"]
|
|
357
|
+
mutation.alt = result["alt"]
|
|
358
|
+
|
|
359
|
+
@classmethod
|
|
360
|
+
def ensure_in_db_batch(cls, mutations: list["Mutation"], annotate: bool = True) -> None:
|
|
361
|
+
"""
|
|
362
|
+
S'assure que toutes les mutations existent dans la DB (crée si nécessaire)
|
|
363
|
+
Modifie les mutations en place pour ajouter les IDs
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
mutations: Liste de mutations (doivent avoir chrom, pos, ref, alt)
|
|
367
|
+
annotate: Si True, annote automatiquement avec le contexte génomique
|
|
368
|
+
|
|
369
|
+
Raises:
|
|
370
|
+
ValueError: Si une mutation n'a pas de coordonnées complètes
|
|
371
|
+
"""
|
|
372
|
+
# D'abord, essayer de récupérer les IDs existants
|
|
373
|
+
cls.fetch_ids_from_db_batch(mutations)
|
|
374
|
+
|
|
375
|
+
# Créer les mutations qui n'existent pas
|
|
376
|
+
conn = DbConnection.get_connection()
|
|
377
|
+
cursor = conn.cursor()
|
|
378
|
+
|
|
379
|
+
for mutation in mutations:
|
|
380
|
+
if mutation.id is None:
|
|
381
|
+
# Créer la mutation
|
|
382
|
+
cursor.execute(
|
|
383
|
+
"""
|
|
384
|
+
INSERT INTO mutations (chrom, pos, ref, alt)
|
|
385
|
+
VALUES (?, ?, ?, ?)
|
|
386
|
+
""",
|
|
387
|
+
(mutation.chrom, mutation.pos, mutation.ref, mutation.alt),
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
mutation.id = cursor.lastrowid
|
|
391
|
+
|
|
392
|
+
# Annoter si demandé
|
|
393
|
+
if annotate:
|
|
394
|
+
mutation._annotate_mutation()
|
|
395
|
+
|
|
396
|
+
conn.commit()
|
|
397
|
+
|
|
398
|
+
def __repr__(self) -> str:
|
|
399
|
+
if self.state == "full":
|
|
400
|
+
chrom_str = int_to_chrom(self.chrom)
|
|
401
|
+
return f"Mutation(id={self.id}, {chrom_str}:{self.pos} {self.ref}>{self.alt})"
|
|
402
|
+
elif self.state == "miss_id":
|
|
403
|
+
chrom_str = int_to_chrom(self.chrom)
|
|
404
|
+
return f"Mutation({chrom_str}:{self.pos} {self.ref}>{self.alt}, id=?)"
|
|
405
|
+
else:
|
|
406
|
+
return f"Mutation(id={self.id}, coords=?)"
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from typing import ClassVar, Optional, get_type_hints
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
from evoseer_utils.db_connection import DbConnection
|
|
6
|
+
from evoseer_utils.mutations import Mutation
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class OutputDescription(BaseModel):
|
|
10
|
+
mutation: Mutation
|
|
11
|
+
version: ClassVar[str]
|
|
12
|
+
|
|
13
|
+
_table_name: ClassVar[str]
|
|
14
|
+
db_fields: ClassVar[list[str]]
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def table_name(self) -> str:
|
|
18
|
+
# we do that because it allows automatic views based on "annot_" prefix
|
|
19
|
+
return "annot_" + self._table_name
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def _get_all_db_fields(cls) -> list[str]:
|
|
23
|
+
# Always include version automatically
|
|
24
|
+
return ["version"] + cls.db_fields
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def _python_type_to_sql(cls, python_type: type) -> str:
|
|
28
|
+
type_map = {
|
|
29
|
+
int: "INTEGER",
|
|
30
|
+
float: "REAL",
|
|
31
|
+
str: "TEXT",
|
|
32
|
+
bool: "INTEGER",
|
|
33
|
+
}
|
|
34
|
+
# Handle Optional types
|
|
35
|
+
origin = getattr(python_type, "__origin__", None)
|
|
36
|
+
if origin is type(None) or str(python_type).startswith("typing.Union"):
|
|
37
|
+
args = getattr(python_type, "__args__", ())
|
|
38
|
+
if args:
|
|
39
|
+
python_type = args[0] if args[0] is not type(None) else args[1]
|
|
40
|
+
|
|
41
|
+
return type_map.get(python_type, "TEXT")
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def _ensure_table_exists(cls, table_name: str) -> None:
|
|
45
|
+
conn = DbConnection.get_connection()
|
|
46
|
+
cursor = conn.cursor()
|
|
47
|
+
|
|
48
|
+
type_hints = get_type_hints(cls)
|
|
49
|
+
columns = ["id INTEGER PRIMARY KEY AUTOINCREMENT", "mutation_id INTEGER NOT NULL UNIQUE"]
|
|
50
|
+
|
|
51
|
+
for field_name in cls._get_all_db_fields():
|
|
52
|
+
field_type = type_hints.get(field_name, str)
|
|
53
|
+
sql_type = cls._python_type_to_sql(field_type)
|
|
54
|
+
columns.append(f"{field_name} {sql_type}")
|
|
55
|
+
|
|
56
|
+
columns.append("FOREIGN KEY (mutation_id) REFERENCES mutations(id) ON DELETE CASCADE")
|
|
57
|
+
|
|
58
|
+
cursor.execute(f"""
|
|
59
|
+
CREATE TABLE IF NOT EXISTS {table_name} (
|
|
60
|
+
{', '.join(columns)}
|
|
61
|
+
)
|
|
62
|
+
""")
|
|
63
|
+
|
|
64
|
+
cursor.execute(f"""
|
|
65
|
+
CREATE INDEX IF NOT EXISTS idx_{table_name}_mutation
|
|
66
|
+
ON {table_name}(mutation_id)
|
|
67
|
+
""")
|
|
68
|
+
|
|
69
|
+
conn.commit()
|
|
70
|
+
|
|
71
|
+
def insert_to_db(self, table_name: Optional[str] = None) -> None:
|
|
72
|
+
if table_name is None:
|
|
73
|
+
table_name = self.table_name
|
|
74
|
+
|
|
75
|
+
self._ensure_table_exists(table_name)
|
|
76
|
+
self.mutation.ensure_in_db()
|
|
77
|
+
|
|
78
|
+
conn = DbConnection.get_connection()
|
|
79
|
+
cursor = conn.cursor()
|
|
80
|
+
|
|
81
|
+
all_fields = self._get_all_db_fields()
|
|
82
|
+
fields = ["mutation_id"] + all_fields
|
|
83
|
+
values = [self.mutation.id] + [getattr(self, field) for field in all_fields]
|
|
84
|
+
|
|
85
|
+
placeholders = ", ".join(["?"] * len(values))
|
|
86
|
+
|
|
87
|
+
cursor.execute(
|
|
88
|
+
f"""
|
|
89
|
+
INSERT OR REPLACE INTO {table_name}
|
|
90
|
+
({', '.join(fields)})
|
|
91
|
+
VALUES ({placeholders})
|
|
92
|
+
""",
|
|
93
|
+
values,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
conn.commit()
|
|
97
|
+
|
|
98
|
+
@classmethod
|
|
99
|
+
def insert_batch_to_db(
|
|
100
|
+
cls, outputs: list["OutputDescription"], table_name: Optional[str] = None
|
|
101
|
+
) -> None:
|
|
102
|
+
if table_name is None:
|
|
103
|
+
table_name = cls.table_name
|
|
104
|
+
|
|
105
|
+
cls._ensure_table_exists(table_name)
|
|
106
|
+
|
|
107
|
+
mutations = [output.mutation for output in outputs]
|
|
108
|
+
Mutation.ensure_in_db_batch(mutations)
|
|
109
|
+
|
|
110
|
+
conn = DbConnection.get_connection()
|
|
111
|
+
cursor = conn.cursor()
|
|
112
|
+
|
|
113
|
+
all_fields = cls._get_all_db_fields()
|
|
114
|
+
fields = ["mutation_id"] + all_fields
|
|
115
|
+
placeholders = ", ".join(["?"] * len(fields))
|
|
116
|
+
|
|
117
|
+
values_list = []
|
|
118
|
+
for output in outputs:
|
|
119
|
+
values = [output.mutation.id] + [getattr(output, field) for field in all_fields]
|
|
120
|
+
values_list.append(values)
|
|
121
|
+
|
|
122
|
+
cursor.executemany(
|
|
123
|
+
f"""
|
|
124
|
+
INSERT OR REPLACE INTO {table_name}
|
|
125
|
+
({', '.join(fields)})
|
|
126
|
+
VALUES ({placeholders})
|
|
127
|
+
""",
|
|
128
|
+
values_list,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
conn.commit()
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test utilities for database testing
|
|
3
|
+
|
|
4
|
+
Provides helpers for creating and managing test databases for unit tests.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sqlite3
|
|
8
|
+
import tempfile
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
from evoseer_utils.db_connection import DbConnection
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DatabaseFixture:
|
|
16
|
+
"""
|
|
17
|
+
Context manager for creating and managing test databases.
|
|
18
|
+
|
|
19
|
+
Usage:
|
|
20
|
+
with DatabaseFixture() as db:
|
|
21
|
+
# Use db.path for database operations
|
|
22
|
+
DbConnection.set_db_path(db.path)
|
|
23
|
+
# Run tests...
|
|
24
|
+
# Database is automatically cleaned up
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, schema_file: Optional[str] = None):
|
|
28
|
+
"""
|
|
29
|
+
Initialize test database context manager.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
schema_file: Optional path to schema.sql file. If not provided,
|
|
33
|
+
uses the default schema from project root.
|
|
34
|
+
"""
|
|
35
|
+
self.schema_file = schema_file
|
|
36
|
+
self.temp_file = None
|
|
37
|
+
self.path = None
|
|
38
|
+
self._conn = None
|
|
39
|
+
|
|
40
|
+
def __enter__(self):
|
|
41
|
+
"""Create temporary database and initialize schema"""
|
|
42
|
+
# Create temporary database file
|
|
43
|
+
self.temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".db", delete=False)
|
|
44
|
+
self.path = self.temp_file.name
|
|
45
|
+
self.temp_file.close()
|
|
46
|
+
|
|
47
|
+
# Load schema
|
|
48
|
+
if self.schema_file is None:
|
|
49
|
+
# Use default schema from data directory
|
|
50
|
+
self.schema_file = str(Path(__file__).parent / "data" / "schema.sql")
|
|
51
|
+
|
|
52
|
+
self._init_schema()
|
|
53
|
+
|
|
54
|
+
return self
|
|
55
|
+
|
|
56
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
57
|
+
"""Clean up temporary database"""
|
|
58
|
+
# Close any active connections
|
|
59
|
+
if self._conn is not None:
|
|
60
|
+
self._conn.close()
|
|
61
|
+
|
|
62
|
+
# Close DbConnection singleton
|
|
63
|
+
DbConnection.close()
|
|
64
|
+
|
|
65
|
+
# Remove temporary file
|
|
66
|
+
if self.path and Path(self.path).exists():
|
|
67
|
+
Path(self.path).unlink()
|
|
68
|
+
|
|
69
|
+
def _init_schema(self):
|
|
70
|
+
"""Initialize database with schema"""
|
|
71
|
+
conn = sqlite3.connect(self.path)
|
|
72
|
+
cursor = conn.cursor()
|
|
73
|
+
|
|
74
|
+
# Read and execute schema
|
|
75
|
+
with open(self.schema_file) as f:
|
|
76
|
+
schema = f.read()
|
|
77
|
+
|
|
78
|
+
cursor.executescript(schema)
|
|
79
|
+
conn.commit()
|
|
80
|
+
conn.close()
|
|
81
|
+
|
|
82
|
+
def get_connection(self) -> sqlite3.Connection:
|
|
83
|
+
"""
|
|
84
|
+
Get a connection to the test database.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
sqlite3.Connection instance
|
|
88
|
+
"""
|
|
89
|
+
if self._conn is None:
|
|
90
|
+
self._conn = sqlite3.connect(self.path)
|
|
91
|
+
self._conn.row_factory = sqlite3.Row
|
|
92
|
+
|
|
93
|
+
return self._conn
|
|
94
|
+
|
|
95
|
+
def add_test_gene(
|
|
96
|
+
self,
|
|
97
|
+
gene_id: str,
|
|
98
|
+
gene_name: str,
|
|
99
|
+
chrom: int,
|
|
100
|
+
start: int,
|
|
101
|
+
end: int,
|
|
102
|
+
strand: str = "+",
|
|
103
|
+
tss: Optional[int] = None,
|
|
104
|
+
) -> int:
|
|
105
|
+
"""
|
|
106
|
+
Add a test gene to the database.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
gene_id: Ensembl gene ID (e.g., ENSG00000141510)
|
|
110
|
+
gene_name: Gene symbol (e.g., TP53)
|
|
111
|
+
chrom: Chromosome (1-24)
|
|
112
|
+
start: Gene start position
|
|
113
|
+
end: Gene end position
|
|
114
|
+
strand: Strand (+ or -)
|
|
115
|
+
tss: Transcription start site (defaults to start)
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Database ID of inserted gene
|
|
119
|
+
"""
|
|
120
|
+
if tss is None:
|
|
121
|
+
tss = start
|
|
122
|
+
|
|
123
|
+
conn = self.get_connection()
|
|
124
|
+
cursor = conn.cursor()
|
|
125
|
+
|
|
126
|
+
cursor.execute(
|
|
127
|
+
"""
|
|
128
|
+
INSERT INTO genes (gene_id, gene_name, chrom, start, end, strand, tss)
|
|
129
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
130
|
+
""",
|
|
131
|
+
(gene_id, gene_name, chrom, start, end, strand, tss),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
conn.commit()
|
|
135
|
+
return cursor.lastrowid
|
|
136
|
+
|
|
137
|
+
def add_test_feature(
|
|
138
|
+
self,
|
|
139
|
+
gene_id: int,
|
|
140
|
+
annotation_type: str,
|
|
141
|
+
feature_start: int,
|
|
142
|
+
feature_end: int,
|
|
143
|
+
transcript_id: Optional[str] = None,
|
|
144
|
+
) -> int:
|
|
145
|
+
"""
|
|
146
|
+
Add a test genomic feature to the database.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
gene_id: Database ID of the gene
|
|
150
|
+
annotation_type: Type (exon, intron, UTR5, UTR3, promoter)
|
|
151
|
+
feature_start: Feature start position
|
|
152
|
+
feature_end: Feature end position
|
|
153
|
+
transcript_id: Optional transcript ID
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Database ID of inserted feature
|
|
157
|
+
"""
|
|
158
|
+
conn = self.get_connection()
|
|
159
|
+
cursor = conn.cursor()
|
|
160
|
+
|
|
161
|
+
cursor.execute(
|
|
162
|
+
"""
|
|
163
|
+
INSERT INTO genomic_features
|
|
164
|
+
(gene_id, annotation_type, feature_start, feature_end, transcript_id)
|
|
165
|
+
VALUES (?, ?, ?, ?, ?)
|
|
166
|
+
""",
|
|
167
|
+
(gene_id, annotation_type, feature_start, feature_end, transcript_id),
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
conn.commit()
|
|
171
|
+
return cursor.lastrowid
|
|
172
|
+
|
|
173
|
+
def add_test_mutation(self, chrom: int, pos: int, ref: str, alt: str) -> int:
|
|
174
|
+
"""
|
|
175
|
+
Add a test mutation to the database.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
chrom: Chromosome (1-24)
|
|
179
|
+
pos: Genomic position
|
|
180
|
+
ref: Reference allele
|
|
181
|
+
alt: Alternate allele
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Database ID of inserted mutation
|
|
185
|
+
"""
|
|
186
|
+
conn = self.get_connection()
|
|
187
|
+
cursor = conn.cursor()
|
|
188
|
+
|
|
189
|
+
cursor.execute(
|
|
190
|
+
"""
|
|
191
|
+
INSERT INTO mutations (chrom, pos, ref, alt)
|
|
192
|
+
VALUES (?, ?, ?, ?)
|
|
193
|
+
""",
|
|
194
|
+
(chrom, pos, ref, alt),
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
conn.commit()
|
|
198
|
+
return cursor.lastrowid
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class DatabaseFixtureWithData(DatabaseFixture):
|
|
202
|
+
"""DatabaseFixture that pre-populates with sample data"""
|
|
203
|
+
|
|
204
|
+
def __enter__(self):
|
|
205
|
+
"""Create temporary database, initialize schema, and add sample data"""
|
|
206
|
+
super().__enter__()
|
|
207
|
+
|
|
208
|
+
# Add sample TP53 gene
|
|
209
|
+
tp53_id = self.add_test_gene(
|
|
210
|
+
gene_id="ENSG00000141510",
|
|
211
|
+
gene_name="TP53",
|
|
212
|
+
chrom=17,
|
|
213
|
+
start=7661779,
|
|
214
|
+
end=7687550,
|
|
215
|
+
strand="-",
|
|
216
|
+
tss=7687550,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Add sample exon with canonical transcript
|
|
220
|
+
self.add_test_feature(
|
|
221
|
+
gene_id=tp53_id,
|
|
222
|
+
annotation_type="exon",
|
|
223
|
+
feature_start=7577100,
|
|
224
|
+
feature_end=7577600,
|
|
225
|
+
transcript_id="ENST00000269305",
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Add sample KRAS gene
|
|
229
|
+
kras_id = self.add_test_gene(
|
|
230
|
+
gene_id="ENSG00000133703",
|
|
231
|
+
gene_name="KRAS",
|
|
232
|
+
chrom=12,
|
|
233
|
+
start=25205246,
|
|
234
|
+
end=25250936,
|
|
235
|
+
strand="-",
|
|
236
|
+
tss=25250936,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# Add sample exon
|
|
240
|
+
self.add_test_feature(
|
|
241
|
+
gene_id=kras_id,
|
|
242
|
+
annotation_type="exon",
|
|
243
|
+
feature_start=25227000,
|
|
244
|
+
feature_end=25227500,
|
|
245
|
+
transcript_id="ENST00000311936",
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
return self
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def create_test_db(with_sample_data: bool = False) -> DatabaseFixture:
|
|
252
|
+
"""
|
|
253
|
+
Factory function to create a test database.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
with_sample_data: If True, populate with sample genes and features
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
DatabaseFixture context manager
|
|
260
|
+
|
|
261
|
+
Usage:
|
|
262
|
+
with create_test_db(with_sample_data=True) as db:
|
|
263
|
+
DbConnection.set_db_path(db.path)
|
|
264
|
+
# Run tests...
|
|
265
|
+
"""
|
|
266
|
+
if with_sample_data:
|
|
267
|
+
return DatabaseFixtureWithData()
|
|
268
|
+
else:
|
|
269
|
+
return DatabaseFixture()
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "evoseer-utils"
|
|
3
|
+
version = "0.1.4"
|
|
4
|
+
description = "Shared library for mutation management across modules"
|
|
5
|
+
authors = ["benoît de Witte"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
packages = [{include = "evoseer_utils"}]
|
|
8
|
+
|
|
9
|
+
[tool.poetry.dependencies]
|
|
10
|
+
python = "^3.9"
|
|
11
|
+
pydantic = "^2.0"
|
|
12
|
+
|
|
13
|
+
[tool.poetry.group.dev.dependencies]
|
|
14
|
+
pytest = "^8.0"
|
|
15
|
+
ruff = "^0.8"
|
|
16
|
+
pre-commit = "^4.0"
|
|
17
|
+
|
|
18
|
+
[build-system]
|
|
19
|
+
requires = ["poetry-core"]
|
|
20
|
+
build-backend = "poetry.core.masonry.api"
|
|
21
|
+
|
|
22
|
+
[tool.pytest.ini_options]
|
|
23
|
+
testpaths = ["tests"]
|
|
24
|
+
python_files = ["test_*.py"]
|
|
25
|
+
python_classes = ["Test*"]
|
|
26
|
+
python_functions = ["test_*"]
|
|
27
|
+
addopts = [
|
|
28
|
+
"-v",
|
|
29
|
+
"--strict-markers",
|
|
30
|
+
"--tb=short",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[tool.ruff]
|
|
34
|
+
line-length = 120
|
|
35
|
+
target-version = "py39"
|
|
36
|
+
|
|
37
|
+
[tool.ruff.lint]
|
|
38
|
+
select = [
|
|
39
|
+
"E", # pycodestyle errors
|
|
40
|
+
"W", # pycodestyle warnings
|
|
41
|
+
"F", # pyflakes
|
|
42
|
+
"I", # isort
|
|
43
|
+
"N", # pep8-naming
|
|
44
|
+
"UP", # pyupgrade
|
|
45
|
+
"B", # flake8-bugbear
|
|
46
|
+
"C4", # flake8-comprehensions
|
|
47
|
+
]
|
|
48
|
+
ignore = []
|
|
49
|
+
|
|
50
|
+
[tool.ruff.lint.per-file-ignores]
|
|
51
|
+
"__init__.py" = ["F401"] # Allow unused imports in __init__.py
|
|
52
|
+
"tests/*" = ["D"] # Disable docstring requirements in tests
|