evoseer-utils 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ Metadata-Version: 2.1
2
+ Name: evoseer-utils
3
+ Version: 0.1.4
4
+ Summary: Shared library for mutation management across modules
5
+ Author: benoît de Witte
6
+ Requires-Python: >=3.9,<4.0
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.9
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Requires-Dist: pydantic (>=2.0,<3.0)
14
+ Description-Content-Type: text/markdown
15
+
16
+ # Mutation Library
17
+
18
+ Shared library for mutation management across modules.
19
+
20
+ ## Components
21
+
22
+ ### `DbConnection` - Singleton DB connection
23
+ ```python
24
+ from libs import DbConnection
25
+
26
+ DbConnection.set_db_path("mutations.db")
27
+ conn = DbConnection.get_connection()
28
+ ```
29
+
30
+ ### `Mutation` - Pydantic model with DB integration
31
+
32
+ #### States
33
+ - `"full"`: Has both id and (chrom, pos, ref, alt)
34
+ - `"miss_id"`: Has coordinates, missing id
35
+ - `"miss_attributes"`: Has id, missing coordinates
36
+
37
+ #### Creation patterns
38
+
39
+ ```python
40
+ # With coordinates (lazy load id)
41
+ mut = Mutation(chrom=17, pos=7577548, ref="C", alt="T")
42
+
43
+ # With id (lazy load attributes)
44
+ mut = Mutation(id=123)
45
+
46
+ # With both
47
+ mut = Mutation(id=123, chrom=17, pos=7577548, ref="C", alt="T")
48
+ ```
49
+
50
+ #### Methods
51
+
52
+ **Instance methods:**
53
+ ```python
54
+ mut.fetch_id_from_db() # Get id from coordinates
55
+ mut.fetch_attributes_from_db() # Get coordinates from id
56
+ mut.ensure_in_db() # Create if missing, return id
57
+ ```
58
+
59
+ **Class methods (batch):**
60
+ ```python
61
+ Mutation.fetch_ids_from_db_batch(mutations)
62
+ Mutation.fetch_attributes_from_db_batch(mutations)
63
+ Mutation.ensure_in_db_batch(mutations)
64
+ ```
65
+
66
+ ## Usage in modules with OutputDescription (fully automatic)
67
+
68
+ `OutputDescription` is a base class that provides automatic DB insertion for module outputs.
69
+
70
+ ```python
71
+ from pydantic import Field
72
+ from typing import ClassVar, List
73
+ from libs import OutputDescription, DbConnection, Mutation
74
+
75
+ class MyModuleOutput(OutputDescription):
76
+ table_name: ClassVar[str] = "tool_mymodule"
77
+ db_fields: ClassVar[List[str]] = ["my_score", "my_prediction"]
78
+
79
+ my_score: float = Field(..., description="Module score")
80
+ my_prediction: str = Field(..., description="Prediction")
81
+
82
+ # Setup
83
+ DbConnection.set_db_path("mutations.db")
84
+
85
+ # Single insertion (automatic table creation + mutation insertion)
86
+ output = MyModuleOutput(
87
+ mutation=Mutation(chrom=17, pos=7577548, ref="C", alt="T"),
88
+ version="1.0.0", # Required field (free text)
89
+ my_score=0.85,
90
+ my_prediction="pathogenic"
91
+ )
92
+ output.insert_to_db() # Creates table if needed, ensures mutation exists, inserts
93
+
94
+ # Batch insertion
95
+ outputs = [...]
96
+ MyModuleOutput.insert_batch_to_db(outputs)
97
+ ```
98
+
99
+ **What happens automatically:**
100
+ - Table creation with correct SQL types (inferred from Python types)
101
+ - Mutation insertion/lookup
102
+ - Index creation on mutation_id
103
+ - `version` field automatically added to table and insertion
104
+ - INSERT OR REPLACE (idempotent)
105
+
106
+ **Note:** `version` field is required in all OutputDescription subclasses. Format is free text.
107
+
108
+ ## Chromosome encoding
109
+
110
+ - Autosomes: `1-22`
111
+ - X: `23`
112
+ - Y: `24`
113
+
114
+ Helper functions:
115
+
116
+ ```python
117
+ from libs.src.mutations import chrom_to_int, int_to_chrom
118
+
119
+ chrom_to_int("chr17") # 17
120
+ chrom_to_int("chrX") # 23
121
+ int_to_chrom(23) # "chrX"
122
+ ```
123
+
124
+ ## Tests
125
+
126
+ ```bash
127
+ # From project root
128
+ .venv/bin/python3 libs/tests/test_mutations_lib.py
129
+
130
+ # Or use the test runner
131
+ libs/tests/run_tests.sh
132
+ ```
133
+
134
+ ## Examples
135
+
136
+ ```bash
137
+ python3 example_mutations_lib.py
138
+ python3 modules/boostdm/output_description_example.py
139
+ ```
140
+
@@ -0,0 +1,124 @@
1
+ # Mutation Library
2
+
3
+ Shared library for mutation management across modules.
4
+
5
+ ## Components
6
+
7
+ ### `DbConnection` - Singleton DB connection
8
+ ```python
9
+ from libs import DbConnection
10
+
11
+ DbConnection.set_db_path("mutations.db")
12
+ conn = DbConnection.get_connection()
13
+ ```
14
+
15
+ ### `Mutation` - Pydantic model with DB integration
16
+
17
+ #### States
18
+ - `"full"`: Has both id and (chrom, pos, ref, alt)
19
+ - `"miss_id"`: Has coordinates, missing id
20
+ - `"miss_attributes"`: Has id, missing coordinates
21
+
22
+ #### Creation patterns
23
+
24
+ ```python
25
+ # With coordinates (lazy load id)
26
+ mut = Mutation(chrom=17, pos=7577548, ref="C", alt="T")
27
+
28
+ # With id (lazy load attributes)
29
+ mut = Mutation(id=123)
30
+
31
+ # With both
32
+ mut = Mutation(id=123, chrom=17, pos=7577548, ref="C", alt="T")
33
+ ```
34
+
35
+ #### Methods
36
+
37
+ **Instance methods:**
38
+ ```python
39
+ mut.fetch_id_from_db() # Get id from coordinates
40
+ mut.fetch_attributes_from_db() # Get coordinates from id
41
+ mut.ensure_in_db() # Create if missing, return id
42
+ ```
43
+
44
+ **Class methods (batch):**
45
+ ```python
46
+ Mutation.fetch_ids_from_db_batch(mutations)
47
+ Mutation.fetch_attributes_from_db_batch(mutations)
48
+ Mutation.ensure_in_db_batch(mutations)
49
+ ```
50
+
51
+ ## Usage in modules with OutputDescription (fully automatic)
52
+
53
+ `OutputDescription` is a base class that provides automatic DB insertion for module outputs.
54
+
55
+ ```python
56
+ from pydantic import Field
57
+ from typing import ClassVar, List
58
+ from libs import OutputDescription, DbConnection, Mutation
59
+
60
+ class MyModuleOutput(OutputDescription):
61
+ table_name: ClassVar[str] = "tool_mymodule"
62
+ db_fields: ClassVar[List[str]] = ["my_score", "my_prediction"]
63
+
64
+ my_score: float = Field(..., description="Module score")
65
+ my_prediction: str = Field(..., description="Prediction")
66
+
67
+ # Setup
68
+ DbConnection.set_db_path("mutations.db")
69
+
70
+ # Single insertion (automatic table creation + mutation insertion)
71
+ output = MyModuleOutput(
72
+ mutation=Mutation(chrom=17, pos=7577548, ref="C", alt="T"),
73
+ version="1.0.0", # Required field (free text)
74
+ my_score=0.85,
75
+ my_prediction="pathogenic"
76
+ )
77
+ output.insert_to_db() # Creates table if needed, ensures mutation exists, inserts
78
+
79
+ # Batch insertion
80
+ outputs = [...]
81
+ MyModuleOutput.insert_batch_to_db(outputs)
82
+ ```
83
+
84
+ **What happens automatically:**
85
+ - Table creation with correct SQL types (inferred from Python types)
86
+ - Mutation insertion/lookup
87
+ - Index creation on mutation_id
88
+ - `version` field automatically added to table and insertion
89
+ - INSERT OR REPLACE (idempotent)
90
+
91
+ **Note:** `version` field is required in all OutputDescription subclasses. Format is free text.
92
+
93
+ ## Chromosome encoding
94
+
95
+ - Autosomes: `1-22`
96
+ - X: `23`
97
+ - Y: `24`
98
+
99
+ Helper functions:
100
+
101
+ ```python
102
+ from libs.src.mutations import chrom_to_int, int_to_chrom
103
+
104
+ chrom_to_int("chr17") # 17
105
+ chrom_to_int("chrX") # 23
106
+ int_to_chrom(23) # "chrX"
107
+ ```
108
+
109
+ ## Tests
110
+
111
+ ```bash
112
+ # From project root
113
+ .venv/bin/python3 libs/tests/test_mutations_lib.py
114
+
115
+ # Or use the test runner
116
+ libs/tests/run_tests.sh
117
+ ```
118
+
119
+ ## Examples
120
+
121
+ ```bash
122
+ python3 example_mutations_lib.py
123
+ python3 modules/boostdm/output_description_example.py
124
+ ```
@@ -0,0 +1,5 @@
1
+ from evoseer_utils.db_connection import DbConnection
2
+ from evoseer_utils.mutations import Mutation
3
+ from evoseer_utils.output_description import OutputDescription
4
+
5
+ __all__ = ["DbConnection", "Mutation", "OutputDescription"]
@@ -0,0 +1,55 @@
1
+ -- Schéma SQLite pour annotations de mutations génomiques
2
+ -- Simple, modulaire, optimisé pour ML
3
+ -- Convention chromosomes: 1-22 (autosomes), 23=X, 24=Y
4
+
5
+ -- Table centrale des mutations
6
+ CREATE TABLE mutations (
7
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
8
+ chrom INTEGER NOT NULL, -- 1-22, 23=X, 24=Y
9
+ pos INTEGER NOT NULL,
10
+ ref TEXT NOT NULL,
11
+ alt TEXT NOT NULL,
12
+ UNIQUE(chrom, pos, ref, alt)
13
+ );
14
+ CREATE INDEX idx_mutations_location ON mutations(chrom, pos);
15
+
16
+ -- Table des gènes (depuis GTF canonique)
17
+ CREATE TABLE genes (
18
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
19
+ gene_id TEXT NOT NULL UNIQUE, -- ENSG...
20
+ gene_name TEXT, -- Symbole (ex: TP53)
21
+ chrom INTEGER NOT NULL, -- 1-22, 23=X, 24=Y
22
+ start INTEGER NOT NULL, -- Start du gène
23
+ end INTEGER NOT NULL, -- End du gène
24
+ strand TEXT NOT NULL, -- + ou -
25
+ tss INTEGER NOT NULL -- Transcription Start Site (pour promoteur)
26
+ );
27
+ CREATE INDEX idx_genes_location ON genes(chrom, start, end);
28
+ CREATE INDEX idx_genes_name ON genes(gene_name);
29
+
30
+ -- Table des features génomiques (depuis GTF)
31
+ -- Stocke les coordonnées permanentes des exons, introns, UTRs, promoteurs
32
+ CREATE TABLE genomic_features (
33
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
34
+ gene_id INTEGER NOT NULL,
35
+ annotation_type TEXT NOT NULL, -- 'exon', 'intron', 'UTR5', 'UTR3', 'promoter'
36
+ feature_start INTEGER NOT NULL,
37
+ feature_end INTEGER NOT NULL,
38
+ transcript_id TEXT, -- ID du transcript canonique
39
+ FOREIGN KEY (gene_id) REFERENCES genes(id) ON DELETE CASCADE
40
+ );
41
+ CREATE INDEX idx_genomic_features_gene ON genomic_features(gene_id);
42
+ CREATE INDEX idx_genomic_features_location ON genomic_features(feature_start, feature_end);
43
+
44
+ -- Table de liaison: mutation <-> genomic_feature
45
+ -- Une mutation peut toucher plusieurs features (ex: exon de 2 gènes chevauchants)
46
+ CREATE TABLE mutation_annotations (
47
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
48
+ mutation_id INTEGER NOT NULL,
49
+ feature_id INTEGER, -- NULL si intergenic
50
+ FOREIGN KEY (mutation_id) REFERENCES mutations(id) ON DELETE CASCADE,
51
+ FOREIGN KEY (feature_id) REFERENCES genomic_features(id) ON DELETE CASCADE,
52
+ UNIQUE(mutation_id, feature_id) -- Éviter les doublons
53
+ );
54
+ CREATE INDEX idx_mutation_annotations_mutation ON mutation_annotations(mutation_id);
55
+ CREATE INDEX idx_mutation_annotations_feature ON mutation_annotations(feature_id);
@@ -0,0 +1,80 @@
1
+ """
2
+ Gestion de la connexion à la base de données SQLite (singleton)
3
+ """
4
+
5
+ import sqlite3
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+
10
+ class DbConnection:
11
+ """
12
+ Singleton pour gérer la connexion à la base de données SQLite
13
+
14
+ Usage:
15
+ DbConnection.set_db_path("mutations.db")
16
+ conn = DbConnection.get_connection()
17
+ """
18
+
19
+ _instance: Optional["DbConnection"] = None
20
+ _connection: Optional[sqlite3.Connection] = None
21
+ _db_path: Optional[str] = None
22
+
23
+ def __new__(cls):
24
+ if cls._instance is None:
25
+ cls._instance = super().__new__(cls)
26
+ return cls._instance
27
+
28
+ @classmethod
29
+ def set_db_path(cls, db_path: str) -> None:
30
+ """
31
+ Configure le chemin vers la base de données
32
+
33
+ Args:
34
+ db_path: Chemin vers le fichier SQLite
35
+ """
36
+ if cls._db_path != db_path:
37
+ # Fermer l'ancienne connexion si elle existe
38
+ if cls._connection is not None:
39
+ cls._connection.close()
40
+ cls._connection = None
41
+
42
+ cls._db_path = db_path
43
+
44
+ @classmethod
45
+ def get_connection(cls) -> sqlite3.Connection:
46
+ """
47
+ Retourne la connexion SQLite (crée si nécessaire)
48
+
49
+ Returns:
50
+ Connexion SQLite
51
+
52
+ Raises:
53
+ RuntimeError: Si le chemin DB n'a pas été configuré
54
+ """
55
+ if cls._db_path is None:
56
+ raise RuntimeError("Database path not set. Call DbConnection.set_db_path() first.")
57
+
58
+ if not Path(cls._db_path).exists():
59
+ raise FileNotFoundError(
60
+ f"Database file not found: {cls._db_path}. " f"Run init_database.py first."
61
+ )
62
+
63
+ # Créer la connexion si elle n'existe pas
64
+ if cls._connection is None:
65
+ cls._connection = sqlite3.connect(cls._db_path)
66
+ cls._connection.row_factory = sqlite3.Row # Accès par nom de colonne
67
+
68
+ return cls._connection
69
+
70
+ @classmethod
71
+ def close(cls) -> None:
72
+ """Ferme la connexion à la base de données"""
73
+ if cls._connection is not None:
74
+ cls._connection.close()
75
+ cls._connection = None
76
+
77
+ @classmethod
78
+ def is_configured(cls) -> bool:
79
+ """Vérifie si la connexion est configurée"""
80
+ return cls._db_path is not None
@@ -0,0 +1,406 @@
1
+ """
2
+ Modèle Pydantic pour les mutations génomiques
3
+ """
4
+
5
+ from typing import Literal, Optional
6
+
7
+ from pydantic import BaseModel, field_validator, model_validator
8
+
9
+ from .db_connection import DbConnection
10
+
11
+
12
+ def chrom_to_int(chrom_str: str) -> Optional[int]:
13
+ """Convertit chr1-chr22, chrX, chrY en 1-24"""
14
+ chrom_str = str(chrom_str).upper().replace("chr", "")
15
+ if chrom_str == "X":
16
+ return 23
17
+ elif chrom_str == "Y":
18
+ return 24
19
+ elif chrom_str in ["M", "MT"]:
20
+ return None
21
+ else:
22
+ try:
23
+ return int(chrom_str)
24
+ except ValueError:
25
+ return None
26
+
27
+
28
+ def int_to_chrom(chrom_int: int) -> str:
29
+ """Convertit 1-24 en chr1-chr22, chrX, chrY"""
30
+ if chrom_int == 23:
31
+ return "chrX"
32
+ elif chrom_int == 24:
33
+ return "chrY"
34
+ else:
35
+ return f"chr{chrom_int}"
36
+
37
+
38
+ class Mutation(BaseModel):
39
+ """
40
+ Modèle pour une mutation génomique
41
+
42
+ Attributs:
43
+ id: ID de la mutation dans la DB (auto-généré)
44
+ chrom: Chromosome (1-22, 23=X, 24=Y)
45
+ pos: Position génomique
46
+ ref: Allèle de référence
47
+ alt: Allèle alternatif
48
+
49
+ Validation:
50
+ - Soit id est fourni
51
+ - Soit (chrom, pos, ref, alt) sont tous fournis
52
+ - Soit les deux
53
+
54
+ Usage:
55
+ # Avec ID seulement (lazy load des attributs)
56
+ mutation = Mutation(id=123)
57
+ mutation.fetch_attributes_from_db()
58
+
59
+ # Avec coordonnées (lazy load de l'ID)
60
+ mutation = Mutation(chrom=17, pos=7577548, ref="C", alt="T")
61
+ mutation.fetch_id_from_db()
62
+
63
+ # Avec tout
64
+ mutation = Mutation(id=123, chrom=17, pos=7577548, ref="C", alt="T")
65
+ """
66
+
67
+ id: Optional[int] = None
68
+ chrom: Optional[int] = None
69
+ pos: Optional[int] = None
70
+ ref: Optional[str] = None
71
+ alt: Optional[str] = None
72
+
73
+ @field_validator("chrom", mode="before")
74
+ @classmethod
75
+ def normalize_chrom(cls, v):
76
+ return chrom_to_int(v)
77
+
78
+ @model_validator(mode="after")
79
+ def validate_mutation(self):
80
+ """
81
+ Valide qu'on a soit id, soit (chrom, pos, ref, alt), soit les deux
82
+ """
83
+ has_id = self.id is not None
84
+ has_coords = all(
85
+ [
86
+ self.chrom is not None,
87
+ self.pos is not None,
88
+ self.ref is not None,
89
+ self.alt is not None,
90
+ ]
91
+ )
92
+
93
+ if not has_id and not has_coords:
94
+ raise ValueError("Must provide either 'id' or all of (chrom, pos, ref, alt)")
95
+
96
+ # Vérifier que si on a des coordonnées partielles, elles sont complètes
97
+ coord_fields = [self.chrom, self.pos, self.ref, self.alt]
98
+ partial_coords = any(f is not None for f in coord_fields)
99
+
100
+ if partial_coords and not has_coords:
101
+ raise ValueError("If providing coordinates, must provide all of (chrom, pos, ref, alt)")
102
+
103
+ return self
104
+
105
+ @property
106
+ def state(self) -> Literal["full", "miss_id", "miss_attributes"]:
107
+ """
108
+ Retourne l'état de la mutation
109
+
110
+ Returns:
111
+ - "full": id et attributs présents
112
+ - "miss_id": attributs présents, id manquant
113
+ - "miss_attributes": id présent, attributs manquants
114
+ """
115
+ has_id = self.id is not None
116
+ has_coords = all(
117
+ [
118
+ self.chrom is not None,
119
+ self.pos is not None,
120
+ self.ref is not None,
121
+ self.alt is not None,
122
+ ]
123
+ )
124
+
125
+ if has_id and has_coords:
126
+ return "full"
127
+ elif has_coords:
128
+ return "miss_id"
129
+ else:
130
+ return "miss_attributes"
131
+
132
+ def fetch_id_from_db(self) -> Optional[int]:
133
+ """
134
+ Récupère l'ID de la mutation depuis la DB via (chrom, pos, ref, alt)
135
+ Met à jour self.id si trouvé
136
+
137
+ Returns:
138
+ ID de la mutation ou None si non trouvée
139
+
140
+ Raises:
141
+ ValueError: Si les coordonnées ne sont pas complètes
142
+ """
143
+ if self.state == "miss_attributes":
144
+ raise ValueError("Cannot fetch ID: coordinates (chrom, pos, ref, alt) are required")
145
+
146
+ conn = DbConnection.get_connection()
147
+ cursor = conn.cursor()
148
+
149
+ cursor.execute(
150
+ """
151
+ SELECT id FROM mutations
152
+ WHERE chrom=? AND pos=? AND ref=? AND alt=?
153
+ """,
154
+ (self.chrom, self.pos, self.ref, self.alt),
155
+ )
156
+
157
+ result = cursor.fetchone()
158
+ if result:
159
+ self.id = result["id"]
160
+ return self.id
161
+
162
+ return None
163
+
164
+ def fetch_attributes_from_db(self) -> bool:
165
+ """
166
+ Récupère les attributs de la mutation depuis la DB via l'ID
167
+ Met à jour (chrom, pos, ref, alt) si trouvés
168
+
169
+ Returns:
170
+ True si trouvé, False sinon
171
+
172
+ Raises:
173
+ ValueError: Si l'ID n'est pas fourni
174
+ """
175
+ if self.id is None:
176
+ raise ValueError("Cannot fetch attributes: id is required")
177
+
178
+ conn = DbConnection.get_connection()
179
+ cursor = conn.cursor()
180
+
181
+ cursor.execute(
182
+ """
183
+ SELECT chrom, pos, ref, alt FROM mutations
184
+ WHERE id=?
185
+ """,
186
+ (self.id,),
187
+ )
188
+
189
+ result = cursor.fetchone()
190
+ if result:
191
+ self.chrom = result["chrom"]
192
+ self.pos = result["pos"]
193
+ self.ref = result["ref"]
194
+ self.alt = result["alt"]
195
+ return True
196
+
197
+ return False
198
+
199
+ def ensure_in_db(self, annotate: bool = True) -> int:
200
+ """
201
+ S'assure que la mutation existe dans la DB (crée si nécessaire)
202
+ Met à jour self.id
203
+
204
+ Args:
205
+ annotate: Si True, annote automatiquement avec le contexte génomique
206
+
207
+ Returns:
208
+ ID de la mutation
209
+
210
+ Raises:
211
+ ValueError: Si les coordonnées ne sont pas complètes
212
+ """
213
+ if self.state == "miss_attributes":
214
+ raise ValueError("Cannot ensure in DB: coordinates (chrom, pos, ref, alt) are required")
215
+
216
+ # Si on a déjà l'ID, on vérifie qu'il existe
217
+ if self.id is not None:
218
+ conn = DbConnection.get_connection()
219
+ cursor = conn.cursor()
220
+ cursor.execute("SELECT id FROM mutations WHERE id=?", (self.id,))
221
+ if cursor.fetchone():
222
+ return self.id
223
+
224
+ # Sinon, chercher par coordonnées
225
+ existing_id = self.fetch_id_from_db()
226
+ if existing_id is not None:
227
+ return existing_id
228
+
229
+ # Si pas trouvé, créer
230
+ conn = DbConnection.get_connection()
231
+ cursor = conn.cursor()
232
+
233
+ cursor.execute(
234
+ """
235
+ INSERT INTO mutations (chrom, pos, ref, alt)
236
+ VALUES (?, ?, ?, ?)
237
+ """,
238
+ (self.chrom, self.pos, self.ref, self.alt),
239
+ )
240
+
241
+ self.id = cursor.lastrowid
242
+
243
+ # Annoter si demandé
244
+ if annotate:
245
+ self._annotate_mutation()
246
+
247
+ conn.commit()
248
+ return self.id
249
+
250
+ def _annotate_mutation(self) -> None:
251
+ """
252
+ Annote la mutation avec le contexte génomique
253
+ (méthode interne, appelée par ensure_in_db)
254
+ """
255
+ if self.id is None or self.chrom is None or self.pos is None:
256
+ return
257
+
258
+ conn = DbConnection.get_connection()
259
+ cursor = conn.cursor()
260
+
261
+ # Supprimer les anciennes annotations
262
+ cursor.execute("DELETE FROM mutation_annotations WHERE mutation_id = ?", (self.id,))
263
+
264
+ # Trouver les features qui chevauchent
265
+ cursor.execute(
266
+ """
267
+ SELECT gf.id
268
+ FROM genomic_features gf
269
+ JOIN genes g ON gf.gene_id = g.id
270
+ WHERE g.chrom = ? AND gf.feature_start <= ? AND gf.feature_end >= ?
271
+ """,
272
+ (self.chrom, self.pos, self.pos),
273
+ )
274
+
275
+ feature_ids = [row["id"] for row in cursor.fetchall()]
276
+
277
+ if feature_ids:
278
+ for feature_id in feature_ids:
279
+ cursor.execute(
280
+ """
281
+ INSERT INTO mutation_annotations (mutation_id, feature_id)
282
+ VALUES (?, ?)
283
+ """,
284
+ (self.id, feature_id),
285
+ )
286
+ else:
287
+ # Intergenic
288
+ cursor.execute(
289
+ """
290
+ INSERT INTO mutation_annotations (mutation_id, feature_id)
291
+ VALUES (?, NULL)
292
+ """,
293
+ (self.id,),
294
+ )
295
+
296
+ @classmethod
297
+ def fetch_ids_from_db_batch(cls, mutations: list["Mutation"]) -> None:
298
+ """
299
+ Récupère les IDs pour un batch de mutations (modifie en place)
300
+
301
+ Args:
302
+ mutations: Liste de mutations (doivent avoir chrom, pos, ref, alt)
303
+
304
+ Raises:
305
+ ValueError: Si une mutation n'a pas de coordonnées complètes
306
+ """
307
+ conn = DbConnection.get_connection()
308
+ cursor = conn.cursor()
309
+
310
+ for mutation in mutations:
311
+ if mutation.state == "miss_attributes":
312
+ raise ValueError(f"Mutation {mutation} missing coordinates")
313
+
314
+ cursor.execute(
315
+ """
316
+ SELECT id FROM mutations
317
+ WHERE chrom=? AND pos=? AND ref=? AND alt=?
318
+ """,
319
+ (mutation.chrom, mutation.pos, mutation.ref, mutation.alt),
320
+ )
321
+
322
+ result = cursor.fetchone()
323
+ if result:
324
+ mutation.id = result["id"]
325
+
326
+ @classmethod
327
+ def fetch_attributes_from_db_batch(cls, mutations: list["Mutation"]) -> None:
328
+ """
329
+ Récupère les attributs pour un batch de mutations (modifie en place)
330
+
331
+ Args:
332
+ mutations: Liste de mutations (doivent avoir id)
333
+
334
+ Raises:
335
+ ValueError: Si une mutation n'a pas d'ID
336
+ """
337
+ conn = DbConnection.get_connection()
338
+ cursor = conn.cursor()
339
+
340
+ for mutation in mutations:
341
+ if mutation.id is None:
342
+ raise ValueError(f"Mutation {mutation} missing id")
343
+
344
+ cursor.execute(
345
+ """
346
+ SELECT chrom, pos, ref, alt FROM mutations
347
+ WHERE id=?
348
+ """,
349
+ (mutation.id,),
350
+ )
351
+
352
+ result = cursor.fetchone()
353
+ if result:
354
+ mutation.chrom = result["chrom"]
355
+ mutation.pos = result["pos"]
356
+ mutation.ref = result["ref"]
357
+ mutation.alt = result["alt"]
358
+
359
+ @classmethod
360
+ def ensure_in_db_batch(cls, mutations: list["Mutation"], annotate: bool = True) -> None:
361
+ """
362
+ S'assure que toutes les mutations existent dans la DB (crée si nécessaire)
363
+ Modifie les mutations en place pour ajouter les IDs
364
+
365
+ Args:
366
+ mutations: Liste de mutations (doivent avoir chrom, pos, ref, alt)
367
+ annotate: Si True, annote automatiquement avec le contexte génomique
368
+
369
+ Raises:
370
+ ValueError: Si une mutation n'a pas de coordonnées complètes
371
+ """
372
+ # D'abord, essayer de récupérer les IDs existants
373
+ cls.fetch_ids_from_db_batch(mutations)
374
+
375
+ # Créer les mutations qui n'existent pas
376
+ conn = DbConnection.get_connection()
377
+ cursor = conn.cursor()
378
+
379
+ for mutation in mutations:
380
+ if mutation.id is None:
381
+ # Créer la mutation
382
+ cursor.execute(
383
+ """
384
+ INSERT INTO mutations (chrom, pos, ref, alt)
385
+ VALUES (?, ?, ?, ?)
386
+ """,
387
+ (mutation.chrom, mutation.pos, mutation.ref, mutation.alt),
388
+ )
389
+
390
+ mutation.id = cursor.lastrowid
391
+
392
+ # Annoter si demandé
393
+ if annotate:
394
+ mutation._annotate_mutation()
395
+
396
+ conn.commit()
397
+
398
+ def __repr__(self) -> str:
399
+ if self.state == "full":
400
+ chrom_str = int_to_chrom(self.chrom)
401
+ return f"Mutation(id={self.id}, {chrom_str}:{self.pos} {self.ref}>{self.alt})"
402
+ elif self.state == "miss_id":
403
+ chrom_str = int_to_chrom(self.chrom)
404
+ return f"Mutation({chrom_str}:{self.pos} {self.ref}>{self.alt}, id=?)"
405
+ else:
406
+ return f"Mutation(id={self.id}, coords=?)"
@@ -0,0 +1,131 @@
1
+ from typing import ClassVar, Optional, get_type_hints
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from evoseer_utils.db_connection import DbConnection
6
+ from evoseer_utils.mutations import Mutation
7
+
8
+
9
+ class OutputDescription(BaseModel):
10
+ mutation: Mutation
11
+ version: ClassVar[str]
12
+
13
+ _table_name: ClassVar[str]
14
+ db_fields: ClassVar[list[str]]
15
+
16
+ @property
17
+ def table_name(self) -> str:
18
+ # we do that because it allows automatic views based on "annot_" prefix
19
+ return "annot_" + self._table_name
20
+
21
+ @classmethod
22
+ def _get_all_db_fields(cls) -> list[str]:
23
+ # Always include version automatically
24
+ return ["version"] + cls.db_fields
25
+
26
+ @classmethod
27
+ def _python_type_to_sql(cls, python_type: type) -> str:
28
+ type_map = {
29
+ int: "INTEGER",
30
+ float: "REAL",
31
+ str: "TEXT",
32
+ bool: "INTEGER",
33
+ }
34
+ # Handle Optional types
35
+ origin = getattr(python_type, "__origin__", None)
36
+ if origin is type(None) or str(python_type).startswith("typing.Union"):
37
+ args = getattr(python_type, "__args__", ())
38
+ if args:
39
+ python_type = args[0] if args[0] is not type(None) else args[1]
40
+
41
+ return type_map.get(python_type, "TEXT")
42
+
43
+ @classmethod
44
+ def _ensure_table_exists(cls, table_name: str) -> None:
45
+ conn = DbConnection.get_connection()
46
+ cursor = conn.cursor()
47
+
48
+ type_hints = get_type_hints(cls)
49
+ columns = ["id INTEGER PRIMARY KEY AUTOINCREMENT", "mutation_id INTEGER NOT NULL UNIQUE"]
50
+
51
+ for field_name in cls._get_all_db_fields():
52
+ field_type = type_hints.get(field_name, str)
53
+ sql_type = cls._python_type_to_sql(field_type)
54
+ columns.append(f"{field_name} {sql_type}")
55
+
56
+ columns.append("FOREIGN KEY (mutation_id) REFERENCES mutations(id) ON DELETE CASCADE")
57
+
58
+ cursor.execute(f"""
59
+ CREATE TABLE IF NOT EXISTS {table_name} (
60
+ {', '.join(columns)}
61
+ )
62
+ """)
63
+
64
+ cursor.execute(f"""
65
+ CREATE INDEX IF NOT EXISTS idx_{table_name}_mutation
66
+ ON {table_name}(mutation_id)
67
+ """)
68
+
69
+ conn.commit()
70
+
71
+ def insert_to_db(self, table_name: Optional[str] = None) -> None:
72
+ if table_name is None:
73
+ table_name = self.table_name
74
+
75
+ self._ensure_table_exists(table_name)
76
+ self.mutation.ensure_in_db()
77
+
78
+ conn = DbConnection.get_connection()
79
+ cursor = conn.cursor()
80
+
81
+ all_fields = self._get_all_db_fields()
82
+ fields = ["mutation_id"] + all_fields
83
+ values = [self.mutation.id] + [getattr(self, field) for field in all_fields]
84
+
85
+ placeholders = ", ".join(["?"] * len(values))
86
+
87
+ cursor.execute(
88
+ f"""
89
+ INSERT OR REPLACE INTO {table_name}
90
+ ({', '.join(fields)})
91
+ VALUES ({placeholders})
92
+ """,
93
+ values,
94
+ )
95
+
96
+ conn.commit()
97
+
98
+ @classmethod
99
+ def insert_batch_to_db(
100
+ cls, outputs: list["OutputDescription"], table_name: Optional[str] = None
101
+ ) -> None:
102
+ if table_name is None:
103
+ table_name = cls.table_name
104
+
105
+ cls._ensure_table_exists(table_name)
106
+
107
+ mutations = [output.mutation for output in outputs]
108
+ Mutation.ensure_in_db_batch(mutations)
109
+
110
+ conn = DbConnection.get_connection()
111
+ cursor = conn.cursor()
112
+
113
+ all_fields = cls._get_all_db_fields()
114
+ fields = ["mutation_id"] + all_fields
115
+ placeholders = ", ".join(["?"] * len(fields))
116
+
117
+ values_list = []
118
+ for output in outputs:
119
+ values = [output.mutation.id] + [getattr(output, field) for field in all_fields]
120
+ values_list.append(values)
121
+
122
+ cursor.executemany(
123
+ f"""
124
+ INSERT OR REPLACE INTO {table_name}
125
+ ({', '.join(fields)})
126
+ VALUES ({placeholders})
127
+ """,
128
+ values_list,
129
+ )
130
+
131
+ conn.commit()
@@ -0,0 +1,269 @@
1
+ """
2
+ Test utilities for database testing
3
+
4
+ Provides helpers for creating and managing test databases for unit tests.
5
+ """
6
+
7
+ import sqlite3
8
+ import tempfile
9
+ from pathlib import Path
10
+ from typing import Optional
11
+
12
+ from evoseer_utils.db_connection import DbConnection
13
+
14
+
15
+ class DatabaseFixture:
16
+ """
17
+ Context manager for creating and managing test databases.
18
+
19
+ Usage:
20
+ with DatabaseFixture() as db:
21
+ # Use db.path for database operations
22
+ DbConnection.set_db_path(db.path)
23
+ # Run tests...
24
+ # Database is automatically cleaned up
25
+ """
26
+
27
+ def __init__(self, schema_file: Optional[str] = None):
28
+ """
29
+ Initialize test database context manager.
30
+
31
+ Args:
32
+ schema_file: Optional path to schema.sql file. If not provided,
33
+ uses the default schema from project root.
34
+ """
35
+ self.schema_file = schema_file
36
+ self.temp_file = None
37
+ self.path = None
38
+ self._conn = None
39
+
40
+ def __enter__(self):
41
+ """Create temporary database and initialize schema"""
42
+ # Create temporary database file
43
+ self.temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".db", delete=False)
44
+ self.path = self.temp_file.name
45
+ self.temp_file.close()
46
+
47
+ # Load schema
48
+ if self.schema_file is None:
49
+ # Use default schema from data directory
50
+ self.schema_file = str(Path(__file__).parent / "data" / "schema.sql")
51
+
52
+ self._init_schema()
53
+
54
+ return self
55
+
56
+ def __exit__(self, exc_type, exc_val, exc_tb):
57
+ """Clean up temporary database"""
58
+ # Close any active connections
59
+ if self._conn is not None:
60
+ self._conn.close()
61
+
62
+ # Close DbConnection singleton
63
+ DbConnection.close()
64
+
65
+ # Remove temporary file
66
+ if self.path and Path(self.path).exists():
67
+ Path(self.path).unlink()
68
+
69
+ def _init_schema(self):
70
+ """Initialize database with schema"""
71
+ conn = sqlite3.connect(self.path)
72
+ cursor = conn.cursor()
73
+
74
+ # Read and execute schema
75
+ with open(self.schema_file) as f:
76
+ schema = f.read()
77
+
78
+ cursor.executescript(schema)
79
+ conn.commit()
80
+ conn.close()
81
+
82
+ def get_connection(self) -> sqlite3.Connection:
83
+ """
84
+ Get a connection to the test database.
85
+
86
+ Returns:
87
+ sqlite3.Connection instance
88
+ """
89
+ if self._conn is None:
90
+ self._conn = sqlite3.connect(self.path)
91
+ self._conn.row_factory = sqlite3.Row
92
+
93
+ return self._conn
94
+
95
+ def add_test_gene(
96
+ self,
97
+ gene_id: str,
98
+ gene_name: str,
99
+ chrom: int,
100
+ start: int,
101
+ end: int,
102
+ strand: str = "+",
103
+ tss: Optional[int] = None,
104
+ ) -> int:
105
+ """
106
+ Add a test gene to the database.
107
+
108
+ Args:
109
+ gene_id: Ensembl gene ID (e.g., ENSG00000141510)
110
+ gene_name: Gene symbol (e.g., TP53)
111
+ chrom: Chromosome (1-24)
112
+ start: Gene start position
113
+ end: Gene end position
114
+ strand: Strand (+ or -)
115
+ tss: Transcription start site (defaults to start)
116
+
117
+ Returns:
118
+ Database ID of inserted gene
119
+ """
120
+ if tss is None:
121
+ tss = start
122
+
123
+ conn = self.get_connection()
124
+ cursor = conn.cursor()
125
+
126
+ cursor.execute(
127
+ """
128
+ INSERT INTO genes (gene_id, gene_name, chrom, start, end, strand, tss)
129
+ VALUES (?, ?, ?, ?, ?, ?, ?)
130
+ """,
131
+ (gene_id, gene_name, chrom, start, end, strand, tss),
132
+ )
133
+
134
+ conn.commit()
135
+ return cursor.lastrowid
136
+
137
+ def add_test_feature(
138
+ self,
139
+ gene_id: int,
140
+ annotation_type: str,
141
+ feature_start: int,
142
+ feature_end: int,
143
+ transcript_id: Optional[str] = None,
144
+ ) -> int:
145
+ """
146
+ Add a test genomic feature to the database.
147
+
148
+ Args:
149
+ gene_id: Database ID of the gene
150
+ annotation_type: Type (exon, intron, UTR5, UTR3, promoter)
151
+ feature_start: Feature start position
152
+ feature_end: Feature end position
153
+ transcript_id: Optional transcript ID
154
+
155
+ Returns:
156
+ Database ID of inserted feature
157
+ """
158
+ conn = self.get_connection()
159
+ cursor = conn.cursor()
160
+
161
+ cursor.execute(
162
+ """
163
+ INSERT INTO genomic_features
164
+ (gene_id, annotation_type, feature_start, feature_end, transcript_id)
165
+ VALUES (?, ?, ?, ?, ?)
166
+ """,
167
+ (gene_id, annotation_type, feature_start, feature_end, transcript_id),
168
+ )
169
+
170
+ conn.commit()
171
+ return cursor.lastrowid
172
+
173
+ def add_test_mutation(self, chrom: int, pos: int, ref: str, alt: str) -> int:
174
+ """
175
+ Add a test mutation to the database.
176
+
177
+ Args:
178
+ chrom: Chromosome (1-24)
179
+ pos: Genomic position
180
+ ref: Reference allele
181
+ alt: Alternate allele
182
+
183
+ Returns:
184
+ Database ID of inserted mutation
185
+ """
186
+ conn = self.get_connection()
187
+ cursor = conn.cursor()
188
+
189
+ cursor.execute(
190
+ """
191
+ INSERT INTO mutations (chrom, pos, ref, alt)
192
+ VALUES (?, ?, ?, ?)
193
+ """,
194
+ (chrom, pos, ref, alt),
195
+ )
196
+
197
+ conn.commit()
198
+ return cursor.lastrowid
199
+
200
+
201
+ class DatabaseFixtureWithData(DatabaseFixture):
202
+ """DatabaseFixture that pre-populates with sample data"""
203
+
204
+ def __enter__(self):
205
+ """Create temporary database, initialize schema, and add sample data"""
206
+ super().__enter__()
207
+
208
+ # Add sample TP53 gene
209
+ tp53_id = self.add_test_gene(
210
+ gene_id="ENSG00000141510",
211
+ gene_name="TP53",
212
+ chrom=17,
213
+ start=7661779,
214
+ end=7687550,
215
+ strand="-",
216
+ tss=7687550,
217
+ )
218
+
219
+ # Add sample exon with canonical transcript
220
+ self.add_test_feature(
221
+ gene_id=tp53_id,
222
+ annotation_type="exon",
223
+ feature_start=7577100,
224
+ feature_end=7577600,
225
+ transcript_id="ENST00000269305",
226
+ )
227
+
228
+ # Add sample KRAS gene
229
+ kras_id = self.add_test_gene(
230
+ gene_id="ENSG00000133703",
231
+ gene_name="KRAS",
232
+ chrom=12,
233
+ start=25205246,
234
+ end=25250936,
235
+ strand="-",
236
+ tss=25250936,
237
+ )
238
+
239
+ # Add sample exon
240
+ self.add_test_feature(
241
+ gene_id=kras_id,
242
+ annotation_type="exon",
243
+ feature_start=25227000,
244
+ feature_end=25227500,
245
+ transcript_id="ENST00000311936",
246
+ )
247
+
248
+ return self
249
+
250
+
251
+ def create_test_db(with_sample_data: bool = False) -> DatabaseFixture:
252
+ """
253
+ Factory function to create a test database.
254
+
255
+ Args:
256
+ with_sample_data: If True, populate with sample genes and features
257
+
258
+ Returns:
259
+ DatabaseFixture context manager
260
+
261
+ Usage:
262
+ with create_test_db(with_sample_data=True) as db:
263
+ DbConnection.set_db_path(db.path)
264
+ # Run tests...
265
+ """
266
+ if with_sample_data:
267
+ return DatabaseFixtureWithData()
268
+ else:
269
+ return DatabaseFixture()
@@ -0,0 +1,52 @@
1
+ [tool.poetry]
2
+ name = "evoseer-utils"
3
+ version = "0.1.4"
4
+ description = "Shared library for mutation management across modules"
5
+ authors = ["benoît de Witte"]
6
+ readme = "README.md"
7
+ packages = [{include = "evoseer_utils"}]
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.9"
11
+ pydantic = "^2.0"
12
+
13
+ [tool.poetry.group.dev.dependencies]
14
+ pytest = "^8.0"
15
+ ruff = "^0.8"
16
+ pre-commit = "^4.0"
17
+
18
+ [build-system]
19
+ requires = ["poetry-core"]
20
+ build-backend = "poetry.core.masonry.api"
21
+
22
+ [tool.pytest.ini_options]
23
+ testpaths = ["tests"]
24
+ python_files = ["test_*.py"]
25
+ python_classes = ["Test*"]
26
+ python_functions = ["test_*"]
27
+ addopts = [
28
+ "-v",
29
+ "--strict-markers",
30
+ "--tb=short",
31
+ ]
32
+
33
+ [tool.ruff]
34
+ line-length = 120
35
+ target-version = "py39"
36
+
37
+ [tool.ruff.lint]
38
+ select = [
39
+ "E", # pycodestyle errors
40
+ "W", # pycodestyle warnings
41
+ "F", # pyflakes
42
+ "I", # isort
43
+ "N", # pep8-naming
44
+ "UP", # pyupgrade
45
+ "B", # flake8-bugbear
46
+ "C4", # flake8-comprehensions
47
+ ]
48
+ ignore = []
49
+
50
+ [tool.ruff.lint.per-file-ignores]
51
+ "__init__.py" = ["F401"] # Allow unused imports in __init__.py
52
+ "tests/*" = ["D"] # Disable docstring requirements in tests