lecrapaud 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. lecrapaud/__init__.py +22 -1
  2. lecrapaud/{api.py → base.py} +331 -241
  3. lecrapaud/config.py +15 -3
  4. lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
  5. lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
  6. lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
  7. lecrapaud/db/models/__init__.py +2 -4
  8. lecrapaud/db/models/base.py +116 -65
  9. lecrapaud/db/models/experiment.py +195 -182
  10. lecrapaud/db/models/feature_selection.py +0 -3
  11. lecrapaud/db/models/feature_selection_rank.py +0 -18
  12. lecrapaud/db/models/model_selection.py +2 -2
  13. lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
  14. lecrapaud/db/session.py +4 -0
  15. lecrapaud/experiment.py +44 -17
  16. lecrapaud/feature_engineering.py +45 -674
  17. lecrapaud/feature_preprocessing.py +1202 -0
  18. lecrapaud/feature_selection.py +145 -332
  19. lecrapaud/integrations/sentry_integration.py +46 -0
  20. lecrapaud/misc/tabpfn_tests.ipynb +2 -2
  21. lecrapaud/mixins.py +247 -0
  22. lecrapaud/model_preprocessing.py +295 -0
  23. lecrapaud/model_selection.py +612 -242
  24. lecrapaud/pipeline.py +548 -0
  25. lecrapaud/search_space.py +2 -1
  26. lecrapaud/utils.py +36 -3
  27. lecrapaud-0.22.6.dist-info/METADATA +423 -0
  28. lecrapaud-0.22.6.dist-info/RECORD +51 -0
  29. {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
  30. {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
  31. lecrapaud/db/models/model_training.py +0 -64
  32. lecrapaud/jobs/__init__.py +0 -13
  33. lecrapaud/jobs/config.py +0 -17
  34. lecrapaud/jobs/scheduler.py +0 -30
  35. lecrapaud/jobs/tasks.py +0 -17
  36. lecrapaud-0.19.0.dist-info/METADATA +0 -249
  37. lecrapaud-0.19.0.dist-info/RECORD +0 -48
@@ -1,5 +1,7 @@
1
1
  from itertools import chain
2
2
  import joblib
3
+ import pandas as pd
4
+ import os
3
5
 
4
6
  from sqlalchemy import (
5
7
  Column,
@@ -14,18 +16,18 @@ from sqlalchemy import (
14
16
  TIMESTAMP,
15
17
  UniqueConstraint,
16
18
  func,
19
+ event,
17
20
  )
18
- from sqlalchemy.orm import relationship, aliased
21
+ from sqlalchemy.orm import relationship, aliased, mapper
19
22
  from sqlalchemy.ext.hybrid import hybrid_property
20
23
  from sqlalchemy import func
21
24
  from statistics import fmean as mean
22
25
  from lecrapaud.db.models.model_selection import ModelSelection
23
- from lecrapaud.db.models.model_training import ModelTraining
24
- from lecrapaud.db.models.score import Score
26
+ from lecrapaud.db.models.model_selection_score import ModelSelectionScore
25
27
 
26
28
  from lecrapaud.db.models.base import Base, with_db
27
29
  from lecrapaud.db.models.utils import create_association_table
28
- from lecrapaud.utils import logger, contains_best
30
+ from lecrapaud.utils import logger, contains_best, strip_timestamp_suffix
29
31
 
30
32
  # jointures
31
33
  lecrapaud_experiment_target_association = create_association_table(
@@ -51,105 +53,13 @@ class Experiment(Base):
51
53
  )
52
54
  name = Column(String(255), nullable=False)
53
55
  path = Column(String(255)) # we do not have this at creation time
54
- type = Column(String(50), nullable=False)
55
56
  size = Column(Integer, nullable=False)
56
57
  train_size = Column(Integer)
57
58
  val_size = Column(Integer)
58
-
59
- # Relationships
60
- model_selections = relationship(
61
- "ModelSelection",
62
- back_populates="experiment",
63
- cascade="all, delete-orphan",
64
- lazy="selectin",
65
- )
66
-
67
- @hybrid_property
68
- def best_rmse(self):
69
- """Best RMSE score across all model selections and trainings."""
70
- # Get the minimum RMSE for each model selection
71
- min_scores = [
72
- min(
73
- score.rmse
74
- for mt in ms.model_trainings
75
- for score in mt.score
76
- if score.rmse is not None
77
- )
78
- for ms in self.model_selections
79
- if any(
80
- score.rmse is not None
81
- for mt in ms.model_trainings
82
- for score in mt.score
83
- )
84
- ]
85
- return min(min_scores) if min_scores else None
86
-
87
- @hybrid_property
88
- def best_logloss(self):
89
- """Best LogLoss score across all model selections and trainings."""
90
- # Get the minimum LogLoss for each model selection
91
- min_scores = [
92
- min(
93
- score.logloss
94
- for mt in ms.model_trainings
95
- for score in mt.score
96
- if score.logloss is not None
97
- )
98
- for ms in self.model_selections
99
- if any(
100
- score.logloss is not None
101
- for mt in ms.model_trainings
102
- for score in mt.score
103
- )
104
- ]
105
- return min(min_scores) if min_scores else None
106
-
107
- @hybrid_property
108
- def avg_rmse(self):
109
- """Average RMSE score across all model selections and trainings."""
110
- # Get the minimum RMSE for each model selection
111
- min_scores = [
112
- min(
113
- score.rmse
114
- for mt in ms.model_trainings
115
- for score in mt.score
116
- if score.rmse is not None
117
- )
118
- for ms in self.model_selections
119
- if any(
120
- score.rmse is not None
121
- for mt in ms.model_trainings
122
- for score in mt.score
123
- )
124
- ]
125
- return mean(min_scores) if min_scores else None
126
-
127
- @hybrid_property
128
- def avg_logloss(self):
129
- """Average LogLoss score across all model selections and trainings."""
130
- # Get the minimum LogLoss for each model selection
131
- min_scores = [
132
- min(
133
- score.logloss
134
- for mt in ms.model_trainings
135
- for score in mt.score
136
- if score.logloss is not None
137
- )
138
- for ms in self.model_selections
139
- if any(
140
- score.logloss is not None
141
- for mt in ms.model_trainings
142
- for score in mt.score
143
- )
144
- ]
145
- return mean(min_scores) if min_scores else None
146
-
147
59
  test_size = Column(Integer)
148
- corr_threshold = Column(Float, nullable=False)
149
- max_features = Column(Integer, nullable=False)
150
- percentile = Column(Float, nullable=False)
151
60
  number_of_groups = Column(Integer)
152
61
  list_of_groups = Column(JSON)
62
+ number_of_targets = Column(Integer)
153
63
  start_date = Column(DateTime)
154
64
  end_date = Column(DateTime)
155
65
  train_start_date = Column(DateTime)
@@ -181,6 +91,142 @@ class Experiment(Base):
181
91
  ),
182
92
  )
183
93
 
94
+ # Relationships
95
+ model_selections = relationship(
96
+ "ModelSelection",
97
+ back_populates="experiment",
98
+ cascade="all, delete-orphan",
99
+ lazy="selectin",
100
+ )
101
+
102
+ # Hooks
103
+ # @event.listens_to(Experiment, "after_commit")
104
+ # def set_score(mapper, connection, target):
105
+ # target.score = target.score
106
+
107
+ # Properties
108
+ @hybrid_property
109
+ def rmse_scores(self):
110
+ """best RMSE scores across all model selections, for each targets."""
111
+ # Get the minimum RMSE for each model selection
112
+ min_scores = [
113
+ min(mss.rmse for mss in ms.model_selection_scores if mss.rmse is not None)
114
+ for ms in self.model_selections
115
+ if any(mss.rmse is not None for mss in ms.model_selection_scores)
116
+ ]
117
+
118
+ if not min_scores:
119
+ # fallback to path if no model_selection_scores found
120
+ for target in self.targets:
121
+ path = f"{self.path}/{target.name}/scores_tracking.csv"
122
+ if not os.path.exists(path):
123
+ continue
124
+ score = pd.read_csv(path)
125
+ if "RMSE" not in score.columns:
126
+ continue
127
+ min_scores.append(min(score["RMSE"]))
128
+
129
+ return min_scores
130
+
131
+ @hybrid_property
132
+ def logloss_scores(self):
133
+ """best LogLoss scores across all model selections, for each targets."""
134
+ # Get the minimum LogLoss for each model selection
135
+ min_scores = [
136
+ min(
137
+ mss.logloss
138
+ for mss in ms.model_selection_scores
139
+ if mss.logloss is not None
140
+ )
141
+ for ms in self.model_selections
142
+ if any(mss.logloss is not None for mss in ms.model_selection_scores)
143
+ ]
144
+
145
+ if not min_scores:
146
+ # fallback to path if no model_selection_scores found
147
+ for target in self.targets:
148
+ path = f"{self.path}/{target.name}/scores_tracking.csv"
149
+ if not os.path.exists(path):
150
+ continue
151
+ score = pd.read_csv(path)
152
+ if "LOGLOSS" not in score.columns:
153
+ continue
154
+ min_scores.append(min(score["LOGLOSS"]))
155
+
156
+ return min_scores
157
+
158
+ @hybrid_property
159
+ def best_rmse(self):
160
+ """Best RMSE score within targets, across all model selections."""
161
+ return min(self.rmse_scores) if self.rmse_scores else None
162
+
163
+ @hybrid_property
164
+ def best_logloss(self):
165
+ """Best LogLoss score within targets, across all model selections."""
166
+ return min(self.logloss_scores) if self.logloss_scores else None
167
+
168
+ @hybrid_property
169
+ def score(self):
170
+ # Calculate a combined score: average of normalized best RMSE and LogLoss per targets
171
+ # This ensures we're comparing apples to apples by normalizing the scores
172
+
173
+ if not self.rmse_scores and not self.logloss_scores:
174
+ logger.error("No experiments found with RMSE or LogLoss scores.")
175
+ return None
176
+
177
+ # Normalize scores (subtract min and divide by range)
178
+ # Guard against division by zero when only one observation or all equal
179
+ # Let's gather all the data from similar experiments to calculate the range
180
+
181
+ similar_experiments = Experiment.get_all_by_name(name=self.name)
182
+ if not similar_experiments:
183
+ similar_experiments = [self]
184
+ rmse_scores = [
185
+ score for exp in similar_experiments for score in exp.rmse_scores or []
186
+ ]
187
+ logloss_scores = [
188
+ score for exp in similar_experiments for score in exp.logloss_scores or []
189
+ ]
190
+
191
+ min_rmse = min(rmse_scores)
192
+ max_rmse = max(rmse_scores)
193
+ range_rmse = max_rmse - min_rmse
194
+ min_logloss = min(logloss_scores)
195
+ max_logloss = max(logloss_scores)
196
+ range_logloss = max_logloss - min_logloss
197
+
198
+ # Calculate combined score for each experiment
199
+ normed_scores = []
200
+ for rmse_score in self.rmse_scores:
201
+
202
+ # Normalize both scores (safe when range == 0)
203
+ norm_rmse = 0.0 if range_rmse == 0 else (rmse_score - min_rmse) / range_rmse
204
+ normed_scores.append(norm_rmse)
205
+
206
+ for logloss_score in self.logloss_scores:
207
+ norm_logloss = (
208
+ 0.0
209
+ if range_logloss == 0
210
+ else (logloss_score - min_logloss) / range_logloss
211
+ )
212
+ normed_scores.append(norm_logloss)
213
+
214
+ # Calculate score (average of normalized scores)
215
+ score = sum(normed_scores) / len(normed_scores)
216
+
217
+ return score
218
+
219
+ @hybrid_property
220
+ def avg_rmse(self):
221
+ """Average within targets of best RMSE score across all model selections ."""
222
+ return mean(self.rmse_scores) if self.rmse_scores else None
223
+
224
+ @hybrid_property
225
+ def avg_logloss(self):
226
+ """Average within targets of best LogLoss score across all model selections ."""
227
+ return mean(self.logloss_scores) if self.logloss_scores else None
228
+
229
+ # Class methods
184
230
  @classmethod
185
231
  @with_db
186
232
  def get_all_by_name(cls, name: str | None = None, limit: int = 1000, db=None):
@@ -194,10 +240,11 @@ class Experiment(Base):
194
240
  Returns:
195
241
  Experiment or None: The most recent matching experiment or None if not found
196
242
  """
243
+ base_name = strip_timestamp_suffix(name)
197
244
  if name is not None:
198
245
  return (
199
246
  db.query(cls)
200
- .filter(cls.name.ilike(f"%{name}%"))
247
+ .filter(cls.name.ilike(f"%{base_name}%"))
201
248
  .order_by(cls.created_at.desc())
202
249
  .limit(limit)
203
250
  .all()
@@ -217,27 +264,24 @@ class Experiment(Base):
217
264
  Returns:
218
265
  Experiment or None: The most recent matching experiment or None if not found
219
266
  """
267
+ base_name = strip_timestamp_suffix(name)
220
268
  return (
221
269
  db.query(cls)
222
- .filter(cls.name.ilike(f"%{name}%"))
270
+ .filter(cls.name.ilike(f"%{base_name}%"))
223
271
  .order_by(cls.created_at.desc())
224
272
  .first()
225
273
  )
226
274
 
227
275
  @classmethod
228
276
  @with_db
229
- def get_best_by_score(cls, name: str, metric="both", db=None):
277
+ def get_best_by_score(cls, name: str, db=None):
230
278
  """
231
- Find the experiment with the best score based on average RMSE, LogLoss, or both.
232
-
233
- Args:
234
- metric (str): 'rmse', 'logloss', or 'both' to determine which score to optimize
235
- db: SQLAlchemy session
279
+ Find the experiment with the best normalized score across RMSE and LogLoss.
236
280
 
237
281
  Returns:
238
282
  Experiment or None: The experiment with the best score or None if not found
239
283
  """
240
- experiments = db.query(cls).filter(cls.name.ilike(f"%{name}%")).all()
284
+ experiments = Experiment.get_all_by_name(name=name)
241
285
  if not experiments:
242
286
  logger.error(f"No experiments found with the given name: {name}")
243
287
  return None
@@ -255,66 +299,22 @@ class Experiment(Base):
255
299
  )
256
300
  return None
257
301
 
258
- if metric == "both":
259
- # Calculate a combined score: average of normalized RMSE and LogLoss
260
- # This ensures we're comparing apples to apples by normalizing the scores
261
-
262
- # Get all scores
263
- rmse_scores = [e.avg_rmse for e in experiments if e.avg_rmse is not None]
264
- logloss_scores = [
265
- e.avg_logloss for e in experiments if e.avg_logloss is not None
266
- ]
267
-
268
- if not rmse_scores or not logloss_scores:
269
- logger.error(
270
- "No experiments found with both RMSE and LogLoss scores. Maybe try with only one metric."
271
- )
272
- return None
273
-
274
- # Normalize scores (subtract min and divide by range)
275
- min_rmse = min(rmse_scores)
276
- range_rmse = max(rmse_scores) - min_rmse
277
- min_logloss = min(logloss_scores)
278
- range_logloss = max(logloss_scores) - min_logloss
279
-
280
- # Calculate combined score for each experiment
281
- experiment_scores = []
282
- for experiment in experiments:
283
- if experiment.avg_rmse is None or experiment.avg_logloss is None:
284
- continue
285
-
286
- # Normalize both scores
287
- norm_rmse = (experiment.avg_rmse - min_rmse) / range_rmse
288
- norm_logloss = (experiment.avg_logloss - min_logloss) / range_logloss
302
+ scored_experiments = []
303
+ for experiment in experiments:
304
+ score = experiment.score
305
+ if score is not None:
306
+ scored_experiments.append((experiment, score))
289
307
 
290
- # Calculate combined score (average of normalized scores)
291
- combined_score = (norm_rmse + norm_logloss) / 2
292
- experiment_scores.append((experiment, combined_score))
293
-
294
- # Sort by combined score (ascending since lower is better)
295
- experiment_scores.sort(key=lambda x: x[1])
296
-
297
- return experiment_scores[0][0] if experiment_scores else None
298
-
299
- elif metric == "rmse" or metric == "logloss":
300
- # For single metric case (rmse or logloss)
301
-
302
- # Filter out experiments without scores and sort by the selected metric
303
- filtered_experiments = []
304
- for exp in experiments:
305
- score = exp.avg_rmse if metric == "rmse" else exp.avg_logloss
306
- if score is not None:
307
- filtered_experiments.append((exp, score))
308
+ if not scored_experiments:
309
+ logger.error(
310
+ f"No experiments with calculable scores found with the given name: {name}"
311
+ )
312
+ return None
308
313
 
309
- if not filtered_experiments:
310
- return None
311
-
312
- # Sort by score (ascending since lower is better)
313
- filtered_experiments.sort(key=lambda x: x[1])
314
- return filtered_experiments[0][0]
315
- else:
316
- raise ValueError("Invalid metric. Must be 'rmse', 'logloss', or 'both'.")
314
+ scored_experiments.sort(key=lambda x: x[1])
315
+ return scored_experiments[0][0]
317
316
 
317
+ # Instance methods
318
318
  def best_score(self, target_number: int) -> dict:
319
319
  """
320
320
  Returns the scores for the best model of the specified target.
@@ -342,7 +342,7 @@ class Experiment(Base):
342
342
  (ms for ms in self.model_selections if ms.target_id == target.id), None
343
343
  )
344
344
 
345
- if not best_model_selection or not best_model_selection.model_trainings:
345
+ if not best_model_selection or not best_model_selection.model_selection_scores:
346
346
  return {
347
347
  "experiment_name": self.name,
348
348
  "target_number": target_number,
@@ -350,22 +350,31 @@ class Experiment(Base):
350
350
  "scores": {},
351
351
  }
352
352
 
353
- # Get the best model training (assuming the first one is the best)
354
- best_training = best_model_selection.model_trainings[0]
355
-
356
- # Get the validation score for this training
357
- validation_scores = [s for s in best_training.score if s.type == "validation"]
353
+ # Get the best model score based on lowest logloss or rmse
354
+ model_scores = best_model_selection.model_selection_scores
358
355
 
359
- if not validation_scores:
356
+ # Determine if we should use logloss or rmse based on what's available
357
+ if any(ms.logloss is not None for ms in model_scores):
358
+ # Classification: find lowest logloss
359
+ best_score = min(
360
+ (ms for ms in model_scores if ms.logloss is not None),
361
+ key=lambda x: x.logloss,
362
+ )
363
+ elif any(ms.rmse is not None for ms in model_scores):
364
+ # Regression: find lowest rmse
365
+ best_score = min(
366
+ (ms for ms in model_scores if ms.rmse is not None), key=lambda x: x.rmse
367
+ )
368
+ else:
360
369
  return {
361
370
  "experiment_name": self.name,
362
371
  "target_number": target_number,
363
- "error": "No validation scores found for the best model",
372
+ "error": "No scores found for the best model",
364
373
  "scores": {},
365
374
  }
366
375
 
367
- # Get all available metrics from the first validation score
368
- score = validation_scores[0]
376
+ # Use the best score found
377
+ score = best_score
369
378
  available_metrics = [
370
379
  "rmse",
371
380
  "mae",
@@ -386,13 +395,9 @@ class Experiment(Base):
386
395
 
387
396
  # Get the model info
388
397
  model_info = {
389
- "model_type": (
390
- best_training.model.model_type if best_training.model else "unknown"
391
- ),
392
- "model_name": (
393
- best_training.model.name if best_training.model else "unknown"
394
- ),
395
- "training_time_seconds": best_training.training_time,
398
+ "model_type": (score.model.model_type if score.model else "unknown"),
399
+ "model_name": (score.model.name if score.model else "unknown"),
400
+ "training_time_seconds": score.training_time,
396
401
  }
397
402
 
398
403
  return {
@@ -402,7 +407,10 @@ class Experiment(Base):
402
407
  "scores": scores,
403
408
  }
404
409
 
405
- def get_features(self, target_number: int):
410
+ @with_db
411
+ def get_features(self, target_number: int, db=None):
412
+ # Ensure we have a fresh instance attached to the session
413
+ self = db.merge(self)
406
414
  targets = [t for t in self.targets if t.name == f"TARGET_{target_number}"]
407
415
  if targets:
408
416
  target_id = targets[0].id
@@ -418,7 +426,12 @@ class Experiment(Base):
418
426
  features = joblib.load(f"{self.path}/TARGET_{target_number}/features.pkl")
419
427
  return features
420
428
 
421
- def get_all_features(self, date_column: str = None, group_column: str = None):
429
+ @with_db
430
+ def get_all_features(
431
+ self, date_column: str = None, group_column: str = None, db=None
432
+ ):
433
+ # Ensure we have a fresh instance attached to the session
434
+ self = db.merge(self)
422
435
  target_idx = [target.id for target in self.targets]
423
436
  _all_features = chain.from_iterable(
424
437
  [f.name for f in fs.features]
@@ -115,7 +115,4 @@ class FeatureSelection(Base):
115
115
  if feature not in self.features:
116
116
  self.features.append(feature)
117
117
 
118
- db.flush()
119
- db.refresh(self)
120
- print(self.features)
121
118
  return self
@@ -65,21 +65,3 @@ class FeatureSelectionRank(Base):
65
65
  name="uq_feature_selection_rank_composite",
66
66
  ),
67
67
  )
68
-
69
- @classmethod
70
- @with_db
71
- def bulk_upsert(cls, rows, db=None):
72
- stmt = insert(cls).values(rows)
73
-
74
- update_fields = {
75
- key: stmt.inserted[key]
76
- for key in rows[0]
77
- if key not in ("feature_selection_id", "feature_id", "method")
78
- }
79
-
80
- stmt = stmt.on_duplicate_key_update(**update_fields)
81
-
82
- db.execute(stmt)
83
- db.commit()
84
-
85
- return len(rows)
@@ -54,8 +54,8 @@ class ModelSelection(Base):
54
54
  )
55
55
 
56
56
  best_model = relationship("Model", lazy="selectin")
57
- model_trainings = relationship(
58
- "ModelTraining",
57
+ model_selection_scores = relationship(
58
+ "ModelSelectionScore",
59
59
  back_populates="model_selection",
60
60
  cascade="all, delete-orphan",
61
61
  lazy="selectin",
@@ -3,10 +3,11 @@ from sqlalchemy import (
3
3
  Integer,
4
4
  String,
5
5
  Float,
6
+ JSON,
6
7
  ForeignKey,
7
8
  BigInteger,
8
9
  TIMESTAMP,
9
- JSON,
10
+ UniqueConstraint,
10
11
  )
11
12
  from sqlalchemy import func
12
13
  from sqlalchemy.orm import relationship
@@ -14,7 +15,9 @@ from lecrapaud.db.models.base import Base
14
15
  from lecrapaud.config import LECRAPAUD_TABLE_PREFIX
15
16
 
16
17
 
17
- class Score(Base):
18
+ class ModelSelectionScore(Base):
19
+ __tablename__ = f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores"
20
+
18
21
  id = Column(BigInteger, primary_key=True, index=True, autoincrement=True)
19
22
  created_at = Column(
20
23
  TIMESTAMP(timezone=True), server_default=func.now(), nullable=False
@@ -25,10 +28,21 @@ class Score(Base):
25
28
  onupdate=func.now(),
26
29
  nullable=False,
27
30
  )
28
- type = Column(
29
- String(50), nullable=False
30
- ) # either hyperopts or validation or crossval
31
+
32
+ # From ModelTraining
33
+ best_params = Column(JSON)
34
+ model_path = Column(String(255))
31
35
  training_time = Column(Integer)
36
+ model_id = Column(
37
+ BigInteger, ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_models.id"), nullable=False
38
+ )
39
+ model_selection_id = Column(
40
+ BigInteger,
41
+ ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_model_selections.id", ondelete="CASCADE"),
42
+ nullable=False,
43
+ )
44
+
45
+ # From Score (excluding type and training_time which is already in ModelTraining)
32
46
  eval_data_std = Column(Float)
33
47
  rmse = Column(Float)
34
48
  rmse_std_ratio = Column(Float)
@@ -50,12 +64,15 @@ class Score(Base):
50
64
  precision_at_threshold = Column(Float)
51
65
  recall_at_threshold = Column(Float)
52
66
  f1_at_threshold = Column(Float)
53
- model_training_id = Column(
54
- BigInteger,
55
- ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_model_trainings.id", ondelete="CASCADE"),
56
- nullable=False,
57
- )
58
67
 
59
- model_trainings = relationship(
60
- "ModelTraining", back_populates="score", lazy="selectin"
68
+ # Relationships
69
+ model = relationship("Model", lazy="selectin")
70
+ model_selection = relationship(
71
+ "ModelSelection", back_populates="model_selection_scores", lazy="selectin"
61
72
  )
73
+
74
+ __table_args__ = (
75
+ UniqueConstraint(
76
+ "model_id", "model_selection_id", name="uq_model_selection_score_composite"
77
+ ),
78
+ )
lecrapaud/db/session.py CHANGED
@@ -27,6 +27,9 @@ else:
27
27
 
28
28
  def init_db(uri: str = None):
29
29
  global _engine, _SessionLocal, DATABASE_URL, DB_URI
30
+ if _SessionLocal is not None:
31
+ return
32
+
30
33
  if uri:
31
34
  if "mysql://" in uri and "pymysql://" not in uri:
32
35
  uri = uri.replace("mysql://", "mysql+pymysql://")
@@ -73,6 +76,7 @@ def init_db(uri: str = None):
73
76
  autocommit=False,
74
77
  autoflush=False,
75
78
  bind=_engine,
79
+ expire_on_commit=False, # Prevent detached instance errors
76
80
  )
77
81
 
78
82
  # Step 5: Apply Alembic migrations programmatically