lecrapaud 0.19.3__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

@@ -20,8 +20,7 @@ from sqlalchemy.ext.hybrid import hybrid_property
20
20
  from sqlalchemy import func
21
21
  from statistics import fmean as mean
22
22
  from lecrapaud.db.models.model_selection import ModelSelection
23
- from lecrapaud.db.models.model_training import ModelTraining
24
- from lecrapaud.db.models.score import Score
23
+ from lecrapaud.db.models.model_selection_score import ModelSelectionScore
25
24
 
26
25
  from lecrapaud.db.models.base import Base, with_db
27
26
  from lecrapaud.db.models.utils import create_association_table
@@ -70,16 +69,14 @@ class Experiment(Base):
70
69
  # Get the minimum RMSE for each model selection
71
70
  min_scores = [
72
71
  min(
73
- score.rmse
74
- for mt in ms.model_trainings
75
- for score in mt.score
76
- if score.rmse is not None
72
+ mss.rmse
73
+ for mss in ms.model_selection_scores
74
+ if mss.rmse is not None
77
75
  )
78
76
  for ms in self.model_selections
79
77
  if any(
80
- score.rmse is not None
81
- for mt in ms.model_trainings
82
- for score in mt.score
78
+ mss.rmse is not None
79
+ for mss in ms.model_selection_scores
83
80
  )
84
81
  ]
85
82
  return min(min_scores) if min_scores else None
@@ -90,16 +87,14 @@ class Experiment(Base):
90
87
  # Get the minimum LogLoss for each model selection
91
88
  min_scores = [
92
89
  min(
93
- score.logloss
94
- for mt in ms.model_trainings
95
- for score in mt.score
96
- if score.logloss is not None
90
+ mss.logloss
91
+ for mss in ms.model_selection_scores
92
+ if mss.logloss is not None
97
93
  )
98
94
  for ms in self.model_selections
99
95
  if any(
100
- score.logloss is not None
101
- for mt in ms.model_trainings
102
- for score in mt.score
96
+ mss.logloss is not None
97
+ for mss in ms.model_selection_scores
103
98
  )
104
99
  ]
105
100
  return min(min_scores) if min_scores else None
@@ -110,16 +105,14 @@ class Experiment(Base):
110
105
  # Get the minimum RMSE for each model selection
111
106
  min_scores = [
112
107
  min(
113
- score.rmse
114
- for mt in ms.model_trainings
115
- for score in mt.score
116
- if score.rmse is not None
108
+ mss.rmse
109
+ for mss in ms.model_selection_scores
110
+ if mss.rmse is not None
117
111
  )
118
112
  for ms in self.model_selections
119
113
  if any(
120
- score.rmse is not None
121
- for mt in ms.model_trainings
122
- for score in mt.score
114
+ mss.rmse is not None
115
+ for mss in ms.model_selection_scores
123
116
  )
124
117
  ]
125
118
  return mean(min_scores) if min_scores else None
@@ -130,16 +123,14 @@ class Experiment(Base):
130
123
  # Get the minimum LogLoss for each model selection
131
124
  min_scores = [
132
125
  min(
133
- score.logloss
134
- for mt in ms.model_trainings
135
- for score in mt.score
136
- if score.logloss is not None
126
+ mss.logloss
127
+ for mss in ms.model_selection_scores
128
+ if mss.logloss is not None
137
129
  )
138
130
  for ms in self.model_selections
139
131
  if any(
140
- score.logloss is not None
141
- for mt in ms.model_trainings
142
- for score in mt.score
132
+ mss.logloss is not None
133
+ for mss in ms.model_selection_scores
143
134
  )
144
135
  ]
145
136
  return mean(min_scores) if min_scores else None
@@ -353,7 +344,7 @@ class Experiment(Base):
353
344
  (ms for ms in self.model_selections if ms.target_id == target.id), None
354
345
  )
355
346
 
356
- if not best_model_selection or not best_model_selection.model_trainings:
347
+ if not best_model_selection or not best_model_selection.model_selection_scores:
357
348
  return {
358
349
  "experiment_name": self.name,
359
350
  "target_number": target_number,
@@ -361,22 +352,32 @@ class Experiment(Base):
361
352
  "scores": {},
362
353
  }
363
354
 
364
- # Get the best model training (assuming the first one is the best)
365
- best_training = best_model_selection.model_trainings[0]
366
-
367
- # Get the validation score for this training
368
- validation_scores = [s for s in best_training.score if s.type == "validation"]
369
-
370
- if not validation_scores:
355
+ # Get the best model score based on lowest logloss or rmse
356
+ model_scores = best_model_selection.model_selection_scores
357
+
358
+ # Determine if we should use logloss or rmse based on what's available
359
+ if any(ms.logloss is not None for ms in model_scores):
360
+ # Classification: find lowest logloss
361
+ best_score = min(
362
+ (ms for ms in model_scores if ms.logloss is not None),
363
+ key=lambda x: x.logloss
364
+ )
365
+ elif any(ms.rmse is not None for ms in model_scores):
366
+ # Regression: find lowest rmse
367
+ best_score = min(
368
+ (ms for ms in model_scores if ms.rmse is not None),
369
+ key=lambda x: x.rmse
370
+ )
371
+ else:
371
372
  return {
372
373
  "experiment_name": self.name,
373
374
  "target_number": target_number,
374
- "error": "No validation scores found for the best model",
375
+ "error": "No scores found for the best model",
375
376
  "scores": {},
376
377
  }
377
378
 
378
- # Get all available metrics from the first validation score
379
- score = validation_scores[0]
379
+ # Use the best score found
380
+ score = best_score
380
381
  available_metrics = [
381
382
  "rmse",
382
383
  "mae",
@@ -398,12 +399,12 @@ class Experiment(Base):
398
399
  # Get the model info
399
400
  model_info = {
400
401
  "model_type": (
401
- best_training.model.model_type if best_training.model else "unknown"
402
+ score.model.model_type if score.model else "unknown"
402
403
  ),
403
404
  "model_name": (
404
- best_training.model.name if best_training.model else "unknown"
405
+ score.model.name if score.model else "unknown"
405
406
  ),
406
- "training_time_seconds": best_training.training_time,
407
+ "training_time_seconds": score.training_time,
407
408
  }
408
409
 
409
410
  return {
@@ -413,7 +414,10 @@ class Experiment(Base):
413
414
  "scores": scores,
414
415
  }
415
416
 
416
- def get_features(self, target_number: int):
417
+ @with_db
418
+ def get_features(self, target_number: int, db=None):
419
+ # Ensure we have a fresh instance attached to the session
420
+ self = db.merge(self)
417
421
  targets = [t for t in self.targets if t.name == f"TARGET_{target_number}"]
418
422
  if targets:
419
423
  target_id = targets[0].id
@@ -429,7 +433,10 @@ class Experiment(Base):
429
433
  features = joblib.load(f"{self.path}/TARGET_{target_number}/features.pkl")
430
434
  return features
431
435
 
432
- def get_all_features(self, date_column: str = None, group_column: str = None):
436
+ @with_db
437
+ def get_all_features(self, date_column: str = None, group_column: str = None, db=None):
438
+ # Ensure we have a fresh instance attached to the session
439
+ self = db.merge(self)
433
440
  target_idx = [target.id for target in self.targets]
434
441
  _all_features = chain.from_iterable(
435
442
  [f.name for f in fs.features]
@@ -115,7 +115,4 @@ class FeatureSelection(Base):
115
115
  if feature not in self.features:
116
116
  self.features.append(feature)
117
117
 
118
- db.flush()
119
- db.refresh(self)
120
- print(self.features)
121
118
  return self
@@ -65,21 +65,3 @@ class FeatureSelectionRank(Base):
65
65
  name="uq_feature_selection_rank_composite",
66
66
  ),
67
67
  )
68
-
69
- @classmethod
70
- @with_db
71
- def bulk_upsert(cls, rows, db=None):
72
- stmt = insert(cls).values(rows)
73
-
74
- update_fields = {
75
- key: stmt.inserted[key]
76
- for key in rows[0]
77
- if key not in ("feature_selection_id", "feature_id", "method")
78
- }
79
-
80
- stmt = stmt.on_duplicate_key_update(**update_fields)
81
-
82
- db.execute(stmt)
83
- db.commit()
84
-
85
- return len(rows)
@@ -54,8 +54,8 @@ class ModelSelection(Base):
54
54
  )
55
55
 
56
56
  best_model = relationship("Model", lazy="selectin")
57
- model_trainings = relationship(
58
- "ModelTraining",
57
+ model_selection_scores = relationship(
58
+ "ModelSelectionScore",
59
59
  back_populates="model_selection",
60
60
  cascade="all, delete-orphan",
61
61
  lazy="selectin",
@@ -3,10 +3,11 @@ from sqlalchemy import (
3
3
  Integer,
4
4
  String,
5
5
  Float,
6
+ JSON,
6
7
  ForeignKey,
7
8
  BigInteger,
8
9
  TIMESTAMP,
9
- JSON,
10
+ UniqueConstraint,
10
11
  )
11
12
  from sqlalchemy import func
12
13
  from sqlalchemy.orm import relationship
@@ -14,7 +15,9 @@ from lecrapaud.db.models.base import Base
14
15
  from lecrapaud.config import LECRAPAUD_TABLE_PREFIX
15
16
 
16
17
 
17
- class Score(Base):
18
+ class ModelSelectionScore(Base):
19
+ __tablename__ = f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores"
20
+
18
21
  id = Column(BigInteger, primary_key=True, index=True, autoincrement=True)
19
22
  created_at = Column(
20
23
  TIMESTAMP(timezone=True), server_default=func.now(), nullable=False
@@ -25,10 +28,21 @@ class Score(Base):
25
28
  onupdate=func.now(),
26
29
  nullable=False,
27
30
  )
28
- type = Column(
29
- String(50), nullable=False
30
- ) # either hyperopts or validation or crossval
31
+
32
+ # From ModelTraining
33
+ best_params = Column(JSON)
34
+ model_path = Column(String(255))
31
35
  training_time = Column(Integer)
36
+ model_id = Column(
37
+ BigInteger, ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_models.id"), nullable=False
38
+ )
39
+ model_selection_id = Column(
40
+ BigInteger,
41
+ ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_model_selections.id", ondelete="CASCADE"),
42
+ nullable=False,
43
+ )
44
+
45
+ # From Score (excluding type and training_time which is already in ModelTraining)
32
46
  eval_data_std = Column(Float)
33
47
  rmse = Column(Float)
34
48
  rmse_std_ratio = Column(Float)
@@ -50,12 +64,15 @@ class Score(Base):
50
64
  precision_at_threshold = Column(Float)
51
65
  recall_at_threshold = Column(Float)
52
66
  f1_at_threshold = Column(Float)
53
- model_training_id = Column(
54
- BigInteger,
55
- ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_model_trainings.id", ondelete="CASCADE"),
56
- nullable=False,
57
- )
58
67
 
59
- model_trainings = relationship(
60
- "ModelTraining", back_populates="score", lazy="selectin"
68
+ # Relationships
69
+ model = relationship("Model", lazy="selectin")
70
+ model_selection = relationship(
71
+ "ModelSelection", back_populates="model_selection_scores", lazy="selectin"
61
72
  )
73
+
74
+ __table_args__ = (
75
+ UniqueConstraint(
76
+ "model_id", "model_selection_id", name="uq_model_selection_score_composite"
77
+ ),
78
+ )
lecrapaud/db/session.py CHANGED
@@ -73,6 +73,7 @@ def init_db(uri: str = None):
73
73
  autocommit=False,
74
74
  autoflush=False,
75
75
  bind=_engine,
76
+ expire_on_commit=False, # Prevent detached instance errors
76
77
  )
77
78
 
78
79
  # Step 5: Apply Alembic migrations programmatically
lecrapaud/experiment.py CHANGED
@@ -50,8 +50,8 @@ def create_experiment(
50
50
  os.makedirs(preprocessing_dir, exist_ok=True)
51
51
  os.makedirs(data_dir, exist_ok=True)
52
52
 
53
+ # Create or update experiment (without targets relation)
53
54
  experiment = Experiment.upsert(
54
- match_fields=["name"],
55
55
  db=db,
56
56
  name=experiment_name,
57
57
  path=Path(experiment_dir).resolve(),
@@ -62,7 +62,6 @@ def create_experiment(
62
62
  max_features=max_features,
63
63
  **groups,
64
64
  **dates,
65
- targets=targets,
66
65
  context={
67
66
  "corr_threshold": corr_threshold,
68
67
  "percentile": percentile,
@@ -73,5 +72,9 @@ def create_experiment(
73
72
  **kwargs,
74
73
  },
75
74
  )
76
-
75
+
76
+ # Set targets relationship after creation/update
77
+ experiment.targets = targets
78
+ experiment.save(db=db)
79
+
77
80
  return experiment
@@ -483,8 +483,8 @@ class PreprocessFeature:
483
483
  f"{data.shape} {name} data from {dates[f"{name}_start_date"].strftime('%d/%m/%Y')} to {dates[f"{name}_end_date"].strftime('%d/%m/%Y')}"
484
484
  )
485
485
 
486
- Experiment.upsert(
487
- match_fields=["id"],
486
+ # Update existing experiment with sizes and dates
487
+ Experiment.update(
488
488
  id=self.experiment_id,
489
489
  train_size=len(train),
490
490
  val_size=len(val),
@@ -545,8 +545,8 @@ class PreprocessFeature:
545
545
  for name, data in zip(["train", "val", "test"], [train, val, test]):
546
546
  logger.info(f"{data.shape} {name} data")
547
547
 
548
- Experiment.upsert(
549
- match_fields=["id"],
548
+ # Update existing experiment with sizes
549
+ Experiment.update(
550
550
  id=self.experiment_id,
551
551
  train_size=len(train),
552
552
  val_size=len(val),
@@ -838,8 +838,7 @@ class PreprocessFeature:
838
838
 
839
839
  # Upsert features in bulk if we have any features
840
840
  if all_feature_names:
841
- Feature.upsert_bulk(
842
- match_fields=["name"],
841
+ Feature.bulk_upsert(
843
842
  name=all_feature_names,
844
843
  type=all_feature_types,
845
844
  )
@@ -855,9 +854,7 @@ class PreprocessFeature:
855
854
  for target in target_names
856
855
  ]
857
856
 
858
- Target.upsert_bulk(
859
- match_fields=["name"], name=target_names, type=target_types
860
- )
857
+ Target.bulk_upsert(name=target_names, type=target_types)
861
858
 
862
859
  # Get all the upserted objects
863
860
  targets = Target.filter(name__in=target_names)
@@ -115,7 +115,6 @@ class FeatureSelectionEngine:
115
115
  max_features = self.max_features
116
116
 
117
117
  feature_selection = FeatureSelection.upsert(
118
- match_fields=["target_id", "experiment_id"],
119
118
  target_id=target.id,
120
119
  experiment_id=self.experiment_id,
121
120
  )