lecrapaud 0.18.7__py3-none-any.whl → 0.22.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lecrapaud/__init__.py +22 -1
- lecrapaud/{api.py → base.py} +331 -241
- lecrapaud/config.py +15 -3
- lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py +9 -4
- lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_score.py +34 -0
- lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py +44 -0
- lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
- lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
- lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
- lecrapaud/db/models/__init__.py +2 -4
- lecrapaud/db/models/base.py +122 -67
- lecrapaud/db/models/experiment.py +196 -183
- lecrapaud/db/models/feature_selection.py +0 -3
- lecrapaud/db/models/feature_selection_rank.py +0 -18
- lecrapaud/db/models/model_selection.py +2 -2
- lecrapaud/db/models/{score.py → model_selection_score.py} +30 -12
- lecrapaud/db/session.py +33 -4
- lecrapaud/experiment.py +44 -17
- lecrapaud/feature_engineering.py +45 -674
- lecrapaud/feature_preprocessing.py +1202 -0
- lecrapaud/feature_selection.py +145 -332
- lecrapaud/integrations/sentry_integration.py +46 -0
- lecrapaud/misc/tabpfn_tests.ipynb +2 -2
- lecrapaud/mixins.py +247 -0
- lecrapaud/model_preprocessing.py +295 -0
- lecrapaud/model_selection.py +725 -249
- lecrapaud/pipeline.py +548 -0
- lecrapaud/search_space.py +38 -1
- lecrapaud/utils.py +36 -3
- lecrapaud-0.22.6.dist-info/METADATA +423 -0
- lecrapaud-0.22.6.dist-info/RECORD +51 -0
- {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
- {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
- lecrapaud/db/models/model_training.py +0 -64
- lecrapaud/jobs/__init__.py +0 -13
- lecrapaud/jobs/config.py +0 -17
- lecrapaud/jobs/scheduler.py +0 -30
- lecrapaud/jobs/tasks.py +0 -17
- lecrapaud-0.18.7.dist-info/METADATA +0 -248
- lecrapaud-0.18.7.dist-info/RECORD +0 -46
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from itertools import chain
|
|
2
2
|
import joblib
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import os
|
|
3
5
|
|
|
4
6
|
from sqlalchemy import (
|
|
5
7
|
Column,
|
|
@@ -14,18 +16,18 @@ from sqlalchemy import (
|
|
|
14
16
|
TIMESTAMP,
|
|
15
17
|
UniqueConstraint,
|
|
16
18
|
func,
|
|
19
|
+
event,
|
|
17
20
|
)
|
|
18
|
-
from sqlalchemy.orm import relationship, aliased
|
|
21
|
+
from sqlalchemy.orm import relationship, aliased, mapper
|
|
19
22
|
from sqlalchemy.ext.hybrid import hybrid_property
|
|
20
23
|
from sqlalchemy import func
|
|
21
24
|
from statistics import fmean as mean
|
|
22
25
|
from lecrapaud.db.models.model_selection import ModelSelection
|
|
23
|
-
from lecrapaud.db.models.
|
|
24
|
-
from lecrapaud.db.models.score import Score
|
|
26
|
+
from lecrapaud.db.models.model_selection_score import ModelSelectionScore
|
|
25
27
|
|
|
26
28
|
from lecrapaud.db.models.base import Base, with_db
|
|
27
29
|
from lecrapaud.db.models.utils import create_association_table
|
|
28
|
-
from lecrapaud.utils import logger, contains_best
|
|
30
|
+
from lecrapaud.utils import logger, contains_best, strip_timestamp_suffix
|
|
29
31
|
|
|
30
32
|
# jointures
|
|
31
33
|
lecrapaud_experiment_target_association = create_association_table(
|
|
@@ -49,107 +51,15 @@ class Experiment(Base):
|
|
|
49
51
|
onupdate=func.now(),
|
|
50
52
|
nullable=False,
|
|
51
53
|
)
|
|
52
|
-
name = Column(String(
|
|
54
|
+
name = Column(String(255), nullable=False)
|
|
53
55
|
path = Column(String(255)) # we do not have this at creation time
|
|
54
|
-
type = Column(String(50), nullable=False)
|
|
55
56
|
size = Column(Integer, nullable=False)
|
|
56
57
|
train_size = Column(Integer)
|
|
57
58
|
val_size = Column(Integer)
|
|
58
|
-
|
|
59
|
-
# Relationships
|
|
60
|
-
model_selections = relationship(
|
|
61
|
-
"ModelSelection",
|
|
62
|
-
back_populates="experiment",
|
|
63
|
-
cascade="all, delete-orphan",
|
|
64
|
-
lazy="selectin",
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
@hybrid_property
|
|
68
|
-
def best_rmse(self):
|
|
69
|
-
"""Best RMSE score across all model selections and trainings."""
|
|
70
|
-
# Get the minimum RMSE for each model selection
|
|
71
|
-
min_scores = [
|
|
72
|
-
min(
|
|
73
|
-
score.rmse
|
|
74
|
-
for mt in ms.model_trainings
|
|
75
|
-
for score in mt.score
|
|
76
|
-
if score.rmse is not None
|
|
77
|
-
)
|
|
78
|
-
for ms in self.model_selections
|
|
79
|
-
if any(
|
|
80
|
-
score.rmse is not None
|
|
81
|
-
for mt in ms.model_trainings
|
|
82
|
-
for score in mt.score
|
|
83
|
-
)
|
|
84
|
-
]
|
|
85
|
-
return min(min_scores) if min_scores else None
|
|
86
|
-
|
|
87
|
-
@hybrid_property
|
|
88
|
-
def best_logloss(self):
|
|
89
|
-
"""Best LogLoss score across all model selections and trainings."""
|
|
90
|
-
# Get the minimum LogLoss for each model selection
|
|
91
|
-
min_scores = [
|
|
92
|
-
min(
|
|
93
|
-
score.logloss
|
|
94
|
-
for mt in ms.model_trainings
|
|
95
|
-
for score in mt.score
|
|
96
|
-
if score.logloss is not None
|
|
97
|
-
)
|
|
98
|
-
for ms in self.model_selections
|
|
99
|
-
if any(
|
|
100
|
-
score.logloss is not None
|
|
101
|
-
for mt in ms.model_trainings
|
|
102
|
-
for score in mt.score
|
|
103
|
-
)
|
|
104
|
-
]
|
|
105
|
-
return min(min_scores) if min_scores else None
|
|
106
|
-
|
|
107
|
-
@hybrid_property
|
|
108
|
-
def avg_rmse(self):
|
|
109
|
-
"""Average RMSE score across all model selections and trainings."""
|
|
110
|
-
# Get the minimum RMSE for each model selection
|
|
111
|
-
min_scores = [
|
|
112
|
-
min(
|
|
113
|
-
score.rmse
|
|
114
|
-
for mt in ms.model_trainings
|
|
115
|
-
for score in mt.score
|
|
116
|
-
if score.rmse is not None
|
|
117
|
-
)
|
|
118
|
-
for ms in self.model_selections
|
|
119
|
-
if any(
|
|
120
|
-
score.rmse is not None
|
|
121
|
-
for mt in ms.model_trainings
|
|
122
|
-
for score in mt.score
|
|
123
|
-
)
|
|
124
|
-
]
|
|
125
|
-
return mean(min_scores) if min_scores else None
|
|
126
|
-
|
|
127
|
-
@hybrid_property
|
|
128
|
-
def avg_logloss(self):
|
|
129
|
-
"""Average LogLoss score across all model selections and trainings."""
|
|
130
|
-
# Get the minimum LogLoss for each model selection
|
|
131
|
-
min_scores = [
|
|
132
|
-
min(
|
|
133
|
-
score.logloss
|
|
134
|
-
for mt in ms.model_trainings
|
|
135
|
-
for score in mt.score
|
|
136
|
-
if score.logloss is not None
|
|
137
|
-
)
|
|
138
|
-
for ms in self.model_selections
|
|
139
|
-
if any(
|
|
140
|
-
score.logloss is not None
|
|
141
|
-
for mt in ms.model_trainings
|
|
142
|
-
for score in mt.score
|
|
143
|
-
)
|
|
144
|
-
]
|
|
145
|
-
return mean(min_scores) if min_scores else None
|
|
146
|
-
|
|
147
59
|
test_size = Column(Integer)
|
|
148
|
-
corr_threshold = Column(Float, nullable=False)
|
|
149
|
-
max_features = Column(Integer, nullable=False)
|
|
150
|
-
percentile = Column(Float, nullable=False)
|
|
151
60
|
number_of_groups = Column(Integer)
|
|
152
61
|
list_of_groups = Column(JSON)
|
|
62
|
+
number_of_targets = Column(Integer)
|
|
153
63
|
start_date = Column(DateTime)
|
|
154
64
|
end_date = Column(DateTime)
|
|
155
65
|
train_start_date = Column(DateTime)
|
|
@@ -181,6 +91,142 @@ class Experiment(Base):
|
|
|
181
91
|
),
|
|
182
92
|
)
|
|
183
93
|
|
|
94
|
+
# Relationships
|
|
95
|
+
model_selections = relationship(
|
|
96
|
+
"ModelSelection",
|
|
97
|
+
back_populates="experiment",
|
|
98
|
+
cascade="all, delete-orphan",
|
|
99
|
+
lazy="selectin",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Hooks
|
|
103
|
+
# @event.listens_to(Experiment, "after_commit")
|
|
104
|
+
# def set_score(mapper, connection, target):
|
|
105
|
+
# target.score = target.score
|
|
106
|
+
|
|
107
|
+
# Properties
|
|
108
|
+
@hybrid_property
|
|
109
|
+
def rmse_scores(self):
|
|
110
|
+
"""best RMSE scores across all model selections, for each targets."""
|
|
111
|
+
# Get the minimum RMSE for each model selection
|
|
112
|
+
min_scores = [
|
|
113
|
+
min(mss.rmse for mss in ms.model_selection_scores if mss.rmse is not None)
|
|
114
|
+
for ms in self.model_selections
|
|
115
|
+
if any(mss.rmse is not None for mss in ms.model_selection_scores)
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
if not min_scores:
|
|
119
|
+
# fallback to path if no model_selection_scores found
|
|
120
|
+
for target in self.targets:
|
|
121
|
+
path = f"{self.path}/{target.name}/scores_tracking.csv"
|
|
122
|
+
if not os.path.exists(path):
|
|
123
|
+
continue
|
|
124
|
+
score = pd.read_csv(path)
|
|
125
|
+
if "RMSE" not in score.columns:
|
|
126
|
+
continue
|
|
127
|
+
min_scores.append(min(score["RMSE"]))
|
|
128
|
+
|
|
129
|
+
return min_scores
|
|
130
|
+
|
|
131
|
+
@hybrid_property
|
|
132
|
+
def logloss_scores(self):
|
|
133
|
+
"""best LogLoss scores across all model selections, for each targets."""
|
|
134
|
+
# Get the minimum LogLoss for each model selection
|
|
135
|
+
min_scores = [
|
|
136
|
+
min(
|
|
137
|
+
mss.logloss
|
|
138
|
+
for mss in ms.model_selection_scores
|
|
139
|
+
if mss.logloss is not None
|
|
140
|
+
)
|
|
141
|
+
for ms in self.model_selections
|
|
142
|
+
if any(mss.logloss is not None for mss in ms.model_selection_scores)
|
|
143
|
+
]
|
|
144
|
+
|
|
145
|
+
if not min_scores:
|
|
146
|
+
# fallback to path if no model_selection_scores found
|
|
147
|
+
for target in self.targets:
|
|
148
|
+
path = f"{self.path}/{target.name}/scores_tracking.csv"
|
|
149
|
+
if not os.path.exists(path):
|
|
150
|
+
continue
|
|
151
|
+
score = pd.read_csv(path)
|
|
152
|
+
if "LOGLOSS" not in score.columns:
|
|
153
|
+
continue
|
|
154
|
+
min_scores.append(min(score["LOGLOSS"]))
|
|
155
|
+
|
|
156
|
+
return min_scores
|
|
157
|
+
|
|
158
|
+
@hybrid_property
|
|
159
|
+
def best_rmse(self):
|
|
160
|
+
"""Best RMSE score within targets, across all model selections."""
|
|
161
|
+
return min(self.rmse_scores) if self.rmse_scores else None
|
|
162
|
+
|
|
163
|
+
@hybrid_property
|
|
164
|
+
def best_logloss(self):
|
|
165
|
+
"""Best LogLoss score within targets, across all model selections."""
|
|
166
|
+
return min(self.logloss_scores) if self.logloss_scores else None
|
|
167
|
+
|
|
168
|
+
@hybrid_property
|
|
169
|
+
def score(self):
|
|
170
|
+
# Calculate a combined score: average of normalized best RMSE and LogLoss per targets
|
|
171
|
+
# This ensures we're comparing apples to apples by normalizing the scores
|
|
172
|
+
|
|
173
|
+
if not self.rmse_scores and not self.logloss_scores:
|
|
174
|
+
logger.error("No experiments found with RMSE or LogLoss scores.")
|
|
175
|
+
return None
|
|
176
|
+
|
|
177
|
+
# Normalize scores (subtract min and divide by range)
|
|
178
|
+
# Guard against division by zero when only one observation or all equal
|
|
179
|
+
# Let's gather all the data from similar experiments to calculate the range
|
|
180
|
+
|
|
181
|
+
similar_experiments = Experiment.get_all_by_name(name=self.name)
|
|
182
|
+
if not similar_experiments:
|
|
183
|
+
similar_experiments = [self]
|
|
184
|
+
rmse_scores = [
|
|
185
|
+
score for exp in similar_experiments for score in exp.rmse_scores or []
|
|
186
|
+
]
|
|
187
|
+
logloss_scores = [
|
|
188
|
+
score for exp in similar_experiments for score in exp.logloss_scores or []
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
min_rmse = min(rmse_scores)
|
|
192
|
+
max_rmse = max(rmse_scores)
|
|
193
|
+
range_rmse = max_rmse - min_rmse
|
|
194
|
+
min_logloss = min(logloss_scores)
|
|
195
|
+
max_logloss = max(logloss_scores)
|
|
196
|
+
range_logloss = max_logloss - min_logloss
|
|
197
|
+
|
|
198
|
+
# Calculate combined score for each experiment
|
|
199
|
+
normed_scores = []
|
|
200
|
+
for rmse_score in self.rmse_scores:
|
|
201
|
+
|
|
202
|
+
# Normalize both scores (safe when range == 0)
|
|
203
|
+
norm_rmse = 0.0 if range_rmse == 0 else (rmse_score - min_rmse) / range_rmse
|
|
204
|
+
normed_scores.append(norm_rmse)
|
|
205
|
+
|
|
206
|
+
for logloss_score in self.logloss_scores:
|
|
207
|
+
norm_logloss = (
|
|
208
|
+
0.0
|
|
209
|
+
if range_logloss == 0
|
|
210
|
+
else (logloss_score - min_logloss) / range_logloss
|
|
211
|
+
)
|
|
212
|
+
normed_scores.append(norm_logloss)
|
|
213
|
+
|
|
214
|
+
# Calculate score (average of normalized scores)
|
|
215
|
+
score = sum(normed_scores) / len(normed_scores)
|
|
216
|
+
|
|
217
|
+
return score
|
|
218
|
+
|
|
219
|
+
@hybrid_property
|
|
220
|
+
def avg_rmse(self):
|
|
221
|
+
"""Average within targets of best RMSE score across all model selections ."""
|
|
222
|
+
return mean(self.rmse_scores) if self.rmse_scores else None
|
|
223
|
+
|
|
224
|
+
@hybrid_property
|
|
225
|
+
def avg_logloss(self):
|
|
226
|
+
"""Average within targets of best LogLoss score across all model selections ."""
|
|
227
|
+
return mean(self.logloss_scores) if self.logloss_scores else None
|
|
228
|
+
|
|
229
|
+
# Class methods
|
|
184
230
|
@classmethod
|
|
185
231
|
@with_db
|
|
186
232
|
def get_all_by_name(cls, name: str | None = None, limit: int = 1000, db=None):
|
|
@@ -194,10 +240,11 @@ class Experiment(Base):
|
|
|
194
240
|
Returns:
|
|
195
241
|
Experiment or None: The most recent matching experiment or None if not found
|
|
196
242
|
"""
|
|
243
|
+
base_name = strip_timestamp_suffix(name)
|
|
197
244
|
if name is not None:
|
|
198
245
|
return (
|
|
199
246
|
db.query(cls)
|
|
200
|
-
.filter(cls.name.ilike(f"%{
|
|
247
|
+
.filter(cls.name.ilike(f"%{base_name}%"))
|
|
201
248
|
.order_by(cls.created_at.desc())
|
|
202
249
|
.limit(limit)
|
|
203
250
|
.all()
|
|
@@ -217,27 +264,24 @@ class Experiment(Base):
|
|
|
217
264
|
Returns:
|
|
218
265
|
Experiment or None: The most recent matching experiment or None if not found
|
|
219
266
|
"""
|
|
267
|
+
base_name = strip_timestamp_suffix(name)
|
|
220
268
|
return (
|
|
221
269
|
db.query(cls)
|
|
222
|
-
.filter(cls.name.ilike(f"%{
|
|
270
|
+
.filter(cls.name.ilike(f"%{base_name}%"))
|
|
223
271
|
.order_by(cls.created_at.desc())
|
|
224
272
|
.first()
|
|
225
273
|
)
|
|
226
274
|
|
|
227
275
|
@classmethod
|
|
228
276
|
@with_db
|
|
229
|
-
def get_best_by_score(cls, name: str,
|
|
277
|
+
def get_best_by_score(cls, name: str, db=None):
|
|
230
278
|
"""
|
|
231
|
-
Find the experiment with the best score
|
|
232
|
-
|
|
233
|
-
Args:
|
|
234
|
-
metric (str): 'rmse', 'logloss', or 'both' to determine which score to optimize
|
|
235
|
-
db: SQLAlchemy session
|
|
279
|
+
Find the experiment with the best normalized score across RMSE and LogLoss.
|
|
236
280
|
|
|
237
281
|
Returns:
|
|
238
282
|
Experiment or None: The experiment with the best score or None if not found
|
|
239
283
|
"""
|
|
240
|
-
experiments =
|
|
284
|
+
experiments = Experiment.get_all_by_name(name=name)
|
|
241
285
|
if not experiments:
|
|
242
286
|
logger.error(f"No experiments found with the given name: {name}")
|
|
243
287
|
return None
|
|
@@ -255,66 +299,22 @@ class Experiment(Base):
|
|
|
255
299
|
)
|
|
256
300
|
return None
|
|
257
301
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
rmse_scores = [e.avg_rmse for e in experiments if e.avg_rmse is not None]
|
|
264
|
-
logloss_scores = [
|
|
265
|
-
e.avg_logloss for e in experiments if e.avg_logloss is not None
|
|
266
|
-
]
|
|
267
|
-
|
|
268
|
-
if not rmse_scores or not logloss_scores:
|
|
269
|
-
logger.error(
|
|
270
|
-
"No experiments found with both RMSE and LogLoss scores. Maybe try with only one metric."
|
|
271
|
-
)
|
|
272
|
-
return None
|
|
273
|
-
|
|
274
|
-
# Normalize scores (subtract min and divide by range)
|
|
275
|
-
min_rmse = min(rmse_scores)
|
|
276
|
-
range_rmse = max(rmse_scores) - min_rmse
|
|
277
|
-
min_logloss = min(logloss_scores)
|
|
278
|
-
range_logloss = max(logloss_scores) - min_logloss
|
|
279
|
-
|
|
280
|
-
# Calculate combined score for each experiment
|
|
281
|
-
experiment_scores = []
|
|
282
|
-
for experiment in experiments:
|
|
283
|
-
if experiment.avg_rmse is None or experiment.avg_logloss is None:
|
|
284
|
-
continue
|
|
285
|
-
|
|
286
|
-
# Normalize both scores
|
|
287
|
-
norm_rmse = (experiment.avg_rmse - min_rmse) / range_rmse
|
|
288
|
-
norm_logloss = (experiment.avg_logloss - min_logloss) / range_logloss
|
|
302
|
+
scored_experiments = []
|
|
303
|
+
for experiment in experiments:
|
|
304
|
+
score = experiment.score
|
|
305
|
+
if score is not None:
|
|
306
|
+
scored_experiments.append((experiment, score))
|
|
289
307
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
experiment_scores.sort(key=lambda x: x[1])
|
|
296
|
-
|
|
297
|
-
return experiment_scores[0][0] if experiment_scores else None
|
|
298
|
-
|
|
299
|
-
elif metric == "rmse" or metric == "logloss":
|
|
300
|
-
# For single metric case (rmse or logloss)
|
|
301
|
-
|
|
302
|
-
# Filter out experiments without scores and sort by the selected metric
|
|
303
|
-
filtered_experiments = []
|
|
304
|
-
for exp in experiments:
|
|
305
|
-
score = exp.avg_rmse if metric == "rmse" else exp.avg_logloss
|
|
306
|
-
if score is not None:
|
|
307
|
-
filtered_experiments.append((exp, score))
|
|
308
|
+
if not scored_experiments:
|
|
309
|
+
logger.error(
|
|
310
|
+
f"No experiments with calculable scores found with the given name: {name}"
|
|
311
|
+
)
|
|
312
|
+
return None
|
|
308
313
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
# Sort by score (ascending since lower is better)
|
|
313
|
-
filtered_experiments.sort(key=lambda x: x[1])
|
|
314
|
-
return filtered_experiments[0][0]
|
|
315
|
-
else:
|
|
316
|
-
raise ValueError("Invalid metric. Must be 'rmse', 'logloss', or 'both'.")
|
|
314
|
+
scored_experiments.sort(key=lambda x: x[1])
|
|
315
|
+
return scored_experiments[0][0]
|
|
317
316
|
|
|
317
|
+
# Instance methods
|
|
318
318
|
def best_score(self, target_number: int) -> dict:
|
|
319
319
|
"""
|
|
320
320
|
Returns the scores for the best model of the specified target.
|
|
@@ -342,7 +342,7 @@ class Experiment(Base):
|
|
|
342
342
|
(ms for ms in self.model_selections if ms.target_id == target.id), None
|
|
343
343
|
)
|
|
344
344
|
|
|
345
|
-
if not best_model_selection or not best_model_selection.
|
|
345
|
+
if not best_model_selection or not best_model_selection.model_selection_scores:
|
|
346
346
|
return {
|
|
347
347
|
"experiment_name": self.name,
|
|
348
348
|
"target_number": target_number,
|
|
@@ -350,22 +350,31 @@ class Experiment(Base):
|
|
|
350
350
|
"scores": {},
|
|
351
351
|
}
|
|
352
352
|
|
|
353
|
-
# Get the best model
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
# Get the validation score for this training
|
|
357
|
-
validation_scores = [s for s in best_training.score if s.type == "validation"]
|
|
353
|
+
# Get the best model score based on lowest logloss or rmse
|
|
354
|
+
model_scores = best_model_selection.model_selection_scores
|
|
358
355
|
|
|
359
|
-
if
|
|
356
|
+
# Determine if we should use logloss or rmse based on what's available
|
|
357
|
+
if any(ms.logloss is not None for ms in model_scores):
|
|
358
|
+
# Classification: find lowest logloss
|
|
359
|
+
best_score = min(
|
|
360
|
+
(ms for ms in model_scores if ms.logloss is not None),
|
|
361
|
+
key=lambda x: x.logloss,
|
|
362
|
+
)
|
|
363
|
+
elif any(ms.rmse is not None for ms in model_scores):
|
|
364
|
+
# Regression: find lowest rmse
|
|
365
|
+
best_score = min(
|
|
366
|
+
(ms for ms in model_scores if ms.rmse is not None), key=lambda x: x.rmse
|
|
367
|
+
)
|
|
368
|
+
else:
|
|
360
369
|
return {
|
|
361
370
|
"experiment_name": self.name,
|
|
362
371
|
"target_number": target_number,
|
|
363
|
-
"error": "No
|
|
372
|
+
"error": "No scores found for the best model",
|
|
364
373
|
"scores": {},
|
|
365
374
|
}
|
|
366
375
|
|
|
367
|
-
#
|
|
368
|
-
score =
|
|
376
|
+
# Use the best score found
|
|
377
|
+
score = best_score
|
|
369
378
|
available_metrics = [
|
|
370
379
|
"rmse",
|
|
371
380
|
"mae",
|
|
@@ -386,13 +395,9 @@ class Experiment(Base):
|
|
|
386
395
|
|
|
387
396
|
# Get the model info
|
|
388
397
|
model_info = {
|
|
389
|
-
"model_type": (
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
"model_name": (
|
|
393
|
-
best_training.model.name if best_training.model else "unknown"
|
|
394
|
-
),
|
|
395
|
-
"training_time_seconds": best_training.training_time,
|
|
398
|
+
"model_type": (score.model.model_type if score.model else "unknown"),
|
|
399
|
+
"model_name": (score.model.name if score.model else "unknown"),
|
|
400
|
+
"training_time_seconds": score.training_time,
|
|
396
401
|
}
|
|
397
402
|
|
|
398
403
|
return {
|
|
@@ -402,7 +407,10 @@ class Experiment(Base):
|
|
|
402
407
|
"scores": scores,
|
|
403
408
|
}
|
|
404
409
|
|
|
405
|
-
|
|
410
|
+
@with_db
|
|
411
|
+
def get_features(self, target_number: int, db=None):
|
|
412
|
+
# Ensure we have a fresh instance attached to the session
|
|
413
|
+
self = db.merge(self)
|
|
406
414
|
targets = [t for t in self.targets if t.name == f"TARGET_{target_number}"]
|
|
407
415
|
if targets:
|
|
408
416
|
target_id = targets[0].id
|
|
@@ -418,7 +426,12 @@ class Experiment(Base):
|
|
|
418
426
|
features = joblib.load(f"{self.path}/TARGET_{target_number}/features.pkl")
|
|
419
427
|
return features
|
|
420
428
|
|
|
421
|
-
|
|
429
|
+
@with_db
|
|
430
|
+
def get_all_features(
|
|
431
|
+
self, date_column: str = None, group_column: str = None, db=None
|
|
432
|
+
):
|
|
433
|
+
# Ensure we have a fresh instance attached to the session
|
|
434
|
+
self = db.merge(self)
|
|
422
435
|
target_idx = [target.id for target in self.targets]
|
|
423
436
|
_all_features = chain.from_iterable(
|
|
424
437
|
[f.name for f in fs.features]
|
|
@@ -65,21 +65,3 @@ class FeatureSelectionRank(Base):
|
|
|
65
65
|
name="uq_feature_selection_rank_composite",
|
|
66
66
|
),
|
|
67
67
|
)
|
|
68
|
-
|
|
69
|
-
@classmethod
|
|
70
|
-
@with_db
|
|
71
|
-
def bulk_upsert(cls, rows, db=None):
|
|
72
|
-
stmt = insert(cls).values(rows)
|
|
73
|
-
|
|
74
|
-
update_fields = {
|
|
75
|
-
key: stmt.inserted[key]
|
|
76
|
-
for key in rows[0]
|
|
77
|
-
if key not in ("feature_selection_id", "feature_id", "method")
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
stmt = stmt.on_duplicate_key_update(**update_fields)
|
|
81
|
-
|
|
82
|
-
db.execute(stmt)
|
|
83
|
-
db.commit()
|
|
84
|
-
|
|
85
|
-
return len(rows)
|
|
@@ -54,8 +54,8 @@ class ModelSelection(Base):
|
|
|
54
54
|
)
|
|
55
55
|
|
|
56
56
|
best_model = relationship("Model", lazy="selectin")
|
|
57
|
-
|
|
58
|
-
"
|
|
57
|
+
model_selection_scores = relationship(
|
|
58
|
+
"ModelSelectionScore",
|
|
59
59
|
back_populates="model_selection",
|
|
60
60
|
cascade="all, delete-orphan",
|
|
61
61
|
lazy="selectin",
|
|
@@ -3,10 +3,11 @@ from sqlalchemy import (
|
|
|
3
3
|
Integer,
|
|
4
4
|
String,
|
|
5
5
|
Float,
|
|
6
|
+
JSON,
|
|
6
7
|
ForeignKey,
|
|
7
8
|
BigInteger,
|
|
8
9
|
TIMESTAMP,
|
|
9
|
-
|
|
10
|
+
UniqueConstraint,
|
|
10
11
|
)
|
|
11
12
|
from sqlalchemy import func
|
|
12
13
|
from sqlalchemy.orm import relationship
|
|
@@ -14,7 +15,9 @@ from lecrapaud.db.models.base import Base
|
|
|
14
15
|
from lecrapaud.config import LECRAPAUD_TABLE_PREFIX
|
|
15
16
|
|
|
16
17
|
|
|
17
|
-
class
|
|
18
|
+
class ModelSelectionScore(Base):
|
|
19
|
+
__tablename__ = f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores"
|
|
20
|
+
|
|
18
21
|
id = Column(BigInteger, primary_key=True, index=True, autoincrement=True)
|
|
19
22
|
created_at = Column(
|
|
20
23
|
TIMESTAMP(timezone=True), server_default=func.now(), nullable=False
|
|
@@ -25,10 +28,21 @@ class Score(Base):
|
|
|
25
28
|
onupdate=func.now(),
|
|
26
29
|
nullable=False,
|
|
27
30
|
)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
+
|
|
32
|
+
# From ModelTraining
|
|
33
|
+
best_params = Column(JSON)
|
|
34
|
+
model_path = Column(String(255))
|
|
31
35
|
training_time = Column(Integer)
|
|
36
|
+
model_id = Column(
|
|
37
|
+
BigInteger, ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_models.id"), nullable=False
|
|
38
|
+
)
|
|
39
|
+
model_selection_id = Column(
|
|
40
|
+
BigInteger,
|
|
41
|
+
ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_model_selections.id", ondelete="CASCADE"),
|
|
42
|
+
nullable=False,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# From Score (excluding type and training_time which is already in ModelTraining)
|
|
32
46
|
eval_data_std = Column(Float)
|
|
33
47
|
rmse = Column(Float)
|
|
34
48
|
rmse_std_ratio = Column(Float)
|
|
@@ -45,16 +59,20 @@ class Score(Base):
|
|
|
45
59
|
recall = Column(Float)
|
|
46
60
|
f1 = Column(Float)
|
|
47
61
|
roc_auc = Column(Float)
|
|
62
|
+
avg_precision = Column(Float)
|
|
48
63
|
thresholds = Column(JSON)
|
|
49
64
|
precision_at_threshold = Column(Float)
|
|
50
65
|
recall_at_threshold = Column(Float)
|
|
51
66
|
f1_at_threshold = Column(Float)
|
|
52
|
-
model_training_id = Column(
|
|
53
|
-
BigInteger,
|
|
54
|
-
ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_model_trainings.id", ondelete="CASCADE"),
|
|
55
|
-
nullable=False,
|
|
56
|
-
)
|
|
57
67
|
|
|
58
|
-
|
|
59
|
-
|
|
68
|
+
# Relationships
|
|
69
|
+
model = relationship("Model", lazy="selectin")
|
|
70
|
+
model_selection = relationship(
|
|
71
|
+
"ModelSelection", back_populates="model_selection_scores", lazy="selectin"
|
|
60
72
|
)
|
|
73
|
+
|
|
74
|
+
__table_args__ = (
|
|
75
|
+
UniqueConstraint(
|
|
76
|
+
"model_id", "model_selection_id", name="uq_model_selection_score_composite"
|
|
77
|
+
),
|
|
78
|
+
)
|
lecrapaud/db/session.py
CHANGED
|
@@ -12,6 +12,7 @@ from lecrapaud.config import DB_USER, DB_PASSWORD, DB_HOST, DB_PORT, DB_NAME, DB
|
|
|
12
12
|
|
|
13
13
|
_engine = None
|
|
14
14
|
_SessionLocal = None
|
|
15
|
+
|
|
15
16
|
if DB_URI:
|
|
16
17
|
if "mysql://" in DB_URI and "pymysql://" not in DB_URI:
|
|
17
18
|
DB_URI = DB_URI.replace("mysql://", "mysql+pymysql://")
|
|
@@ -26,6 +27,9 @@ else:
|
|
|
26
27
|
|
|
27
28
|
def init_db(uri: str = None):
|
|
28
29
|
global _engine, _SessionLocal, DATABASE_URL, DB_URI
|
|
30
|
+
if _SessionLocal is not None:
|
|
31
|
+
return
|
|
32
|
+
|
|
29
33
|
if uri:
|
|
30
34
|
if "mysql://" in uri and "pymysql://" not in uri:
|
|
31
35
|
uri = uri.replace("mysql://", "mysql+pymysql://")
|
|
@@ -53,11 +57,27 @@ def init_db(uri: str = None):
|
|
|
53
57
|
conn.execute(text(f"CREATE DATABASE IF NOT EXISTS `{db_name}`"))
|
|
54
58
|
conn.commit()
|
|
55
59
|
|
|
56
|
-
# Step 3:
|
|
57
|
-
_engine = create_engine(
|
|
60
|
+
# Step 3: Configure main engine with connection pooling and reconnection settings
|
|
61
|
+
_engine = create_engine(
|
|
62
|
+
DATABASE_URL,
|
|
63
|
+
echo=False,
|
|
64
|
+
pool_pre_ping=True, # Check connection health before using
|
|
65
|
+
pool_recycle=3600, # Recycle connections after 1 hour
|
|
66
|
+
pool_timeout=30, # Wait 30 seconds for a connection from the pool
|
|
67
|
+
pool_size=10, # Maintain up to 10 connections
|
|
68
|
+
max_overflow=10, # Allow up to 10 more connections during spikes
|
|
69
|
+
connect_args={
|
|
70
|
+
"connect_timeout": 10, # 10 second connection timeout
|
|
71
|
+
},
|
|
72
|
+
)
|
|
58
73
|
|
|
59
|
-
# Step 4:
|
|
60
|
-
_SessionLocal = sessionmaker(
|
|
74
|
+
# Step 4: Configure session factory
|
|
75
|
+
_SessionLocal = sessionmaker(
|
|
76
|
+
autocommit=False,
|
|
77
|
+
autoflush=False,
|
|
78
|
+
bind=_engine,
|
|
79
|
+
expire_on_commit=False, # Prevent detached instance errors
|
|
80
|
+
)
|
|
61
81
|
|
|
62
82
|
# Step 5: Apply Alembic migrations programmatically
|
|
63
83
|
current_dir = os.path.dirname(__file__) # → lecrapaud/db
|
|
@@ -76,8 +96,17 @@ def init_db(uri: str = None):
|
|
|
76
96
|
# Dependency to get a session instance
|
|
77
97
|
@contextmanager
|
|
78
98
|
def get_db():
|
|
99
|
+
"""Get a database session with automatic connection management.
|
|
100
|
+
|
|
101
|
+
Yields:
|
|
102
|
+
Session: A SQLAlchemy session instance
|
|
103
|
+
|
|
104
|
+
The session is automatically committed if no exceptions occur, and rolled back otherwise.
|
|
105
|
+
The connection is automatically returned to the pool when the session is closed.
|
|
106
|
+
"""
|
|
79
107
|
if _SessionLocal is None:
|
|
80
108
|
init_db()
|
|
109
|
+
|
|
81
110
|
db = _SessionLocal()
|
|
82
111
|
try:
|
|
83
112
|
yield db
|