lecrapaud 0.19.2__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/api.py +3 -0
- lecrapaud/config.py +1 -0
- lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
- lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
- lecrapaud/db/models/__init__.py +2 -4
- lecrapaud/db/models/base.py +103 -65
- lecrapaud/db/models/experiment.py +53 -46
- lecrapaud/db/models/feature_selection.py +0 -3
- lecrapaud/db/models/feature_selection_rank.py +0 -18
- lecrapaud/db/models/model_selection.py +2 -2
- lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
- lecrapaud/db/session.py +1 -0
- lecrapaud/experiment.py +7 -4
- lecrapaud/feature_engineering.py +6 -9
- lecrapaud/feature_selection.py +0 -1
- lecrapaud/model_selection.py +478 -170
- lecrapaud/search_space.py +2 -1
- lecrapaud/utils.py +22 -2
- {lecrapaud-0.19.2.dist-info → lecrapaud-0.20.0.dist-info}/METADATA +1 -1
- {lecrapaud-0.19.2.dist-info → lecrapaud-0.20.0.dist-info}/RECORD +22 -21
- lecrapaud/db/models/model_training.py +0 -64
- {lecrapaud-0.19.2.dist-info → lecrapaud-0.20.0.dist-info}/WHEEL +0 -0
- {lecrapaud-0.19.2.dist-info → lecrapaud-0.20.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -20,8 +20,7 @@ from sqlalchemy.ext.hybrid import hybrid_property
|
|
|
20
20
|
from sqlalchemy import func
|
|
21
21
|
from statistics import fmean as mean
|
|
22
22
|
from lecrapaud.db.models.model_selection import ModelSelection
|
|
23
|
-
from lecrapaud.db.models.
|
|
24
|
-
from lecrapaud.db.models.score import Score
|
|
23
|
+
from lecrapaud.db.models.model_selection_score import ModelSelectionScore
|
|
25
24
|
|
|
26
25
|
from lecrapaud.db.models.base import Base, with_db
|
|
27
26
|
from lecrapaud.db.models.utils import create_association_table
|
|
@@ -70,16 +69,14 @@ class Experiment(Base):
|
|
|
70
69
|
# Get the minimum RMSE for each model selection
|
|
71
70
|
min_scores = [
|
|
72
71
|
min(
|
|
73
|
-
|
|
74
|
-
for
|
|
75
|
-
|
|
76
|
-
if score.rmse is not None
|
|
72
|
+
mss.rmse
|
|
73
|
+
for mss in ms.model_selection_scores
|
|
74
|
+
if mss.rmse is not None
|
|
77
75
|
)
|
|
78
76
|
for ms in self.model_selections
|
|
79
77
|
if any(
|
|
80
|
-
|
|
81
|
-
for
|
|
82
|
-
for score in mt.score
|
|
78
|
+
mss.rmse is not None
|
|
79
|
+
for mss in ms.model_selection_scores
|
|
83
80
|
)
|
|
84
81
|
]
|
|
85
82
|
return min(min_scores) if min_scores else None
|
|
@@ -90,16 +87,14 @@ class Experiment(Base):
|
|
|
90
87
|
# Get the minimum LogLoss for each model selection
|
|
91
88
|
min_scores = [
|
|
92
89
|
min(
|
|
93
|
-
|
|
94
|
-
for
|
|
95
|
-
|
|
96
|
-
if score.logloss is not None
|
|
90
|
+
mss.logloss
|
|
91
|
+
for mss in ms.model_selection_scores
|
|
92
|
+
if mss.logloss is not None
|
|
97
93
|
)
|
|
98
94
|
for ms in self.model_selections
|
|
99
95
|
if any(
|
|
100
|
-
|
|
101
|
-
for
|
|
102
|
-
for score in mt.score
|
|
96
|
+
mss.logloss is not None
|
|
97
|
+
for mss in ms.model_selection_scores
|
|
103
98
|
)
|
|
104
99
|
]
|
|
105
100
|
return min(min_scores) if min_scores else None
|
|
@@ -110,16 +105,14 @@ class Experiment(Base):
|
|
|
110
105
|
# Get the minimum RMSE for each model selection
|
|
111
106
|
min_scores = [
|
|
112
107
|
min(
|
|
113
|
-
|
|
114
|
-
for
|
|
115
|
-
|
|
116
|
-
if score.rmse is not None
|
|
108
|
+
mss.rmse
|
|
109
|
+
for mss in ms.model_selection_scores
|
|
110
|
+
if mss.rmse is not None
|
|
117
111
|
)
|
|
118
112
|
for ms in self.model_selections
|
|
119
113
|
if any(
|
|
120
|
-
|
|
121
|
-
for
|
|
122
|
-
for score in mt.score
|
|
114
|
+
mss.rmse is not None
|
|
115
|
+
for mss in ms.model_selection_scores
|
|
123
116
|
)
|
|
124
117
|
]
|
|
125
118
|
return mean(min_scores) if min_scores else None
|
|
@@ -130,16 +123,14 @@ class Experiment(Base):
|
|
|
130
123
|
# Get the minimum LogLoss for each model selection
|
|
131
124
|
min_scores = [
|
|
132
125
|
min(
|
|
133
|
-
|
|
134
|
-
for
|
|
135
|
-
|
|
136
|
-
if score.logloss is not None
|
|
126
|
+
mss.logloss
|
|
127
|
+
for mss in ms.model_selection_scores
|
|
128
|
+
if mss.logloss is not None
|
|
137
129
|
)
|
|
138
130
|
for ms in self.model_selections
|
|
139
131
|
if any(
|
|
140
|
-
|
|
141
|
-
for
|
|
142
|
-
for score in mt.score
|
|
132
|
+
mss.logloss is not None
|
|
133
|
+
for mss in ms.model_selection_scores
|
|
143
134
|
)
|
|
144
135
|
]
|
|
145
136
|
return mean(min_scores) if min_scores else None
|
|
@@ -353,7 +344,7 @@ class Experiment(Base):
|
|
|
353
344
|
(ms for ms in self.model_selections if ms.target_id == target.id), None
|
|
354
345
|
)
|
|
355
346
|
|
|
356
|
-
if not best_model_selection or not best_model_selection.
|
|
347
|
+
if not best_model_selection or not best_model_selection.model_selection_scores:
|
|
357
348
|
return {
|
|
358
349
|
"experiment_name": self.name,
|
|
359
350
|
"target_number": target_number,
|
|
@@ -361,22 +352,32 @@ class Experiment(Base):
|
|
|
361
352
|
"scores": {},
|
|
362
353
|
}
|
|
363
354
|
|
|
364
|
-
# Get the best model
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
#
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
355
|
+
# Get the best model score based on lowest logloss or rmse
|
|
356
|
+
model_scores = best_model_selection.model_selection_scores
|
|
357
|
+
|
|
358
|
+
# Determine if we should use logloss or rmse based on what's available
|
|
359
|
+
if any(ms.logloss is not None for ms in model_scores):
|
|
360
|
+
# Classification: find lowest logloss
|
|
361
|
+
best_score = min(
|
|
362
|
+
(ms for ms in model_scores if ms.logloss is not None),
|
|
363
|
+
key=lambda x: x.logloss
|
|
364
|
+
)
|
|
365
|
+
elif any(ms.rmse is not None for ms in model_scores):
|
|
366
|
+
# Regression: find lowest rmse
|
|
367
|
+
best_score = min(
|
|
368
|
+
(ms for ms in model_scores if ms.rmse is not None),
|
|
369
|
+
key=lambda x: x.rmse
|
|
370
|
+
)
|
|
371
|
+
else:
|
|
371
372
|
return {
|
|
372
373
|
"experiment_name": self.name,
|
|
373
374
|
"target_number": target_number,
|
|
374
|
-
"error": "No
|
|
375
|
+
"error": "No scores found for the best model",
|
|
375
376
|
"scores": {},
|
|
376
377
|
}
|
|
377
378
|
|
|
378
|
-
#
|
|
379
|
-
score =
|
|
379
|
+
# Use the best score found
|
|
380
|
+
score = best_score
|
|
380
381
|
available_metrics = [
|
|
381
382
|
"rmse",
|
|
382
383
|
"mae",
|
|
@@ -398,12 +399,12 @@ class Experiment(Base):
|
|
|
398
399
|
# Get the model info
|
|
399
400
|
model_info = {
|
|
400
401
|
"model_type": (
|
|
401
|
-
|
|
402
|
+
score.model.model_type if score.model else "unknown"
|
|
402
403
|
),
|
|
403
404
|
"model_name": (
|
|
404
|
-
|
|
405
|
+
score.model.name if score.model else "unknown"
|
|
405
406
|
),
|
|
406
|
-
"training_time_seconds":
|
|
407
|
+
"training_time_seconds": score.training_time,
|
|
407
408
|
}
|
|
408
409
|
|
|
409
410
|
return {
|
|
@@ -413,7 +414,10 @@ class Experiment(Base):
|
|
|
413
414
|
"scores": scores,
|
|
414
415
|
}
|
|
415
416
|
|
|
416
|
-
|
|
417
|
+
@with_db
|
|
418
|
+
def get_features(self, target_number: int, db=None):
|
|
419
|
+
# Ensure we have a fresh instance attached to the session
|
|
420
|
+
self = db.merge(self)
|
|
417
421
|
targets = [t for t in self.targets if t.name == f"TARGET_{target_number}"]
|
|
418
422
|
if targets:
|
|
419
423
|
target_id = targets[0].id
|
|
@@ -429,7 +433,10 @@ class Experiment(Base):
|
|
|
429
433
|
features = joblib.load(f"{self.path}/TARGET_{target_number}/features.pkl")
|
|
430
434
|
return features
|
|
431
435
|
|
|
432
|
-
|
|
436
|
+
@with_db
|
|
437
|
+
def get_all_features(self, date_column: str = None, group_column: str = None, db=None):
|
|
438
|
+
# Ensure we have a fresh instance attached to the session
|
|
439
|
+
self = db.merge(self)
|
|
433
440
|
target_idx = [target.id for target in self.targets]
|
|
434
441
|
_all_features = chain.from_iterable(
|
|
435
442
|
[f.name for f in fs.features]
|
|
@@ -65,21 +65,3 @@ class FeatureSelectionRank(Base):
|
|
|
65
65
|
name="uq_feature_selection_rank_composite",
|
|
66
66
|
),
|
|
67
67
|
)
|
|
68
|
-
|
|
69
|
-
@classmethod
|
|
70
|
-
@with_db
|
|
71
|
-
def bulk_upsert(cls, rows, db=None):
|
|
72
|
-
stmt = insert(cls).values(rows)
|
|
73
|
-
|
|
74
|
-
update_fields = {
|
|
75
|
-
key: stmt.inserted[key]
|
|
76
|
-
for key in rows[0]
|
|
77
|
-
if key not in ("feature_selection_id", "feature_id", "method")
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
stmt = stmt.on_duplicate_key_update(**update_fields)
|
|
81
|
-
|
|
82
|
-
db.execute(stmt)
|
|
83
|
-
db.commit()
|
|
84
|
-
|
|
85
|
-
return len(rows)
|
|
@@ -54,8 +54,8 @@ class ModelSelection(Base):
|
|
|
54
54
|
)
|
|
55
55
|
|
|
56
56
|
best_model = relationship("Model", lazy="selectin")
|
|
57
|
-
|
|
58
|
-
"
|
|
57
|
+
model_selection_scores = relationship(
|
|
58
|
+
"ModelSelectionScore",
|
|
59
59
|
back_populates="model_selection",
|
|
60
60
|
cascade="all, delete-orphan",
|
|
61
61
|
lazy="selectin",
|
|
@@ -3,10 +3,11 @@ from sqlalchemy import (
|
|
|
3
3
|
Integer,
|
|
4
4
|
String,
|
|
5
5
|
Float,
|
|
6
|
+
JSON,
|
|
6
7
|
ForeignKey,
|
|
7
8
|
BigInteger,
|
|
8
9
|
TIMESTAMP,
|
|
9
|
-
|
|
10
|
+
UniqueConstraint,
|
|
10
11
|
)
|
|
11
12
|
from sqlalchemy import func
|
|
12
13
|
from sqlalchemy.orm import relationship
|
|
@@ -14,7 +15,9 @@ from lecrapaud.db.models.base import Base
|
|
|
14
15
|
from lecrapaud.config import LECRAPAUD_TABLE_PREFIX
|
|
15
16
|
|
|
16
17
|
|
|
17
|
-
class
|
|
18
|
+
class ModelSelectionScore(Base):
|
|
19
|
+
__tablename__ = f"{LECRAPAUD_TABLE_PREFIX}_model_selection_scores"
|
|
20
|
+
|
|
18
21
|
id = Column(BigInteger, primary_key=True, index=True, autoincrement=True)
|
|
19
22
|
created_at = Column(
|
|
20
23
|
TIMESTAMP(timezone=True), server_default=func.now(), nullable=False
|
|
@@ -25,10 +28,21 @@ class Score(Base):
|
|
|
25
28
|
onupdate=func.now(),
|
|
26
29
|
nullable=False,
|
|
27
30
|
)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
+
|
|
32
|
+
# From ModelTraining
|
|
33
|
+
best_params = Column(JSON)
|
|
34
|
+
model_path = Column(String(255))
|
|
31
35
|
training_time = Column(Integer)
|
|
36
|
+
model_id = Column(
|
|
37
|
+
BigInteger, ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_models.id"), nullable=False
|
|
38
|
+
)
|
|
39
|
+
model_selection_id = Column(
|
|
40
|
+
BigInteger,
|
|
41
|
+
ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_model_selections.id", ondelete="CASCADE"),
|
|
42
|
+
nullable=False,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# From Score (excluding type and training_time which is already in ModelTraining)
|
|
32
46
|
eval_data_std = Column(Float)
|
|
33
47
|
rmse = Column(Float)
|
|
34
48
|
rmse_std_ratio = Column(Float)
|
|
@@ -50,12 +64,15 @@ class Score(Base):
|
|
|
50
64
|
precision_at_threshold = Column(Float)
|
|
51
65
|
recall_at_threshold = Column(Float)
|
|
52
66
|
f1_at_threshold = Column(Float)
|
|
53
|
-
model_training_id = Column(
|
|
54
|
-
BigInteger,
|
|
55
|
-
ForeignKey(f"{LECRAPAUD_TABLE_PREFIX}_model_trainings.id", ondelete="CASCADE"),
|
|
56
|
-
nullable=False,
|
|
57
|
-
)
|
|
58
67
|
|
|
59
|
-
|
|
60
|
-
|
|
68
|
+
# Relationships
|
|
69
|
+
model = relationship("Model", lazy="selectin")
|
|
70
|
+
model_selection = relationship(
|
|
71
|
+
"ModelSelection", back_populates="model_selection_scores", lazy="selectin"
|
|
61
72
|
)
|
|
73
|
+
|
|
74
|
+
__table_args__ = (
|
|
75
|
+
UniqueConstraint(
|
|
76
|
+
"model_id", "model_selection_id", name="uq_model_selection_score_composite"
|
|
77
|
+
),
|
|
78
|
+
)
|
lecrapaud/db/session.py
CHANGED
lecrapaud/experiment.py
CHANGED
|
@@ -35,7 +35,7 @@ def create_experiment(
|
|
|
35
35
|
groups = {}
|
|
36
36
|
if group_column:
|
|
37
37
|
groups["number_of_groups"] = data[group_column].nunique()
|
|
38
|
-
groups["list_of_groups"] = data[group_column].unique().tolist()
|
|
38
|
+
groups["list_of_groups"] = sorted(data[group_column].unique().tolist())
|
|
39
39
|
|
|
40
40
|
with get_db() as db:
|
|
41
41
|
all_targets = Target.get_all(db=db)
|
|
@@ -50,8 +50,8 @@ def create_experiment(
|
|
|
50
50
|
os.makedirs(preprocessing_dir, exist_ok=True)
|
|
51
51
|
os.makedirs(data_dir, exist_ok=True)
|
|
52
52
|
|
|
53
|
+
# Create or update experiment (without targets relation)
|
|
53
54
|
experiment = Experiment.upsert(
|
|
54
|
-
match_fields=["name"],
|
|
55
55
|
db=db,
|
|
56
56
|
name=experiment_name,
|
|
57
57
|
path=Path(experiment_dir).resolve(),
|
|
@@ -62,7 +62,6 @@ def create_experiment(
|
|
|
62
62
|
max_features=max_features,
|
|
63
63
|
**groups,
|
|
64
64
|
**dates,
|
|
65
|
-
targets=targets,
|
|
66
65
|
context={
|
|
67
66
|
"corr_threshold": corr_threshold,
|
|
68
67
|
"percentile": percentile,
|
|
@@ -73,5 +72,9 @@ def create_experiment(
|
|
|
73
72
|
**kwargs,
|
|
74
73
|
},
|
|
75
74
|
)
|
|
76
|
-
|
|
75
|
+
|
|
76
|
+
# Set targets relationship after creation/update
|
|
77
|
+
experiment.targets = targets
|
|
78
|
+
experiment.save(db=db)
|
|
79
|
+
|
|
77
80
|
return experiment
|
lecrapaud/feature_engineering.py
CHANGED
|
@@ -483,8 +483,8 @@ class PreprocessFeature:
|
|
|
483
483
|
f"{data.shape} {name} data from {dates[f"{name}_start_date"].strftime('%d/%m/%Y')} to {dates[f"{name}_end_date"].strftime('%d/%m/%Y')}"
|
|
484
484
|
)
|
|
485
485
|
|
|
486
|
-
|
|
487
|
-
|
|
486
|
+
# Update existing experiment with sizes and dates
|
|
487
|
+
Experiment.update(
|
|
488
488
|
id=self.experiment_id,
|
|
489
489
|
train_size=len(train),
|
|
490
490
|
val_size=len(val),
|
|
@@ -545,8 +545,8 @@ class PreprocessFeature:
|
|
|
545
545
|
for name, data in zip(["train", "val", "test"], [train, val, test]):
|
|
546
546
|
logger.info(f"{data.shape} {name} data")
|
|
547
547
|
|
|
548
|
-
|
|
549
|
-
|
|
548
|
+
# Update existing experiment with sizes
|
|
549
|
+
Experiment.update(
|
|
550
550
|
id=self.experiment_id,
|
|
551
551
|
train_size=len(train),
|
|
552
552
|
val_size=len(val),
|
|
@@ -838,8 +838,7 @@ class PreprocessFeature:
|
|
|
838
838
|
|
|
839
839
|
# Upsert features in bulk if we have any features
|
|
840
840
|
if all_feature_names:
|
|
841
|
-
Feature.
|
|
842
|
-
match_fields=["name"],
|
|
841
|
+
Feature.bulk_upsert(
|
|
843
842
|
name=all_feature_names,
|
|
844
843
|
type=all_feature_types,
|
|
845
844
|
)
|
|
@@ -855,9 +854,7 @@ class PreprocessFeature:
|
|
|
855
854
|
for target in target_names
|
|
856
855
|
]
|
|
857
856
|
|
|
858
|
-
Target.
|
|
859
|
-
match_fields=["name"], name=target_names, type=target_types
|
|
860
|
-
)
|
|
857
|
+
Target.bulk_upsert(name=target_names, type=target_types)
|
|
861
858
|
|
|
862
859
|
# Get all the upserted objects
|
|
863
860
|
targets = Target.filter(name__in=target_names)
|
lecrapaud/feature_selection.py
CHANGED