lecrapaud 0.18.7__py3-none-any.whl → 0.22.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lecrapaud/__init__.py +22 -1
- lecrapaud/{api.py → base.py} +331 -241
- lecrapaud/config.py +15 -3
- lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py +9 -4
- lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_score.py +34 -0
- lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py +44 -0
- lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
- lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
- lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
- lecrapaud/db/models/__init__.py +2 -4
- lecrapaud/db/models/base.py +122 -67
- lecrapaud/db/models/experiment.py +196 -183
- lecrapaud/db/models/feature_selection.py +0 -3
- lecrapaud/db/models/feature_selection_rank.py +0 -18
- lecrapaud/db/models/model_selection.py +2 -2
- lecrapaud/db/models/{score.py → model_selection_score.py} +30 -12
- lecrapaud/db/session.py +33 -4
- lecrapaud/experiment.py +44 -17
- lecrapaud/feature_engineering.py +45 -674
- lecrapaud/feature_preprocessing.py +1202 -0
- lecrapaud/feature_selection.py +145 -332
- lecrapaud/integrations/sentry_integration.py +46 -0
- lecrapaud/misc/tabpfn_tests.ipynb +2 -2
- lecrapaud/mixins.py +247 -0
- lecrapaud/model_preprocessing.py +295 -0
- lecrapaud/model_selection.py +725 -249
- lecrapaud/pipeline.py +548 -0
- lecrapaud/search_space.py +38 -1
- lecrapaud/utils.py +36 -3
- lecrapaud-0.22.6.dist-info/METADATA +423 -0
- lecrapaud-0.22.6.dist-info/RECORD +51 -0
- {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
- {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
- lecrapaud/db/models/model_training.py +0 -64
- lecrapaud/jobs/__init__.py +0 -13
- lecrapaud/jobs/config.py +0 -17
- lecrapaud/jobs/scheduler.py +0 -30
- lecrapaud/jobs/tasks.py +0 -17
- lecrapaud-0.18.7.dist-info/METADATA +0 -248
- lecrapaud-0.18.7.dist-info/RECORD +0 -46
lecrapaud/experiment.py
CHANGED
|
@@ -3,6 +3,7 @@ from pathlib import Path
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
import joblib
|
|
6
|
+
from datetime import datetime
|
|
6
7
|
|
|
7
8
|
# Set up coverage file path
|
|
8
9
|
os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
|
|
@@ -15,17 +16,44 @@ from lecrapaud.db.session import get_db
|
|
|
15
16
|
|
|
16
17
|
def create_experiment(
|
|
17
18
|
data: pd.DataFrame | str,
|
|
18
|
-
corr_threshold,
|
|
19
|
-
percentile,
|
|
20
|
-
max_features,
|
|
21
|
-
date_column,
|
|
22
|
-
group_column,
|
|
23
19
|
experiment_name,
|
|
20
|
+
date_column=None,
|
|
21
|
+
group_column=None,
|
|
24
22
|
**kwargs,
|
|
25
23
|
):
|
|
24
|
+
if "target_numbers" not in kwargs or "target_clf" not in kwargs:
|
|
25
|
+
raise ValueError(
|
|
26
|
+
"You should specify context in kwargs to create experiment from folder. Especially, target_clf and target_numbers must be present"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# if data is a path, load from path
|
|
30
|
+
# only works locally as we do not save full.pkl outside development env
|
|
26
31
|
if isinstance(data, str):
|
|
27
32
|
path = f"{data}/data/full.pkl"
|
|
28
33
|
data = joblib.load(path)
|
|
34
|
+
keys = kwargs.keys()
|
|
35
|
+
date_column = kwargs["date_column"] if "date_column" in keys else None
|
|
36
|
+
group_column = keys["group_column"] if "group_column" in keys else None
|
|
37
|
+
targets = []
|
|
38
|
+
for target_number in kwargs["target_numbers"]:
|
|
39
|
+
target_name = f"TARGET_{target_number}"
|
|
40
|
+
target_type = (
|
|
41
|
+
"classification"
|
|
42
|
+
if target_number in kwargs["target_clf"]
|
|
43
|
+
else "regression"
|
|
44
|
+
)
|
|
45
|
+
targets.append({"name": target_name, "type": target_type})
|
|
46
|
+
Target.bulk_upsert(targets)
|
|
47
|
+
else:
|
|
48
|
+
experiment_name = (
|
|
49
|
+
f"{experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
if kwargs.get("time_series") and not date_column:
|
|
53
|
+
raise ValueError("date_column must be provided for time series experiments")
|
|
54
|
+
|
|
55
|
+
if experiment_name is None:
|
|
56
|
+
raise ValueError("experiment_name must be provided")
|
|
29
57
|
|
|
30
58
|
dates = {}
|
|
31
59
|
if date_column:
|
|
@@ -35,14 +63,16 @@ def create_experiment(
|
|
|
35
63
|
groups = {}
|
|
36
64
|
if group_column:
|
|
37
65
|
groups["number_of_groups"] = data[group_column].nunique()
|
|
38
|
-
groups["list_of_groups"] = data[group_column].unique().tolist()
|
|
66
|
+
groups["list_of_groups"] = sorted(data[group_column].unique().tolist())
|
|
39
67
|
|
|
40
68
|
with get_db() as db:
|
|
41
69
|
all_targets = Target.get_all(db=db)
|
|
42
70
|
targets = [
|
|
43
|
-
target
|
|
71
|
+
target
|
|
72
|
+
for target in all_targets
|
|
73
|
+
if int(target.name.split("_")[-1]) in kwargs["target_numbers"]
|
|
44
74
|
]
|
|
45
|
-
|
|
75
|
+
number_of_targets = len(targets)
|
|
46
76
|
|
|
47
77
|
experiment_dir = f"{tmp_dir}/{experiment_name}"
|
|
48
78
|
preprocessing_dir = f"{experiment_dir}/preprocessing"
|
|
@@ -50,23 +80,16 @@ def create_experiment(
|
|
|
50
80
|
os.makedirs(preprocessing_dir, exist_ok=True)
|
|
51
81
|
os.makedirs(data_dir, exist_ok=True)
|
|
52
82
|
|
|
83
|
+
# Create or update experiment (without targets relation)
|
|
53
84
|
experiment = Experiment.upsert(
|
|
54
|
-
match_fields=["name"],
|
|
55
85
|
db=db,
|
|
56
86
|
name=experiment_name,
|
|
57
87
|
path=Path(experiment_dir).resolve(),
|
|
58
|
-
type="training",
|
|
59
88
|
size=data.shape[0],
|
|
60
|
-
|
|
61
|
-
percentile=percentile,
|
|
62
|
-
max_features=max_features,
|
|
89
|
+
number_of_targets=number_of_targets,
|
|
63
90
|
**groups,
|
|
64
91
|
**dates,
|
|
65
|
-
targets=targets,
|
|
66
92
|
context={
|
|
67
|
-
"corr_threshold": corr_threshold,
|
|
68
|
-
"percentile": percentile,
|
|
69
|
-
"max_features": max_features,
|
|
70
93
|
"date_column": date_column,
|
|
71
94
|
"group_column": group_column,
|
|
72
95
|
"experiment_name": experiment_name,
|
|
@@ -74,4 +97,8 @@ def create_experiment(
|
|
|
74
97
|
},
|
|
75
98
|
)
|
|
76
99
|
|
|
100
|
+
# Set targets relationship after creation/update
|
|
101
|
+
experiment.targets = targets
|
|
102
|
+
experiment.save(db=db)
|
|
103
|
+
|
|
77
104
|
return experiment
|