lecrapaud 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. lecrapaud/__init__.py +22 -1
  2. lecrapaud/{api.py → base.py} +331 -241
  3. lecrapaud/config.py +15 -3
  4. lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
  5. lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
  6. lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
  7. lecrapaud/db/models/__init__.py +2 -4
  8. lecrapaud/db/models/base.py +116 -65
  9. lecrapaud/db/models/experiment.py +195 -182
  10. lecrapaud/db/models/feature_selection.py +0 -3
  11. lecrapaud/db/models/feature_selection_rank.py +0 -18
  12. lecrapaud/db/models/model_selection.py +2 -2
  13. lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
  14. lecrapaud/db/session.py +4 -0
  15. lecrapaud/experiment.py +44 -17
  16. lecrapaud/feature_engineering.py +45 -674
  17. lecrapaud/feature_preprocessing.py +1202 -0
  18. lecrapaud/feature_selection.py +145 -332
  19. lecrapaud/integrations/sentry_integration.py +46 -0
  20. lecrapaud/misc/tabpfn_tests.ipynb +2 -2
  21. lecrapaud/mixins.py +247 -0
  22. lecrapaud/model_preprocessing.py +295 -0
  23. lecrapaud/model_selection.py +612 -242
  24. lecrapaud/pipeline.py +548 -0
  25. lecrapaud/search_space.py +2 -1
  26. lecrapaud/utils.py +36 -3
  27. lecrapaud-0.22.6.dist-info/METADATA +423 -0
  28. lecrapaud-0.22.6.dist-info/RECORD +51 -0
  29. {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
  30. {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
  31. lecrapaud/db/models/model_training.py +0 -64
  32. lecrapaud/jobs/__init__.py +0 -13
  33. lecrapaud/jobs/config.py +0 -17
  34. lecrapaud/jobs/scheduler.py +0 -30
  35. lecrapaud/jobs/tasks.py +0 -17
  36. lecrapaud-0.19.0.dist-info/METADATA +0 -249
  37. lecrapaud-0.19.0.dist-info/RECORD +0 -48
lecrapaud/experiment.py CHANGED
@@ -3,6 +3,7 @@ from pathlib import Path
3
3
 
4
4
  import pandas as pd
5
5
  import joblib
6
+ from datetime import datetime
6
7
 
7
8
  # Set up coverage file path
8
9
  os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
@@ -15,17 +16,44 @@ from lecrapaud.db.session import get_db
15
16
 
16
17
  def create_experiment(
17
18
  data: pd.DataFrame | str,
18
- corr_threshold,
19
- percentile,
20
- max_features,
21
- date_column,
22
- group_column,
23
19
  experiment_name,
20
+ date_column=None,
21
+ group_column=None,
24
22
  **kwargs,
25
23
  ):
24
+ if "target_numbers" not in kwargs or "target_clf" not in kwargs:
25
+ raise ValueError(
26
+ "You should specify context in kwargs to create experiment from folder. Especially, target_clf and target_numbers must be present"
27
+ )
28
+
29
+ # if data is a path, load from path
30
+ # only works locally as we do not save full.pkl outside development env
26
31
  if isinstance(data, str):
27
32
  path = f"{data}/data/full.pkl"
28
33
  data = joblib.load(path)
34
+ keys = kwargs.keys()
35
+ date_column = kwargs["date_column"] if "date_column" in keys else None
36
+ group_column = keys["group_column"] if "group_column" in keys else None
37
+ targets = []
38
+ for target_number in kwargs["target_numbers"]:
39
+ target_name = f"TARGET_{target_number}"
40
+ target_type = (
41
+ "classification"
42
+ if target_number in kwargs["target_clf"]
43
+ else "regression"
44
+ )
45
+ targets.append({"name": target_name, "type": target_type})
46
+ Target.bulk_upsert(targets)
47
+ else:
48
+ experiment_name = (
49
+ f"{experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
50
+ )
51
+
52
+ if kwargs.get("time_series") and not date_column:
53
+ raise ValueError("date_column must be provided for time series experiments")
54
+
55
+ if experiment_name is None:
56
+ raise ValueError("experiment_name must be provided")
29
57
 
30
58
  dates = {}
31
59
  if date_column:
@@ -35,14 +63,16 @@ def create_experiment(
35
63
  groups = {}
36
64
  if group_column:
37
65
  groups["number_of_groups"] = data[group_column].nunique()
38
- groups["list_of_groups"] = data[group_column].unique().tolist().sort()
66
+ groups["list_of_groups"] = sorted(data[group_column].unique().tolist())
39
67
 
40
68
  with get_db() as db:
41
69
  all_targets = Target.get_all(db=db)
42
70
  targets = [
43
- target for target in all_targets if target.name in data.columns.str.upper()
71
+ target
72
+ for target in all_targets
73
+ if int(target.name.split("_")[-1]) in kwargs["target_numbers"]
44
74
  ]
45
- experiment_name = f"{experiment_name}_{groups["number_of_groups"] if group_column else 'ng'}_{corr_threshold}_{percentile}_{max_features}_{dates['start_date'].date() if date_column else 'nd'}_{dates['end_date'].date() if date_column else 'nd'}"
75
+ number_of_targets = len(targets)
46
76
 
47
77
  experiment_dir = f"{tmp_dir}/{experiment_name}"
48
78
  preprocessing_dir = f"{experiment_dir}/preprocessing"
@@ -50,23 +80,16 @@ def create_experiment(
50
80
  os.makedirs(preprocessing_dir, exist_ok=True)
51
81
  os.makedirs(data_dir, exist_ok=True)
52
82
 
83
+ # Create or update experiment (without targets relation)
53
84
  experiment = Experiment.upsert(
54
- match_fields=["name"],
55
85
  db=db,
56
86
  name=experiment_name,
57
87
  path=Path(experiment_dir).resolve(),
58
- type="training",
59
88
  size=data.shape[0],
60
- corr_threshold=corr_threshold,
61
- percentile=percentile,
62
- max_features=max_features,
89
+ number_of_targets=number_of_targets,
63
90
  **groups,
64
91
  **dates,
65
- targets=targets,
66
92
  context={
67
- "corr_threshold": corr_threshold,
68
- "percentile": percentile,
69
- "max_features": max_features,
70
93
  "date_column": date_column,
71
94
  "group_column": group_column,
72
95
  "experiment_name": experiment_name,
@@ -74,4 +97,8 @@ def create_experiment(
74
97
  },
75
98
  )
76
99
 
100
+ # Set targets relationship after creation/update
101
+ experiment.targets = targets
102
+ experiment.save(db=db)
103
+
77
104
  return experiment