lecrapaud 0.14.7__tar.gz → 0.15.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (46) hide show
  1. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/PKG-INFO +2 -1
  2. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/api.py +8 -0
  3. lecrapaud-0.15.0/lecrapaud/db/models/experiment.py +301 -0
  4. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/feature_engineering.py +1 -1
  5. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/pyproject.toml +2 -1
  6. lecrapaud-0.14.7/lecrapaud/db/models/experiment.py +0 -130
  7. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/LICENSE +0 -0
  8. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/README.md +0 -0
  9. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/__init__.py +0 -0
  10. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/config.py +0 -0
  11. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/__init__.py +0 -0
  12. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/alembic/README +0 -0
  13. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/alembic/env.py +0 -0
  14. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/alembic/script.py.mako +0 -0
  15. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/alembic/versions/2025_06_23_1748-f089dfb7e3ba_.py +0 -0
  16. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/alembic/versions/2025_06_24_1216-c62251b129ed_.py +0 -0
  17. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py +0 -0
  18. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/alembic/versions/2025_06_25_1759-72aa496ca65b_.py +0 -0
  19. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/alembic.ini +0 -0
  20. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/models/__init__.py +0 -0
  21. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/models/base.py +0 -0
  22. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/models/feature.py +0 -0
  23. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/models/feature_selection.py +0 -0
  24. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/models/feature_selection_rank.py +0 -0
  25. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/models/model.py +0 -0
  26. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/models/model_selection.py +0 -0
  27. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/models/model_training.py +0 -0
  28. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/models/score.py +0 -0
  29. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/models/target.py +0 -0
  30. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/models/utils.py +0 -0
  31. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/db/session.py +0 -0
  32. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/directories.py +0 -0
  33. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/experiment.py +0 -0
  34. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/feature_selection.py +0 -0
  35. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/integrations/openai_integration.py +0 -0
  36. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/jobs/__init__.py +0 -0
  37. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/jobs/config.py +0 -0
  38. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/jobs/scheduler.py +0 -0
  39. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/jobs/tasks.py +0 -0
  40. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/misc/tabpfn_tests.ipynb +0 -0
  41. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/misc/test-gpu-bilstm.ipynb +0 -0
  42. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/misc/test-gpu-resnet.ipynb +0 -0
  43. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/misc/test-gpu-transformers.ipynb +0 -0
  44. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/model_selection.py +0 -0
  45. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/search_space.py +0 -0
  46. {lecrapaud-0.14.7 → lecrapaud-0.15.0}/lecrapaud/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: lecrapaud
3
- Version: 0.14.7
3
+ Version: 0.15.0
4
4
  Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
5
  License: Apache License
6
6
  Author: Pierre H. Gallet
@@ -19,6 +19,7 @@ Requires-Dist: mlxtend (>=0.23.4)
19
19
  Requires-Dist: numpy (>=2.1.3)
20
20
  Requires-Dist: openai (>=1.88.0)
21
21
  Requires-Dist: pandas (>=2.3.0)
22
+ Requires-Dist: pydantic (>=2.9.2)
22
23
  Requires-Dist: python-dotenv (>=1.1.0)
23
24
  Requires-Dist: scikit-learn (>=1.6.1)
24
25
  Requires-Dist: scipy (<1.14.0)
@@ -88,6 +88,14 @@ class LeCrapaud:
88
88
  """
89
89
  return ExperimentEngine(id=id, **kwargs)
90
90
 
91
+ def get_last_experiement_by_name(self, name: str, **kwargs) -> "ExperimentEngine":
92
+ """Retrieve the last experiment by name."""
93
+ return ExperimentEngine(id=Experiment.get_last_by_name(name).id, **kwargs)
94
+
95
+ def get_best_experiment_by_name(self, name: str,metric: str = "both", **kwargs) -> "ExperimentEngine":
96
+ """Retrieve the best experiment by score."""
97
+ return ExperimentEngine(id=Experiment.get_best_by_score(name=name, metric=metric).id, **kwargs)
98
+
91
99
  def list_experiments(self, limit=1000) -> list["ExperimentEngine"]:
92
100
  """List all experiments in the database."""
93
101
  return [ExperimentEngine(id=exp.id) for exp in Experiment.get_all(limit=limit)]
@@ -0,0 +1,301 @@
1
+ from itertools import chain
2
+ import joblib
3
+
4
+ from sqlalchemy import (
5
+ Column,
6
+ Integer,
7
+ String,
8
+ DateTime,
9
+ Float,
10
+ JSON,
11
+ Table,
12
+ ForeignKey,
13
+ BigInteger,
14
+ TIMESTAMP,
15
+ UniqueConstraint,
16
+ func,
17
+ )
18
+ from sqlalchemy.orm import relationship, aliased
19
+ from sqlalchemy.ext.hybrid import hybrid_property
20
+ from sqlalchemy import func
21
+ from statistics import fmean as mean
22
+ from lecrapaud.db.models.model_selection import ModelSelection
23
+ from lecrapaud.db.models.model_training import ModelTraining
24
+ from lecrapaud.db.models.score import Score
25
+
26
+ from lecrapaud.db.models.base import Base, with_db
27
+ from lecrapaud.db.models.utils import create_association_table
28
+
29
+ # jointures
30
+ lecrapaud_experiment_target_association = create_association_table(
31
+ name="experiment_target_association",
32
+ table1="experiments",
33
+ column1="experiment",
34
+ table2="targets",
35
+ column2="target"
36
+ )
37
+
38
+
39
+ class Experiment(Base):
40
+
41
+ id = Column(BigInteger, primary_key=True, index=True, autoincrement=True)
42
+ created_at = Column(
43
+ TIMESTAMP(timezone=True), server_default=func.now(), nullable=False
44
+ )
45
+ updated_at = Column(
46
+ TIMESTAMP(timezone=True),
47
+ server_default=func.now(),
48
+ onupdate=func.now(),
49
+ nullable=False,
50
+ )
51
+ name = Column(String(50), nullable=False)
52
+ path = Column(String(255)) # we do not have this at creation time
53
+ type = Column(String(50), nullable=False)
54
+ size = Column(Integer, nullable=False)
55
+ train_size = Column(Integer)
56
+ val_size = Column(Integer)
57
+
58
+ # Relationships
59
+ model_selections = relationship(
60
+ "ModelSelection",
61
+ back_populates="experiment",
62
+ cascade="all, delete-orphan",
63
+ lazy="selectin",
64
+ )
65
+ @hybrid_property
66
+ def best_rmse(self):
67
+ """Best RMSE score across all model selections and trainings."""
68
+ # Get the minimum RMSE for each model selection
69
+ min_scores = [
70
+ min(
71
+ score.rmse
72
+ for mt in ms.model_trainings
73
+ for score in mt.score
74
+ if score.rmse is not None
75
+ )
76
+ for ms in self.model_selections
77
+ if any(
78
+ score.rmse is not None
79
+ for mt in ms.model_trainings
80
+ for score in mt.score
81
+ )
82
+ ]
83
+ return min(min_scores) if min_scores else None
84
+
85
+ @hybrid_property
86
+ def best_logloss(self):
87
+ """Best LogLoss score across all model selections and trainings."""
88
+ # Get the minimum LogLoss for each model selection
89
+ min_scores = [
90
+ min(
91
+ score.logloss
92
+ for mt in ms.model_trainings
93
+ for score in mt.score
94
+ if score.logloss is not None
95
+ )
96
+ for ms in self.model_selections
97
+ if any(
98
+ score.logloss is not None
99
+ for mt in ms.model_trainings
100
+ for score in mt.score
101
+ )
102
+ ]
103
+ return min(min_scores) if min_scores else None
104
+
105
+ @hybrid_property
106
+ def avg_rmse(self):
107
+ """Average RMSE score across all model selections and trainings."""
108
+ # Get the minimum RMSE for each model selection
109
+ min_scores = [
110
+ min(
111
+ score.rmse
112
+ for mt in ms.model_trainings
113
+ for score in mt.score
114
+ if score.rmse is not None
115
+ )
116
+ for ms in self.model_selections
117
+ if any(
118
+ score.rmse is not None
119
+ for mt in ms.model_trainings
120
+ for score in mt.score
121
+ )
122
+ ]
123
+ return mean(min_scores) if min_scores else None
124
+
125
+ @hybrid_property
126
+ def avg_logloss(self):
127
+ """Average LogLoss score across all model selections and trainings."""
128
+ # Get the minimum LogLoss for each model selection
129
+ min_scores = [
130
+ min(
131
+ score.logloss
132
+ for mt in ms.model_trainings
133
+ for score in mt.score
134
+ if score.logloss is not None
135
+ )
136
+ for ms in self.model_selections
137
+ if any(
138
+ score.logloss is not None
139
+ for mt in ms.model_trainings
140
+ for score in mt.score
141
+ )
142
+ ]
143
+ return mean(min_scores) if min_scores else None
144
+
145
+ test_size = Column(Integer)
146
+ corr_threshold = Column(Float, nullable=False)
147
+ max_features = Column(Integer, nullable=False)
148
+ percentile = Column(Float, nullable=False)
149
+ number_of_groups = Column(Integer)
150
+ list_of_groups = Column(JSON)
151
+ start_date = Column(DateTime)
152
+ end_date = Column(DateTime)
153
+ train_start_date = Column(DateTime)
154
+ train_end_date = Column(DateTime)
155
+ val_start_date = Column(DateTime)
156
+ val_end_date = Column(DateTime)
157
+ test_start_date = Column(DateTime)
158
+ test_end_date = Column(DateTime)
159
+ context = Column(JSON)
160
+
161
+ feature_selections = relationship(
162
+ "FeatureSelection",
163
+ back_populates="experiment",
164
+ cascade="all, delete-orphan",
165
+ lazy="selectin",
166
+ )
167
+
168
+ targets = relationship(
169
+ "Target",
170
+ secondary=lecrapaud_experiment_target_association,
171
+ back_populates="experiments",
172
+ lazy="selectin",
173
+ )
174
+
175
+ __table_args__ = (
176
+ UniqueConstraint(
177
+ "name",
178
+ name="uq_experiments_composite",
179
+ ),
180
+ )
181
+
182
+ @classmethod
183
+ @with_db
184
+ def get_last_by_name(cls, name: str, db=None):
185
+ """
186
+ Find the most recently created experiment that contains the given name string.
187
+
188
+ Args:
189
+ session: SQLAlchemy session
190
+ name (str): String to search for in experiment names
191
+
192
+ Returns:
193
+ Experiment or None: The most recent matching experiment or None if not found
194
+ """
195
+ return (
196
+ db.query(cls)
197
+ .filter(cls.name.ilike(f'%{name}%'))
198
+ .order_by(cls.created_at.desc())
199
+ .first()
200
+ )
201
+
202
+ @classmethod
203
+ @with_db
204
+ def get_best_by_score(cls, name: str, metric='both', db=None):
205
+ """
206
+ Find the experiment with the best score based on average RMSE, LogLoss, or both.
207
+
208
+ Args:
209
+ metric (str): 'rmse', 'logloss', or 'both' to determine which score to optimize
210
+ db: SQLAlchemy session
211
+
212
+ Returns:
213
+ Experiment or None: The experiment with the best score or None if not found
214
+ """
215
+ if metric == 'both':
216
+ # Calculate a combined score: average of normalized RMSE and LogLoss
217
+ # This ensures we're comparing apples to apples by normalizing the scores
218
+ experiments = db.query(cls).filter(cls.name.ilike(f'%{name}%')).all()
219
+ if not experiments:
220
+ return None
221
+
222
+ # Get all scores
223
+ rmse_scores = [e.avg_rmse for e in experiments if e.avg_rmse is not None]
224
+ logloss_scores = [e.avg_logloss for e in experiments if e.avg_logloss is not None]
225
+
226
+ if not rmse_scores or not logloss_scores:
227
+ return None
228
+
229
+ # Normalize scores (subtract min and divide by range)
230
+ min_rmse = min(rmse_scores)
231
+ range_rmse = max(rmse_scores) - min_rmse
232
+ min_logloss = min(logloss_scores)
233
+ range_logloss = max(logloss_scores) - min_logloss
234
+
235
+ # Calculate combined score for each experiment
236
+ experiment_scores = []
237
+ for experiment in experiments:
238
+ if experiment.avg_rmse is None or experiment.avg_logloss is None:
239
+ continue
240
+
241
+ # Normalize both scores
242
+ norm_rmse = (experiment.avg_rmse - min_rmse) / range_rmse
243
+ norm_logloss = (experiment.avg_logloss - min_logloss) / range_logloss
244
+
245
+ # Calculate combined score (average of normalized scores)
246
+ combined_score = (norm_rmse + norm_logloss) / 2
247
+ experiment_scores.append((experiment, combined_score))
248
+
249
+ # Sort by combined score (ascending since lower is better)
250
+ experiment_scores.sort(key=lambda x: x[1])
251
+
252
+ return experiment_scores[0][0] if experiment_scores else None
253
+
254
+ # For single metric case (rmse or logloss)
255
+ score_property = cls.avg_rmse if metric == 'rmse' else cls.avg_logloss
256
+
257
+ return (
258
+ db.query(cls)
259
+ .filter(cls.name.ilike(f'%{name}%'), score_property.isnot(None)) # Only consider experiments with scores
260
+ .order_by(score_property)
261
+ .first()
262
+ )
263
+
264
+ def get_features(self, target_number: int):
265
+ targets = [t for t in self.targets if t.name == f"TARGET_{target_number}"]
266
+ if targets:
267
+ target_id = targets[0].id
268
+ feature_selection = [
269
+ fs for fs in self.feature_selections if fs.target_id == target_id
270
+ ]
271
+ if feature_selection:
272
+ feature_selection = feature_selection[0]
273
+ features = [f.name for f in feature_selection.features]
274
+ return features
275
+
276
+ # fallback to path if no features found
277
+ features = joblib.load(f"{self.path}/TARGET_{target_number}/features.pkl")
278
+ return features
279
+
280
+ def get_all_features(self, date_column: str = None, group_column: str = None):
281
+ target_idx = [target.id for target in self.targets]
282
+ _all_features = chain.from_iterable(
283
+ [f.name for f in fs.features]
284
+ for fs in self.feature_selections
285
+ if fs.target_id in target_idx
286
+ )
287
+ _all_features = list(_all_features)
288
+
289
+ # fallback to path if no features found
290
+ if len(_all_features) == 0:
291
+ _all_features = joblib.load(f"{self.path}/preprocessing/all_features.pkl")
292
+
293
+ all_features = []
294
+ if date_column:
295
+ all_features.append(date_column)
296
+ if group_column:
297
+ all_features.append(group_column)
298
+ all_features += _all_features
299
+ all_features = list(dict.fromkeys(all_features))
300
+
301
+ return all_features
@@ -658,7 +658,7 @@ class PreprocessFeature:
658
658
  X_transformed = pd.DataFrame(transformed, columns=column_names, index=df.index)
659
659
 
660
660
  # Try to convert columns to best possible dtypes
661
- X_transformed = X_transformed.convert_ups()
661
+ X_transformed = X_transformed.convert_dtypes()
662
662
 
663
663
  # Insert features in db
664
664
  if save_in_db:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "lecrapaud"
3
- version = "0.14.7"
3
+ version = "0.15.0"
4
4
  description = "Framework for machine and deep learning, with regression, classification and time series analysis"
5
5
  authors = [
6
6
  {name = "Pierre H. Gallet"}
@@ -20,6 +20,7 @@ dependencies = [
20
20
  "numpy>=2.1.3",
21
21
  "openai>=1.88.0",
22
22
  "pandas>=2.3.0",
23
+ "pydantic>=2.9.2",
23
24
  "python-dotenv>=1.1.0",
24
25
  "scikit-learn>=1.6.1",
25
26
  "scipy<1.14.0",
@@ -1,130 +0,0 @@
1
- from itertools import chain
2
- import joblib
3
-
4
- from sqlalchemy import (
5
- Column,
6
- Integer,
7
- String,
8
- DateTime,
9
- Float,
10
- JSON,
11
- Table,
12
- ForeignKey,
13
- BigInteger,
14
- TIMESTAMP,
15
- UniqueConstraint,
16
- func,
17
- )
18
- from sqlalchemy.orm import relationship
19
-
20
- from lecrapaud.db.models.base import Base
21
- from lecrapaud.db.models.utils import create_association_table
22
-
23
- # jointures
24
- lecrapaud_experiment_target_association = create_association_table(
25
- name="experiment_target_association",
26
- table1="experiments",
27
- column1="experiment",
28
- table2="targets",
29
- column2="target"
30
- )
31
-
32
-
33
- class Experiment(Base):
34
-
35
- id = Column(BigInteger, primary_key=True, index=True, autoincrement=True)
36
- created_at = Column(
37
- TIMESTAMP(timezone=True), server_default=func.now(), nullable=False
38
- )
39
- updated_at = Column(
40
- TIMESTAMP(timezone=True),
41
- server_default=func.now(),
42
- onupdate=func.now(),
43
- nullable=False,
44
- )
45
- name = Column(String(50), nullable=False)
46
- path = Column(String(255)) # we do not have this at creation time
47
- type = Column(String(50), nullable=False)
48
- size = Column(Integer, nullable=False)
49
- train_size = Column(Integer)
50
- val_size = Column(Integer)
51
- test_size = Column(Integer)
52
- corr_threshold = Column(Float, nullable=False)
53
- max_features = Column(Integer, nullable=False)
54
- percentile = Column(Float, nullable=False)
55
- number_of_groups = Column(Integer)
56
- list_of_groups = Column(JSON)
57
- start_date = Column(DateTime)
58
- end_date = Column(DateTime)
59
- train_start_date = Column(DateTime)
60
- train_end_date = Column(DateTime)
61
- val_start_date = Column(DateTime)
62
- val_end_date = Column(DateTime)
63
- test_start_date = Column(DateTime)
64
- test_end_date = Column(DateTime)
65
- context = Column(JSON)
66
-
67
- feature_selections = relationship(
68
- "FeatureSelection",
69
- back_populates="experiment",
70
- cascade="all, delete-orphan",
71
- lazy="selectin",
72
- )
73
- model_selections = relationship(
74
- "ModelSelection",
75
- back_populates="experiment",
76
- cascade="all, delete-orphan",
77
- lazy="selectin",
78
- )
79
- targets = relationship(
80
- "Target",
81
- secondary=lecrapaud_experiment_target_association,
82
- back_populates="experiments",
83
- lazy="selectin",
84
- )
85
-
86
- __table_args__ = (
87
- UniqueConstraint(
88
- "name",
89
- name="uq_experiments_composite",
90
- ),
91
- )
92
-
93
- def get_features(self, target_number: int):
94
- targets = [t for t in self.targets if t.name == f"TARGET_{target_number}"]
95
- if targets:
96
- target_id = targets[0].id
97
- feature_selection = [
98
- fs for fs in self.feature_selections if fs.target_id == target_id
99
- ]
100
- if feature_selection:
101
- feature_selection = feature_selection[0]
102
- features = [f.name for f in feature_selection.features]
103
- return features
104
-
105
- # fallback to path if no features found
106
- features = joblib.load(f"{self.path}/TARGET_{target_number}/features.pkl")
107
- return features
108
-
109
- def get_all_features(self, date_column: str = None, group_column: str = None):
110
- target_idx = [target.id for target in self.targets]
111
- _all_features = chain.from_iterable(
112
- [f.name for f in fs.features]
113
- for fs in self.feature_selections
114
- if fs.target_id in target_idx
115
- )
116
- _all_features = list(_all_features)
117
-
118
- # fallback to path if no features found
119
- if len(_all_features) == 0:
120
- _all_features = joblib.load(f"{self.path}/preprocessing/all_features.pkl")
121
-
122
- all_features = []
123
- if date_column:
124
- all_features.append(date_column)
125
- if group_column:
126
- all_features.append(group_column)
127
- all_features += _all_features
128
- all_features = list(dict.fromkeys(all_features))
129
-
130
- return all_features
File without changes
File without changes