lecrapaud 0.14.8__tar.gz → 0.16.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (46) hide show
  1. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/PKG-INFO +2 -1
  2. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/api.py +19 -2
  3. lecrapaud-0.16.0/lecrapaud/db/models/experiment.py +328 -0
  4. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/pyproject.toml +2 -1
  5. lecrapaud-0.14.8/lecrapaud/db/models/experiment.py +0 -130
  6. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/LICENSE +0 -0
  7. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/README.md +0 -0
  8. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/__init__.py +0 -0
  9. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/config.py +0 -0
  10. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/__init__.py +0 -0
  11. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/alembic/README +0 -0
  12. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/alembic/env.py +0 -0
  13. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/alembic/script.py.mako +0 -0
  14. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/alembic/versions/2025_06_23_1748-f089dfb7e3ba_.py +0 -0
  15. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/alembic/versions/2025_06_24_1216-c62251b129ed_.py +0 -0
  16. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py +0 -0
  17. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/alembic/versions/2025_06_25_1759-72aa496ca65b_.py +0 -0
  18. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/alembic.ini +0 -0
  19. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/models/__init__.py +0 -0
  20. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/models/base.py +0 -0
  21. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/models/feature.py +0 -0
  22. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/models/feature_selection.py +0 -0
  23. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/models/feature_selection_rank.py +0 -0
  24. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/models/model.py +0 -0
  25. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/models/model_selection.py +0 -0
  26. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/models/model_training.py +0 -0
  27. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/models/score.py +0 -0
  28. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/models/target.py +0 -0
  29. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/models/utils.py +0 -0
  30. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/db/session.py +0 -0
  31. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/directories.py +0 -0
  32. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/experiment.py +0 -0
  33. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/feature_engineering.py +0 -0
  34. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/feature_selection.py +0 -0
  35. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/integrations/openai_integration.py +0 -0
  36. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/jobs/__init__.py +0 -0
  37. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/jobs/config.py +0 -0
  38. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/jobs/scheduler.py +0 -0
  39. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/jobs/tasks.py +0 -0
  40. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/misc/tabpfn_tests.ipynb +0 -0
  41. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/misc/test-gpu-bilstm.ipynb +0 -0
  42. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/misc/test-gpu-resnet.ipynb +0 -0
  43. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/misc/test-gpu-transformers.ipynb +0 -0
  44. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/model_selection.py +0 -0
  45. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/search_space.py +0 -0
  46. {lecrapaud-0.14.8 → lecrapaud-0.16.0}/lecrapaud/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: lecrapaud
3
- Version: 0.14.8
3
+ Version: 0.16.0
4
4
  Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
5
  License: Apache License
6
6
  Author: Pierre H. Gallet
@@ -19,6 +19,7 @@ Requires-Dist: mlxtend (>=0.23.4)
19
19
  Requires-Dist: numpy (>=2.1.3)
20
20
  Requires-Dist: openai (>=1.88.0)
21
21
  Requires-Dist: pandas (>=2.3.0)
22
+ Requires-Dist: pydantic (>=2.9.2)
22
23
  Requires-Dist: python-dotenv (>=1.1.0)
23
24
  Requires-Dist: scikit-learn (>=1.6.1)
24
25
  Requires-Dist: scipy (<1.14.0)
@@ -88,9 +88,26 @@ class LeCrapaud:
88
88
  """
89
89
  return ExperimentEngine(id=id, **kwargs)
90
90
 
91
- def list_experiments(self, limit=1000) -> list["ExperimentEngine"]:
91
+ def get_last_experiement_by_name(self, name: str, **kwargs) -> "ExperimentEngine":
92
+ """Retrieve the last experiment by name."""
93
+ return ExperimentEngine(id=Experiment.get_last_by_name(name).id, **kwargs)
94
+
95
+ def get_best_experiment_by_name(
96
+ self, name: str, metric: str = "both", **kwargs
97
+ ) -> "ExperimentEngine":
98
+ """Retrieve the best experiment by score."""
99
+ return ExperimentEngine(
100
+ id=Experiment.get_best_by_score(name=name, metric=metric).id, **kwargs
101
+ )
102
+
103
+ def list_experiments(
104
+ self, name: str = None, limit: int = 1000
105
+ ) -> list["ExperimentEngine"]:
92
106
  """List all experiments in the database."""
93
- return [ExperimentEngine(id=exp.id) for exp in Experiment.get_all(limit=limit)]
107
+ return [
108
+ ExperimentEngine(id=exp.id)
109
+ for exp in Experiment.get_all_by_name(name=name, limit=limit)
110
+ ]
94
111
 
95
112
 
96
113
  class ExperimentEngine:
@@ -0,0 +1,328 @@
1
+ from itertools import chain
2
+ import joblib
3
+
4
+ from sqlalchemy import (
5
+ Column,
6
+ Integer,
7
+ String,
8
+ DateTime,
9
+ Float,
10
+ JSON,
11
+ Table,
12
+ ForeignKey,
13
+ BigInteger,
14
+ TIMESTAMP,
15
+ UniqueConstraint,
16
+ func,
17
+ )
18
+ from sqlalchemy.orm import relationship, aliased
19
+ from sqlalchemy.ext.hybrid import hybrid_property
20
+ from sqlalchemy import func
21
+ from statistics import fmean as mean
22
+ from lecrapaud.db.models.model_selection import ModelSelection
23
+ from lecrapaud.db.models.model_training import ModelTraining
24
+ from lecrapaud.db.models.score import Score
25
+
26
+ from lecrapaud.db.models.base import Base, with_db
27
+ from lecrapaud.db.models.utils import create_association_table
28
+
29
+ # jointures
30
+ lecrapaud_experiment_target_association = create_association_table(
31
+ name="experiment_target_association",
32
+ table1="experiments",
33
+ column1="experiment",
34
+ table2="targets",
35
+ column2="target",
36
+ )
37
+
38
+
39
+ class Experiment(Base):
40
+
41
+ id = Column(BigInteger, primary_key=True, index=True, autoincrement=True)
42
+ created_at = Column(
43
+ TIMESTAMP(timezone=True), server_default=func.now(), nullable=False
44
+ )
45
+ updated_at = Column(
46
+ TIMESTAMP(timezone=True),
47
+ server_default=func.now(),
48
+ onupdate=func.now(),
49
+ nullable=False,
50
+ )
51
+ name = Column(String(50), nullable=False)
52
+ path = Column(String(255)) # we do not have this at creation time
53
+ type = Column(String(50), nullable=False)
54
+ size = Column(Integer, nullable=False)
55
+ train_size = Column(Integer)
56
+ val_size = Column(Integer)
57
+
58
+ # Relationships
59
+ model_selections = relationship(
60
+ "ModelSelection",
61
+ back_populates="experiment",
62
+ cascade="all, delete-orphan",
63
+ lazy="selectin",
64
+ )
65
+
66
+ @hybrid_property
67
+ def best_rmse(self):
68
+ """Best RMSE score across all model selections and trainings."""
69
+ # Get the minimum RMSE for each model selection
70
+ min_scores = [
71
+ min(
72
+ score.rmse
73
+ for mt in ms.model_trainings
74
+ for score in mt.score
75
+ if score.rmse is not None
76
+ )
77
+ for ms in self.model_selections
78
+ if any(
79
+ score.rmse is not None
80
+ for mt in ms.model_trainings
81
+ for score in mt.score
82
+ )
83
+ ]
84
+ return min(min_scores) if min_scores else None
85
+
86
+ @hybrid_property
87
+ def best_logloss(self):
88
+ """Best LogLoss score across all model selections and trainings."""
89
+ # Get the minimum LogLoss for each model selection
90
+ min_scores = [
91
+ min(
92
+ score.logloss
93
+ for mt in ms.model_trainings
94
+ for score in mt.score
95
+ if score.logloss is not None
96
+ )
97
+ for ms in self.model_selections
98
+ if any(
99
+ score.logloss is not None
100
+ for mt in ms.model_trainings
101
+ for score in mt.score
102
+ )
103
+ ]
104
+ return min(min_scores) if min_scores else None
105
+
106
+ @hybrid_property
107
+ def avg_rmse(self):
108
+ """Average RMSE score across all model selections and trainings."""
109
+ # Get the minimum RMSE for each model selection
110
+ min_scores = [
111
+ min(
112
+ score.rmse
113
+ for mt in ms.model_trainings
114
+ for score in mt.score
115
+ if score.rmse is not None
116
+ )
117
+ for ms in self.model_selections
118
+ if any(
119
+ score.rmse is not None
120
+ for mt in ms.model_trainings
121
+ for score in mt.score
122
+ )
123
+ ]
124
+ return mean(min_scores) if min_scores else None
125
+
126
+ @hybrid_property
127
+ def avg_logloss(self):
128
+ """Average LogLoss score across all model selections and trainings."""
129
+ # Get the minimum LogLoss for each model selection
130
+ min_scores = [
131
+ min(
132
+ score.logloss
133
+ for mt in ms.model_trainings
134
+ for score in mt.score
135
+ if score.logloss is not None
136
+ )
137
+ for ms in self.model_selections
138
+ if any(
139
+ score.logloss is not None
140
+ for mt in ms.model_trainings
141
+ for score in mt.score
142
+ )
143
+ ]
144
+ return mean(min_scores) if min_scores else None
145
+
146
+ test_size = Column(Integer)
147
+ corr_threshold = Column(Float, nullable=False)
148
+ max_features = Column(Integer, nullable=False)
149
+ percentile = Column(Float, nullable=False)
150
+ number_of_groups = Column(Integer)
151
+ list_of_groups = Column(JSON)
152
+ start_date = Column(DateTime)
153
+ end_date = Column(DateTime)
154
+ train_start_date = Column(DateTime)
155
+ train_end_date = Column(DateTime)
156
+ val_start_date = Column(DateTime)
157
+ val_end_date = Column(DateTime)
158
+ test_start_date = Column(DateTime)
159
+ test_end_date = Column(DateTime)
160
+ context = Column(JSON)
161
+
162
+ feature_selections = relationship(
163
+ "FeatureSelection",
164
+ back_populates="experiment",
165
+ cascade="all, delete-orphan",
166
+ lazy="selectin",
167
+ )
168
+
169
+ targets = relationship(
170
+ "Target",
171
+ secondary=lecrapaud_experiment_target_association,
172
+ back_populates="experiments",
173
+ lazy="selectin",
174
+ )
175
+
176
+ __table_args__ = (
177
+ UniqueConstraint(
178
+ "name",
179
+ name="uq_experiments_composite",
180
+ ),
181
+ )
182
+
183
+ @classmethod
184
+ @with_db
185
+ def get_all_by_name(cls, name: str | None = None, db=None):
186
+ """
187
+ Find the most recently created experiment that contains the given name string.
188
+
189
+ Args:
190
+ session: SQLAlchemy session
191
+ name (str): String to search for in experiment names
192
+
193
+ Returns:
194
+ Experiment or None: The most recent matching experiment or None if not found
195
+ """
196
+ if name is not None:
197
+ return (
198
+ db.query(cls)
199
+ .filter(cls.name.ilike(f"%{name}%"))
200
+ .order_by(cls.created_at.desc())
201
+ .all()
202
+ )
203
+ return db.query(cls).order_by(cls.created_at.desc()).all()
204
+
205
+ @classmethod
206
+ @with_db
207
+ def get_last_by_name(cls, name: str, db=None):
208
+ """
209
+ Find the most recently created experiment that contains the given name string.
210
+
211
+ Args:
212
+ session: SQLAlchemy session
213
+ name (str): String to search for in experiment names
214
+
215
+ Returns:
216
+ Experiment or None: The most recent matching experiment or None if not found
217
+ """
218
+ return (
219
+ db.query(cls)
220
+ .filter(cls.name.ilike(f"%{name}%"))
221
+ .order_by(cls.created_at.desc())
222
+ .first()
223
+ )
224
+
225
+ @classmethod
226
+ @with_db
227
+ def get_best_by_score(cls, name: str, metric="both", db=None):
228
+ """
229
+ Find the experiment with the best score based on average RMSE, LogLoss, or both.
230
+
231
+ Args:
232
+ metric (str): 'rmse', 'logloss', or 'both' to determine which score to optimize
233
+ db: SQLAlchemy session
234
+
235
+ Returns:
236
+ Experiment or None: The experiment with the best score or None if not found
237
+ """
238
+ if metric == "both":
239
+ # Calculate a combined score: average of normalized RMSE and LogLoss
240
+ # This ensures we're comparing apples to apples by normalizing the scores
241
+ experiments = db.query(cls).filter(cls.name.ilike(f"%{name}%")).all()
242
+ if not experiments:
243
+ return None
244
+
245
+ # Get all scores
246
+ rmse_scores = [e.avg_rmse for e in experiments if e.avg_rmse is not None]
247
+ logloss_scores = [
248
+ e.avg_logloss for e in experiments if e.avg_logloss is not None
249
+ ]
250
+
251
+ if not rmse_scores or not logloss_scores:
252
+ return None
253
+
254
+ # Normalize scores (subtract min and divide by range)
255
+ min_rmse = min(rmse_scores)
256
+ range_rmse = max(rmse_scores) - min_rmse
257
+ min_logloss = min(logloss_scores)
258
+ range_logloss = max(logloss_scores) - min_logloss
259
+
260
+ # Calculate combined score for each experiment
261
+ experiment_scores = []
262
+ for experiment in experiments:
263
+ if experiment.avg_rmse is None or experiment.avg_logloss is None:
264
+ continue
265
+
266
+ # Normalize both scores
267
+ norm_rmse = (experiment.avg_rmse - min_rmse) / range_rmse
268
+ norm_logloss = (experiment.avg_logloss - min_logloss) / range_logloss
269
+
270
+ # Calculate combined score (average of normalized scores)
271
+ combined_score = (norm_rmse + norm_logloss) / 2
272
+ experiment_scores.append((experiment, combined_score))
273
+
274
+ # Sort by combined score (ascending since lower is better)
275
+ experiment_scores.sort(key=lambda x: x[1])
276
+
277
+ return experiment_scores[0][0] if experiment_scores else None
278
+
279
+ # For single metric case (rmse or logloss)
280
+ score_property = cls.avg_rmse if metric == "rmse" else cls.avg_logloss
281
+
282
+ return (
283
+ db.query(cls)
284
+ .filter(
285
+ cls.name.ilike(f"%{name}%"), score_property.isnot(None)
286
+ ) # Only consider experiments with scores
287
+ .order_by(score_property)
288
+ .first()
289
+ )
290
+
291
+ def get_features(self, target_number: int):
292
+ targets = [t for t in self.targets if t.name == f"TARGET_{target_number}"]
293
+ if targets:
294
+ target_id = targets[0].id
295
+ feature_selection = [
296
+ fs for fs in self.feature_selections if fs.target_id == target_id
297
+ ]
298
+ if feature_selection:
299
+ feature_selection = feature_selection[0]
300
+ features = [f.name for f in feature_selection.features]
301
+ return features
302
+
303
+ # fallback to path if no features found
304
+ features = joblib.load(f"{self.path}/TARGET_{target_number}/features.pkl")
305
+ return features
306
+
307
+ def get_all_features(self, date_column: str = None, group_column: str = None):
308
+ target_idx = [target.id for target in self.targets]
309
+ _all_features = chain.from_iterable(
310
+ [f.name for f in fs.features]
311
+ for fs in self.feature_selections
312
+ if fs.target_id in target_idx
313
+ )
314
+ _all_features = list(_all_features)
315
+
316
+ # fallback to path if no features found
317
+ if len(_all_features) == 0:
318
+ _all_features = joblib.load(f"{self.path}/preprocessing/all_features.pkl")
319
+
320
+ all_features = []
321
+ if date_column:
322
+ all_features.append(date_column)
323
+ if group_column:
324
+ all_features.append(group_column)
325
+ all_features += _all_features
326
+ all_features = list(dict.fromkeys(all_features))
327
+
328
+ return all_features
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "lecrapaud"
3
- version = "0.14.8"
3
+ version = "0.16.0"
4
4
  description = "Framework for machine and deep learning, with regression, classification and time series analysis"
5
5
  authors = [
6
6
  {name = "Pierre H. Gallet"}
@@ -20,6 +20,7 @@ dependencies = [
20
20
  "numpy>=2.1.3",
21
21
  "openai>=1.88.0",
22
22
  "pandas>=2.3.0",
23
+ "pydantic>=2.9.2",
23
24
  "python-dotenv>=1.1.0",
24
25
  "scikit-learn>=1.6.1",
25
26
  "scipy<1.14.0",
@@ -1,130 +0,0 @@
1
- from itertools import chain
2
- import joblib
3
-
4
- from sqlalchemy import (
5
- Column,
6
- Integer,
7
- String,
8
- DateTime,
9
- Float,
10
- JSON,
11
- Table,
12
- ForeignKey,
13
- BigInteger,
14
- TIMESTAMP,
15
- UniqueConstraint,
16
- func,
17
- )
18
- from sqlalchemy.orm import relationship
19
-
20
- from lecrapaud.db.models.base import Base
21
- from lecrapaud.db.models.utils import create_association_table
22
-
23
- # jointures
24
- lecrapaud_experiment_target_association = create_association_table(
25
- name="experiment_target_association",
26
- table1="experiments",
27
- column1="experiment",
28
- table2="targets",
29
- column2="target"
30
- )
31
-
32
-
33
- class Experiment(Base):
34
-
35
- id = Column(BigInteger, primary_key=True, index=True, autoincrement=True)
36
- created_at = Column(
37
- TIMESTAMP(timezone=True), server_default=func.now(), nullable=False
38
- )
39
- updated_at = Column(
40
- TIMESTAMP(timezone=True),
41
- server_default=func.now(),
42
- onupdate=func.now(),
43
- nullable=False,
44
- )
45
- name = Column(String(50), nullable=False)
46
- path = Column(String(255)) # we do not have this at creation time
47
- type = Column(String(50), nullable=False)
48
- size = Column(Integer, nullable=False)
49
- train_size = Column(Integer)
50
- val_size = Column(Integer)
51
- test_size = Column(Integer)
52
- corr_threshold = Column(Float, nullable=False)
53
- max_features = Column(Integer, nullable=False)
54
- percentile = Column(Float, nullable=False)
55
- number_of_groups = Column(Integer)
56
- list_of_groups = Column(JSON)
57
- start_date = Column(DateTime)
58
- end_date = Column(DateTime)
59
- train_start_date = Column(DateTime)
60
- train_end_date = Column(DateTime)
61
- val_start_date = Column(DateTime)
62
- val_end_date = Column(DateTime)
63
- test_start_date = Column(DateTime)
64
- test_end_date = Column(DateTime)
65
- context = Column(JSON)
66
-
67
- feature_selections = relationship(
68
- "FeatureSelection",
69
- back_populates="experiment",
70
- cascade="all, delete-orphan",
71
- lazy="selectin",
72
- )
73
- model_selections = relationship(
74
- "ModelSelection",
75
- back_populates="experiment",
76
- cascade="all, delete-orphan",
77
- lazy="selectin",
78
- )
79
- targets = relationship(
80
- "Target",
81
- secondary=lecrapaud_experiment_target_association,
82
- back_populates="experiments",
83
- lazy="selectin",
84
- )
85
-
86
- __table_args__ = (
87
- UniqueConstraint(
88
- "name",
89
- name="uq_experiments_composite",
90
- ),
91
- )
92
-
93
- def get_features(self, target_number: int):
94
- targets = [t for t in self.targets if t.name == f"TARGET_{target_number}"]
95
- if targets:
96
- target_id = targets[0].id
97
- feature_selection = [
98
- fs for fs in self.feature_selections if fs.target_id == target_id
99
- ]
100
- if feature_selection:
101
- feature_selection = feature_selection[0]
102
- features = [f.name for f in feature_selection.features]
103
- return features
104
-
105
- # fallback to path if no features found
106
- features = joblib.load(f"{self.path}/TARGET_{target_number}/features.pkl")
107
- return features
108
-
109
- def get_all_features(self, date_column: str = None, group_column: str = None):
110
- target_idx = [target.id for target in self.targets]
111
- _all_features = chain.from_iterable(
112
- [f.name for f in fs.features]
113
- for fs in self.feature_selections
114
- if fs.target_id in target_idx
115
- )
116
- _all_features = list(_all_features)
117
-
118
- # fallback to path if no features found
119
- if len(_all_features) == 0:
120
- _all_features = joblib.load(f"{self.path}/preprocessing/all_features.pkl")
121
-
122
- all_features = []
123
- if date_column:
124
- all_features.append(date_column)
125
- if group_column:
126
- all_features.append(group_column)
127
- all_features += _all_features
128
- all_features = list(dict.fromkeys(all_features))
129
-
130
- return all_features
File without changes
File without changes