lecrapaud 0.20.0__py3-none-any.whl → 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

lecrapaud/api.py CHANGED
@@ -165,6 +165,12 @@ class ExperimentEngine:
165
165
 
166
166
  def __init__(self, id: int = None, data: pd.DataFrame = None, **kwargs):
167
167
  """Initialize the experiment engine with either new or existing experiment."""
168
+ # Set all kwargs as instance attributes
169
+ if "models_idx" in kwargs:
170
+ kwargs["models_idx"] = normalize_models_idx(kwargs["models_idx"])
171
+ for key, value in kwargs.items():
172
+ setattr(self, key, value)
173
+
168
174
  if id:
169
175
  self.experiment = Experiment.get(id)
170
176
  kwargs.update(self.experiment.context)
@@ -180,12 +186,6 @@ class ExperimentEngine:
180
186
  )
181
187
  self.experiment = create_experiment(data=data, **kwargs)
182
188
 
183
- # Set all kwargs as instance attributes
184
- for key, value in kwargs.items():
185
- if key == "models_idx":
186
- value = normalize_models_idx(value)
187
- setattr(self, key, value)
188
-
189
189
  def train(self, data, best_params=None):
190
190
  logger.info("Running training...")
191
191
 
@@ -309,12 +309,8 @@ class ExperimentEngine:
309
309
  def feature_engineering(self, data, for_training=True):
310
310
  app = FeatureEngineeringEngine(
311
311
  data=data,
312
- columns_drop=getattr(self, "columns_drop", []),
313
- columns_boolean=getattr(self, "columns_boolean", []),
314
- columns_date=getattr(self, "columns_date", []),
315
- columns_te_groupby=getattr(self, "columns_te_groupby", []),
316
- columns_te_target=getattr(self, "columns_te_target", []),
317
- for_training=getattr(self, "for_training", True),
312
+ experiment=self.experiment,
313
+ for_training=for_training,
318
314
  )
319
315
  data = app.run()
320
316
  return data
@@ -322,21 +318,7 @@ class ExperimentEngine:
322
318
  def preprocess_feature(self, data, for_training=True):
323
319
  app = PreprocessFeature(
324
320
  data=data,
325
- experiment=getattr(self, "experiment", None),
326
- time_series=getattr(self, "time_series", False),
327
- date_column=getattr(self, "date_column", None),
328
- group_column=getattr(self, "group_column", None),
329
- val_size=getattr(self, "val_size", 0.2),
330
- test_size=getattr(self, "test_size", 0.2),
331
- columns_pca=getattr(self, "columns_pca", []),
332
- pca_temporal=getattr(self, "pca_temporal", []),
333
- pca_cross_sectional=getattr(self, "pca_cross_sectional", []),
334
- columns_onehot=getattr(self, "columns_onehot", []),
335
- columns_binary=getattr(self, "columns_binary", []),
336
- columns_ordinal=getattr(self, "columns_ordinal", []),
337
- columns_frequency=getattr(self, "columns_frequency", []),
338
- target_numbers=getattr(self, "target_numbers", []),
339
- target_clf=getattr(self, "target_clf", []),
321
+ experiment=self.experiment,
340
322
  )
341
323
  if for_training:
342
324
  train, val, test = app.run()
@@ -351,7 +333,6 @@ class ExperimentEngine:
351
333
  train=train,
352
334
  target_number=target_number,
353
335
  experiment=self.experiment,
354
- target_clf=self.target_clf,
355
336
  )
356
337
  app.run()
357
338
  self.experiment = Experiment.get(self.experiment.id)
@@ -368,14 +349,7 @@ class ExperimentEngine:
368
349
  train=train,
369
350
  val=val,
370
351
  test=test,
371
- experiment=getattr(self, "experiment", None),
372
- target_numbers=getattr(self, "target_numbers", []),
373
- target_clf=getattr(self, "target_clf", []),
374
- models_idx=getattr(self, "models_idx", []),
375
- time_series=getattr(self, "time_series", False),
376
- max_timesteps=getattr(self, "max_timesteps", 120),
377
- date_column=getattr(self, "date_column", None),
378
- group_column=getattr(self, "group_column", None),
352
+ experiment=self.experiment,
379
353
  )
380
354
  if for_training:
381
355
  data, reshaped_data = app.run()
@@ -390,25 +364,13 @@ class ExperimentEngine:
390
364
  data=data,
391
365
  reshaped_data=reshaped_data,
392
366
  target_number=target_number,
393
- experiment=getattr(self, "experiment", None),
394
- target_clf=getattr(self, "target_clf", []),
395
- models_idx=getattr(self, "models_idx", []),
396
- time_series=getattr(self, "time_series", False),
397
- date_column=getattr(self, "date_column", None),
398
- group_column=getattr(self, "group_column", None),
399
- target_clf_thresholds=getattr(self, "target_clf_thresholds", {}),
367
+ experiment=self.experiment,
400
368
  )
401
369
  if best_params and target_number not in best_params.keys():
402
370
  raise ValueError(
403
371
  f"Target {target_number} not found in best_params passed as argument"
404
372
  )
405
373
  app.run(
406
- self.experiment_name,
407
- perform_hyperopt=self.perform_hyperopt,
408
- number_of_trials=self.number_of_trials,
409
- perform_crossval=self.perform_crossval,
410
- plot=self.plot,
411
- preserve_model=self.preserve_model,
412
374
  best_params=best_params[target_number] if best_params else None,
413
375
  )
414
376
 
lecrapaud/config.py CHANGED
@@ -32,6 +32,7 @@ DB_URI: str = (
32
32
  )
33
33
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
34
34
  LECRAPAUD_LOGFILE = os.getenv("LECRAPAUD_LOGFILE")
35
- LECRAPAUD_LOCAL = os.getenv("LECRAPAUD_LOCAL", False)
36
35
  LECRAPAUD_TABLE_PREFIX = os.getenv("LECRAPAUD_TABLE_PREFIX", "lecrapaud")
37
- LECRAPAUD_OPTIMIZATION_BACKEND = os.getenv("LECRAPAUD_OPTIMIZATION_BACKEND", "ray").lower()
36
+ LECRAPAUD_OPTIMIZATION_BACKEND = os.getenv(
37
+ "LECRAPAUD_OPTIMIZATION_BACKEND", "ray"
38
+ ).lower()
@@ -0,0 +1,42 @@
1
+ """add number_of_targets and remove other fields from experiments
2
+
3
+ Revision ID: 0a8fb7826e9b
4
+ Revises: 033e0f7eca4f
5
+ Create Date: 2025-10-28 20:06:54.792631
6
+
7
+ """
8
+ from typing import Sequence, Union
9
+
10
+ from alembic import op
11
+ import sqlalchemy as sa
12
+ from sqlalchemy.dialects import mysql
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision: str = '0a8fb7826e9b'
16
+ down_revision: Union[str, None] = '033e0f7eca4f'
17
+ branch_labels: Union[str, Sequence[str], None] = None
18
+ depends_on: Union[str, Sequence[str], None] = None
19
+
20
+
21
+ def upgrade() -> None:
22
+ # ### commands auto generated by Alembic - please adjust! ###
23
+ op.add_column('lecrapaud_experiments', sa.Column('number_of_targets', sa.Integer(), nullable=True))
24
+ op.drop_column('lecrapaud_experiments', 'corr_threshold')
25
+ op.drop_column('lecrapaud_experiments', 'max_features')
26
+ op.drop_column('lecrapaud_experiments', 'percentile')
27
+ op.drop_column('lecrapaud_experiments', 'type')
28
+ op.drop_index(op.f('ix_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
29
+ op.create_index(op.f('ix_lecrapaud_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
30
+ # ### end Alembic commands ###
31
+
32
+
33
+ def downgrade() -> None:
34
+ # ### commands auto generated by Alembic - please adjust! ###
35
+ op.drop_index(op.f('ix_lecrapaud_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
36
+ op.create_index(op.f('ix_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
37
+ op.add_column('lecrapaud_experiments', sa.Column('type', mysql.VARCHAR(length=50), nullable=False))
38
+ op.add_column('lecrapaud_experiments', sa.Column('percentile', mysql.FLOAT(), nullable=False))
39
+ op.add_column('lecrapaud_experiments', sa.Column('max_features', mysql.INTEGER(), autoincrement=False, nullable=False))
40
+ op.add_column('lecrapaud_experiments', sa.Column('corr_threshold', mysql.FLOAT(), nullable=False))
41
+ op.drop_column('lecrapaud_experiments', 'number_of_targets')
42
+ # ### end Alembic commands ###
@@ -50,10 +50,43 @@ class Experiment(Base):
50
50
  )
51
51
  name = Column(String(255), nullable=False)
52
52
  path = Column(String(255)) # we do not have this at creation time
53
- type = Column(String(50), nullable=False)
54
53
  size = Column(Integer, nullable=False)
55
54
  train_size = Column(Integer)
56
55
  val_size = Column(Integer)
56
+ test_size = Column(Integer)
57
+ number_of_groups = Column(Integer)
58
+ list_of_groups = Column(JSON)
59
+ number_of_targets = Column(Integer)
60
+ start_date = Column(DateTime)
61
+ end_date = Column(DateTime)
62
+ train_start_date = Column(DateTime)
63
+ train_end_date = Column(DateTime)
64
+ val_start_date = Column(DateTime)
65
+ val_end_date = Column(DateTime)
66
+ test_start_date = Column(DateTime)
67
+ test_end_date = Column(DateTime)
68
+ context = Column(JSON)
69
+
70
+ feature_selections = relationship(
71
+ "FeatureSelection",
72
+ back_populates="experiment",
73
+ cascade="all, delete-orphan",
74
+ lazy="selectin",
75
+ )
76
+
77
+ targets = relationship(
78
+ "Target",
79
+ secondary=lecrapaud_experiment_target_association,
80
+ back_populates="experiments",
81
+ lazy="selectin",
82
+ )
83
+
84
+ __table_args__ = (
85
+ UniqueConstraint(
86
+ "name",
87
+ name="uq_experiments_composite",
88
+ ),
89
+ )
57
90
 
58
91
  # Relationships
59
92
  model_selections = relationship(
@@ -68,16 +101,9 @@ class Experiment(Base):
68
101
  """Best RMSE score across all model selections and trainings."""
69
102
  # Get the minimum RMSE for each model selection
70
103
  min_scores = [
71
- min(
72
- mss.rmse
73
- for mss in ms.model_selection_scores
74
- if mss.rmse is not None
75
- )
104
+ min(mss.rmse for mss in ms.model_selection_scores if mss.rmse is not None)
76
105
  for ms in self.model_selections
77
- if any(
78
- mss.rmse is not None
79
- for mss in ms.model_selection_scores
80
- )
106
+ if any(mss.rmse is not None for mss in ms.model_selection_scores)
81
107
  ]
82
108
  return min(min_scores) if min_scores else None
83
109
 
@@ -92,10 +118,7 @@ class Experiment(Base):
92
118
  if mss.logloss is not None
93
119
  )
94
120
  for ms in self.model_selections
95
- if any(
96
- mss.logloss is not None
97
- for mss in ms.model_selection_scores
98
- )
121
+ if any(mss.logloss is not None for mss in ms.model_selection_scores)
99
122
  ]
100
123
  return min(min_scores) if min_scores else None
101
124
 
@@ -104,16 +127,9 @@ class Experiment(Base):
104
127
  """Average RMSE score across all model selections and trainings."""
105
128
  # Get the minimum RMSE for each model selection
106
129
  min_scores = [
107
- min(
108
- mss.rmse
109
- for mss in ms.model_selection_scores
110
- if mss.rmse is not None
111
- )
130
+ min(mss.rmse for mss in ms.model_selection_scores if mss.rmse is not None)
112
131
  for ms in self.model_selections
113
- if any(
114
- mss.rmse is not None
115
- for mss in ms.model_selection_scores
116
- )
132
+ if any(mss.rmse is not None for mss in ms.model_selection_scores)
117
133
  ]
118
134
  return mean(min_scores) if min_scores else None
119
135
 
@@ -128,50 +144,10 @@ class Experiment(Base):
128
144
  if mss.logloss is not None
129
145
  )
130
146
  for ms in self.model_selections
131
- if any(
132
- mss.logloss is not None
133
- for mss in ms.model_selection_scores
134
- )
147
+ if any(mss.logloss is not None for mss in ms.model_selection_scores)
135
148
  ]
136
149
  return mean(min_scores) if min_scores else None
137
150
 
138
- test_size = Column(Integer)
139
- corr_threshold = Column(Float, nullable=False)
140
- max_features = Column(Integer, nullable=False)
141
- percentile = Column(Float, nullable=False)
142
- number_of_groups = Column(Integer)
143
- list_of_groups = Column(JSON)
144
- start_date = Column(DateTime)
145
- end_date = Column(DateTime)
146
- train_start_date = Column(DateTime)
147
- train_end_date = Column(DateTime)
148
- val_start_date = Column(DateTime)
149
- val_end_date = Column(DateTime)
150
- test_start_date = Column(DateTime)
151
- test_end_date = Column(DateTime)
152
- context = Column(JSON)
153
-
154
- feature_selections = relationship(
155
- "FeatureSelection",
156
- back_populates="experiment",
157
- cascade="all, delete-orphan",
158
- lazy="selectin",
159
- )
160
-
161
- targets = relationship(
162
- "Target",
163
- secondary=lecrapaud_experiment_target_association,
164
- back_populates="experiments",
165
- lazy="selectin",
166
- )
167
-
168
- __table_args__ = (
169
- UniqueConstraint(
170
- "name",
171
- name="uq_experiments_composite",
172
- ),
173
- )
174
-
175
151
  @classmethod
176
152
  @with_db
177
153
  def get_all_by_name(cls, name: str | None = None, limit: int = 1000, db=None):
@@ -354,19 +330,18 @@ class Experiment(Base):
354
330
 
355
331
  # Get the best model score based on lowest logloss or rmse
356
332
  model_scores = best_model_selection.model_selection_scores
357
-
333
+
358
334
  # Determine if we should use logloss or rmse based on what's available
359
335
  if any(ms.logloss is not None for ms in model_scores):
360
336
  # Classification: find lowest logloss
361
337
  best_score = min(
362
338
  (ms for ms in model_scores if ms.logloss is not None),
363
- key=lambda x: x.logloss
339
+ key=lambda x: x.logloss,
364
340
  )
365
341
  elif any(ms.rmse is not None for ms in model_scores):
366
342
  # Regression: find lowest rmse
367
343
  best_score = min(
368
- (ms for ms in model_scores if ms.rmse is not None),
369
- key=lambda x: x.rmse
344
+ (ms for ms in model_scores if ms.rmse is not None), key=lambda x: x.rmse
370
345
  )
371
346
  else:
372
347
  return {
@@ -398,12 +373,8 @@ class Experiment(Base):
398
373
 
399
374
  # Get the model info
400
375
  model_info = {
401
- "model_type": (
402
- score.model.model_type if score.model else "unknown"
403
- ),
404
- "model_name": (
405
- score.model.name if score.model else "unknown"
406
- ),
376
+ "model_type": (score.model.model_type if score.model else "unknown"),
377
+ "model_name": (score.model.name if score.model else "unknown"),
407
378
  "training_time_seconds": score.training_time,
408
379
  }
409
380
 
@@ -434,7 +405,9 @@ class Experiment(Base):
434
405
  return features
435
406
 
436
407
  @with_db
437
- def get_all_features(self, date_column: str = None, group_column: str = None, db=None):
408
+ def get_all_features(
409
+ self, date_column: str = None, group_column: str = None, db=None
410
+ ):
438
411
  # Ensure we have a fresh instance attached to the session
439
412
  self = db.merge(self)
440
413
  target_idx = [target.id for target in self.targets]
lecrapaud/experiment.py CHANGED
@@ -3,6 +3,7 @@ from pathlib import Path
3
3
 
4
4
  import pandas as pd
5
5
  import joblib
6
+ from datetime import datetime
6
7
 
7
8
  # Set up coverage file path
8
9
  os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
@@ -15,18 +16,18 @@ from lecrapaud.db.session import get_db
15
16
 
16
17
  def create_experiment(
17
18
  data: pd.DataFrame | str,
18
- corr_threshold,
19
- percentile,
20
- max_features,
21
- date_column,
22
- group_column,
23
19
  experiment_name,
20
+ date_column=None,
21
+ group_column=None,
24
22
  **kwargs,
25
23
  ):
26
24
  if isinstance(data, str):
27
25
  path = f"{data}/data/full.pkl"
28
26
  data = joblib.load(path)
29
27
 
28
+ if kwargs.get("time_series") and not date_column:
29
+ raise ValueError("date_column must be provided for time series experiments")
30
+
30
31
  dates = {}
31
32
  if date_column:
32
33
  dates["start_date"] = pd.to_datetime(data[date_column].iat[0])
@@ -42,7 +43,10 @@ def create_experiment(
42
43
  targets = [
43
44
  target for target in all_targets if target.name in data.columns.str.upper()
44
45
  ]
45
- experiment_name = f"{experiment_name}_{groups["number_of_groups"] if group_column else 'ng'}_{corr_threshold}_{percentile}_{max_features}_{dates['start_date'].date() if date_column else 'nd'}_{dates['end_date'].date() if date_column else 'nd'}"
46
+ experiment_name = (
47
+ f"{experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
48
+ )
49
+ number_of_targets = len(targets)
46
50
 
47
51
  experiment_dir = f"{tmp_dir}/{experiment_name}"
48
52
  preprocessing_dir = f"{experiment_dir}/preprocessing"
@@ -55,26 +59,20 @@ def create_experiment(
55
59
  db=db,
56
60
  name=experiment_name,
57
61
  path=Path(experiment_dir).resolve(),
58
- type="training",
59
62
  size=data.shape[0],
60
- corr_threshold=corr_threshold,
61
- percentile=percentile,
62
- max_features=max_features,
63
+ number_of_targets=number_of_targets,
63
64
  **groups,
64
65
  **dates,
65
66
  context={
66
- "corr_threshold": corr_threshold,
67
- "percentile": percentile,
68
- "max_features": max_features,
69
67
  "date_column": date_column,
70
68
  "group_column": group_column,
71
69
  "experiment_name": experiment_name,
72
70
  **kwargs,
73
71
  },
74
72
  )
75
-
73
+
76
74
  # Set targets relationship after creation/update
77
75
  experiment.targets = targets
78
76
  experiment.save(db=db)
79
-
77
+
80
78
  return experiment
@@ -87,21 +87,20 @@ class FeatureEngineeringEngine:
87
87
  def __init__(
88
88
  self,
89
89
  data: pd.DataFrame,
90
- columns_drop: list[str] = [],
91
- columns_boolean: list[str] = [],
92
- columns_date: list[str] = [],
93
- columns_te_groupby: list[str] = [],
94
- columns_te_target: list[str] = [],
90
+ experiment,
95
91
  for_training: bool = True,
96
92
  **kwargs,
97
93
  ):
98
94
  self.data = data
99
- self.columns_drop = columns_drop
100
- self.columns_boolean = columns_boolean
101
- self.columns_date = columns_date
102
- self.columns_te_groupby = columns_te_groupby
103
- self.columns_te_target = columns_te_target
95
+ self.experiment = experiment
104
96
  self.for_training = for_training
97
+
98
+ # Get all parameters from experiment context
99
+ self.columns_drop = self.experiment.context.get("columns_drop", [])
100
+ self.columns_boolean = self.experiment.context.get("columns_boolean", [])
101
+ self.columns_date = self.experiment.context.get("columns_date", [])
102
+ self.columns_te_groupby = self.experiment.context.get("columns_te_groupby", [])
103
+ self.columns_te_target = self.experiment.context.get("columns_te_target", [])
105
104
 
106
105
  def run(self) -> pd.DataFrame:
107
106
  # drop columns
@@ -316,41 +315,30 @@ class PreprocessFeature:
316
315
  self,
317
316
  data: pd.DataFrame,
318
317
  experiment,
319
- time_series: bool = False,
320
- date_column: str | None = None,
321
- group_column: str | None = None,
322
- val_size: float = 0.2,
323
- test_size: float = 0.2,
324
- columns_pca: list[str] = [],
325
- pca_temporal: list[dict[str, list[str]]] = [],
326
- pca_cross_sectional: list[dict[str, list[str]]] = [],
327
- columns_onehot: list[str] = [],
328
- columns_binary: list[str] = [],
329
- columns_ordinal: list[str] = [],
330
- columns_frequency: list[str] = [],
331
- target_numbers: list = [],
332
- target_clf: list = [],
333
318
  **kwargs,
334
319
  ):
335
320
  self.data = data
336
321
  self.data.columns = self.data.columns.str.upper()
337
-
338
322
  self.experiment = experiment
339
- self.columns_pca = [col.upper() for col in columns_pca]
340
- self.pca_temporal = pca_temporal
341
- self.pca_cross_sectional = pca_cross_sectional
342
- self.columns_onehot = [col.upper() for col in columns_onehot]
343
- self.columns_binary = [col.upper() for col in columns_binary]
344
- self.columns_ordinal = [col.upper() for col in columns_ordinal]
345
- self.columns_frequency = [col.upper() for col in columns_frequency]
346
- self.target_numbers = target_numbers
347
- self.target_clf = target_clf
348
-
349
- self.time_series = time_series
350
- self.date_column = date_column
351
- self.group_column = group_column
352
- self.val_size = val_size
353
- self.test_size = test_size
323
+
324
+ # Get all parameters from experiment context
325
+ context = self.experiment.context
326
+ self.time_series = context.get("time_series", False)
327
+ self.date_column = context.get("date_column", None)
328
+ self.group_column = context.get("group_column", None)
329
+ self.val_size = context.get("val_size", 0.2)
330
+ self.test_size = context.get("test_size", 0.2)
331
+ self.target_numbers = context.get("target_numbers", [])
332
+ self.target_clf = context.get("target_clf", [])
333
+
334
+ # Handle list parameters with uppercase conversion
335
+ self.columns_pca = [col.upper() for col in context.get("columns_pca", [])]
336
+ self.pca_temporal = context.get("pca_temporal", [])
337
+ self.pca_cross_sectional = context.get("pca_cross_sectional", [])
338
+ self.columns_onehot = [col.upper() for col in context.get("columns_onehot", [])]
339
+ self.columns_binary = [col.upper() for col in context.get("columns_binary", [])]
340
+ self.columns_ordinal = [col.upper() for col in context.get("columns_ordinal", [])]
341
+ self.columns_frequency = [col.upper() for col in context.get("columns_frequency", [])]
354
342
 
355
343
  self.experiment_dir = self.experiment.path
356
344
  self.experiment_id = self.experiment.id