lecrapaud 0.20.0__tar.gz → 0.20.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (50) hide show
  1. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/PKG-INFO +1 -1
  2. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/api.py +11 -49
  3. lecrapaud-0.20.1/lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +42 -0
  4. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/models/experiment.py +48 -75
  5. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/experiment.py +8 -13
  6. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/feature_engineering.py +28 -40
  7. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/feature_selection.py +90 -21
  8. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/model_selection.py +24 -30
  9. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/pyproject.toml +1 -1
  10. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/LICENSE +0 -0
  11. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/README.md +0 -0
  12. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/__init__.py +0 -0
  13. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/config.py +0 -0
  14. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/__init__.py +0 -0
  15. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/alembic/README +0 -0
  16. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/alembic/env.py +0 -0
  17. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/alembic/script.py.mako +0 -0
  18. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/alembic/versions/2025_06_23_1748-f089dfb7e3ba_.py +0 -0
  19. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/alembic/versions/2025_06_24_1216-c62251b129ed_.py +0 -0
  20. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py +0 -0
  21. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/alembic/versions/2025_06_25_1759-72aa496ca65b_.py +0 -0
  22. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py +0 -0
  23. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_score.py +0 -0
  24. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py +0 -0
  25. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +0 -0
  26. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +0 -0
  27. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/alembic.ini +0 -0
  28. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/models/__init__.py +0 -0
  29. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/models/base.py +0 -0
  30. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/models/feature.py +0 -0
  31. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/models/feature_selection.py +0 -0
  32. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/models/feature_selection_rank.py +0 -0
  33. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/models/model.py +0 -0
  34. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/models/model_selection.py +0 -0
  35. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/models/model_selection_score.py +0 -0
  36. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/models/target.py +0 -0
  37. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/models/utils.py +0 -0
  38. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/db/session.py +0 -0
  39. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/directories.py +0 -0
  40. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/integrations/openai_integration.py +0 -0
  41. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/jobs/__init__.py +0 -0
  42. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/jobs/config.py +0 -0
  43. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/jobs/scheduler.py +0 -0
  44. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/jobs/tasks.py +0 -0
  45. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/misc/tabpfn_tests.ipynb +0 -0
  46. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/misc/test-gpu-bilstm.ipynb +0 -0
  47. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/misc/test-gpu-resnet.ipynb +0 -0
  48. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/misc/test-gpu-transformers.ipynb +0 -0
  49. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/search_space.py +0 -0
  50. {lecrapaud-0.20.0 → lecrapaud-0.20.1}/lecrapaud/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lecrapaud
3
- Version: 0.20.0
3
+ Version: 0.20.1
4
4
  Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
5
  License: Apache License
6
6
  License-File: LICENSE
@@ -165,6 +165,12 @@ class ExperimentEngine:
165
165
 
166
166
  def __init__(self, id: int = None, data: pd.DataFrame = None, **kwargs):
167
167
  """Initialize the experiment engine with either new or existing experiment."""
168
+ # Set all kwargs as instance attributes
169
+ if "models_idx" in kwargs:
170
+ kwargs["models_idx"] = normalize_models_idx(kwargs["models_idx"])
171
+ for key, value in kwargs.items():
172
+ setattr(self, key, value)
173
+
168
174
  if id:
169
175
  self.experiment = Experiment.get(id)
170
176
  kwargs.update(self.experiment.context)
@@ -180,12 +186,6 @@ class ExperimentEngine:
180
186
  )
181
187
  self.experiment = create_experiment(data=data, **kwargs)
182
188
 
183
- # Set all kwargs as instance attributes
184
- for key, value in kwargs.items():
185
- if key == "models_idx":
186
- value = normalize_models_idx(value)
187
- setattr(self, key, value)
188
-
189
189
  def train(self, data, best_params=None):
190
190
  logger.info("Running training...")
191
191
 
@@ -309,12 +309,8 @@ class ExperimentEngine:
309
309
  def feature_engineering(self, data, for_training=True):
310
310
  app = FeatureEngineeringEngine(
311
311
  data=data,
312
- columns_drop=getattr(self, "columns_drop", []),
313
- columns_boolean=getattr(self, "columns_boolean", []),
314
- columns_date=getattr(self, "columns_date", []),
315
- columns_te_groupby=getattr(self, "columns_te_groupby", []),
316
- columns_te_target=getattr(self, "columns_te_target", []),
317
- for_training=getattr(self, "for_training", True),
312
+ experiment=self.experiment,
313
+ for_training=for_training,
318
314
  )
319
315
  data = app.run()
320
316
  return data
@@ -322,21 +318,7 @@ class ExperimentEngine:
322
318
  def preprocess_feature(self, data, for_training=True):
323
319
  app = PreprocessFeature(
324
320
  data=data,
325
- experiment=getattr(self, "experiment", None),
326
- time_series=getattr(self, "time_series", False),
327
- date_column=getattr(self, "date_column", None),
328
- group_column=getattr(self, "group_column", None),
329
- val_size=getattr(self, "val_size", 0.2),
330
- test_size=getattr(self, "test_size", 0.2),
331
- columns_pca=getattr(self, "columns_pca", []),
332
- pca_temporal=getattr(self, "pca_temporal", []),
333
- pca_cross_sectional=getattr(self, "pca_cross_sectional", []),
334
- columns_onehot=getattr(self, "columns_onehot", []),
335
- columns_binary=getattr(self, "columns_binary", []),
336
- columns_ordinal=getattr(self, "columns_ordinal", []),
337
- columns_frequency=getattr(self, "columns_frequency", []),
338
- target_numbers=getattr(self, "target_numbers", []),
339
- target_clf=getattr(self, "target_clf", []),
321
+ experiment=self.experiment,
340
322
  )
341
323
  if for_training:
342
324
  train, val, test = app.run()
@@ -351,7 +333,6 @@ class ExperimentEngine:
351
333
  train=train,
352
334
  target_number=target_number,
353
335
  experiment=self.experiment,
354
- target_clf=self.target_clf,
355
336
  )
356
337
  app.run()
357
338
  self.experiment = Experiment.get(self.experiment.id)
@@ -368,14 +349,7 @@ class ExperimentEngine:
368
349
  train=train,
369
350
  val=val,
370
351
  test=test,
371
- experiment=getattr(self, "experiment", None),
372
- target_numbers=getattr(self, "target_numbers", []),
373
- target_clf=getattr(self, "target_clf", []),
374
- models_idx=getattr(self, "models_idx", []),
375
- time_series=getattr(self, "time_series", False),
376
- max_timesteps=getattr(self, "max_timesteps", 120),
377
- date_column=getattr(self, "date_column", None),
378
- group_column=getattr(self, "group_column", None),
352
+ experiment=self.experiment,
379
353
  )
380
354
  if for_training:
381
355
  data, reshaped_data = app.run()
@@ -390,25 +364,13 @@ class ExperimentEngine:
390
364
  data=data,
391
365
  reshaped_data=reshaped_data,
392
366
  target_number=target_number,
393
- experiment=getattr(self, "experiment", None),
394
- target_clf=getattr(self, "target_clf", []),
395
- models_idx=getattr(self, "models_idx", []),
396
- time_series=getattr(self, "time_series", False),
397
- date_column=getattr(self, "date_column", None),
398
- group_column=getattr(self, "group_column", None),
399
- target_clf_thresholds=getattr(self, "target_clf_thresholds", {}),
367
+ experiment=self.experiment,
400
368
  )
401
369
  if best_params and target_number not in best_params.keys():
402
370
  raise ValueError(
403
371
  f"Target {target_number} not found in best_params passed as argument"
404
372
  )
405
373
  app.run(
406
- self.experiment_name,
407
- perform_hyperopt=self.perform_hyperopt,
408
- number_of_trials=self.number_of_trials,
409
- perform_crossval=self.perform_crossval,
410
- plot=self.plot,
411
- preserve_model=self.preserve_model,
412
374
  best_params=best_params[target_number] if best_params else None,
413
375
  )
414
376
 
@@ -0,0 +1,42 @@
1
+ """add number_of_targets and remove other fields from experiments
2
+
3
+ Revision ID: 0a8fb7826e9b
4
+ Revises: 033e0f7eca4f
5
+ Create Date: 2025-10-28 20:06:54.792631
6
+
7
+ """
8
+ from typing import Sequence, Union
9
+
10
+ from alembic import op
11
+ import sqlalchemy as sa
12
+ from sqlalchemy.dialects import mysql
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision: str = '0a8fb7826e9b'
16
+ down_revision: Union[str, None] = '033e0f7eca4f'
17
+ branch_labels: Union[str, Sequence[str], None] = None
18
+ depends_on: Union[str, Sequence[str], None] = None
19
+
20
+
21
+ def upgrade() -> None:
22
+ # ### commands auto generated by Alembic - please adjust! ###
23
+ op.add_column('lecrapaud_experiments', sa.Column('number_of_targets', sa.Integer(), nullable=True))
24
+ op.drop_column('lecrapaud_experiments', 'corr_threshold')
25
+ op.drop_column('lecrapaud_experiments', 'max_features')
26
+ op.drop_column('lecrapaud_experiments', 'percentile')
27
+ op.drop_column('lecrapaud_experiments', 'type')
28
+ op.drop_index(op.f('ix_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
29
+ op.create_index(op.f('ix_lecrapaud_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
30
+ # ### end Alembic commands ###
31
+
32
+
33
+ def downgrade() -> None:
34
+ # ### commands auto generated by Alembic - please adjust! ###
35
+ op.drop_index(op.f('ix_lecrapaud_model_selection_scores_id'), table_name='lecrapaud_model_selection_scores')
36
+ op.create_index(op.f('ix_model_selection_scores_id'), 'lecrapaud_model_selection_scores', ['id'], unique=False)
37
+ op.add_column('lecrapaud_experiments', sa.Column('type', mysql.VARCHAR(length=50), nullable=False))
38
+ op.add_column('lecrapaud_experiments', sa.Column('percentile', mysql.FLOAT(), nullable=False))
39
+ op.add_column('lecrapaud_experiments', sa.Column('max_features', mysql.INTEGER(), autoincrement=False, nullable=False))
40
+ op.add_column('lecrapaud_experiments', sa.Column('corr_threshold', mysql.FLOAT(), nullable=False))
41
+ op.drop_column('lecrapaud_experiments', 'number_of_targets')
42
+ # ### end Alembic commands ###
@@ -50,10 +50,43 @@ class Experiment(Base):
50
50
  )
51
51
  name = Column(String(255), nullable=False)
52
52
  path = Column(String(255)) # we do not have this at creation time
53
- type = Column(String(50), nullable=False)
54
53
  size = Column(Integer, nullable=False)
55
54
  train_size = Column(Integer)
56
55
  val_size = Column(Integer)
56
+ test_size = Column(Integer)
57
+ number_of_groups = Column(Integer)
58
+ list_of_groups = Column(JSON)
59
+ number_of_targets = Column(Integer)
60
+ start_date = Column(DateTime)
61
+ end_date = Column(DateTime)
62
+ train_start_date = Column(DateTime)
63
+ train_end_date = Column(DateTime)
64
+ val_start_date = Column(DateTime)
65
+ val_end_date = Column(DateTime)
66
+ test_start_date = Column(DateTime)
67
+ test_end_date = Column(DateTime)
68
+ context = Column(JSON)
69
+
70
+ feature_selections = relationship(
71
+ "FeatureSelection",
72
+ back_populates="experiment",
73
+ cascade="all, delete-orphan",
74
+ lazy="selectin",
75
+ )
76
+
77
+ targets = relationship(
78
+ "Target",
79
+ secondary=lecrapaud_experiment_target_association,
80
+ back_populates="experiments",
81
+ lazy="selectin",
82
+ )
83
+
84
+ __table_args__ = (
85
+ UniqueConstraint(
86
+ "name",
87
+ name="uq_experiments_composite",
88
+ ),
89
+ )
57
90
 
58
91
  # Relationships
59
92
  model_selections = relationship(
@@ -68,16 +101,9 @@ class Experiment(Base):
68
101
  """Best RMSE score across all model selections and trainings."""
69
102
  # Get the minimum RMSE for each model selection
70
103
  min_scores = [
71
- min(
72
- mss.rmse
73
- for mss in ms.model_selection_scores
74
- if mss.rmse is not None
75
- )
104
+ min(mss.rmse for mss in ms.model_selection_scores if mss.rmse is not None)
76
105
  for ms in self.model_selections
77
- if any(
78
- mss.rmse is not None
79
- for mss in ms.model_selection_scores
80
- )
106
+ if any(mss.rmse is not None for mss in ms.model_selection_scores)
81
107
  ]
82
108
  return min(min_scores) if min_scores else None
83
109
 
@@ -92,10 +118,7 @@ class Experiment(Base):
92
118
  if mss.logloss is not None
93
119
  )
94
120
  for ms in self.model_selections
95
- if any(
96
- mss.logloss is not None
97
- for mss in ms.model_selection_scores
98
- )
121
+ if any(mss.logloss is not None for mss in ms.model_selection_scores)
99
122
  ]
100
123
  return min(min_scores) if min_scores else None
101
124
 
@@ -104,16 +127,9 @@ class Experiment(Base):
104
127
  """Average RMSE score across all model selections and trainings."""
105
128
  # Get the minimum RMSE for each model selection
106
129
  min_scores = [
107
- min(
108
- mss.rmse
109
- for mss in ms.model_selection_scores
110
- if mss.rmse is not None
111
- )
130
+ min(mss.rmse for mss in ms.model_selection_scores if mss.rmse is not None)
112
131
  for ms in self.model_selections
113
- if any(
114
- mss.rmse is not None
115
- for mss in ms.model_selection_scores
116
- )
132
+ if any(mss.rmse is not None for mss in ms.model_selection_scores)
117
133
  ]
118
134
  return mean(min_scores) if min_scores else None
119
135
 
@@ -128,50 +144,10 @@ class Experiment(Base):
128
144
  if mss.logloss is not None
129
145
  )
130
146
  for ms in self.model_selections
131
- if any(
132
- mss.logloss is not None
133
- for mss in ms.model_selection_scores
134
- )
147
+ if any(mss.logloss is not None for mss in ms.model_selection_scores)
135
148
  ]
136
149
  return mean(min_scores) if min_scores else None
137
150
 
138
- test_size = Column(Integer)
139
- corr_threshold = Column(Float, nullable=False)
140
- max_features = Column(Integer, nullable=False)
141
- percentile = Column(Float, nullable=False)
142
- number_of_groups = Column(Integer)
143
- list_of_groups = Column(JSON)
144
- start_date = Column(DateTime)
145
- end_date = Column(DateTime)
146
- train_start_date = Column(DateTime)
147
- train_end_date = Column(DateTime)
148
- val_start_date = Column(DateTime)
149
- val_end_date = Column(DateTime)
150
- test_start_date = Column(DateTime)
151
- test_end_date = Column(DateTime)
152
- context = Column(JSON)
153
-
154
- feature_selections = relationship(
155
- "FeatureSelection",
156
- back_populates="experiment",
157
- cascade="all, delete-orphan",
158
- lazy="selectin",
159
- )
160
-
161
- targets = relationship(
162
- "Target",
163
- secondary=lecrapaud_experiment_target_association,
164
- back_populates="experiments",
165
- lazy="selectin",
166
- )
167
-
168
- __table_args__ = (
169
- UniqueConstraint(
170
- "name",
171
- name="uq_experiments_composite",
172
- ),
173
- )
174
-
175
151
  @classmethod
176
152
  @with_db
177
153
  def get_all_by_name(cls, name: str | None = None, limit: int = 1000, db=None):
@@ -354,19 +330,18 @@ class Experiment(Base):
354
330
 
355
331
  # Get the best model score based on lowest logloss or rmse
356
332
  model_scores = best_model_selection.model_selection_scores
357
-
333
+
358
334
  # Determine if we should use logloss or rmse based on what's available
359
335
  if any(ms.logloss is not None for ms in model_scores):
360
336
  # Classification: find lowest logloss
361
337
  best_score = min(
362
338
  (ms for ms in model_scores if ms.logloss is not None),
363
- key=lambda x: x.logloss
339
+ key=lambda x: x.logloss,
364
340
  )
365
341
  elif any(ms.rmse is not None for ms in model_scores):
366
342
  # Regression: find lowest rmse
367
343
  best_score = min(
368
- (ms for ms in model_scores if ms.rmse is not None),
369
- key=lambda x: x.rmse
344
+ (ms for ms in model_scores if ms.rmse is not None), key=lambda x: x.rmse
370
345
  )
371
346
  else:
372
347
  return {
@@ -398,12 +373,8 @@ class Experiment(Base):
398
373
 
399
374
  # Get the model info
400
375
  model_info = {
401
- "model_type": (
402
- score.model.model_type if score.model else "unknown"
403
- ),
404
- "model_name": (
405
- score.model.name if score.model else "unknown"
406
- ),
376
+ "model_type": (score.model.model_type if score.model else "unknown"),
377
+ "model_name": (score.model.name if score.model else "unknown"),
407
378
  "training_time_seconds": score.training_time,
408
379
  }
409
380
 
@@ -434,7 +405,9 @@ class Experiment(Base):
434
405
  return features
435
406
 
436
407
  @with_db
437
- def get_all_features(self, date_column: str = None, group_column: str = None, db=None):
408
+ def get_all_features(
409
+ self, date_column: str = None, group_column: str = None, db=None
410
+ ):
438
411
  # Ensure we have a fresh instance attached to the session
439
412
  self = db.merge(self)
440
413
  target_idx = [target.id for target in self.targets]
@@ -3,6 +3,7 @@ from pathlib import Path
3
3
 
4
4
  import pandas as pd
5
5
  import joblib
6
+ from datetime import datetime
6
7
 
7
8
  # Set up coverage file path
8
9
  os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
@@ -15,9 +16,6 @@ from lecrapaud.db.session import get_db
15
16
 
16
17
  def create_experiment(
17
18
  data: pd.DataFrame | str,
18
- corr_threshold,
19
- percentile,
20
- max_features,
21
19
  date_column,
22
20
  group_column,
23
21
  experiment_name,
@@ -42,7 +40,10 @@ def create_experiment(
42
40
  targets = [
43
41
  target for target in all_targets if target.name in data.columns.str.upper()
44
42
  ]
45
- experiment_name = f"{experiment_name}_{groups["number_of_groups"] if group_column else 'ng'}_{corr_threshold}_{percentile}_{max_features}_{dates['start_date'].date() if date_column else 'nd'}_{dates['end_date'].date() if date_column else 'nd'}"
43
+ experiment_name = (
44
+ f"{experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
45
+ )
46
+ number_of_targets = len(targets)
46
47
 
47
48
  experiment_dir = f"{tmp_dir}/{experiment_name}"
48
49
  preprocessing_dir = f"{experiment_dir}/preprocessing"
@@ -55,26 +56,20 @@ def create_experiment(
55
56
  db=db,
56
57
  name=experiment_name,
57
58
  path=Path(experiment_dir).resolve(),
58
- type="training",
59
59
  size=data.shape[0],
60
- corr_threshold=corr_threshold,
61
- percentile=percentile,
62
- max_features=max_features,
60
+ number_of_targets=number_of_targets,
63
61
  **groups,
64
62
  **dates,
65
63
  context={
66
- "corr_threshold": corr_threshold,
67
- "percentile": percentile,
68
- "max_features": max_features,
69
64
  "date_column": date_column,
70
65
  "group_column": group_column,
71
66
  "experiment_name": experiment_name,
72
67
  **kwargs,
73
68
  },
74
69
  )
75
-
70
+
76
71
  # Set targets relationship after creation/update
77
72
  experiment.targets = targets
78
73
  experiment.save(db=db)
79
-
74
+
80
75
  return experiment
@@ -87,21 +87,20 @@ class FeatureEngineeringEngine:
87
87
  def __init__(
88
88
  self,
89
89
  data: pd.DataFrame,
90
- columns_drop: list[str] = [],
91
- columns_boolean: list[str] = [],
92
- columns_date: list[str] = [],
93
- columns_te_groupby: list[str] = [],
94
- columns_te_target: list[str] = [],
90
+ experiment,
95
91
  for_training: bool = True,
96
92
  **kwargs,
97
93
  ):
98
94
  self.data = data
99
- self.columns_drop = columns_drop
100
- self.columns_boolean = columns_boolean
101
- self.columns_date = columns_date
102
- self.columns_te_groupby = columns_te_groupby
103
- self.columns_te_target = columns_te_target
95
+ self.experiment = experiment
104
96
  self.for_training = for_training
97
+
98
+ # Get all parameters from experiment context
99
+ self.columns_drop = self.experiment.context.get("columns_drop", [])
100
+ self.columns_boolean = self.experiment.context.get("columns_boolean", [])
101
+ self.columns_date = self.experiment.context.get("columns_date", [])
102
+ self.columns_te_groupby = self.experiment.context.get("columns_te_groupby", [])
103
+ self.columns_te_target = self.experiment.context.get("columns_te_target", [])
105
104
 
106
105
  def run(self) -> pd.DataFrame:
107
106
  # drop columns
@@ -316,41 +315,30 @@ class PreprocessFeature:
316
315
  self,
317
316
  data: pd.DataFrame,
318
317
  experiment,
319
- time_series: bool = False,
320
- date_column: str | None = None,
321
- group_column: str | None = None,
322
- val_size: float = 0.2,
323
- test_size: float = 0.2,
324
- columns_pca: list[str] = [],
325
- pca_temporal: list[dict[str, list[str]]] = [],
326
- pca_cross_sectional: list[dict[str, list[str]]] = [],
327
- columns_onehot: list[str] = [],
328
- columns_binary: list[str] = [],
329
- columns_ordinal: list[str] = [],
330
- columns_frequency: list[str] = [],
331
- target_numbers: list = [],
332
- target_clf: list = [],
333
318
  **kwargs,
334
319
  ):
335
320
  self.data = data
336
321
  self.data.columns = self.data.columns.str.upper()
337
-
338
322
  self.experiment = experiment
339
- self.columns_pca = [col.upper() for col in columns_pca]
340
- self.pca_temporal = pca_temporal
341
- self.pca_cross_sectional = pca_cross_sectional
342
- self.columns_onehot = [col.upper() for col in columns_onehot]
343
- self.columns_binary = [col.upper() for col in columns_binary]
344
- self.columns_ordinal = [col.upper() for col in columns_ordinal]
345
- self.columns_frequency = [col.upper() for col in columns_frequency]
346
- self.target_numbers = target_numbers
347
- self.target_clf = target_clf
348
-
349
- self.time_series = time_series
350
- self.date_column = date_column
351
- self.group_column = group_column
352
- self.val_size = val_size
353
- self.test_size = test_size
323
+
324
+ # Get all parameters from experiment context
325
+ context = self.experiment.context
326
+ self.time_series = context.get("time_series", False)
327
+ self.date_column = context.get("date_column", None)
328
+ self.group_column = context.get("group_column", None)
329
+ self.val_size = context.get("val_size", 0.2)
330
+ self.test_size = context.get("test_size", 0.2)
331
+ self.target_numbers = context.get("target_numbers", [])
332
+ self.target_clf = context.get("target_clf", [])
333
+
334
+ # Handle list parameters with uppercase conversion
335
+ self.columns_pca = [col.upper() for col in context.get("columns_pca", [])]
336
+ self.pca_temporal = context.get("pca_temporal", [])
337
+ self.pca_cross_sectional = context.get("pca_cross_sectional", [])
338
+ self.columns_onehot = [col.upper() for col in context.get("columns_onehot", [])]
339
+ self.columns_binary = [col.upper() for col in context.get("columns_binary", [])]
340
+ self.columns_ordinal = [col.upper() for col in context.get("columns_ordinal", [])]
341
+ self.columns_frequency = [col.upper() for col in context.get("columns_frequency", [])]
354
342
 
355
343
  self.experiment_dir = self.experiment.path
356
344
  self.experiment_id = self.experiment.id
@@ -73,18 +73,21 @@ def load_train_data(experiment_dir):
73
73
 
74
74
 
75
75
  class FeatureSelectionEngine:
76
- def __init__(self, train, experiment, target_number, target_clf, **kwargs):
76
+ def __init__(self, train, experiment, target_number, **kwargs):
77
77
  self.experiment = experiment
78
78
  self.train = train
79
79
  self.target_number = target_number
80
- self.target_clf = target_clf
80
+
81
+ # Get all parameters from experiment context
82
+ self.target_clf = self.experiment.context.get("target_clf", [])
83
+ self.max_p_value_categorical = self.experiment.context.get("max_p_value_categorical", 0.05)
84
+ self.percentile = self.experiment.context.get("percentile", 20)
85
+ self.corr_threshold = self.experiment.context.get("corr_threshold", 80)
86
+ self.max_features = self.experiment.context.get("max_features", 50)
81
87
 
82
88
  self.target_type = (
83
89
  "classification" if self.target_number in self.target_clf else "regression"
84
90
  )
85
- self.percentile = self.experiment.percentile
86
- self.corr_threshold = self.experiment.corr_threshold
87
- self.max_features = self.experiment.max_features
88
91
 
89
92
  self.experiment_dir = self.experiment.path
90
93
  self.experiment_id = self.experiment.id
@@ -274,6 +277,38 @@ class FeatureSelectionEngine:
274
277
  features_selected.drop_duplicates("features", inplace=True)
275
278
 
276
279
  features_selected_list = features_selected["features"].values.tolist()
280
+
281
+ # Save ensemble features before correlation (aggregated features)
282
+ logger.info("Saving ensemble features before correlation...")
283
+ all_features_in_data = self.X.columns.tolist()
284
+ ensemble_rows = []
285
+
286
+ # Add global rank for selected features
287
+ features_selected_with_global_rank = features_selected.copy()
288
+ features_selected_with_global_rank["global_rank"] = range(1, len(features_selected_with_global_rank) + 1)
289
+
290
+ for feature in all_features_in_data:
291
+ feature_id = feature_map.get(feature)
292
+ if feature_id:
293
+ is_selected = feature in features_selected_list
294
+ global_rank = None
295
+ if is_selected:
296
+ global_rank = features_selected_with_global_rank[
297
+ features_selected_with_global_rank["features"] == feature
298
+ ]["global_rank"].values[0]
299
+
300
+ ensemble_rows.append({
301
+ "feature_selection_id": feature_selection.id,
302
+ "feature_id": feature_id,
303
+ "method": "ensemble",
304
+ "score": None,
305
+ "pvalue": None,
306
+ "support": 2 if is_selected else 0, # 2 = in aggregated features
307
+ "rank": global_rank,
308
+ "training_time": 0,
309
+ })
310
+
311
+ FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
277
312
 
278
313
  # analysis 1
279
314
  features_selected_by_every_methods = set(results[0]["features"].values.tolist())
@@ -302,12 +337,46 @@ class FeatureSelectionEngine:
302
337
  header=True,
303
338
  index_label="ID",
304
339
  )
340
+
341
+ # Update support for features after correlation removal (before max)
342
+ logger.info("Updating ensemble features after correlation removal...")
343
+ for row in ensemble_rows:
344
+ feature = Feature.get(row["feature_id"]).name
345
+ if feature in features:
346
+ row["support"] = 1 # 1 = survived correlation removal
347
+
305
348
  features = features[:max_features]
306
349
 
307
350
  # adding categorical features selected
308
351
  features += (
309
352
  categorical_features_selected if target_type == "classification" else []
310
353
  )
354
+
355
+ # Final update for features after max limitation (final selection)
356
+ logger.info("Finalizing ensemble features with categorical features...")
357
+ for row in ensemble_rows:
358
+ feature = Feature.get(row["feature_id"]).name
359
+ if feature in features and row["support"] == 1:
360
+ row["support"] = 2 # 2 = in final selection
361
+
362
+ # Add categorical features to ensemble if not already present
363
+ if target_type == "classification":
364
+ for cat_feature in categorical_features_selected:
365
+ feature_id = feature_map.get(cat_feature)
366
+ if feature_id and not any(row["feature_id"] == feature_id for row in ensemble_rows):
367
+ ensemble_rows.append({
368
+ "feature_selection_id": feature_selection.id,
369
+ "feature_id": feature_id,
370
+ "method": "ensemble",
371
+ "score": None,
372
+ "pvalue": None,
373
+ "support": 2, # 2 = in final selection (categorical)
374
+ "rank": None, # No rank for categorical features added at the end
375
+ "training_time": 0,
376
+ })
377
+
378
+ # Re-save all ensemble data with updated support values
379
+ FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
311
380
  logger.debug(
312
381
  f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
313
382
  )
@@ -440,13 +509,18 @@ class FeatureSelectionEngine:
440
509
  feat_scores["features"] = X.columns
441
510
  feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
442
511
  feat_scores["method"] = "Chi2"
512
+
513
+ # Apply both percentile and p-value filtering
514
+ # Keep features that satisfy BOTH conditions: within percentile AND p-value < threshold
515
+ feat_scores["support"] = feat_scores["support"] & (feat_scores["pvalue"] <= self.max_p_value_categorical)
516
+
443
517
  feat_scores.sort_values("rank", ascending=True, inplace=True)
444
518
  stop = time.time()
445
519
  training_time = timedelta(seconds=(stop - start)).total_seconds()
446
520
  feat_scores["training_time"] = training_time
447
521
 
448
522
  logger.debug(
449
- f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
523
+ f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds (percentile={percentile}%, p-value<={self.max_p_value_categorical})"
450
524
  )
451
525
 
452
526
  feat_scores.to_csv(
@@ -803,33 +877,28 @@ class PreprocessModel:
803
877
  val,
804
878
  test,
805
879
  experiment,
806
- target_numbers,
807
- target_clf,
808
- models_idx,
809
- time_series,
810
- max_timesteps,
811
- group_column,
812
- date_column,
813
880
  **kwargs,
814
881
  ):
815
882
  self.train = train
816
883
  self.val = val
817
884
  self.test = test
818
885
  self.experiment = experiment
819
- self.target_numbers = target_numbers
820
- self.target_clf = target_clf
821
- self.models_idx = models_idx
822
- self.time_series = time_series
823
- self.max_timesteps = max_timesteps
824
- self.group_column = group_column
825
- self.date_column = date_column
886
+
887
+ # Get all parameters from experiment context
888
+ self.target_numbers = self.experiment.context.get("target_numbers", [])
889
+ self.target_clf = self.experiment.context.get("target_clf", [])
890
+ self.models_idx = self.experiment.context.get("models_idx", [])
891
+ self.time_series = self.experiment.context.get("time_series", False)
892
+ self.max_timesteps = self.experiment.context.get("max_timesteps", 120)
893
+ self.group_column = self.experiment.context.get("group_column", None)
894
+ self.date_column = self.experiment.context.get("date_column", None)
826
895
 
827
896
  self.experiment_dir = experiment.path
828
897
  self.data_dir = f"{self.experiment_dir}/data"
829
898
  self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
830
899
 
831
900
  self.all_features = experiment.get_all_features(
832
- date_column=date_column, group_column=group_column
901
+ date_column=self.date_column, group_column=self.group_column
833
902
  )
834
903
 
835
904
  def run(self):
@@ -1017,24 +1017,24 @@ class ModelSelectionEngine:
1017
1017
  data,
1018
1018
  reshaped_data,
1019
1019
  target_number,
1020
- target_clf,
1021
1020
  experiment,
1022
- models_idx,
1023
- time_series,
1024
- date_column,
1025
- group_column,
1026
- target_clf_thresholds,
1027
1021
  **kwargs,
1028
1022
  ):
1029
1023
  self.data = data
1030
1024
  self.reshaped_data = reshaped_data
1031
1025
  self.target_number = target_number
1032
1026
  self.experiment = experiment
1033
- self.target_clf = target_clf
1034
- self.models_idx = models_idx
1035
- self.time_series = time_series
1036
- self.date_column = date_column
1037
- self.group_column = group_column
1027
+
1028
+ # Get all parameters from experiment context
1029
+ context = self.experiment.context
1030
+ self.target_clf = context.get("target_clf", [])
1031
+ self.models_idx = context.get("models_idx", [])
1032
+ self.time_series = context.get("time_series", False)
1033
+ self.date_column = context.get("date_column", None)
1034
+ self.group_column = context.get("group_column", None)
1035
+
1036
+ # Handle target_clf_thresholds
1037
+ target_clf_thresholds = context.get("target_clf_thresholds", {})
1038
1038
  self.target_clf_thresholds = (
1039
1039
  target_clf_thresholds[target_number]
1040
1040
  if target_number in target_clf_thresholds.keys()
@@ -1056,25 +1056,19 @@ class ModelSelectionEngine:
1056
1056
  )
1057
1057
 
1058
1058
  # Main training function
1059
- def run(
1060
- self,
1061
- experiment_name,
1062
- perform_hyperopt=True,
1063
- number_of_trials=20,
1064
- perform_crossval=False, # This controls CV during hyperopt, not after
1065
- plot=True,
1066
- clean_dir=False, # TODO: This has been unused because now feature_selection is in the target directory
1067
- preserve_model=True,
1068
- best_params=None,
1069
- ):
1059
+ def run(self, best_params=None):
1070
1060
  """
1071
1061
  Selects the best models based on a target variable, optionally performing hyperparameter optimization
1072
1062
  and cross-validation, and manages outputs in a session-specific directory.
1073
1063
  """
1074
- self.experiment_name = experiment_name
1075
- self.plot = plot
1076
- self.number_of_trials = number_of_trials
1077
- self.perform_crossval = perform_crossval
1064
+ # Get all parameters from experiment context
1065
+ context = self.experiment.context
1066
+ self.experiment_name = context.get("experiment_name", "")
1067
+ self.plot = context.get("plot", True)
1068
+ self.number_of_trials = context.get("number_of_trials", 20)
1069
+ self.perform_crossval = context.get("perform_crossval", False)
1070
+ self.preserve_model = context.get("preserve_model", True)
1071
+ self.perform_hyperopt = context.get("perform_hyperopt", True)
1078
1072
 
1079
1073
  if self.experiment_id is None:
1080
1074
  raise ValueError("Please provide a experiment.")
@@ -1141,13 +1135,13 @@ class ModelSelectionEngine:
1141
1135
  self.results_dir = f"{self.target_dir}/{model_name}"
1142
1136
  if not os.path.exists(f"{self.results_dir}"):
1143
1137
  os.makedirs(f"{self.results_dir}")
1144
- elif preserve_model and contains_best(self.results_dir):
1138
+ elif self.preserve_model and contains_best(self.results_dir):
1145
1139
  continue
1146
- elif perform_hyperopt:
1140
+ elif self.perform_hyperopt:
1147
1141
  clean_directory(self.results_dir)
1148
1142
 
1149
1143
  logger.info(
1150
- f"{experiment_name} - Training a {model_name} at {datetime.now()} for TARGET_{self.target_number}"
1144
+ f"{self.experiment_name} - Training a {model_name} at {datetime.now()} for TARGET_{self.target_number}"
1151
1145
  )
1152
1146
 
1153
1147
  # Getting data
@@ -1204,7 +1198,7 @@ class ModelSelectionEngine:
1204
1198
 
1205
1199
  # Tuning hyperparameters
1206
1200
  start = time.time()
1207
- if perform_hyperopt:
1201
+ if self.perform_hyperopt:
1208
1202
  model_best_params = self.hyperoptimize(
1209
1203
  x_train, y_train, x_val, y_val, model
1210
1204
  )
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "lecrapaud"
3
- version = "0.20.0"
3
+ version = "0.20.1"
4
4
  description = "Framework for machine and deep learning, with regression, classification and time series analysis"
5
5
  authors = [
6
6
  {name = "Pierre H. Gallet"}
File without changes
File without changes