lecrapaud 0.12.2__tar.gz → 0.13.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (44) hide show
  1. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/PKG-INFO +1 -1
  2. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/api.py +50 -1
  3. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/model_selection.py +153 -46
  4. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/pyproject.toml +1 -1
  5. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/LICENSE +0 -0
  6. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/README.md +0 -0
  7. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/__init__.py +0 -0
  8. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/config.py +0 -0
  9. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/__init__.py +0 -0
  10. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/alembic/README +0 -0
  11. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/alembic/env.py +0 -0
  12. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/alembic/script.py.mako +0 -0
  13. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/alembic/versions/2025_06_23_1748-f089dfb7e3ba_.py +0 -0
  14. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/alembic/versions/2025_06_24_1216-c62251b129ed_.py +0 -0
  15. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py +0 -0
  16. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/alembic/versions/2025_06_25_1759-72aa496ca65b_.py +0 -0
  17. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/alembic.ini +0 -0
  18. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/models/__init__.py +0 -0
  19. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/models/base.py +0 -0
  20. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/models/experiment.py +0 -0
  21. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/models/feature.py +0 -0
  22. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/models/feature_selection.py +0 -0
  23. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/models/feature_selection_rank.py +0 -0
  24. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/models/model.py +0 -0
  25. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/models/model_selection.py +0 -0
  26. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/models/model_training.py +0 -0
  27. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/models/score.py +0 -0
  28. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/models/target.py +0 -0
  29. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/db/session.py +0 -0
  30. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/directories.py +0 -0
  31. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/experiment.py +0 -0
  32. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/feature_engineering.py +0 -0
  33. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/feature_selection.py +0 -0
  34. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/integrations/openai_integration.py +0 -0
  35. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/jobs/__init__.py +0 -0
  36. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/jobs/config.py +0 -0
  37. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/jobs/scheduler.py +0 -0
  38. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/jobs/tasks.py +0 -0
  39. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/misc/tabpfn_tests.ipynb +0 -0
  40. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/misc/test-gpu-bilstm.ipynb +0 -0
  41. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/misc/test-gpu-resnet.ipynb +0 -0
  42. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/misc/test-gpu-transformers.ipynb +0 -0
  43. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/search_space.py +0 -0
  44. {lecrapaud-0.12.2 → lecrapaud-0.13.1}/lecrapaud/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: lecrapaud
3
- Version: 0.12.2
3
+ Version: 0.13.1
4
4
  Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
5
  License: Apache License
6
6
  Author: Pierre H. Gallet
@@ -175,7 +175,7 @@ class ExperimentEngine:
175
175
  )
176
176
  features = self.experiment.get_features(target_number)
177
177
 
178
- model = ModelEngine(path=target_dir)
178
+ model = ModelEngine(path=target_dir, target_number=target_number)
179
179
 
180
180
  # getting data
181
181
  if model.recurrent:
@@ -470,3 +470,52 @@ class ExperimentEngine:
470
470
  plot_threshold(tmp_pred, threshold, precision, recall)
471
471
  else:
472
472
  logger.info(f"No threshold found for class {class_label}")
473
+
474
+ def get_best_params(self, target_number: int = None) -> dict:
475
+ """
476
+ Load the best parameters for the experiment.
477
+
478
+ Args:
479
+ target_number (int, optional): If provided, returns parameters for this specific target.
480
+ If None, returns parameters for all targets.
481
+
482
+ Returns:
483
+ dict: Dictionary containing the best parameters. If target_number is provided,
484
+ returns parameters for that target only. Otherwise, returns a dictionary
485
+ with target numbers as keys.
486
+ """
487
+ import json
488
+ import os
489
+
490
+ params_file = os.path.join(
491
+ self.experiment.path, "preprocessing", "all_targets_best_params.json"
492
+ )
493
+
494
+ if not os.path.exists(params_file):
495
+ raise FileNotFoundError(
496
+ f"Best parameters file not found at {params_file}. "
497
+ "Make sure to run model training first."
498
+ )
499
+
500
+ try:
501
+ with open(params_file, "r") as f:
502
+ all_params = json.load(f)
503
+
504
+ # Convert string keys to integers
505
+ all_params = {int(k): v for k, v in all_params.items()}
506
+
507
+ if target_number is not None:
508
+ if target_number not in all_params:
509
+ available_targets = list(all_params.keys())
510
+ raise ValueError(
511
+ f"No parameters found for target {target_number}. "
512
+ f"Available targets: {available_targets}"
513
+ )
514
+ return all_params[target_number]
515
+
516
+ return all_params
517
+
518
+ except json.JSONDecodeError as e:
519
+ raise ValueError(f"Error parsing best parameters file: {str(e)}")
520
+ except Exception as e:
521
+ raise Exception(f"Error loading best parameters: {str(e)}")
@@ -114,6 +114,7 @@ class ModelEngine:
114
114
  self,
115
115
  model_name: str = None,
116
116
  target_type: str = None,
117
+ target_number: int = None,
117
118
  path: str = None,
118
119
  search_params: dict = {},
119
120
  create_model=None,
@@ -126,6 +127,7 @@ class ModelEngine:
126
127
  else:
127
128
  self.model_name = model_name
128
129
  self.target_type = target_type
130
+ self.target_number = target_number
129
131
 
130
132
  config = [
131
133
  config for config in all_models if config["model_name"] == self.model_name
@@ -326,18 +328,16 @@ class ModelEngine:
326
328
  writer.close()
327
329
 
328
330
  if self.plot:
329
- # Plot loss per epoch
330
- train_loss = evals_result["train"][eval_metric]
331
- val_loss = evals_result["val"][eval_metric]
332
- logs = pd.DataFrame({"train": train_loss, "val": val_loss})
333
-
334
- plt.figure(figsize=(14, 4))
335
- plt.plot(logs.loc[:, "train"], lw=2, label="Training loss")
336
- plt.plot(logs.loc[:, "val"], lw=2, label="Validation loss")
337
- plt.xlabel("Epoch")
338
- plt.ylabel("Loss")
339
- plt.legend()
340
- plt.show()
331
+ # Plot training progress
332
+ plot_training_progress(
333
+ logs={
334
+ "train": evals_result["train"][eval_metric],
335
+ "val": evals_result["val"][eval_metric],
336
+ },
337
+ model_name=self.model_name,
338
+ target_number=self.target_number,
339
+ title_suffix=f"Training Progress - {eval_metric}",
340
+ )
341
341
 
342
342
  self._model = model
343
343
 
@@ -465,16 +465,12 @@ class ModelEngine:
465
465
  # logger.info(pd.DataFrame(gradiant.epoch_gradient))
466
466
 
467
467
  if self.plot:
468
- # Plot loss per epoch
469
- logs = pd.DataFrame(history.history)
470
-
471
- plt.figure(figsize=(14, 4))
472
- plt.plot(logs.loc[:, "loss"], lw=2, label="Training loss")
473
- plt.plot(logs.loc[:, "val_loss"], lw=2, label="Validation loss")
474
- plt.xlabel("Epoch")
475
- plt.ylabel("Loss")
476
- plt.legend()
477
- plt.show()
468
+ # Plot training progress using the utility function
469
+ plot_training_progress(
470
+ logs=history.history,
471
+ model_name=self.model_name,
472
+ target_number=self.target_number,
473
+ )
478
474
 
479
475
  self._model = model
480
476
 
@@ -605,6 +601,7 @@ def trainable(
605
601
  model = ModelEngine(
606
602
  model_name=model_name,
607
603
  target_type=target_type,
604
+ target_number=target_number,
608
605
  create_model=create_model,
609
606
  plot=plot,
610
607
  log_dir=log_dir,
@@ -659,6 +656,9 @@ def trainable(
659
656
 
660
657
  score.update(evaluate(prediction, target_type, target_clf_thresholds))
661
658
 
659
+ metric = "RMSE" if target_type == "regression" else "LOGLOSS"
660
+ logger.info(f"{model.model_name} scores on validation set: {score[metric]:.4f}")
661
+
662
662
  if type_name == "hyperopts":
663
663
  session.report(metrics=score)
664
664
  return score
@@ -856,6 +856,7 @@ class ModelSelectionEngine:
856
856
  log_dir = get_log_dir(self.target_dir, model_name)
857
857
  # instantiate model
858
858
  model = ModelEngine(
859
+ target_number=self.target_number,
859
860
  model_name=model_name,
860
861
  search_params=config["search_params"],
861
862
  target_type=self.target_type,
@@ -904,16 +905,22 @@ class ModelSelectionEngine:
904
905
  tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
905
906
 
906
907
  # Store the scores
907
- cross_validation_scores = []
908
+ cv_scores = []
908
909
 
909
910
  for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
910
911
  self.type_name = f"crossval_fold_{i}"
911
912
 
912
913
  if self.time_series:
913
- date_series = train[self.date_column].copy()
914
+ date_series = pd.concat(
915
+ [
916
+ train[self.date_column],
917
+ val[self.date_column],
918
+ test[self.date_column],
919
+ ],
920
+ axis=0,
921
+ ).reset_index(drop=True)
914
922
 
915
- if need_scaling:
916
- date_series = date_series.map(pd.Timestamp.fromordinal)
923
+ date_series = date_series.map(pd.Timestamp.fromordinal)
917
924
 
918
925
  # Now you can use the actual train/val indices to extract ranges
919
926
  train_start = date_series.iloc[train_index[0]]
@@ -932,7 +939,7 @@ class ModelSelectionEngine:
932
939
 
933
940
  # Train the model and get the score
934
941
  if recurrent:
935
- cross_validation_score, _, _ = self.train_model(
942
+ cv_score, _, _ = self.train_model(
936
943
  params=model_best_params,
937
944
  x_train=x_train_val[train_index],
938
945
  y_train=y_train_val[train_index],
@@ -941,7 +948,7 @@ class ModelSelectionEngine:
941
948
  model=model,
942
949
  )
943
950
  else:
944
- cross_validation_score, _, _ = self.train_model(
951
+ cv_score, _, _ = self.train_model(
945
952
  params=model_best_params,
946
953
  x_train=x_train_val.iloc[train_index],
947
954
  y_train=y_train_val.iloc[train_index],
@@ -951,18 +958,20 @@ class ModelSelectionEngine:
951
958
  )
952
959
 
953
960
  # Append score to the list
954
- cross_validation_scores.append(cross_validation_score)
955
-
956
- # Calculate and log the mean score
957
- cross_validation_mean_score = pd.DataFrame(cross_validation_scores)[
958
- self.metric
959
- ].mean()
960
- logger.info(
961
- f"Best model mean cross-validation score on entire experiment: {cross_validation_mean_score}"
962
- )
961
+ cv_scores.append(cv_score)
962
+
963
+ # Calculate mean of all numerical metrics across all cross-validation folds
964
+ cv_scores_df = pd.DataFrame(cv_scores)
965
+ # Get mean of all numeric columns
966
+ cv_means = cv_scores_df.mean(numeric_only=True).to_dict()
967
+
968
+ logger.info(f"👉 {model.model_name} mean cv scores on full dataset:")
969
+ for metric, value in cv_means.items():
970
+ logger.info(f" {metric}: {value:.4f}")
963
971
 
964
972
  # Retrain on entire training set, but keep score on cross-validation folds
965
- best_score, best_model, best_pred = self.train_model(
973
+ # Get the test score using the best model
974
+ test_score, best_model, best_pred = self.train_model(
966
975
  params=model_best_params,
967
976
  x_train=pd.concat([x_train, x_val], axis=0),
968
977
  y_train=pd.concat([y_train, y_val], axis=0),
@@ -970,10 +979,16 @@ class ModelSelectionEngine:
970
979
  y_val=y_test,
971
980
  model=model,
972
981
  )
973
- best_score = cross_validation_mean_score
982
+
983
+ # Update all metrics with cross-validation means
984
+ for metric, value in cv_means.items():
985
+ if metric in test_score: # Only update existing metrics
986
+ test_score[metric] = value
987
+ best_score = test_score
988
+ best_score["TYPE"] = "crossval"
974
989
  else:
975
- # Evaluate on validation set
976
- self.type_name = "validation"
990
+ # Evaluate on test set
991
+ self.type_name = "testset"
977
992
  best_score, best_model, best_pred = self.train_model(
978
993
  params=model_best_params,
979
994
  x_train=pd.concat([x_train, x_val], axis=0),
@@ -983,9 +998,12 @@ class ModelSelectionEngine:
983
998
  model=model,
984
999
  )
985
1000
 
986
- logger.info(f"Best model scores on test set: {best_score}")
1001
+ logger.info(f"👉 {model.model_name} scores on test set:")
1002
+ for metric, value in best_score.items():
1003
+ if isinstance(value, (int, float)):
1004
+ logger.info(f" {metric}: {value:.4f}")
987
1005
 
988
- # Save validation predictions
1006
+ # Save predictions
989
1007
  best_pred.to_csv(
990
1008
  f"{self.results_dir}/prediction.csv",
991
1009
  index=True,
@@ -999,7 +1017,7 @@ class ModelSelectionEngine:
999
1017
  model_path = Path(model_path).resolve()
1000
1018
  best_score["MODEL_PATH"] = model_path
1001
1019
 
1002
- # Track scores
1020
+ # Save best scores
1003
1021
  scores_tracking_path = f"{self.target_dir}/scores_tracking.csv"
1004
1022
  best_score_df = pd.DataFrame([best_score])
1005
1023
 
@@ -1071,7 +1089,7 @@ class ModelSelectionEngine:
1071
1089
  with open(f"{self.target_dir}/best_params.json", "r") as f:
1072
1090
  best_model_params = json.load(f)[best_model_name]
1073
1091
 
1074
- # save model_selection results to db
1092
+ # Save model_selection results to db
1075
1093
  model_selection = ModelSelection.get(model_selection.id)
1076
1094
  model_selection.best_model_id = Model.find_by(
1077
1095
  name=best_score_overall["MODEL_NAME"], type=self.target_type
@@ -1083,6 +1101,9 @@ class ModelSelectionEngine:
1083
1101
 
1084
1102
  logger.info(f"Best model overall is : {best_score_overall}")
1085
1103
 
1104
+ # Consolidate best parameters from all targets into a single file
1105
+ self.consolidate_best_params()
1106
+
1086
1107
  best_model = joblib.load(best_model_path)
1087
1108
  return best_model
1088
1109
 
@@ -1184,11 +1205,53 @@ class ModelSelectionEngine:
1184
1205
  target_clf_thresholds=self.target_clf_thresholds,
1185
1206
  )
1186
1207
 
1208
+ def consolidate_best_params(self):
1209
+ """
1210
+ Consolidate best parameters from all targets into a single JSON file in the preprocessing folder.
1211
+ The output will be a dictionary with target numbers as keys and their best parameters as values.
1212
+ """
1213
+ # Initialize the consolidated parameters dictionary
1214
+ all_best_params = {}
1215
+
1216
+ # Find all target directories
1217
+ target_dirs = [
1218
+ d for d in os.listdir(self.experiment_dir) if d.startswith("TARGET_")
1219
+ ]
1220
+
1221
+ for target_dir in target_dirs:
1222
+ target_number = target_dir.split("_")[1]
1223
+ best_params_file = os.path.join(
1224
+ self.experiment_dir, target_dir, "best_params.json"
1225
+ )
1226
+
1227
+ # Check if best_params.json exists for this target
1228
+ if os.path.exists(best_params_file):
1229
+ try:
1230
+ with open(best_params_file, "r") as f:
1231
+ target_params = json.load(f)
1232
+ all_best_params[target_number] = target_params
1233
+ except Exception as e:
1234
+ logger.warning(
1235
+ f"Error loading best params for {target_dir}: {str(e)}"
1236
+ )
1237
+
1238
+ # Save consolidated parameters to preprocessing folder
1239
+ if all_best_params:
1240
+ output_file = os.path.join(
1241
+ self.preprocessing_dir, "all_targets_best_params.json"
1242
+ )
1243
+ os.makedirs(os.path.dirname(output_file), exist_ok=True)
1244
+ with open(output_file, "w") as f:
1245
+ json.dump(all_best_params, f, indent=4)
1246
+ logger.info(f"Consolidated best parameters saved to {output_file}")
1247
+
1248
+ return all_best_params
1249
+
1187
1250
 
1188
1251
  def evaluate(
1189
1252
  prediction: pd.DataFrame,
1190
1253
  target_type: str,
1191
- target_clf_thresholds: dict = {"precision": 0.80},
1254
+ target_clf_thresholds: dict = None,
1192
1255
  ):
1193
1256
  """
1194
1257
  Function to evaluate model performance
@@ -1202,6 +1265,10 @@ def evaluate(
1202
1265
  y_true = prediction["TARGET"]
1203
1266
  y_pred = prediction["PRED"]
1204
1267
 
1268
+ # Set default threshold if not provided
1269
+ if target_clf_thresholds is None:
1270
+ target_clf_thresholds = {"precision": 0.80}
1271
+
1205
1272
  if target_type == "regression":
1206
1273
  # Main metrics
1207
1274
  score["RMSE"] = root_mean_squared_error(y_true, y_pred)
@@ -1330,6 +1397,46 @@ def load_model(target_dir: str):
1330
1397
  )
1331
1398
 
1332
1399
 
1400
+ def plot_training_progress(
1401
+ logs, model_name, target_number, title_suffix="Training Progress"
1402
+ ):
1403
+ """
1404
+ Plot training and validation metrics during model training.
1405
+
1406
+ Args:
1407
+ logs: DataFrame or dict containing training history
1408
+ model_name: Name of the model being trained
1409
+ target_number: Target number for the model
1410
+ title_suffix: Optional suffix for the plot title
1411
+ """
1412
+ if isinstance(logs, dict):
1413
+ logs = pd.DataFrame(logs)
1414
+
1415
+ plt.figure(figsize=(14, 4))
1416
+
1417
+ # Plot all metrics that exist in the logs
1418
+ if "loss" in logs.columns:
1419
+ plt.plot(logs["loss"], lw=2, label="Training loss")
1420
+ if "val_loss" in logs.columns:
1421
+ plt.plot(logs["val_loss"], lw=2, label="Validation loss")
1422
+
1423
+ # If no specific loss columns, plot all available metrics
1424
+ if "loss" not in logs.columns and "val_loss" not in logs.columns and not logs.empty:
1425
+ for col in logs.columns:
1426
+ if col.startswith("val_"):
1427
+ plt.plot(logs[col], "--", lw=2, label=f"Validation {col[4:]}")
1428
+ else:
1429
+ plt.plot(logs[col], lw=2, label=f"Training {col}")
1430
+
1431
+ plt.title(f"{model_name} - Target {target_number}\n{title_suffix}")
1432
+ plt.xlabel("Epoch")
1433
+ plt.ylabel("Metric Value")
1434
+ plt.legend()
1435
+ plt.grid(True, alpha=0.3)
1436
+ plt.tight_layout()
1437
+ plt.show()
1438
+
1439
+
1333
1440
  # plots
1334
1441
  def plot_evaluation_for_classification(prediction: dict):
1335
1442
  """
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "lecrapaud"
3
- version = "0.12.2"
3
+ version = "0.13.1"
4
4
  description = "Framework for machine and deep learning, with regression, classification and time series analysis"
5
5
  authors = [
6
6
  {name = "Pierre H. Gallet"}
File without changes
File without changes