lecrapaud 0.12.1__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

lecrapaud/api.py CHANGED
@@ -175,7 +175,7 @@ class ExperimentEngine:
175
175
  )
176
176
  features = self.experiment.get_features(target_number)
177
177
 
178
- model = ModelEngine(path=target_dir)
178
+ model = ModelEngine(path=target_dir, target_number=target_number)
179
179
 
180
180
  # getting data
181
181
  if model.recurrent:
@@ -335,6 +335,10 @@ class ExperimentEngine:
335
335
  group_column=self.group_column,
336
336
  target_clf_thresholds=self.target_clf_thresholds,
337
337
  )
338
+ if best_params and target_number not in best_params.keys():
339
+ raise ValueError(
340
+ f"Target {target_number} not found in best_params passed as argument"
341
+ )
338
342
  app.run(
339
343
  self.experiment_name,
340
344
  perform_hyperopt=self.perform_hyperopt,
@@ -342,7 +346,7 @@ class ExperimentEngine:
342
346
  perform_crossval=self.perform_crossval,
343
347
  plot=self.plot,
344
348
  preserve_model=self.preserve_model,
345
- best_params=best_params[target_number],
349
+ best_params=best_params[target_number] if best_params else None,
346
350
  )
347
351
 
348
352
  def get_scores(self, target_number: int):
@@ -466,3 +470,52 @@ class ExperimentEngine:
466
470
  plot_threshold(tmp_pred, threshold, precision, recall)
467
471
  else:
468
472
  logger.info(f"No threshold found for class {class_label}")
473
+
474
+ def get_best_params(self, target_number: int = None) -> dict:
475
+ """
476
+ Load the best parameters for the experiment.
477
+
478
+ Args:
479
+ target_number (int, optional): If provided, returns parameters for this specific target.
480
+ If None, returns parameters for all targets.
481
+
482
+ Returns:
483
+ dict: Dictionary containing the best parameters. If target_number is provided,
484
+ returns parameters for that target only. Otherwise, returns a dictionary
485
+ with target numbers as keys.
486
+ """
487
+ import json
488
+ import os
489
+
490
+ params_file = os.path.join(
491
+ self.experiment.path, "preprocessing", "all_targets_best_params.json"
492
+ )
493
+
494
+ if not os.path.exists(params_file):
495
+ raise FileNotFoundError(
496
+ f"Best parameters file not found at {params_file}. "
497
+ "Make sure to run model training first."
498
+ )
499
+
500
+ try:
501
+ with open(params_file, "r") as f:
502
+ all_params = json.load(f)
503
+
504
+ # Convert string keys to integers
505
+ all_params = {int(k): v for k, v in all_params.items()}
506
+
507
+ if target_number is not None:
508
+ if target_number not in all_params:
509
+ available_targets = list(all_params.keys())
510
+ raise ValueError(
511
+ f"No parameters found for target {target_number}. "
512
+ f"Available targets: {available_targets}"
513
+ )
514
+ return all_params[target_number]
515
+
516
+ return all_params
517
+
518
+ except json.JSONDecodeError as e:
519
+ raise ValueError(f"Error parsing best parameters file: {str(e)}")
520
+ except Exception as e:
521
+ raise Exception(f"Error loading best parameters: {str(e)}")
@@ -114,6 +114,7 @@ class ModelEngine:
114
114
  self,
115
115
  model_name: str = None,
116
116
  target_type: str = None,
117
+ target_number: int = None,
117
118
  path: str = None,
118
119
  search_params: dict = {},
119
120
  create_model=None,
@@ -126,6 +127,7 @@ class ModelEngine:
126
127
  else:
127
128
  self.model_name = model_name
128
129
  self.target_type = target_type
130
+ self.target_number = target_number
129
131
 
130
132
  config = [
131
133
  config for config in all_models if config["model_name"] == self.model_name
@@ -326,18 +328,16 @@ class ModelEngine:
326
328
  writer.close()
327
329
 
328
330
  if self.plot:
329
- # Plot loss per epoch
330
- train_loss = evals_result["train"][eval_metric]
331
- val_loss = evals_result["val"][eval_metric]
332
- logs = pd.DataFrame({"train": train_loss, "val": val_loss})
333
-
334
- plt.figure(figsize=(14, 4))
335
- plt.plot(logs.loc[:, "train"], lw=2, label="Training loss")
336
- plt.plot(logs.loc[:, "val"], lw=2, label="Validation loss")
337
- plt.xlabel("Epoch")
338
- plt.ylabel("Loss")
339
- plt.legend()
340
- plt.show()
331
+ # Plot training progress
332
+ plot_training_progress(
333
+ logs={
334
+ "train": evals_result["train"][eval_metric],
335
+ "val": evals_result["val"][eval_metric],
336
+ },
337
+ model_name=self.model_name,
338
+ target_number=self.target_number,
339
+ title_suffix=f"Training Progress - {eval_metric}",
340
+ )
341
341
 
342
342
  self._model = model
343
343
 
@@ -465,16 +465,12 @@ class ModelEngine:
465
465
  # logger.info(pd.DataFrame(gradiant.epoch_gradient))
466
466
 
467
467
  if self.plot:
468
- # Plot loss per epoch
469
- logs = pd.DataFrame(history.history)
470
-
471
- plt.figure(figsize=(14, 4))
472
- plt.plot(logs.loc[:, "loss"], lw=2, label="Training loss")
473
- plt.plot(logs.loc[:, "val_loss"], lw=2, label="Validation loss")
474
- plt.xlabel("Epoch")
475
- plt.ylabel("Loss")
476
- plt.legend()
477
- plt.show()
468
+ # Plot training progress using the utility function
469
+ plot_training_progress(
470
+ logs=history.history,
471
+ model_name=self.model_name,
472
+ target_number=self.target_number,
473
+ )
478
474
 
479
475
  self._model = model
480
476
 
@@ -605,6 +601,7 @@ def trainable(
605
601
  model = ModelEngine(
606
602
  model_name=model_name,
607
603
  target_type=target_type,
604
+ target_number=target_number,
608
605
  create_model=create_model,
609
606
  plot=plot,
610
607
  log_dir=log_dir,
@@ -659,6 +656,9 @@ def trainable(
659
656
 
660
657
  score.update(evaluate(prediction, target_type, target_clf_thresholds))
661
658
 
659
+ metric = "RMSE" if target_type == "regression" else "LOGLOSS"
660
+ logger.info(f"{model.model_name} scores on validation set: {score[metric]:.4f}")
661
+
662
662
  if type_name == "hyperopts":
663
663
  session.report(metrics=score)
664
664
  return score
@@ -856,6 +856,7 @@ class ModelSelectionEngine:
856
856
  log_dir = get_log_dir(self.target_dir, model_name)
857
857
  # instantiate model
858
858
  model = ModelEngine(
859
+ target_number=self.target_number,
859
860
  model_name=model_name,
860
861
  search_params=config["search_params"],
861
862
  target_type=self.target_type,
@@ -904,16 +905,22 @@ class ModelSelectionEngine:
904
905
  tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
905
906
 
906
907
  # Store the scores
907
- cross_validation_scores = []
908
+ cv_scores = []
908
909
 
909
910
  for i, (train_index, val_index) in enumerate(tscv.split(x_train_val)):
910
911
  self.type_name = f"crossval_fold_{i}"
911
912
 
912
913
  if self.time_series:
913
- date_series = train[self.date_column].copy()
914
+ date_series = pd.concat(
915
+ [
916
+ train[self.date_column],
917
+ val[self.date_column],
918
+ test[self.date_column],
919
+ ],
920
+ axis=0,
921
+ ).reset_index(drop=True)
914
922
 
915
- if need_scaling:
916
- date_series = date_series.map(pd.Timestamp.fromordinal)
923
+ date_series = date_series.map(pd.Timestamp.fromordinal)
917
924
 
918
925
  # Now you can use the actual train/val indices to extract ranges
919
926
  train_start = date_series.iloc[train_index[0]]
@@ -932,7 +939,7 @@ class ModelSelectionEngine:
932
939
 
933
940
  # Train the model and get the score
934
941
  if recurrent:
935
- cross_validation_score, _, _ = self.train_model(
942
+ cv_score, _, _ = self.train_model(
936
943
  params=model_best_params,
937
944
  x_train=x_train_val[train_index],
938
945
  y_train=y_train_val[train_index],
@@ -941,7 +948,7 @@ class ModelSelectionEngine:
941
948
  model=model,
942
949
  )
943
950
  else:
944
- cross_validation_score, _, _ = self.train_model(
951
+ cv_score, _, _ = self.train_model(
945
952
  params=model_best_params,
946
953
  x_train=x_train_val.iloc[train_index],
947
954
  y_train=y_train_val.iloc[train_index],
@@ -951,18 +958,20 @@ class ModelSelectionEngine:
951
958
  )
952
959
 
953
960
  # Append score to the list
954
- cross_validation_scores.append(cross_validation_score)
955
-
956
- # Calculate and log the mean score
957
- cross_validation_mean_score = pd.DataFrame(cross_validation_scores)[
958
- self.metric
959
- ].mean()
960
- logger.info(
961
- f"Best model mean cross-validation score on entire experiment: {cross_validation_mean_score}"
962
- )
961
+ cv_scores.append(cv_score)
962
+
963
+ # Calculate mean of all numerical metrics across all cross-validation folds
964
+ cv_scores_df = pd.DataFrame(cv_scores)
965
+ # Get mean of all numeric columns
966
+ cv_means = cv_scores_df.mean(numeric_only=True).to_dict()
967
+
968
+ logger.info(f"👉 {model.model_name} mean cv scores on full dataset:")
969
+ for metric, value in cv_means.items():
970
+ logger.info(f" {metric}: {value:.4f}")
963
971
 
964
972
  # Retrain on entire training set, but keep score on cross-validation folds
965
- best_score, best_model, best_pred = self.train_model(
973
+ # Get the test score using the best model
974
+ test_score, best_model, best_pred = self.train_model(
966
975
  params=model_best_params,
967
976
  x_train=pd.concat([x_train, x_val], axis=0),
968
977
  y_train=pd.concat([y_train, y_val], axis=0),
@@ -970,10 +979,16 @@ class ModelSelectionEngine:
970
979
  y_val=y_test,
971
980
  model=model,
972
981
  )
973
- best_score = cross_validation_mean_score
982
+
983
+ # Update all metrics with cross-validation means
984
+ for metric, value in cv_means.items():
985
+ if metric in test_score: # Only update existing metrics
986
+ test_score[metric] = value
987
+ best_score = test_score
988
+ best_score["TYPE"] = "crossval"
974
989
  else:
975
- # Evaluate on validation set
976
- self.type_name = "validation"
990
+ # Evaluate on test set
991
+ self.type_name = "testset"
977
992
  best_score, best_model, best_pred = self.train_model(
978
993
  params=model_best_params,
979
994
  x_train=pd.concat([x_train, x_val], axis=0),
@@ -983,9 +998,11 @@ class ModelSelectionEngine:
983
998
  model=model,
984
999
  )
985
1000
 
986
- logger.info(f"Best model scores on test set: {best_score}")
1001
+ logger.info(f"👉 {model.model_name} scores on test set:")
1002
+ for metric, value in best_score.items():
1003
+ logger.info(f" {metric}: {value:.4f}")
987
1004
 
988
- # Save validation predictions
1005
+ # Save predictions
989
1006
  best_pred.to_csv(
990
1007
  f"{self.results_dir}/prediction.csv",
991
1008
  index=True,
@@ -999,7 +1016,7 @@ class ModelSelectionEngine:
999
1016
  model_path = Path(model_path).resolve()
1000
1017
  best_score["MODEL_PATH"] = model_path
1001
1018
 
1002
- # Track scores
1019
+ # Save best scores
1003
1020
  scores_tracking_path = f"{self.target_dir}/scores_tracking.csv"
1004
1021
  best_score_df = pd.DataFrame([best_score])
1005
1022
 
@@ -1071,7 +1088,7 @@ class ModelSelectionEngine:
1071
1088
  with open(f"{self.target_dir}/best_params.json", "r") as f:
1072
1089
  best_model_params = json.load(f)[best_model_name]
1073
1090
 
1074
- # save model_selection results to db
1091
+ # Save model_selection results to db
1075
1092
  model_selection = ModelSelection.get(model_selection.id)
1076
1093
  model_selection.best_model_id = Model.find_by(
1077
1094
  name=best_score_overall["MODEL_NAME"], type=self.target_type
@@ -1083,6 +1100,9 @@ class ModelSelectionEngine:
1083
1100
 
1084
1101
  logger.info(f"Best model overall is : {best_score_overall}")
1085
1102
 
1103
+ # Consolidate best parameters from all targets into a single file
1104
+ self.consolidate_best_params()
1105
+
1086
1106
  best_model = joblib.load(best_model_path)
1087
1107
  return best_model
1088
1108
 
@@ -1184,11 +1204,53 @@ class ModelSelectionEngine:
1184
1204
  target_clf_thresholds=self.target_clf_thresholds,
1185
1205
  )
1186
1206
 
1207
+ def consolidate_best_params(self):
1208
+ """
1209
+ Consolidate best parameters from all targets into a single JSON file in the preprocessing folder.
1210
+ The output will be a dictionary with target numbers as keys and their best parameters as values.
1211
+ """
1212
+ # Initialize the consolidated parameters dictionary
1213
+ all_best_params = {}
1214
+
1215
+ # Find all target directories
1216
+ target_dirs = [
1217
+ d for d in os.listdir(self.experiment_dir) if d.startswith("TARGET_")
1218
+ ]
1219
+
1220
+ for target_dir in target_dirs:
1221
+ target_number = target_dir.split("_")[1]
1222
+ best_params_file = os.path.join(
1223
+ self.experiment_dir, target_dir, "best_params.json"
1224
+ )
1225
+
1226
+ # Check if best_params.json exists for this target
1227
+ if os.path.exists(best_params_file):
1228
+ try:
1229
+ with open(best_params_file, "r") as f:
1230
+ target_params = json.load(f)
1231
+ all_best_params[target_number] = target_params
1232
+ except Exception as e:
1233
+ logger.warning(
1234
+ f"Error loading best params for {target_dir}: {str(e)}"
1235
+ )
1236
+
1237
+ # Save consolidated parameters to preprocessing folder
1238
+ if all_best_params:
1239
+ output_file = os.path.join(
1240
+ self.preprocessing_dir, "all_targets_best_params.json"
1241
+ )
1242
+ os.makedirs(os.path.dirname(output_file), exist_ok=True)
1243
+ with open(output_file, "w") as f:
1244
+ json.dump(all_best_params, f, indent=4)
1245
+ logger.info(f"Consolidated best parameters saved to {output_file}")
1246
+
1247
+ return all_best_params
1248
+
1187
1249
 
1188
1250
  def evaluate(
1189
1251
  prediction: pd.DataFrame,
1190
1252
  target_type: str,
1191
- target_clf_thresholds: dict = {"precision": 0.80},
1253
+ target_clf_thresholds: dict = None,
1192
1254
  ):
1193
1255
  """
1194
1256
  Function to evaluate model performance
@@ -1202,6 +1264,10 @@ def evaluate(
1202
1264
  y_true = prediction["TARGET"]
1203
1265
  y_pred = prediction["PRED"]
1204
1266
 
1267
+ # Set default threshold if not provided
1268
+ if target_clf_thresholds is None:
1269
+ target_clf_thresholds = {"precision": 0.80}
1270
+
1205
1271
  if target_type == "regression":
1206
1272
  # Main metrics
1207
1273
  score["RMSE"] = root_mean_squared_error(y_true, y_pred)
@@ -1330,6 +1396,46 @@ def load_model(target_dir: str):
1330
1396
  )
1331
1397
 
1332
1398
 
1399
+ def plot_training_progress(
1400
+ logs, model_name, target_number, title_suffix="Training Progress"
1401
+ ):
1402
+ """
1403
+ Plot training and validation metrics during model training.
1404
+
1405
+ Args:
1406
+ logs: DataFrame or dict containing training history
1407
+ model_name: Name of the model being trained
1408
+ target_number: Target number for the model
1409
+ title_suffix: Optional suffix for the plot title
1410
+ """
1411
+ if isinstance(logs, dict):
1412
+ logs = pd.DataFrame(logs)
1413
+
1414
+ plt.figure(figsize=(14, 4))
1415
+
1416
+ # Plot all metrics that exist in the logs
1417
+ if "loss" in logs.columns:
1418
+ plt.plot(logs["loss"], lw=2, label="Training loss")
1419
+ if "val_loss" in logs.columns:
1420
+ plt.plot(logs["val_loss"], lw=2, label="Validation loss")
1421
+
1422
+ # If no specific loss columns, plot all available metrics
1423
+ if "loss" not in logs.columns and "val_loss" not in logs.columns and not logs.empty:
1424
+ for col in logs.columns:
1425
+ if col.startswith("val_"):
1426
+ plt.plot(logs[col], "--", lw=2, label=f"Validation {col[4:]}")
1427
+ else:
1428
+ plt.plot(logs[col], lw=2, label=f"Training {col}")
1429
+
1430
+ plt.title(f"{model_name} - Target {target_number}\n{title_suffix}")
1431
+ plt.xlabel("Epoch")
1432
+ plt.ylabel("Metric Value")
1433
+ plt.legend()
1434
+ plt.grid(True, alpha=0.3)
1435
+ plt.tight_layout()
1436
+ plt.show()
1437
+
1438
+
1333
1439
  # plots
1334
1440
  def plot_evaluation_for_classification(prediction: dict):
1335
1441
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: lecrapaud
3
- Version: 0.12.1
3
+ Version: 0.13.0
4
4
  Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
5
  License: Apache License
6
6
  Author: Pierre H. Gallet
@@ -1,5 +1,5 @@
1
1
  lecrapaud/__init__.py,sha256=oCxbtw_nk8rlOXbXbWo0RRMlsh6w-hTiZ6e5PRG_wp0,28
2
- lecrapaud/api.py,sha256=YKxeWcTuq3QZ-UI9UjRkq7OI7W1UXW0EWgBbY3uEmLE,17409
2
+ lecrapaud/api.py,sha256=K5eM5dXtU8DGH6je7Ai60hOgycXUAIVE1OvMh3Qvh5c,19541
3
3
  lecrapaud/config.py,sha256=eYnrktVq457xMIMGcUSilJdNxCsaGP_gRAlzCSwd6Vo,1047
4
4
  lecrapaud/db/__init__.py,sha256=82o9fMfaqKXPh2_rt44EzNRVZV1R4LScEnQYvj_TjK0,34
5
5
  lecrapaud/db/alembic/README,sha256=MVlc9TYmr57RbhXET6QxgyCcwWP7w-vLkEsirENqiIQ,38
@@ -35,10 +35,10 @@ lecrapaud/misc/tabpfn_tests.ipynb,sha256=VkgsCUJ30d8jaL2VaWtQAgb8ngHPNtPgnXLs7QQ
35
35
  lecrapaud/misc/test-gpu-bilstm.ipynb,sha256=4nLuZRJVe2kn6kEmauhRiz5wkWT9AVrYhI9CEk_dYUY,9608
36
36
  lecrapaud/misc/test-gpu-resnet.ipynb,sha256=27Vu7nYwujYeh3fOxBNCnKJn3MXNPKZU-U8oDDUbymg,4944
37
37
  lecrapaud/misc/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIFST_xl_AFW2jgI,8444
38
- lecrapaud/model_selection.py,sha256=8TfYjVJnFDviycX4DMe6mpHm7oxTfS-UXO55TvOLPJs,63377
38
+ lecrapaud/model_selection.py,sha256=S16Zc6PxyNx-HrB_5JucCijFMDAjZlHiHPrl7mer4Cw,67517
39
39
  lecrapaud/search_space.py,sha256=-JkzuMhaomdwiWi4HvVQY5hiw3-oREemJA16tbwEIp4,34854
40
40
  lecrapaud/utils.py,sha256=JdBB1NvbNIx4y0Una-kSZdo1_ZEocc5hwyYFIZKHmGg,8305
41
- lecrapaud-0.12.1.dist-info/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
42
- lecrapaud-0.12.1.dist-info/METADATA,sha256=AIsGG4s0ZD_P3d2rRj-vGgDFvQzTO8ipvM_zmEbKZv8,11016
43
- lecrapaud-0.12.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
44
- lecrapaud-0.12.1.dist-info/RECORD,,
41
+ lecrapaud-0.13.0.dist-info/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
42
+ lecrapaud-0.13.0.dist-info/METADATA,sha256=OhgqiesFiciX8XtyC_wXTRPcWlWwCwGUuC1zVpoWIOI,11016
43
+ lecrapaud-0.13.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
44
+ lecrapaud-0.13.0.dist-info/RECORD,,