lecrapaud 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

lecrapaud/api.py CHANGED
@@ -1,42 +1,47 @@
1
- """
2
- Main API class
3
-
4
- the way I want it to work :
5
-
6
- app = LeCrapaud()
7
-
8
- kwargs = {
9
-
10
- }
11
-
12
- experiment = app.create_experiment(**kwargs) # return a class Experiment()
13
- ou
14
- experiment = app.get_experiment(exp_id)
15
-
16
- best_features, artifacts, best_model = experiment.train(get_data, get_data_params)
17
-
18
- new_data + target_pred + target_proba (if classif) = experiment.predict(**new_data)
1
+ """LeCrapaud API module.
19
2
 
20
- On veut aussi pouvoir juste faire :
3
+ This module provides the main interface for the LeCrapaud machine learning pipeline.
4
+ It allows for end-to-end ML workflows including data preprocessing, feature engineering,
5
+ model training, and prediction.
21
6
 
22
- experiment.feature_engineering(data) : feat eng, return data
7
+ Basic Usage:
8
+ # Create a LeCrapaud instance
9
+ lc = LeCrapaud()
23
10
 
24
- experiment.preprocess_feature(data) : split, encoding, pcas, return train, val, test df
11
+ # Create a new experiment
12
+ experiment = lc.create_experiment(data, target_numbers=[1], target_clf=[1])
25
13
 
26
- experiment.feature_selection(train) : return features
14
+ # Train a model
15
+ best_features, artifacts, best_model = experiment.train(data)
27
16
 
28
- experiment.preprocess_model(train, val, test) : return data = dict of df
17
+ # Make predictions
18
+ predictions, scores_reg, scores_clf = experiment.predict(new_data)
29
19
 
30
- experiment.model_selection(data) : return best_model
20
+ # Or use individual pipeline steps:
21
+ processed_data = experiment.feature_engineering(data) # Feature engineering
22
+ train, val, test = experiment.preprocess_feature(data) # Data splitting and encoding
23
+ selected_features = experiment.feature_selection(train) # Feature selection
24
+ model_data = experiment.preprocess_model(train, val, test) # Model preprocessing
25
+ best_model = experiment.model_selection(model_data) # Model selection
31
26
  """
32
27
 
33
28
  import joblib
34
29
  import pandas as pd
35
30
  import logging
31
+ import seaborn as sns
32
+ import numpy as np
33
+ import matplotlib.pyplot as plt
36
34
  from lecrapaud.utils import logger
37
35
  from lecrapaud.db.session import init_db
38
36
  from lecrapaud.feature_selection import FeatureSelectionEngine, PreprocessModel
39
- from lecrapaud.model_selection import ModelSelectionEngine, ModelEngine, evaluate
37
+ from lecrapaud.model_selection import (
38
+ ModelSelectionEngine,
39
+ ModelEngine,
40
+ evaluate,
41
+ load_model,
42
+ plot_threshold,
43
+ plot_evaluation_for_classification,
44
+ )
40
45
  from lecrapaud.feature_engineering import FeatureEngineeringEngine, PreprocessFeature
41
46
  from lecrapaud.experiment import create_experiment
42
47
  from lecrapaud.db import Experiment
@@ -44,24 +49,67 @@ from lecrapaud.search_space import normalize_models_idx
44
49
 
45
50
 
46
51
  class LeCrapaud:
52
+ """Main class for interacting with the LeCrapaud ML pipeline.
53
+
54
+ This class provides methods to create and retrieve experiments.
55
+
56
+ Args:
57
+ uri (str, optional): Database connection URI. If None, uses default connection.
58
+ """
59
+
47
60
  def __init__(self, uri: str = None):
61
+ """Initialize LeCrapaud with optional database URI."""
48
62
  init_db(uri=uri)
49
63
 
50
- def create_experiment(self, data: pd.DataFrame, **kwargs):
64
+ def create_experiment(self, data: pd.DataFrame, **kwargs) -> "ExperimentEngine":
65
+ """Create a new experiment.
66
+
67
+ Args:
68
+ data (pd.DataFrame): Input data for the experiment
69
+ **kwargs: Additional arguments to configure the experiment
70
+
71
+ Returns:
72
+ ExperimentEngine: A new experiment instance
73
+ """
51
74
  return ExperimentEngine(data=data, **kwargs)
52
75
 
53
- def get_experiment(self, id: int, **kwargs):
76
+ def get_experiment(self, id: int, **kwargs) -> "ExperimentEngine":
77
+ """Retrieve an existing experiment by ID.
78
+
79
+ Args:
80
+ id (int): The ID of the experiment to retrieve
81
+ **kwargs: Additional arguments to pass to the experiment
82
+
83
+ Returns:
84
+ ExperimentEngine: The retrieved experiment instance
85
+ """
54
86
  return ExperimentEngine(id=id, **kwargs)
55
87
 
56
88
 
57
89
  class ExperimentEngine:
58
- def __init__(self, id=None, data=None, **kwargs):
90
+ """Engine for managing ML experiments.
91
+
92
+ This class handles the complete ML pipeline including feature engineering,
93
+ model training, and prediction. It can be initialized with either new data
94
+ or by loading an existing experiment by ID.
95
+
96
+ Args:
97
+ id (int, optional): ID of an existing experiment to load
98
+ data (pd.DataFrame, optional): Input data for a new experiment
99
+ **kwargs: Additional configuration parameters
100
+ """
101
+
102
+ def __init__(self, id: int = None, data: pd.DataFrame = None, **kwargs):
103
+ """Initialize the experiment engine with either new or existing experiment."""
59
104
  if id:
60
105
  self.experiment = Experiment.get(id)
61
106
  kwargs.update(self.experiment.context)
62
107
  else:
108
+ if data is None:
109
+ raise ValueError("Either id or data must be provided")
63
110
  self.experiment = create_experiment(data=data, **kwargs)
64
111
 
112
+ # Set all kwargs as instance attributes
65
113
  for key, value in kwargs.items():
66
114
  if key == "models_idx":
67
115
  value = normalize_models_idx(value)
@@ -295,3 +343,103 @@ class ExperimentEngine:
295
343
  return joblib.load(
296
344
  f"{self.experiment.path}/TARGET_{target_number}/thresholds.pkl"
297
345
  )
346
+
347
+ def load_model(self, target_number: int, model_name: str = None):
348
+
349
+ if not model_name:
350
+ return load_model(f"{self.experiment.path}/TARGET_{target_number}")
351
+
352
+ return load_model(f"{self.experiment.path}/TARGET_{target_number}/{model_name}")
353
+
354
+ def plot_feature_importance(
355
+ self, target_number: int, model_name="linear", top_n=30
356
+ ):
357
+ """
358
+ Plot feature importance ranking.
359
+
360
+ Args:
361
+ target_number (int): Target variable number
362
+ model_name (str): Name of the model to load
363
+ top_n (int): Number of top features to display
364
+ """
365
+ model = self.load_model(target_number, model_name)
366
+ experiment = self.experiment
367
+
368
+ # Get feature names
369
+ feature_names = experiment.get_features(target_number)
370
+
371
+ # Get feature importances based on model type
372
+ if hasattr(model, "feature_importances_"):
373
+ # For sklearn tree models
374
+ importances = model.feature_importances_
375
+ importance_type = "Gini"
376
+ elif hasattr(model, "get_score"):
377
+ # For xgboost models
378
+ importance_dict = model.get_score(importance_type="weight")
379
+ importances = np.zeros(len(feature_names))
380
+ for i, feat in enumerate(feature_names):
381
+ if feat in importance_dict:
382
+ importances[i] = importance_dict[feat]
383
+ importance_type = "Weight"
384
+ elif hasattr(model, "feature_importance"):
385
+ # For lightgbm models
386
+ importances = model.feature_importance(importance_type="split")
387
+ importance_type = "Split"
388
+ elif hasattr(model, "coef_"):
389
+ # For linear models
390
+ importances = np.abs(model.coef_.flatten())
391
+ importance_type = "Absolute coefficient"
392
+ else:
393
+ raise ValueError(
394
+ f"Model {model_name} does not support feature importance calculation"
395
+ )
396
+
397
+ # Create a DataFrame for easier manipulation
398
+ importance_df = pd.DataFrame(
399
+ {"feature": feature_names[: len(importances)], "importance": importances}
400
+ )
401
+
402
+ # Sort features by importance and take top N
403
+ importance_df = importance_df.sort_values("importance", ascending=False).head(
404
+ top_n
405
+ )
406
+
407
+ # Create the plot
408
+ plt.figure(figsize=(10, max(6, len(importance_df) * 0.3)))
409
+ ax = sns.barplot(
410
+ data=importance_df,
411
+ x="importance",
412
+ y="feature",
413
+ palette="viridis",
414
+ orient="h",
415
+ )
416
+
417
+ # Add value labels
418
+ for i, v in enumerate(importance_df["importance"]):
419
+ ax.text(v, i, f"{v:.4f}", color="black", ha="left", va="center")
420
+
421
+ plt.title(f"Feature Importance ({importance_type})")
422
+ plt.tight_layout()
423
+ plt.show()
424
+
425
+ return importance_df
426
+
427
+ def plot_evaluation_for_classification(
428
+ self, target_number: int, model_name="linear"
429
+ ):
430
+ prediction = self.get_prediction(target_number, model_name)
431
+ thresholds = self.get_threshold(target_number)
432
+
433
+ plot_evaluation_for_classification(prediction)
434
+
435
+ for class_label, metrics in thresholds.items():
436
+ threshold = metrics["threshold"]
437
+ precision = metrics["precision"]
438
+ recall = metrics["recall"]
439
+ if threshold is not None:
440
+ tmp_pred = prediction[["TARGET", "PRED", class_label]].copy()
441
+ tmp_pred.rename(columns={class_label: 1}, inplace=True)
442
+ logger.info(f"Class {class_label}:")
443
+ plot_threshold(tmp_pred, threshold, precision, recall)
444
+ else:
445
+ logger.info(f"No threshold found for class {class_label}")
lecrapaud/directories.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import os
2
2
  import shutil
3
- from config import LECRAPAUD_LOGFILE
3
+ from lecrapaud.config import LECRAPAUD_LOGFILE
4
4
 
5
5
  cwd = os.getcwd()
6
6
 
@@ -5,11 +5,6 @@ from lecrapaud.jobs.tasks import app
5
5
 
6
6
  def schedule_tasks():
7
7
  schedule_tasks_list = [
8
- {
9
- "name": "task_send_daily_emails",
10
- "task": "src.jobs.tasks.task_send_daily_emails",
11
- "schedule": crontab(minute=00, hour=12),
12
- },
13
8
  {
14
9
  "name": "task_training_experiment",
15
10
  "task": "src.jobs.tasks.task_training_experiment",
@@ -24,8 +19,7 @@ def schedule_tasks():
24
19
 
25
20
  def unschedule_tasks():
26
21
  unschedule_task_keys = [
27
- "redbeat:task_send_daily_emails",
28
- "redbeat:task_train_models",
22
+ "redbeat:task_training_experiment",
29
23
  ]
30
24
 
31
25
  for key in unschedule_task_keys:
lecrapaud/jobs/tasks.py CHANGED
@@ -1,30 +1,5 @@
1
1
  from lecrapaud.jobs import app
2
-
3
- # from honeybadger import honeybadger
4
- from lecrapaud.send_daily_emails import send_daily_emails
5
- from lecrapaud.config import EXPERIMENT_ID, RECEIVER_EMAIL
6
- from lecrapaud.training import run_training
7
- from lecrapaud.constants import stock_list_3
8
- from lecrapaud.search_space import get_models_idx
9
-
10
-
11
- @app.task(
12
- bind=True,
13
- autoretry_for=(Exception,),
14
- retry_backoff=True,
15
- retry_kwargs={"max_retries": 5},
16
- acks_late=True,
17
- )
18
- def task_send_daily_emails(self):
19
- try:
20
- print(f"[Attempt #{self.request.retries}] task_send_daily_emails")
21
- experiment_id = int(EXPERIMENT_ID)
22
- email = RECEIVER_EMAIL
23
- return send_daily_emails(email, experiment_id)
24
- except Exception as e:
25
- print(e)
26
- # honeybadger.notify(e)
27
- raise
2
+ from lecrapaud.utils import logger
28
3
 
29
4
 
30
5
  @app.task(
@@ -36,22 +11,7 @@ def task_send_daily_emails(self):
36
11
  )
37
12
  def task_training_experiment(self):
38
13
  try:
39
- print(f"[Attempt #{self.request.retries}] task_training_experiment")
40
- run_training(
41
- years_of_data=20,
42
- list_of_groups=stock_list_3,
43
- targets_numbers=range(1, 15),
44
- percentile=20,
45
- corr_threshold=80,
46
- max_features=25,
47
- models_idx=get_models_idx("linear", "xgb"),
48
- number_of_trials=20,
49
- perform_hyperoptimization=True,
50
- perform_crossval=False,
51
- preserve_model=False,
52
- experiment_name="20y_stock_list_3_linear_xgb",
53
- )
14
+ pass
54
15
  except Exception as e:
55
- print(e)
56
- # honeybadger.notify(e)
16
+ logger.error(e)
57
17
  raise
@@ -0,0 +1,222 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/Users/pierregallet/Code/lecrapaud/.venv/lib/python3.12/site-packages/tabpfn/base.py:89: UserWarning: Downloading model to /Users/pierregallet/Library/Caches/tabpfn/tabpfn-v2-classifier.ckpt.\n",
13
+ " model, _, config_ = load_model_criterion_config(\n"
14
+ ]
15
+ },
16
+ {
17
+ "data": {
18
+ "application/vnd.jupyter.widget-view+json": {
19
+ "model_id": "df286c7a921b48439f5a97dae1985862",
20
+ "version_major": 2,
21
+ "version_minor": 0
22
+ },
23
+ "text/plain": [
24
+ "tabpfn-v2-classifier.ckpt: 0%| | 0.00/29.0M [00:00<?, ?B/s]"
25
+ ]
26
+ },
27
+ "metadata": {},
28
+ "output_type": "display_data"
29
+ },
30
+ {
31
+ "data": {
32
+ "application/vnd.jupyter.widget-view+json": {
33
+ "model_id": "55c41be7cbaf4b95a670b40157536ea1",
34
+ "version_major": 2,
35
+ "version_minor": 0
36
+ },
37
+ "text/plain": [
38
+ "config.json: 0%| | 0.00/37.0 [00:00<?, ?B/s]"
39
+ ]
40
+ },
41
+ "metadata": {},
42
+ "output_type": "display_data"
43
+ },
44
+ {
45
+ "name": "stderr",
46
+ "output_type": "stream",
47
+ "text": [
48
+ "/Users/pierregallet/Code/lecrapaud/.venv/lib/python3.12/site-packages/tabpfn/classifier.py:432: UserWarning: Running on CPU with more than 200 samples may be slow.\n",
49
+ "Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client\n",
50
+ " check_cpu_warning(\n"
51
+ ]
52
+ },
53
+ {
54
+ "name": "stdout",
55
+ "output_type": "stream",
56
+ "text": [
57
+ "ROC AUC: 0.9981992797118848\n",
58
+ "Accuracy 0.9824561403508771\n"
59
+ ]
60
+ }
61
+ ],
62
+ "source": [
63
+ "from sklearn.datasets import load_breast_cancer\n",
64
+ "from sklearn.metrics import accuracy_score, roc_auc_score\n",
65
+ "from sklearn.model_selection import train_test_split\n",
66
+ "\n",
67
+ "from tabpfn import TabPFNClassifier\n",
68
+ "\n",
69
+ "# Load data\n",
70
+ "X, y = load_breast_cancer(return_X_y=True)\n",
71
+ "X_train, X_test, y_train, y_test = train_test_split(\n",
72
+ " X, y, test_size=0.5, random_state=42\n",
73
+ ")\n",
74
+ "\n",
75
+ "# Initialize a classifier\n",
76
+ "clf = TabPFNClassifier()\n",
77
+ "clf.fit(X_train, y_train)\n",
78
+ "\n",
79
+ "# Predict probabilities\n",
80
+ "prediction_probabilities = clf.predict_proba(X_test)\n",
81
+ "print(\"ROC AUC:\", roc_auc_score(y_test, prediction_probabilities[:, 1]))\n",
82
+ "\n",
83
+ "# Predict labels\n",
84
+ "predictions = clf.predict(X_test)\n",
85
+ "print(\"Accuracy\", accuracy_score(y_test, predictions))"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 5,
91
+ "metadata": {},
92
+ "outputs": [
93
+ {
94
+ "name": "stdout",
95
+ "output_type": "stream",
96
+ "text": [
97
+ "Training time: 0.05 seconds\n",
98
+ "Accuracy: 0.9561\n",
99
+ "\n",
100
+ "Classification Report:\n",
101
+ " precision recall f1-score support\n",
102
+ "\n",
103
+ " 0 0.95 0.93 0.94 43\n",
104
+ " 1 0.96 0.97 0.97 71\n",
105
+ "\n",
106
+ " accuracy 0.96 114\n",
107
+ " macro avg 0.96 0.95 0.95 114\n",
108
+ "weighted avg 0.96 0.96 0.96 114\n",
109
+ "\n",
110
+ "\n",
111
+ "Feature Importance:\n",
112
+ "Feature 0: 0.0284\n",
113
+ "Feature 1: 0.0198\n",
114
+ "Feature 2: 0.0000\n",
115
+ "Feature 3: 0.0136\n",
116
+ "Feature 4: 0.0094\n",
117
+ "Feature 5: 0.0053\n",
118
+ "Feature 6: 0.0060\n",
119
+ "Feature 7: 0.3079\n",
120
+ "Feature 8: 0.0001\n",
121
+ "Feature 9: 0.0063\n",
122
+ "Feature 10: 0.0093\n",
123
+ "Feature 11: 0.0089\n",
124
+ "Feature 12: 0.0168\n",
125
+ "Feature 13: 0.0119\n",
126
+ "Feature 14: 0.0113\n",
127
+ "Feature 15: 0.0087\n",
128
+ "Feature 16: 0.0220\n",
129
+ "Feature 17: 0.0043\n",
130
+ "Feature 18: 0.0036\n",
131
+ "Feature 19: 0.0040\n",
132
+ "Feature 20: 0.0578\n",
133
+ "Feature 21: 0.0276\n",
134
+ "Feature 22: 0.1538\n",
135
+ "Feature 23: 0.0360\n",
136
+ "Feature 24: 0.0072\n",
137
+ "Feature 25: 0.0000\n",
138
+ "Feature 26: 0.0295\n",
139
+ "Feature 27: 0.1860\n",
140
+ "Feature 28: 0.0049\n",
141
+ "Feature 29: 0.0000\n"
142
+ ]
143
+ },
144
+ {
145
+ "name": "stderr",
146
+ "output_type": "stream",
147
+ "text": [
148
+ "/Users/pierregallet/Code/lecrapaud/.venv/lib/python3.12/site-packages/xgboost/training.py:183: UserWarning: [11:58:14] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:738: \n",
149
+ "Parameters: { \"use_label_encoder\" } are not used.\n",
150
+ "\n",
151
+ " bst.update(dtrain, iteration=i, fobj=obj)\n"
152
+ ]
153
+ }
154
+ ],
155
+ "source": [
156
+ "# XGBoost Example\n",
157
+ "import xgboost as xgb\n",
158
+ "from sklearn.metrics import accuracy_score, classification_report\n",
159
+ "import time\n",
160
+ "\n",
161
+ "# Load the breast cancer dataset\n",
162
+ "X, y = load_breast_cancer(return_X_y=True)\n",
163
+ "X_train, X_test, y_train, y_test = train_test_split(\n",
164
+ " X, y, test_size=0.2, random_state=42\n",
165
+ ")\n",
166
+ "\n",
167
+ "# Create and train XGBoost classifier\n",
168
+ "start_time = time.time()\n",
169
+ "xgb_clf = xgb.XGBClassifier(\n",
170
+ " n_estimators=100,\n",
171
+ " max_depth=3,\n",
172
+ " learning_rate=0.1,\n",
173
+ " use_label_encoder=False,\n",
174
+ " eval_metric=\"logloss\",\n",
175
+ " random_state=42,\n",
176
+ ")\n",
177
+ "\n",
178
+ "# Train the model\n",
179
+ "xgb_clf.fit(X_train, y_train)\n",
180
+ "\n",
181
+ "# Make predictions\n",
182
+ "y_pred = xgb_clf.predict(X_test)\n",
183
+ "y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]\n",
184
+ "\n",
185
+ "# Calculate metrics\n",
186
+ "accuracy = accuracy_score(y_test, y_pred)\n",
187
+ "training_time = time.time() - start_time\n",
188
+ "\n",
189
+ "print(f\"Training time: {training_time:.2f} seconds\")\n",
190
+ "print(f\"Accuracy: {accuracy:.4f}\")\n",
191
+ "print(\"\\nClassification Report:\")\n",
192
+ "print(classification_report(y_test, y_pred))\n",
193
+ "\n",
194
+ "# Feature importance\n",
195
+ "print(\"\\nFeature Importance:\")\n",
196
+ "for i, importance in enumerate(xgb_clf.feature_importances_):\n",
197
+ " print(f\"Feature {i}: {importance:.4f}\")"
198
+ ]
199
+ }
200
+ ],
201
+ "metadata": {
202
+ "kernelspec": {
203
+ "display_name": ".venv",
204
+ "language": "python",
205
+ "name": "python3"
206
+ },
207
+ "language_info": {
208
+ "codemirror_mode": {
209
+ "name": "ipython",
210
+ "version": 3
211
+ },
212
+ "file_extension": ".py",
213
+ "mimetype": "text/x-python",
214
+ "name": "python",
215
+ "nbconvert_exporter": "python",
216
+ "pygments_lexer": "ipython3",
217
+ "version": "3.12.11"
218
+ }
219
+ },
220
+ "nbformat": 4,
221
+ "nbformat_minor": 2
222
+ }
@@ -567,40 +567,14 @@ class ModelEngine:
567
567
  if not self.path:
568
568
  raise ValueError("Path is not set, cannot load model")
569
569
 
570
- target_dir = Path(self.path)
571
-
572
- # Search for files that contain '.best' or '.keras' in the name
573
- best_files = list(target_dir.glob("*.best*")) + list(
574
- target_dir.glob("*.keras*")
575
- )
576
- # If any files are found, try loading the first one (or process as needed)
577
- if best_files:
578
- file_path = best_files[
579
- 0
580
- ] # Assuming you want to open the first matching file
581
- try:
582
- # Attempt to load the file as a scikit-learn, XGBoost, or LightGBM model (Pickle format)
583
- self._model = joblib.load(file_path)
584
- except (pickle.UnpicklingError, EOFError):
585
- # If it's not a pickle file, try loading it as a Keras model
586
- try:
587
- # Attempt to load the file as a Keras model
588
- self._model = keras.models.load_model(file_path)
589
- except Exception as e:
590
- raise FileNotFoundError(
591
- f"Model could not be loaded from path: {file_path}: {e}"
592
- )
593
- else:
594
- raise FileNotFoundError(
595
- f"No files with '.best' or '.keras' found in the specified folder: {target_dir}"
596
- )
570
+ self._model = load_model(self.path)
597
571
 
598
572
  self.model_name = self._model.model_name
599
573
  self.target_type = self._model.target_type
600
574
 
601
575
  # Load threshold
602
576
  self.threshold = (
603
- joblib.load(f"{target_dir}/thresholds.pkl")
577
+ joblib.load(f"{self.path}/thresholds.pkl")
604
578
  if self.target_type == "classification"
605
579
  else None
606
580
  )
@@ -1326,12 +1300,29 @@ def get_log_dir(target_dir: str, model_name="test_model"):
1326
1300
  return str(log_dir)
1327
1301
 
1328
1302
 
1329
- def print_scores(target_dir: str):
1330
- """
1331
- Monitor scores
1332
- """
1333
- scores_tracking = pd.read_csv(f"{target_dir}/scores_tracking.csv")
1334
- return scores_tracking
1303
+ def load_model(target_dir: str):
1304
+ target_dir = Path(target_dir)
1305
+ # Search for files that contain '.best' or '.keras' in the name
1306
+ best_files = list(target_dir.glob("*.best*")) + list(target_dir.glob("*.keras*"))
1307
+ # If any files are found, try loading the first one (or process as needed)
1308
+ if best_files:
1309
+ file_path = best_files[0] # Assuming you want to open the first matching file
1310
+ try:
1311
+ # Attempt to load the file as a scikit-learn, XGBoost, or LightGBM model (Pickle format)
1312
+ return joblib.load(file_path)
1313
+ except (pickle.UnpicklingError, EOFError):
1314
+ # If it's not a pickle file, try loading it as a Keras model
1315
+ try:
1316
+ # Attempt to load the file as a Keras model
1317
+ return keras.models.load_model(file_path)
1318
+ except Exception as e:
1319
+ raise FileNotFoundError(
1320
+ f"Model could not be loaded from path: {file_path}: {e}"
1321
+ )
1322
+ else:
1323
+ raise FileNotFoundError(
1324
+ f"No files with '.best' or '.keras' found in the specified folder: {target_dir}"
1325
+ )
1335
1326
 
1336
1327
 
1337
1328
  # plots
@@ -1629,37 +1620,6 @@ def plot_threshold(prediction, threshold, precision, recall):
1629
1620
 
1630
1621
 
1631
1622
  # OLD - to sort out
1632
- def get_pred_distribution(target_dir: str, model_name="linear"):
1633
- """
1634
- Look at prediction distributions
1635
- """
1636
- prediction = pd.read_csv(
1637
- f"{target_dir}/{model_name}/prediction.csv",
1638
- index_col="ID",
1639
- )
1640
- prediction.describe()
1641
-
1642
-
1643
- def plot_feature_importance(target_dir: str, model_name="linear"):
1644
- """
1645
- Monitor feature importance ranking to filter out unrelevant features
1646
- """
1647
- model = joblib.load(f"{target_dir}/{model_name}/{model_name}.best")
1648
- if hasattr(model, "feature_importances_"):
1649
- feature_importances_ = model.feature_importances_.flatten()
1650
- elif hasattr(model, "feature_importance"):
1651
- feature_importances_ = model.feature_importance.flatten()
1652
- elif hasattr(model, "coefs_"):
1653
- feature_importances_ = np.mean(model.coefs_[0], axis=1).flatten()
1654
- elif hasattr(model, "coef_"):
1655
- feature_importances_ = model.coef_.flatten()
1656
- else:
1657
- feature_importances_ = []
1658
-
1659
- sns.barplot(
1660
- data=feature_importances_,
1661
- orient="h",
1662
- )
1663
1623
 
1664
1624
 
1665
1625
  def print_model_estimators(target_dir: str, model_name="linear"):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: lecrapaud
3
- Version: 0.10.1
3
+ Version: 0.11.0
4
4
  Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
5
  License: Apache License
6
6
  Author: Pierre H. Gallet
@@ -8,43 +8,27 @@ Requires-Python: ==3.12.*
8
8
  Classifier: License :: Other/Proprietary License
9
9
  Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Programming Language :: Python :: 3.12
11
- Requires-Dist: backoff (>=2.2.1)
12
11
  Requires-Dist: category-encoders (>=2.8.1)
13
- Requires-Dist: celery (>=5.5.1)
14
- Requires-Dist: curl-cffi (>=0.11.1)
15
- Requires-Dist: deep-translator (>=1.11.4)
16
- Requires-Dist: degiro-connector (>=3.0.26)
17
- Requires-Dist: fake-useragent (>=2.1.0)
12
+ Requires-Dist: celery (>=5.5.3)
18
13
  Requires-Dist: ftfy (>=6.3.1)
19
- Requires-Dist: honeybadger (>=0.21)
20
- Requires-Dist: joblib (>=1.4.2)
21
- Requires-Dist: keras (>=3.9.0)
22
- Requires-Dist: keras-tcn (>=3.1.2)
14
+ Requires-Dist: joblib (>=1.5.1)
15
+ Requires-Dist: keras (>=3.10.0)
23
16
  Requires-Dist: lightgbm (>=4.6.0)
24
- Requires-Dist: matplotlib (>=3.10.1)
17
+ Requires-Dist: matplotlib (>=3.10.3)
25
18
  Requires-Dist: mlxtend (>=0.23.4)
26
19
  Requires-Dist: numpy (>=2.1.3)
27
- Requires-Dist: openai (>=1.86.0)
28
- Requires-Dist: pandas (>=2.2.3)
29
- Requires-Dist: pandas-market-calendars (>=4.6.1)
30
- Requires-Dist: playwright (>=1.52.0)
31
- Requires-Dist: pydantic (>=2.10.6)
32
- Requires-Dist: python-dotenv (>=1.0.1)
33
- Requires-Dist: pytz (>=2025.1)
34
- Requires-Dist: ratelimit (>=2.2.1)
20
+ Requires-Dist: openai (>=1.88.0)
21
+ Requires-Dist: pandas (>=2.3.0)
22
+ Requires-Dist: python-dotenv (>=1.1.0)
35
23
  Requires-Dist: scikit-learn (>=1.6.1)
36
- Requires-Dist: scipy (>=1.15.2)
24
+ Requires-Dist: scipy (>=1.15.3)
37
25
  Requires-Dist: seaborn (>=0.13.2)
38
- Requires-Dist: sentence-transformers (>=3.4.1)
39
- Requires-Dist: sqlalchemy (>=2.0.39)
40
- Requires-Dist: tensorboardx (>=2.6.2.2)
26
+ Requires-Dist: sqlalchemy (>=2.0.41)
27
+ Requires-Dist: tensorboardx (>=2.6.4)
41
28
  Requires-Dist: tensorflow (>=2.19.0)
42
- Requires-Dist: tf-keras (>=2.19.0)
43
29
  Requires-Dist: tiktoken (>=0.9.0)
44
30
  Requires-Dist: tqdm (>=4.67.1)
45
- Requires-Dist: xgboost (>=3.0.0)
46
- Requires-Dist: yahoo-fin (>=0.8.9.1)
47
- Requires-Dist: yfinance (>=0.2.55)
31
+ Requires-Dist: xgboost (>=3.0.2)
48
32
  Description-Content-Type: text/markdown
49
33
 
50
34
  <div align="center">
@@ -1,5 +1,5 @@
1
1
  lecrapaud/__init__.py,sha256=oCxbtw_nk8rlOXbXbWo0RRMlsh6w-hTiZ6e5PRG_wp0,28
2
- lecrapaud/api.py,sha256=hpAVsHeOaxck2ufH0BA7IsKQXG9oA8Y_q1lvaHn6liU,10563
2
+ lecrapaud/api.py,sha256=SaoDlLK5zWAhS1LY17ZHsYIXn_7RpknEoKC5HPk-Y14,16434
3
3
  lecrapaud/config.py,sha256=n5qYpWyNSgxhJrmiujqRPa_EN3eLjGjtXDsboi1eeCo,993
4
4
  lecrapaud/db/__init__.py,sha256=82o9fMfaqKXPh2_rt44EzNRVZV1R4LScEnQYvj_TjK0,34
5
5
  lecrapaud/db/alembic/README,sha256=MVlc9TYmr57RbhXET6QxgyCcwWP7w-vLkEsirENqiIQ,38
@@ -22,23 +22,23 @@ lecrapaud/db/models/model_training.py,sha256=egggSfkW8C2nTadytc5DdjU7d2VEMT6LRRZ
22
22
  lecrapaud/db/models/score.py,sha256=_yaa6yBxugcOZMvLxqqIaMN7QGvzAOzOGCYQO0_gBjw,1601
23
23
  lecrapaud/db/models/target.py,sha256=DKnfeaLU8eT8J_oh_vuFo5-o1CaoXR13xBbswme6Bgk,1649
24
24
  lecrapaud/db/session.py,sha256=K9dTyXmO-aF_2r9RRHsDsbW9_zLNDaOcchXlpiv7cSo,2719
25
- lecrapaud/directories.py,sha256=svfeNjuUvxYKUQECx3qOi4XxBO3cg-bnlDq6FhNFI0Q,816
25
+ lecrapaud/directories.py,sha256=0LrANuDgbuneSLker60c6q2hmGnQ3mKHIztTGzTx6Gw,826
26
26
  lecrapaud/experiment.py,sha256=FSj5RUQsRdFpiK0iSyRBLRZQMlKJLQbS52cFoAVxoMk,2553
27
27
  lecrapaud/feature_engineering.py,sha256=2Er29SxHRIdzwxcEjk-2UI-MxQNVBPdTzlTemZ8bqYg,32193
28
28
  lecrapaud/feature_selection.py,sha256=u3TWq3G5Xh3geQevGDOZEt_rl_m6-K_CR7SttFtpwKw,43409
29
29
  lecrapaud/integrations/openai_integration.py,sha256=hHLF3fk5Bps8KNbNrEL3NUFa945jwClE6LrLpuMZOd4,7459
30
30
  lecrapaud/jobs/__init__.py,sha256=ZkrsyTOR21c_wN7RY8jPhm8jCrL1oCEtTsf3VFIlQiE,292
31
31
  lecrapaud/jobs/config.py,sha256=AmO0j3RFjx8H66dfKw_7vnshaOJb9Ox5BAZ9cwwLFMY,377
32
- lecrapaud/jobs/scheduler.py,sha256=SiYWPxokpKnR8V6btLOO6gbK0PEjSRoeG0kCbQvYPf4,990
33
- lecrapaud/jobs/tasks.py,sha256=jfhOCsgZlZGTnsLB_K7-Y3NgJqpzpUCFu7EfDQuIeSY,1655
34
- lecrapaud/model_selection.py,sha256=hKa6rQPbFBPSiQv98R89bxp-U-3Kufj9pETV0ff6KKM,61767
32
+ lecrapaud/jobs/scheduler.py,sha256=OKXhb_gxE1-R7D1HyPns88iIS31Wd4gRqEzk4EqS0J4,774
33
+ lecrapaud/jobs/tasks.py,sha256=sbD2_IT45DE4yQQbR6DVb9xv5x06rYDtUvSK8exYxes,332
34
+ lecrapaud/misc/tabpfn_tests.ipynb,sha256=VkgsCUJ30d8jaL2VaWtQAgb8ngHPNtPgnXLs7QQTjqg,6676
35
+ lecrapaud/misc/test-gpu-bilstm.ipynb,sha256=4nLuZRJVe2kn6kEmauhRiz5wkWT9AVrYhI9CEk_dYUY,9608
36
+ lecrapaud/misc/test-gpu-resnet.ipynb,sha256=27Vu7nYwujYeh3fOxBNCnKJn3MXNPKZU-U8oDDUbymg,4944
37
+ lecrapaud/misc/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIFST_xl_AFW2jgI,8444
38
+ lecrapaud/model_selection.py,sha256=PQGEWVWN-4ZeHCqrmXBpHgq1QZi_1nOOeu5gazXGDLQ,60487
35
39
  lecrapaud/search_space.py,sha256=-JkzuMhaomdwiWi4HvVQY5hiw3-oREemJA16tbwEIp4,34854
36
- lecrapaud/speed_tests/test-gpu-bilstm.ipynb,sha256=4nLuZRJVe2kn6kEmauhRiz5wkWT9AVrYhI9CEk_dYUY,9608
37
- lecrapaud/speed_tests/test-gpu-resnet.ipynb,sha256=27Vu7nYwujYeh3fOxBNCnKJn3MXNPKZU-U8oDDUbymg,4944
38
- lecrapaud/speed_tests/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIFST_xl_AFW2jgI,8444
39
- lecrapaud/speed_tests/tests.ipynb,sha256=RjI7LDHSsbadUkea_hT14sD7ivljtIQk4NB5McXJ1bE,3835
40
40
  lecrapaud/utils.py,sha256=zM3V6WzY7XTBnbBAzk5_HKPYsH4WskjbqFwnQLG9g90,8197
41
- lecrapaud-0.10.1.dist-info/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
42
- lecrapaud-0.10.1.dist-info/METADATA,sha256=Ho8NyjWAZKXC6TI2QBgfxspyGRkTjP0ywimgRP3LtcQ,11624
43
- lecrapaud-0.10.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
44
- lecrapaud-0.10.1.dist-info/RECORD,,
41
+ lecrapaud-0.11.0.dist-info/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
42
+ lecrapaud-0.11.0.dist-info/METADATA,sha256=5i5VGMgNA0EzHNUWpP9eHp-KW10Cl_8tQAXk1rwW1Jc,11017
43
+ lecrapaud-0.11.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
44
+ lecrapaud-0.11.0.dist-info/RECORD,,
@@ -1,145 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 2,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "# import autosklearn.classification\n",
10
- "import sklearn.datasets\n",
11
- "import sklearn.metrics\n",
12
- "from pprint import pprint\n",
13
- "from tabpfn import TabPFNClassifier\n",
14
- "import numpy as np\n",
15
- "from pathlib import Path\n",
16
- "import pandas as pd\n",
17
- "import time\n",
18
- "from sklearn.metrics import accuracy_score\n",
19
- "from sklearn.datasets import load_breast_cancer\n",
20
- "from sklearn.model_selection import train_test_split"
21
- ]
22
- },
23
- {
24
- "cell_type": "code",
25
- "execution_count": null,
26
- "metadata": {},
27
- "outputs": [],
28
- "source": [
29
- "X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)\n",
30
- "X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n",
31
- " X, y, random_state=1\n",
32
- ")"
33
- ]
34
- },
35
- {
36
- "cell_type": "code",
37
- "execution_count": null,
38
- "metadata": {},
39
- "outputs": [],
40
- "source": [
41
- "automl = autosklearn.classification.AutoSklearnClassifier(\n",
42
- " time_left_for_this_task=120,\n",
43
- " per_run_time_limit=30,\n",
44
- " tmp_folder=\"/tmp/autosklearn_interpretable_models_example_tmp\",\n",
45
- " include={\n",
46
- " \"classifier\": [\"decision_tree\", \"lda\", \"sgd\"],\n",
47
- " \"feature_preprocessor\": [\n",
48
- " \"no_preprocessing\",\n",
49
- " \"polynomial\",\n",
50
- " \"select_percentile_classification\",\n",
51
- " ],\n",
52
- " },\n",
53
- " ensemble_kwargs={\"ensemble_size\": 1},\n",
54
- ")\n",
55
- "automl.fit(X_train, y_train, dataset_name=\"breast_cancer\")"
56
- ]
57
- },
58
- {
59
- "cell_type": "code",
60
- "execution_count": null,
61
- "metadata": {},
62
- "outputs": [],
63
- "source": [
64
- "pprint(automl.show_models(), indent=4)"
65
- ]
66
- },
67
- {
68
- "cell_type": "code",
69
- "execution_count": null,
70
- "metadata": {},
71
- "outputs": [],
72
- "source": [
73
- "predictions = automl.predict(X_test)\n",
74
- "print(\"Accuracy score:\", sklearn.metrics.accuracy_score(y_test, predictions))"
75
- ]
76
- },
77
- {
78
- "cell_type": "code",
79
- "execution_count": null,
80
- "metadata": {},
81
- "outputs": [],
82
- "source": [
83
- "# N_ensemble_configurations defines how many estimators are averaged, it is bounded by #features * #classes\n",
84
- "# more ensemble members are slower, but more accurate\n",
85
- "classifier = TabPFNClassifier(device=\"cuda\", N_ensemble_configurations=4)"
86
- ]
87
- },
88
- {
89
- "cell_type": "code",
90
- "execution_count": null,
91
- "metadata": {},
92
- "outputs": [],
93
- "source": [
94
- "start = time.time()\n",
95
- "classifier.fit(X_train, y_train)\n",
96
- "y_eval, p_eval = classifier.predict(X_test, return_winning_probability=True)\n",
97
- "print(\n",
98
- " \"Prediction time: \", time.time() - start, \"Accuracy\", accuracy_score(y_test, y_eval)\n",
99
- ")"
100
- ]
101
- },
102
- {
103
- "cell_type": "code",
104
- "execution_count": null,
105
- "metadata": {},
106
- "outputs": [],
107
- "source": [
108
- "# We also offer the `predict_proba` interface\n",
109
- "classifier.predict_proba(X_test).shape"
110
- ]
111
- },
112
- {
113
- "cell_type": "code",
114
- "execution_count": null,
115
- "metadata": {},
116
- "outputs": [],
117
- "source": [
118
- "out_table = pd.DataFrame(X_test.copy().astype(str))\n",
119
- "out_table[\"prediction\"] = [f\"{y_e} (p={p_e:.2f})\" for y_e, p_e in zip(y_eval, p_eval)]\n",
120
- "out_table"
121
- ]
122
- }
123
- ],
124
- "metadata": {
125
- "kernelspec": {
126
- "display_name": ".venv",
127
- "language": "python",
128
- "name": "python3"
129
- },
130
- "language_info": {
131
- "codemirror_mode": {
132
- "name": "ipython",
133
- "version": 3
134
- },
135
- "file_extension": ".py",
136
- "mimetype": "text/x-python",
137
- "name": "python",
138
- "nbconvert_exporter": "python",
139
- "pygments_lexer": "ipython3",
140
- "version": "3.12.8"
141
- }
142
- },
143
- "nbformat": 4,
144
- "nbformat_minor": 2
145
- }
File without changes
File without changes