lecrapaud 0.10.2__py3-none-any.whl → 0.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/api.py +180 -28
- lecrapaud/config.py +1 -0
- lecrapaud/jobs/scheduler.py +1 -7
- lecrapaud/jobs/tasks.py +3 -43
- lecrapaud/misc/tabpfn_tests.ipynb +222 -0
- lecrapaud/model_selection.py +25 -65
- lecrapaud/utils.py +6 -3
- {lecrapaud-0.10.2.dist-info → lecrapaud-0.11.1.dist-info}/METADATA +12 -28
- {lecrapaud-0.10.2.dist-info → lecrapaud-0.11.1.dist-info}/RECORD +14 -14
- lecrapaud/speed_tests/tests.ipynb +0 -145
- /lecrapaud/{speed_tests → misc}/test-gpu-bilstm.ipynb +0 -0
- /lecrapaud/{speed_tests → misc}/test-gpu-resnet.ipynb +0 -0
- /lecrapaud/{speed_tests → misc}/test-gpu-transformers.ipynb +0 -0
- {lecrapaud-0.10.2.dist-info → lecrapaud-0.11.1.dist-info}/LICENSE +0 -0
- {lecrapaud-0.10.2.dist-info → lecrapaud-0.11.1.dist-info}/WHEEL +0 -0
lecrapaud/api.py
CHANGED
|
@@ -1,42 +1,47 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Main API class
|
|
3
|
-
|
|
4
|
-
the way I want it to work :
|
|
5
|
-
|
|
6
|
-
app = LeCrapaud()
|
|
7
|
-
|
|
8
|
-
kwargs = {
|
|
9
|
-
|
|
10
|
-
}
|
|
11
|
-
|
|
12
|
-
experiment = app.create_experiment(**kwargs) # return a class Experiment()
|
|
13
|
-
ou
|
|
14
|
-
experiment = app.get_experiment(exp_id)
|
|
15
|
-
|
|
16
|
-
best_features, artifacts, best_model = experiment.train(get_data, get_data_params)
|
|
17
|
-
|
|
18
|
-
new_data + target_pred + target_proba (if classif) = experiment.predict(**new_data)
|
|
1
|
+
"""LeCrapaud API module.
|
|
19
2
|
|
|
20
|
-
|
|
3
|
+
This module provides the main interface for the LeCrapaud machine learning pipeline.
|
|
4
|
+
It allows for end-to-end ML workflows including data preprocessing, feature engineering,
|
|
5
|
+
model training, and prediction.
|
|
21
6
|
|
|
22
|
-
|
|
7
|
+
Basic Usage:
|
|
8
|
+
# Create a LeCrapaud instance
|
|
9
|
+
lc = LeCrapaud()
|
|
23
10
|
|
|
24
|
-
|
|
11
|
+
# Create a new experiment
|
|
12
|
+
experiment = lc.create_experiment(data, target_numbers=[1], target_clf=[1])
|
|
25
13
|
|
|
26
|
-
|
|
14
|
+
# Train a model
|
|
15
|
+
best_features, artifacts, best_model = experiment.train(data)
|
|
27
16
|
|
|
28
|
-
|
|
17
|
+
# Make predictions
|
|
18
|
+
predictions, scores_reg, scores_clf = experiment.predict(new_data)
|
|
29
19
|
|
|
30
|
-
|
|
20
|
+
# Or use individual pipeline steps:
|
|
21
|
+
processed_data = experiment.feature_engineering(data) # Feature engineering
|
|
22
|
+
train, val, test = experiment.preprocess_feature(data) # Data splitting and encoding
|
|
23
|
+
selected_features = experiment.feature_selection(train) # Feature selection
|
|
24
|
+
model_data = experiment.preprocess_model(train, val, test) # Model preprocessing
|
|
25
|
+
best_model = experiment.model_selection(model_data) # Model selection
|
|
31
26
|
"""
|
|
32
27
|
|
|
33
28
|
import joblib
|
|
34
29
|
import pandas as pd
|
|
35
30
|
import logging
|
|
31
|
+
import seaborn as sns
|
|
32
|
+
import numpy as np
|
|
33
|
+
import matplotlib.pyplot as plt
|
|
36
34
|
from lecrapaud.utils import logger
|
|
37
35
|
from lecrapaud.db.session import init_db
|
|
38
36
|
from lecrapaud.feature_selection import FeatureSelectionEngine, PreprocessModel
|
|
39
|
-
from lecrapaud.model_selection import
|
|
37
|
+
from lecrapaud.model_selection import (
|
|
38
|
+
ModelSelectionEngine,
|
|
39
|
+
ModelEngine,
|
|
40
|
+
evaluate,
|
|
41
|
+
load_model,
|
|
42
|
+
plot_threshold,
|
|
43
|
+
plot_evaluation_for_classification,
|
|
44
|
+
)
|
|
40
45
|
from lecrapaud.feature_engineering import FeatureEngineeringEngine, PreprocessFeature
|
|
41
46
|
from lecrapaud.experiment import create_experiment
|
|
42
47
|
from lecrapaud.db import Experiment
|
|
@@ -44,24 +49,71 @@ from lecrapaud.search_space import normalize_models_idx
|
|
|
44
49
|
|
|
45
50
|
|
|
46
51
|
class LeCrapaud:
|
|
52
|
+
"""Main class for interacting with the LeCrapaud ML pipeline.
|
|
53
|
+
|
|
54
|
+
This class provides methods to create and retrieve experiments.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
uri (str, optional): Database connection URI. If None, uses default connection.
|
|
58
|
+
"""
|
|
59
|
+
|
|
47
60
|
def __init__(self, uri: str = None):
|
|
61
|
+
"""Initialize LeCrapaud with optional database URI."""
|
|
48
62
|
init_db(uri=uri)
|
|
49
63
|
|
|
50
|
-
def create_experiment(self, data: pd.DataFrame, **kwargs):
|
|
64
|
+
def create_experiment(self, data: pd.DataFrame, **kwargs) -> "ExperimentEngine":
|
|
65
|
+
"""Create a new experiment.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
data (pd.DataFrame): Input data for the experiment
|
|
69
|
+
**kwargs: Additional arguments to configure the experiment
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
ExperimentEngine: A new experiment instance
|
|
73
|
+
"""
|
|
51
74
|
return ExperimentEngine(data=data, **kwargs)
|
|
52
75
|
|
|
53
|
-
def get_experiment(self, id: int, **kwargs):
|
|
76
|
+
def get_experiment(self, id: int, **kwargs) -> "ExperimentEngine":
|
|
77
|
+
"""Retrieve an existing experiment by ID.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
id (int): The ID of the experiment to retrieve
|
|
81
|
+
**kwargs: Additional arguments to pass to the experiment
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
ExperimentEngine: The retrieved experiment instance
|
|
85
|
+
"""
|
|
54
86
|
return ExperimentEngine(id=id, **kwargs)
|
|
55
87
|
|
|
88
|
+
def list_experiments(self, limit=1000) -> list[ExperimentEngine]:
|
|
89
|
+
"""List all experiments in the database."""
|
|
90
|
+
return [ExperimentEngine(id=exp.id) for exp in Experiment.get_all(limit=limit)]
|
|
91
|
+
|
|
56
92
|
|
|
57
93
|
class ExperimentEngine:
|
|
58
|
-
|
|
94
|
+
"""Engine for managing ML experiments.
|
|
95
|
+
|
|
96
|
+
This class handles the complete ML pipeline including feature engineering,
|
|
97
|
+
model training, and prediction. It can be initialized with either new data
|
|
98
|
+
or by loading an existing experiment by ID.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
id (int, optional): ID of an existing experiment to load
|
|
102
|
+
data (pd.DataFrame, optional): Input data for a new experiment
|
|
103
|
+
**kwargs: Additional configuration parameters
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
def __init__(self, id: int = None, data: pd.DataFrame = None, **kwargs):
|
|
107
|
+
"""Initialize the experiment engine with either new or existing experiment."""
|
|
59
108
|
if id:
|
|
60
109
|
self.experiment = Experiment.get(id)
|
|
61
110
|
kwargs.update(self.experiment.context)
|
|
62
111
|
else:
|
|
112
|
+
if data is None:
|
|
113
|
+
raise ValueError("Either id or data must be provided")
|
|
63
114
|
self.experiment = create_experiment(data=data, **kwargs)
|
|
64
115
|
|
|
116
|
+
# Set all kwargs as instance attributes
|
|
65
117
|
for key, value in kwargs.items():
|
|
66
118
|
if key == "models_idx":
|
|
67
119
|
value = normalize_models_idx(value)
|
|
@@ -295,3 +347,103 @@ class ExperimentEngine:
|
|
|
295
347
|
return joblib.load(
|
|
296
348
|
f"{self.experiment.path}/TARGET_{target_number}/thresholds.pkl"
|
|
297
349
|
)
|
|
350
|
+
|
|
351
|
+
def load_model(self, target_number: int, model_name: str = None):
|
|
352
|
+
|
|
353
|
+
if not model_name:
|
|
354
|
+
return load_model(f"{self.experiment.path}/TARGET_{target_number}")
|
|
355
|
+
|
|
356
|
+
return load_model(f"{self.experiment.path}/TARGET_{target_number}/{model_name}")
|
|
357
|
+
|
|
358
|
+
def plot_feature_importance(
|
|
359
|
+
self, target_number: int, model_name="linear", top_n=30
|
|
360
|
+
):
|
|
361
|
+
"""
|
|
362
|
+
Plot feature importance ranking.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
target_number (int): Target variable number
|
|
366
|
+
model_name (str): Name of the model to load
|
|
367
|
+
top_n (int): Number of top features to display
|
|
368
|
+
"""
|
|
369
|
+
model = self.load_model(target_number, model_name)
|
|
370
|
+
experiment = self.experiment
|
|
371
|
+
|
|
372
|
+
# Get feature names
|
|
373
|
+
feature_names = experiment.get_features(target_number)
|
|
374
|
+
|
|
375
|
+
# Get feature importances based on model type
|
|
376
|
+
if hasattr(model, "feature_importances_"):
|
|
377
|
+
# For sklearn tree models
|
|
378
|
+
importances = model.feature_importances_
|
|
379
|
+
importance_type = "Gini"
|
|
380
|
+
elif hasattr(model, "get_score"):
|
|
381
|
+
# For xgboost models
|
|
382
|
+
importance_dict = model.get_score(importance_type="weight")
|
|
383
|
+
importances = np.zeros(len(feature_names))
|
|
384
|
+
for i, feat in enumerate(feature_names):
|
|
385
|
+
if feat in importance_dict:
|
|
386
|
+
importances[i] = importance_dict[feat]
|
|
387
|
+
importance_type = "Weight"
|
|
388
|
+
elif hasattr(model, "feature_importance"):
|
|
389
|
+
# For lightgbm models
|
|
390
|
+
importances = model.feature_importance(importance_type="split")
|
|
391
|
+
importance_type = "Split"
|
|
392
|
+
elif hasattr(model, "coef_"):
|
|
393
|
+
# For linear models
|
|
394
|
+
importances = np.abs(model.coef_.flatten())
|
|
395
|
+
importance_type = "Absolute coefficient"
|
|
396
|
+
else:
|
|
397
|
+
raise ValueError(
|
|
398
|
+
f"Model {model_name} does not support feature importance calculation"
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
# Create a DataFrame for easier manipulation
|
|
402
|
+
importance_df = pd.DataFrame(
|
|
403
|
+
{"feature": feature_names[: len(importances)], "importance": importances}
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# Sort features by importance and take top N
|
|
407
|
+
importance_df = importance_df.sort_values("importance", ascending=False).head(
|
|
408
|
+
top_n
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
# Create the plot
|
|
412
|
+
plt.figure(figsize=(10, max(6, len(importance_df) * 0.3)))
|
|
413
|
+
ax = sns.barplot(
|
|
414
|
+
data=importance_df,
|
|
415
|
+
x="importance",
|
|
416
|
+
y="feature",
|
|
417
|
+
palette="viridis",
|
|
418
|
+
orient="h",
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
# Add value labels
|
|
422
|
+
for i, v in enumerate(importance_df["importance"]):
|
|
423
|
+
ax.text(v, i, f"{v:.4f}", color="black", ha="left", va="center")
|
|
424
|
+
|
|
425
|
+
plt.title(f"Feature Importance ({importance_type})")
|
|
426
|
+
plt.tight_layout()
|
|
427
|
+
plt.show()
|
|
428
|
+
|
|
429
|
+
return importance_df
|
|
430
|
+
|
|
431
|
+
def plot_evaluation_for_classification(
|
|
432
|
+
self, target_number: int, model_name="linear"
|
|
433
|
+
):
|
|
434
|
+
prediction = self.get_prediction(target_number, model_name)
|
|
435
|
+
thresholds = self.get_threshold(target_number)
|
|
436
|
+
|
|
437
|
+
plot_evaluation_for_classification(prediction)
|
|
438
|
+
|
|
439
|
+
for class_label, metrics in thresholds.items():
|
|
440
|
+
threshold = metrics["threshold"]
|
|
441
|
+
precision = metrics["precision"]
|
|
442
|
+
recall = metrics["recall"]
|
|
443
|
+
if threshold is not None:
|
|
444
|
+
tmp_pred = prediction[["TARGET", "PRED", class_label]].copy()
|
|
445
|
+
tmp_pred.rename(columns={class_label: 1}, inplace=True)
|
|
446
|
+
logger.info(f"Class {class_label}:")
|
|
447
|
+
plot_threshold(tmp_pred, threshold, precision, recall)
|
|
448
|
+
else:
|
|
449
|
+
logger.info(f"No threshold found for class {class_label}")
|
lecrapaud/config.py
CHANGED
lecrapaud/jobs/scheduler.py
CHANGED
|
@@ -5,11 +5,6 @@ from lecrapaud.jobs.tasks import app
|
|
|
5
5
|
|
|
6
6
|
def schedule_tasks():
|
|
7
7
|
schedule_tasks_list = [
|
|
8
|
-
{
|
|
9
|
-
"name": "task_send_daily_emails",
|
|
10
|
-
"task": "src.jobs.tasks.task_send_daily_emails",
|
|
11
|
-
"schedule": crontab(minute=00, hour=12),
|
|
12
|
-
},
|
|
13
8
|
{
|
|
14
9
|
"name": "task_training_experiment",
|
|
15
10
|
"task": "src.jobs.tasks.task_training_experiment",
|
|
@@ -24,8 +19,7 @@ def schedule_tasks():
|
|
|
24
19
|
|
|
25
20
|
def unschedule_tasks():
|
|
26
21
|
unschedule_task_keys = [
|
|
27
|
-
"redbeat:
|
|
28
|
-
"redbeat:task_train_models",
|
|
22
|
+
"redbeat:task_training_experiment",
|
|
29
23
|
]
|
|
30
24
|
|
|
31
25
|
for key in unschedule_task_keys:
|
lecrapaud/jobs/tasks.py
CHANGED
|
@@ -1,30 +1,5 @@
|
|
|
1
1
|
from lecrapaud.jobs import app
|
|
2
|
-
|
|
3
|
-
# from honeybadger import honeybadger
|
|
4
|
-
from lecrapaud.send_daily_emails import send_daily_emails
|
|
5
|
-
from lecrapaud.config import EXPERIMENT_ID, RECEIVER_EMAIL
|
|
6
|
-
from lecrapaud.training import run_training
|
|
7
|
-
from lecrapaud.constants import stock_list_3
|
|
8
|
-
from lecrapaud.search_space import get_models_idx
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@app.task(
|
|
12
|
-
bind=True,
|
|
13
|
-
autoretry_for=(Exception,),
|
|
14
|
-
retry_backoff=True,
|
|
15
|
-
retry_kwargs={"max_retries": 5},
|
|
16
|
-
acks_late=True,
|
|
17
|
-
)
|
|
18
|
-
def task_send_daily_emails(self):
|
|
19
|
-
try:
|
|
20
|
-
print(f"[Attempt #{self.request.retries}] task_send_daily_emails")
|
|
21
|
-
experiment_id = int(EXPERIMENT_ID)
|
|
22
|
-
email = RECEIVER_EMAIL
|
|
23
|
-
return send_daily_emails(email, experiment_id)
|
|
24
|
-
except Exception as e:
|
|
25
|
-
print(e)
|
|
26
|
-
# honeybadger.notify(e)
|
|
27
|
-
raise
|
|
2
|
+
from lecrapaud.utils import logger
|
|
28
3
|
|
|
29
4
|
|
|
30
5
|
@app.task(
|
|
@@ -36,22 +11,7 @@ def task_send_daily_emails(self):
|
|
|
36
11
|
)
|
|
37
12
|
def task_training_experiment(self):
|
|
38
13
|
try:
|
|
39
|
-
|
|
40
|
-
run_training(
|
|
41
|
-
years_of_data=20,
|
|
42
|
-
list_of_groups=stock_list_3,
|
|
43
|
-
targets_numbers=range(1, 15),
|
|
44
|
-
percentile=20,
|
|
45
|
-
corr_threshold=80,
|
|
46
|
-
max_features=25,
|
|
47
|
-
models_idx=get_models_idx("linear", "xgb"),
|
|
48
|
-
number_of_trials=20,
|
|
49
|
-
perform_hyperoptimization=True,
|
|
50
|
-
perform_crossval=False,
|
|
51
|
-
preserve_model=False,
|
|
52
|
-
experiment_name="20y_stock_list_3_linear_xgb",
|
|
53
|
-
)
|
|
14
|
+
pass
|
|
54
15
|
except Exception as e:
|
|
55
|
-
|
|
56
|
-
# honeybadger.notify(e)
|
|
16
|
+
logger.error(e)
|
|
57
17
|
raise
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "code",
|
|
5
|
+
"execution_count": 3,
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"outputs": [
|
|
8
|
+
{
|
|
9
|
+
"name": "stderr",
|
|
10
|
+
"output_type": "stream",
|
|
11
|
+
"text": [
|
|
12
|
+
"/Users/pierregallet/Code/lecrapaud/.venv/lib/python3.12/site-packages/tabpfn/base.py:89: UserWarning: Downloading model to /Users/pierregallet/Library/Caches/tabpfn/tabpfn-v2-classifier.ckpt.\n",
|
|
13
|
+
" model, _, config_ = load_model_criterion_config(\n"
|
|
14
|
+
]
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"data": {
|
|
18
|
+
"application/vnd.jupyter.widget-view+json": {
|
|
19
|
+
"model_id": "df286c7a921b48439f5a97dae1985862",
|
|
20
|
+
"version_major": 2,
|
|
21
|
+
"version_minor": 0
|
|
22
|
+
},
|
|
23
|
+
"text/plain": [
|
|
24
|
+
"tabpfn-v2-classifier.ckpt: 0%| | 0.00/29.0M [00:00<?, ?B/s]"
|
|
25
|
+
]
|
|
26
|
+
},
|
|
27
|
+
"metadata": {},
|
|
28
|
+
"output_type": "display_data"
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"data": {
|
|
32
|
+
"application/vnd.jupyter.widget-view+json": {
|
|
33
|
+
"model_id": "55c41be7cbaf4b95a670b40157536ea1",
|
|
34
|
+
"version_major": 2,
|
|
35
|
+
"version_minor": 0
|
|
36
|
+
},
|
|
37
|
+
"text/plain": [
|
|
38
|
+
"config.json: 0%| | 0.00/37.0 [00:00<?, ?B/s]"
|
|
39
|
+
]
|
|
40
|
+
},
|
|
41
|
+
"metadata": {},
|
|
42
|
+
"output_type": "display_data"
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
"name": "stderr",
|
|
46
|
+
"output_type": "stream",
|
|
47
|
+
"text": [
|
|
48
|
+
"/Users/pierregallet/Code/lecrapaud/.venv/lib/python3.12/site-packages/tabpfn/classifier.py:432: UserWarning: Running on CPU with more than 200 samples may be slow.\n",
|
|
49
|
+
"Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client\n",
|
|
50
|
+
" check_cpu_warning(\n"
|
|
51
|
+
]
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"name": "stdout",
|
|
55
|
+
"output_type": "stream",
|
|
56
|
+
"text": [
|
|
57
|
+
"ROC AUC: 0.9981992797118848\n",
|
|
58
|
+
"Accuracy 0.9824561403508771\n"
|
|
59
|
+
]
|
|
60
|
+
}
|
|
61
|
+
],
|
|
62
|
+
"source": [
|
|
63
|
+
"from sklearn.datasets import load_breast_cancer\n",
|
|
64
|
+
"from sklearn.metrics import accuracy_score, roc_auc_score\n",
|
|
65
|
+
"from sklearn.model_selection import train_test_split\n",
|
|
66
|
+
"\n",
|
|
67
|
+
"from tabpfn import TabPFNClassifier\n",
|
|
68
|
+
"\n",
|
|
69
|
+
"# Load data\n",
|
|
70
|
+
"X, y = load_breast_cancer(return_X_y=True)\n",
|
|
71
|
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
|
72
|
+
" X, y, test_size=0.5, random_state=42\n",
|
|
73
|
+
")\n",
|
|
74
|
+
"\n",
|
|
75
|
+
"# Initialize a classifier\n",
|
|
76
|
+
"clf = TabPFNClassifier()\n",
|
|
77
|
+
"clf.fit(X_train, y_train)\n",
|
|
78
|
+
"\n",
|
|
79
|
+
"# Predict probabilities\n",
|
|
80
|
+
"prediction_probabilities = clf.predict_proba(X_test)\n",
|
|
81
|
+
"print(\"ROC AUC:\", roc_auc_score(y_test, prediction_probabilities[:, 1]))\n",
|
|
82
|
+
"\n",
|
|
83
|
+
"# Predict labels\n",
|
|
84
|
+
"predictions = clf.predict(X_test)\n",
|
|
85
|
+
"print(\"Accuracy\", accuracy_score(y_test, predictions))"
|
|
86
|
+
]
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
"cell_type": "code",
|
|
90
|
+
"execution_count": 5,
|
|
91
|
+
"metadata": {},
|
|
92
|
+
"outputs": [
|
|
93
|
+
{
|
|
94
|
+
"name": "stdout",
|
|
95
|
+
"output_type": "stream",
|
|
96
|
+
"text": [
|
|
97
|
+
"Training time: 0.05 seconds\n",
|
|
98
|
+
"Accuracy: 0.9561\n",
|
|
99
|
+
"\n",
|
|
100
|
+
"Classification Report:\n",
|
|
101
|
+
" precision recall f1-score support\n",
|
|
102
|
+
"\n",
|
|
103
|
+
" 0 0.95 0.93 0.94 43\n",
|
|
104
|
+
" 1 0.96 0.97 0.97 71\n",
|
|
105
|
+
"\n",
|
|
106
|
+
" accuracy 0.96 114\n",
|
|
107
|
+
" macro avg 0.96 0.95 0.95 114\n",
|
|
108
|
+
"weighted avg 0.96 0.96 0.96 114\n",
|
|
109
|
+
"\n",
|
|
110
|
+
"\n",
|
|
111
|
+
"Feature Importance:\n",
|
|
112
|
+
"Feature 0: 0.0284\n",
|
|
113
|
+
"Feature 1: 0.0198\n",
|
|
114
|
+
"Feature 2: 0.0000\n",
|
|
115
|
+
"Feature 3: 0.0136\n",
|
|
116
|
+
"Feature 4: 0.0094\n",
|
|
117
|
+
"Feature 5: 0.0053\n",
|
|
118
|
+
"Feature 6: 0.0060\n",
|
|
119
|
+
"Feature 7: 0.3079\n",
|
|
120
|
+
"Feature 8: 0.0001\n",
|
|
121
|
+
"Feature 9: 0.0063\n",
|
|
122
|
+
"Feature 10: 0.0093\n",
|
|
123
|
+
"Feature 11: 0.0089\n",
|
|
124
|
+
"Feature 12: 0.0168\n",
|
|
125
|
+
"Feature 13: 0.0119\n",
|
|
126
|
+
"Feature 14: 0.0113\n",
|
|
127
|
+
"Feature 15: 0.0087\n",
|
|
128
|
+
"Feature 16: 0.0220\n",
|
|
129
|
+
"Feature 17: 0.0043\n",
|
|
130
|
+
"Feature 18: 0.0036\n",
|
|
131
|
+
"Feature 19: 0.0040\n",
|
|
132
|
+
"Feature 20: 0.0578\n",
|
|
133
|
+
"Feature 21: 0.0276\n",
|
|
134
|
+
"Feature 22: 0.1538\n",
|
|
135
|
+
"Feature 23: 0.0360\n",
|
|
136
|
+
"Feature 24: 0.0072\n",
|
|
137
|
+
"Feature 25: 0.0000\n",
|
|
138
|
+
"Feature 26: 0.0295\n",
|
|
139
|
+
"Feature 27: 0.1860\n",
|
|
140
|
+
"Feature 28: 0.0049\n",
|
|
141
|
+
"Feature 29: 0.0000\n"
|
|
142
|
+
]
|
|
143
|
+
},
|
|
144
|
+
{
|
|
145
|
+
"name": "stderr",
|
|
146
|
+
"output_type": "stream",
|
|
147
|
+
"text": [
|
|
148
|
+
"/Users/pierregallet/Code/lecrapaud/.venv/lib/python3.12/site-packages/xgboost/training.py:183: UserWarning: [11:58:14] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:738: \n",
|
|
149
|
+
"Parameters: { \"use_label_encoder\" } are not used.\n",
|
|
150
|
+
"\n",
|
|
151
|
+
" bst.update(dtrain, iteration=i, fobj=obj)\n"
|
|
152
|
+
]
|
|
153
|
+
}
|
|
154
|
+
],
|
|
155
|
+
"source": [
|
|
156
|
+
"# XGBoost Example\n",
|
|
157
|
+
"import xgboost as xgb\n",
|
|
158
|
+
"from sklearn.metrics import accuracy_score, classification_report\n",
|
|
159
|
+
"import time\n",
|
|
160
|
+
"\n",
|
|
161
|
+
"# Load the breast cancer dataset\n",
|
|
162
|
+
"X, y = load_breast_cancer(return_X_y=True)\n",
|
|
163
|
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
|
164
|
+
" X, y, test_size=0.2, random_state=42\n",
|
|
165
|
+
")\n",
|
|
166
|
+
"\n",
|
|
167
|
+
"# Create and train XGBoost classifier\n",
|
|
168
|
+
"start_time = time.time()\n",
|
|
169
|
+
"xgb_clf = xgb.XGBClassifier(\n",
|
|
170
|
+
" n_estimators=100,\n",
|
|
171
|
+
" max_depth=3,\n",
|
|
172
|
+
" learning_rate=0.1,\n",
|
|
173
|
+
" use_label_encoder=False,\n",
|
|
174
|
+
" eval_metric=\"logloss\",\n",
|
|
175
|
+
" random_state=42,\n",
|
|
176
|
+
")\n",
|
|
177
|
+
"\n",
|
|
178
|
+
"# Train the model\n",
|
|
179
|
+
"xgb_clf.fit(X_train, y_train)\n",
|
|
180
|
+
"\n",
|
|
181
|
+
"# Make predictions\n",
|
|
182
|
+
"y_pred = xgb_clf.predict(X_test)\n",
|
|
183
|
+
"y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]\n",
|
|
184
|
+
"\n",
|
|
185
|
+
"# Calculate metrics\n",
|
|
186
|
+
"accuracy = accuracy_score(y_test, y_pred)\n",
|
|
187
|
+
"training_time = time.time() - start_time\n",
|
|
188
|
+
"\n",
|
|
189
|
+
"print(f\"Training time: {training_time:.2f} seconds\")\n",
|
|
190
|
+
"print(f\"Accuracy: {accuracy:.4f}\")\n",
|
|
191
|
+
"print(\"\\nClassification Report:\")\n",
|
|
192
|
+
"print(classification_report(y_test, y_pred))\n",
|
|
193
|
+
"\n",
|
|
194
|
+
"# Feature importance\n",
|
|
195
|
+
"print(\"\\nFeature Importance:\")\n",
|
|
196
|
+
"for i, importance in enumerate(xgb_clf.feature_importances_):\n",
|
|
197
|
+
" print(f\"Feature {i}: {importance:.4f}\")"
|
|
198
|
+
]
|
|
199
|
+
}
|
|
200
|
+
],
|
|
201
|
+
"metadata": {
|
|
202
|
+
"kernelspec": {
|
|
203
|
+
"display_name": ".venv",
|
|
204
|
+
"language": "python",
|
|
205
|
+
"name": "python3"
|
|
206
|
+
},
|
|
207
|
+
"language_info": {
|
|
208
|
+
"codemirror_mode": {
|
|
209
|
+
"name": "ipython",
|
|
210
|
+
"version": 3
|
|
211
|
+
},
|
|
212
|
+
"file_extension": ".py",
|
|
213
|
+
"mimetype": "text/x-python",
|
|
214
|
+
"name": "python",
|
|
215
|
+
"nbconvert_exporter": "python",
|
|
216
|
+
"pygments_lexer": "ipython3",
|
|
217
|
+
"version": "3.12.11"
|
|
218
|
+
}
|
|
219
|
+
},
|
|
220
|
+
"nbformat": 4,
|
|
221
|
+
"nbformat_minor": 2
|
|
222
|
+
}
|
lecrapaud/model_selection.py
CHANGED
|
@@ -567,40 +567,14 @@ class ModelEngine:
|
|
|
567
567
|
if not self.path:
|
|
568
568
|
raise ValueError("Path is not set, cannot load model")
|
|
569
569
|
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
# Search for files that contain '.best' or '.keras' in the name
|
|
573
|
-
best_files = list(target_dir.glob("*.best*")) + list(
|
|
574
|
-
target_dir.glob("*.keras*")
|
|
575
|
-
)
|
|
576
|
-
# If any files are found, try loading the first one (or process as needed)
|
|
577
|
-
if best_files:
|
|
578
|
-
file_path = best_files[
|
|
579
|
-
0
|
|
580
|
-
] # Assuming you want to open the first matching file
|
|
581
|
-
try:
|
|
582
|
-
# Attempt to load the file as a scikit-learn, XGBoost, or LightGBM model (Pickle format)
|
|
583
|
-
self._model = joblib.load(file_path)
|
|
584
|
-
except (pickle.UnpicklingError, EOFError):
|
|
585
|
-
# If it's not a pickle file, try loading it as a Keras model
|
|
586
|
-
try:
|
|
587
|
-
# Attempt to load the file as a Keras model
|
|
588
|
-
self._model = keras.models.load_model(file_path)
|
|
589
|
-
except Exception as e:
|
|
590
|
-
raise FileNotFoundError(
|
|
591
|
-
f"Model could not be loaded from path: {file_path}: {e}"
|
|
592
|
-
)
|
|
593
|
-
else:
|
|
594
|
-
raise FileNotFoundError(
|
|
595
|
-
f"No files with '.best' or '.keras' found in the specified folder: {target_dir}"
|
|
596
|
-
)
|
|
570
|
+
self._model = load_model(self.path)
|
|
597
571
|
|
|
598
572
|
self.model_name = self._model.model_name
|
|
599
573
|
self.target_type = self._model.target_type
|
|
600
574
|
|
|
601
575
|
# Load threshold
|
|
602
576
|
self.threshold = (
|
|
603
|
-
joblib.load(f"{
|
|
577
|
+
joblib.load(f"{self.path}/thresholds.pkl")
|
|
604
578
|
if self.target_type == "classification"
|
|
605
579
|
else None
|
|
606
580
|
)
|
|
@@ -1326,12 +1300,29 @@ def get_log_dir(target_dir: str, model_name="test_model"):
|
|
|
1326
1300
|
return str(log_dir)
|
|
1327
1301
|
|
|
1328
1302
|
|
|
1329
|
-
def
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
"""
|
|
1333
|
-
|
|
1334
|
-
|
|
1303
|
+
def load_model(target_dir: str):
|
|
1304
|
+
target_dir = Path(target_dir)
|
|
1305
|
+
# Search for files that contain '.best' or '.keras' in the name
|
|
1306
|
+
best_files = list(target_dir.glob("*.best*")) + list(target_dir.glob("*.keras*"))
|
|
1307
|
+
# If any files are found, try loading the first one (or process as needed)
|
|
1308
|
+
if best_files:
|
|
1309
|
+
file_path = best_files[0] # Assuming you want to open the first matching file
|
|
1310
|
+
try:
|
|
1311
|
+
# Attempt to load the file as a scikit-learn, XGBoost, or LightGBM model (Pickle format)
|
|
1312
|
+
return joblib.load(file_path)
|
|
1313
|
+
except (pickle.UnpicklingError, EOFError):
|
|
1314
|
+
# If it's not a pickle file, try loading it as a Keras model
|
|
1315
|
+
try:
|
|
1316
|
+
# Attempt to load the file as a Keras model
|
|
1317
|
+
return keras.models.load_model(file_path)
|
|
1318
|
+
except Exception as e:
|
|
1319
|
+
raise FileNotFoundError(
|
|
1320
|
+
f"Model could not be loaded from path: {file_path}: {e}"
|
|
1321
|
+
)
|
|
1322
|
+
else:
|
|
1323
|
+
raise FileNotFoundError(
|
|
1324
|
+
f"No files with '.best' or '.keras' found in the specified folder: {target_dir}"
|
|
1325
|
+
)
|
|
1335
1326
|
|
|
1336
1327
|
|
|
1337
1328
|
# plots
|
|
@@ -1629,37 +1620,6 @@ def plot_threshold(prediction, threshold, precision, recall):
|
|
|
1629
1620
|
|
|
1630
1621
|
|
|
1631
1622
|
# OLD - to sort out
|
|
1632
|
-
def get_pred_distribution(target_dir: str, model_name="linear"):
|
|
1633
|
-
"""
|
|
1634
|
-
Look at prediction distributions
|
|
1635
|
-
"""
|
|
1636
|
-
prediction = pd.read_csv(
|
|
1637
|
-
f"{target_dir}/{model_name}/prediction.csv",
|
|
1638
|
-
index_col="ID",
|
|
1639
|
-
)
|
|
1640
|
-
prediction.describe()
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
def plot_feature_importance(target_dir: str, model_name="linear"):
|
|
1644
|
-
"""
|
|
1645
|
-
Monitor feature importance ranking to filter out unrelevant features
|
|
1646
|
-
"""
|
|
1647
|
-
model = joblib.load(f"{target_dir}/{model_name}/{model_name}.best")
|
|
1648
|
-
if hasattr(model, "feature_importances_"):
|
|
1649
|
-
feature_importances_ = model.feature_importances_.flatten()
|
|
1650
|
-
elif hasattr(model, "feature_importance"):
|
|
1651
|
-
feature_importances_ = model.feature_importance.flatten()
|
|
1652
|
-
elif hasattr(model, "coefs_"):
|
|
1653
|
-
feature_importances_ = np.mean(model.coefs_[0], axis=1).flatten()
|
|
1654
|
-
elif hasattr(model, "coef_"):
|
|
1655
|
-
feature_importances_ = model.coef_.flatten()
|
|
1656
|
-
else:
|
|
1657
|
-
feature_importances_ = []
|
|
1658
|
-
|
|
1659
|
-
sns.barplot(
|
|
1660
|
-
data=feature_importances_,
|
|
1661
|
-
orient="h",
|
|
1662
|
-
)
|
|
1663
1623
|
|
|
1664
1624
|
|
|
1665
1625
|
def print_model_estimators(target_dir: str, model_name="linear"):
|
lecrapaud/utils.py
CHANGED
|
@@ -11,7 +11,7 @@ import re
|
|
|
11
11
|
import string
|
|
12
12
|
|
|
13
13
|
from lecrapaud.directories import logger_dir
|
|
14
|
-
from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV
|
|
14
|
+
from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV, LECRAPAUD_LOCAL
|
|
15
15
|
|
|
16
16
|
_LECRAPAUD_LOGGER_ALREADY_CONFIGURED = False
|
|
17
17
|
|
|
@@ -24,7 +24,7 @@ def setup_logger():
|
|
|
24
24
|
return logging.getLogger("lecrapaud" if PYTHON_ENV != "Worker" else "")
|
|
25
25
|
|
|
26
26
|
print(
|
|
27
|
-
f"Setting up logger with PYTHON_ENV {PYTHON_ENV} and LOGGING_LEVEL {LOGGING_LEVEL}"
|
|
27
|
+
f"Setting up logger lecrapaud with PYTHON_ENV {PYTHON_ENV} and LOGGING_LEVEL {LOGGING_LEVEL}"
|
|
28
28
|
)
|
|
29
29
|
# ------------------------------------------------------------------ #
|
|
30
30
|
# Real configuration happens only on the FIRST call #
|
|
@@ -61,7 +61,10 @@ def setup_logger():
|
|
|
61
61
|
return logger
|
|
62
62
|
|
|
63
63
|
|
|
64
|
-
|
|
64
|
+
if LECRAPAUD_LOCAL:
|
|
65
|
+
logger = setup_logger()
|
|
66
|
+
else:
|
|
67
|
+
logger = logging.getLogger(__name__)
|
|
65
68
|
|
|
66
69
|
|
|
67
70
|
def get_df_name(obj, namespace):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: lecrapaud
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.11.1
|
|
4
4
|
Summary: Framework for machine and deep learning, with regression, classification and time series analysis
|
|
5
5
|
License: Apache License
|
|
6
6
|
Author: Pierre H. Gallet
|
|
@@ -8,43 +8,27 @@ Requires-Python: ==3.12.*
|
|
|
8
8
|
Classifier: License :: Other/Proprietary License
|
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
-
Requires-Dist: backoff (>=2.2.1)
|
|
12
11
|
Requires-Dist: category-encoders (>=2.8.1)
|
|
13
|
-
Requires-Dist: celery (>=5.5.
|
|
14
|
-
Requires-Dist: curl-cffi (>=0.11.1)
|
|
15
|
-
Requires-Dist: deep-translator (>=1.11.4)
|
|
16
|
-
Requires-Dist: degiro-connector (>=3.0.26)
|
|
17
|
-
Requires-Dist: fake-useragent (>=2.1.0)
|
|
12
|
+
Requires-Dist: celery (>=5.5.3)
|
|
18
13
|
Requires-Dist: ftfy (>=6.3.1)
|
|
19
|
-
Requires-Dist:
|
|
20
|
-
Requires-Dist:
|
|
21
|
-
Requires-Dist: keras (>=3.9.0)
|
|
22
|
-
Requires-Dist: keras-tcn (>=3.1.2)
|
|
14
|
+
Requires-Dist: joblib (>=1.5.1)
|
|
15
|
+
Requires-Dist: keras (>=3.10.0)
|
|
23
16
|
Requires-Dist: lightgbm (>=4.6.0)
|
|
24
|
-
Requires-Dist: matplotlib (>=3.10.
|
|
17
|
+
Requires-Dist: matplotlib (>=3.10.3)
|
|
25
18
|
Requires-Dist: mlxtend (>=0.23.4)
|
|
26
19
|
Requires-Dist: numpy (>=2.1.3)
|
|
27
|
-
Requires-Dist: openai (>=1.
|
|
28
|
-
Requires-Dist: pandas (>=2.
|
|
29
|
-
Requires-Dist:
|
|
30
|
-
Requires-Dist: playwright (>=1.52.0)
|
|
31
|
-
Requires-Dist: pydantic (>=2.10.6)
|
|
32
|
-
Requires-Dist: python-dotenv (>=1.0.1)
|
|
33
|
-
Requires-Dist: pytz (>=2025.1)
|
|
34
|
-
Requires-Dist: ratelimit (>=2.2.1)
|
|
20
|
+
Requires-Dist: openai (>=1.88.0)
|
|
21
|
+
Requires-Dist: pandas (>=2.3.0)
|
|
22
|
+
Requires-Dist: python-dotenv (>=1.1.0)
|
|
35
23
|
Requires-Dist: scikit-learn (>=1.6.1)
|
|
36
|
-
Requires-Dist: scipy (>=1.15.
|
|
24
|
+
Requires-Dist: scipy (>=1.15.3)
|
|
37
25
|
Requires-Dist: seaborn (>=0.13.2)
|
|
38
|
-
Requires-Dist:
|
|
39
|
-
Requires-Dist:
|
|
40
|
-
Requires-Dist: tensorboardx (>=2.6.2.2)
|
|
26
|
+
Requires-Dist: sqlalchemy (>=2.0.41)
|
|
27
|
+
Requires-Dist: tensorboardx (>=2.6.4)
|
|
41
28
|
Requires-Dist: tensorflow (>=2.19.0)
|
|
42
|
-
Requires-Dist: tf-keras (>=2.19.0)
|
|
43
29
|
Requires-Dist: tiktoken (>=0.9.0)
|
|
44
30
|
Requires-Dist: tqdm (>=4.67.1)
|
|
45
|
-
Requires-Dist: xgboost (>=3.0.
|
|
46
|
-
Requires-Dist: yahoo-fin (>=0.8.9.1)
|
|
47
|
-
Requires-Dist: yfinance (>=0.2.55)
|
|
31
|
+
Requires-Dist: xgboost (>=3.0.2)
|
|
48
32
|
Description-Content-Type: text/markdown
|
|
49
33
|
|
|
50
34
|
<div align="center">
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
lecrapaud/__init__.py,sha256=oCxbtw_nk8rlOXbXbWo0RRMlsh6w-hTiZ6e5PRG_wp0,28
|
|
2
|
-
lecrapaud/api.py,sha256=
|
|
3
|
-
lecrapaud/config.py,sha256=
|
|
2
|
+
lecrapaud/api.py,sha256=wrMc3TaP5qCzGvmN0QsYKxUt2ZPzK3z4nmnetQo23io,16645
|
|
3
|
+
lecrapaud/config.py,sha256=eYnrktVq457xMIMGcUSilJdNxCsaGP_gRAlzCSwd6Vo,1047
|
|
4
4
|
lecrapaud/db/__init__.py,sha256=82o9fMfaqKXPh2_rt44EzNRVZV1R4LScEnQYvj_TjK0,34
|
|
5
5
|
lecrapaud/db/alembic/README,sha256=MVlc9TYmr57RbhXET6QxgyCcwWP7w-vLkEsirENqiIQ,38
|
|
6
6
|
lecrapaud/db/alembic/env.py,sha256=rseEi8oR_eKXYYW3UwOKiCMuDEwT4lxsT7llySOUpgk,2305
|
|
@@ -29,16 +29,16 @@ lecrapaud/feature_selection.py,sha256=u3TWq3G5Xh3geQevGDOZEt_rl_m6-K_CR7SttFtpwK
|
|
|
29
29
|
lecrapaud/integrations/openai_integration.py,sha256=hHLF3fk5Bps8KNbNrEL3NUFa945jwClE6LrLpuMZOd4,7459
|
|
30
30
|
lecrapaud/jobs/__init__.py,sha256=ZkrsyTOR21c_wN7RY8jPhm8jCrL1oCEtTsf3VFIlQiE,292
|
|
31
31
|
lecrapaud/jobs/config.py,sha256=AmO0j3RFjx8H66dfKw_7vnshaOJb9Ox5BAZ9cwwLFMY,377
|
|
32
|
-
lecrapaud/jobs/scheduler.py,sha256=
|
|
33
|
-
lecrapaud/jobs/tasks.py,sha256=
|
|
34
|
-
lecrapaud/
|
|
32
|
+
lecrapaud/jobs/scheduler.py,sha256=OKXhb_gxE1-R7D1HyPns88iIS31Wd4gRqEzk4EqS0J4,774
|
|
33
|
+
lecrapaud/jobs/tasks.py,sha256=sbD2_IT45DE4yQQbR6DVb9xv5x06rYDtUvSK8exYxes,332
|
|
34
|
+
lecrapaud/misc/tabpfn_tests.ipynb,sha256=VkgsCUJ30d8jaL2VaWtQAgb8ngHPNtPgnXLs7QQTjqg,6676
|
|
35
|
+
lecrapaud/misc/test-gpu-bilstm.ipynb,sha256=4nLuZRJVe2kn6kEmauhRiz5wkWT9AVrYhI9CEk_dYUY,9608
|
|
36
|
+
lecrapaud/misc/test-gpu-resnet.ipynb,sha256=27Vu7nYwujYeh3fOxBNCnKJn3MXNPKZU-U8oDDUbymg,4944
|
|
37
|
+
lecrapaud/misc/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIFST_xl_AFW2jgI,8444
|
|
38
|
+
lecrapaud/model_selection.py,sha256=PQGEWVWN-4ZeHCqrmXBpHgq1QZi_1nOOeu5gazXGDLQ,60487
|
|
35
39
|
lecrapaud/search_space.py,sha256=-JkzuMhaomdwiWi4HvVQY5hiw3-oREemJA16tbwEIp4,34854
|
|
36
|
-
lecrapaud/
|
|
37
|
-
lecrapaud
|
|
38
|
-
lecrapaud
|
|
39
|
-
lecrapaud/
|
|
40
|
-
lecrapaud/
|
|
41
|
-
lecrapaud-0.10.2.dist-info/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
|
|
42
|
-
lecrapaud-0.10.2.dist-info/METADATA,sha256=WBdFSR8XKbLy8KhvJJWi48ylh7vP1nVKk-h9ga0OApU,11624
|
|
43
|
-
lecrapaud-0.10.2.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
44
|
-
lecrapaud-0.10.2.dist-info/RECORD,,
|
|
40
|
+
lecrapaud/utils.py,sha256=MUgDoJ31GOF8WRLn_WLzDbHw7OTKxq_ldnZT6dpxdQo,8295
|
|
41
|
+
lecrapaud-0.11.1.dist-info/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
|
|
42
|
+
lecrapaud-0.11.1.dist-info/METADATA,sha256=YDEIQa4j_87wQqkW4SKzeomDWgUjKCFdEYn55nO41MI,11017
|
|
43
|
+
lecrapaud-0.11.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
44
|
+
lecrapaud-0.11.1.dist-info/RECORD,,
|
|
@@ -1,145 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"cells": [
|
|
3
|
-
{
|
|
4
|
-
"cell_type": "code",
|
|
5
|
-
"execution_count": 2,
|
|
6
|
-
"metadata": {},
|
|
7
|
-
"outputs": [],
|
|
8
|
-
"source": [
|
|
9
|
-
"# import autosklearn.classification\n",
|
|
10
|
-
"import sklearn.datasets\n",
|
|
11
|
-
"import sklearn.metrics\n",
|
|
12
|
-
"from pprint import pprint\n",
|
|
13
|
-
"from tabpfn import TabPFNClassifier\n",
|
|
14
|
-
"import numpy as np\n",
|
|
15
|
-
"from pathlib import Path\n",
|
|
16
|
-
"import pandas as pd\n",
|
|
17
|
-
"import time\n",
|
|
18
|
-
"from sklearn.metrics import accuracy_score\n",
|
|
19
|
-
"from sklearn.datasets import load_breast_cancer\n",
|
|
20
|
-
"from sklearn.model_selection import train_test_split"
|
|
21
|
-
]
|
|
22
|
-
},
|
|
23
|
-
{
|
|
24
|
-
"cell_type": "code",
|
|
25
|
-
"execution_count": null,
|
|
26
|
-
"metadata": {},
|
|
27
|
-
"outputs": [],
|
|
28
|
-
"source": [
|
|
29
|
-
"X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)\n",
|
|
30
|
-
"X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n",
|
|
31
|
-
" X, y, random_state=1\n",
|
|
32
|
-
")"
|
|
33
|
-
]
|
|
34
|
-
},
|
|
35
|
-
{
|
|
36
|
-
"cell_type": "code",
|
|
37
|
-
"execution_count": null,
|
|
38
|
-
"metadata": {},
|
|
39
|
-
"outputs": [],
|
|
40
|
-
"source": [
|
|
41
|
-
"automl = autosklearn.classification.AutoSklearnClassifier(\n",
|
|
42
|
-
" time_left_for_this_task=120,\n",
|
|
43
|
-
" per_run_time_limit=30,\n",
|
|
44
|
-
" tmp_folder=\"/tmp/autosklearn_interpretable_models_example_tmp\",\n",
|
|
45
|
-
" include={\n",
|
|
46
|
-
" \"classifier\": [\"decision_tree\", \"lda\", \"sgd\"],\n",
|
|
47
|
-
" \"feature_preprocessor\": [\n",
|
|
48
|
-
" \"no_preprocessing\",\n",
|
|
49
|
-
" \"polynomial\",\n",
|
|
50
|
-
" \"select_percentile_classification\",\n",
|
|
51
|
-
" ],\n",
|
|
52
|
-
" },\n",
|
|
53
|
-
" ensemble_kwargs={\"ensemble_size\": 1},\n",
|
|
54
|
-
")\n",
|
|
55
|
-
"automl.fit(X_train, y_train, dataset_name=\"breast_cancer\")"
|
|
56
|
-
]
|
|
57
|
-
},
|
|
58
|
-
{
|
|
59
|
-
"cell_type": "code",
|
|
60
|
-
"execution_count": null,
|
|
61
|
-
"metadata": {},
|
|
62
|
-
"outputs": [],
|
|
63
|
-
"source": [
|
|
64
|
-
"pprint(automl.show_models(), indent=4)"
|
|
65
|
-
]
|
|
66
|
-
},
|
|
67
|
-
{
|
|
68
|
-
"cell_type": "code",
|
|
69
|
-
"execution_count": null,
|
|
70
|
-
"metadata": {},
|
|
71
|
-
"outputs": [],
|
|
72
|
-
"source": [
|
|
73
|
-
"predictions = automl.predict(X_test)\n",
|
|
74
|
-
"print(\"Accuracy score:\", sklearn.metrics.accuracy_score(y_test, predictions))"
|
|
75
|
-
]
|
|
76
|
-
},
|
|
77
|
-
{
|
|
78
|
-
"cell_type": "code",
|
|
79
|
-
"execution_count": null,
|
|
80
|
-
"metadata": {},
|
|
81
|
-
"outputs": [],
|
|
82
|
-
"source": [
|
|
83
|
-
"# N_ensemble_configurations defines how many estimators are averaged, it is bounded by #features * #classes\n",
|
|
84
|
-
"# more ensemble members are slower, but more accurate\n",
|
|
85
|
-
"classifier = TabPFNClassifier(device=\"cuda\", N_ensemble_configurations=4)"
|
|
86
|
-
]
|
|
87
|
-
},
|
|
88
|
-
{
|
|
89
|
-
"cell_type": "code",
|
|
90
|
-
"execution_count": null,
|
|
91
|
-
"metadata": {},
|
|
92
|
-
"outputs": [],
|
|
93
|
-
"source": [
|
|
94
|
-
"start = time.time()\n",
|
|
95
|
-
"classifier.fit(X_train, y_train)\n",
|
|
96
|
-
"y_eval, p_eval = classifier.predict(X_test, return_winning_probability=True)\n",
|
|
97
|
-
"print(\n",
|
|
98
|
-
" \"Prediction time: \", time.time() - start, \"Accuracy\", accuracy_score(y_test, y_eval)\n",
|
|
99
|
-
")"
|
|
100
|
-
]
|
|
101
|
-
},
|
|
102
|
-
{
|
|
103
|
-
"cell_type": "code",
|
|
104
|
-
"execution_count": null,
|
|
105
|
-
"metadata": {},
|
|
106
|
-
"outputs": [],
|
|
107
|
-
"source": [
|
|
108
|
-
"# We also offer the `predict_proba` interface\n",
|
|
109
|
-
"classifier.predict_proba(X_test).shape"
|
|
110
|
-
]
|
|
111
|
-
},
|
|
112
|
-
{
|
|
113
|
-
"cell_type": "code",
|
|
114
|
-
"execution_count": null,
|
|
115
|
-
"metadata": {},
|
|
116
|
-
"outputs": [],
|
|
117
|
-
"source": [
|
|
118
|
-
"out_table = pd.DataFrame(X_test.copy().astype(str))\n",
|
|
119
|
-
"out_table[\"prediction\"] = [f\"{y_e} (p={p_e:.2f})\" for y_e, p_e in zip(y_eval, p_eval)]\n",
|
|
120
|
-
"out_table"
|
|
121
|
-
]
|
|
122
|
-
}
|
|
123
|
-
],
|
|
124
|
-
"metadata": {
|
|
125
|
-
"kernelspec": {
|
|
126
|
-
"display_name": ".venv",
|
|
127
|
-
"language": "python",
|
|
128
|
-
"name": "python3"
|
|
129
|
-
},
|
|
130
|
-
"language_info": {
|
|
131
|
-
"codemirror_mode": {
|
|
132
|
-
"name": "ipython",
|
|
133
|
-
"version": 3
|
|
134
|
-
},
|
|
135
|
-
"file_extension": ".py",
|
|
136
|
-
"mimetype": "text/x-python",
|
|
137
|
-
"name": "python",
|
|
138
|
-
"nbconvert_exporter": "python",
|
|
139
|
-
"pygments_lexer": "ipython3",
|
|
140
|
-
"version": "3.12.8"
|
|
141
|
-
}
|
|
142
|
-
},
|
|
143
|
-
"nbformat": 4,
|
|
144
|
-
"nbformat_minor": 2
|
|
145
|
-
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|