lecrapaud 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/__init__.py +1 -0
- lecrapaud/api.py +271 -0
- lecrapaud/config.py +25 -0
- lecrapaud/db/__init__.py +1 -0
- lecrapaud/db/alembic/README +1 -0
- lecrapaud/db/alembic/env.py +78 -0
- lecrapaud/db/alembic/script.py.mako +26 -0
- lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +295 -0
- lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +30 -0
- lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +52 -0
- lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +34 -0
- lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +39 -0
- lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +36 -0
- lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +30 -0
- lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +132 -0
- lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +36 -0
- lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +62 -0
- lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +107 -0
- lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +38 -0
- lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +44 -0
- lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +30 -0
- lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +32 -0
- lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
- lecrapaud/db/models/__init__.py +11 -0
- lecrapaud/db/models/base.py +181 -0
- lecrapaud/db/models/dataset.py +129 -0
- lecrapaud/db/models/feature.py +45 -0
- lecrapaud/db/models/feature_selection.py +125 -0
- lecrapaud/db/models/feature_selection_rank.py +79 -0
- lecrapaud/db/models/model.py +40 -0
- lecrapaud/db/models/model_selection.py +63 -0
- lecrapaud/db/models/model_training.py +62 -0
- lecrapaud/db/models/score.py +65 -0
- lecrapaud/db/models/target.py +67 -0
- lecrapaud/db/session.py +45 -0
- lecrapaud/directory_management.py +28 -0
- lecrapaud/experiment.py +64 -0
- lecrapaud/feature_engineering.py +846 -0
- lecrapaud/feature_selection.py +1167 -0
- lecrapaud/integrations/openai_integration.py +225 -0
- lecrapaud/jobs/__init__.py +13 -0
- lecrapaud/jobs/config.py +17 -0
- lecrapaud/jobs/scheduler.py +36 -0
- lecrapaud/jobs/tasks.py +57 -0
- lecrapaud/model_selection.py +1671 -0
- lecrapaud/predictions.py +292 -0
- lecrapaud/preprocessing.py +984 -0
- lecrapaud/search_space.py +848 -0
- lecrapaud/services/__init__.py +0 -0
- lecrapaud/services/embedding_categorical.py +71 -0
- lecrapaud/services/indicators.py +309 -0
- lecrapaud/speed_tests/experiments.py +139 -0
- lecrapaud/speed_tests/test-gpu-bilstm.ipynb +261 -0
- lecrapaud/speed_tests/test-gpu-resnet.ipynb +166 -0
- lecrapaud/speed_tests/test-gpu-transformers.ipynb +254 -0
- lecrapaud/speed_tests/tests.ipynb +145 -0
- lecrapaud/speed_tests/trash.py +37 -0
- lecrapaud/training.py +239 -0
- lecrapaud/utils.py +246 -0
- lecrapaud-0.1.0.dist-info/LICENSE +201 -0
- lecrapaud-0.1.0.dist-info/METADATA +105 -0
- lecrapaud-0.1.0.dist-info/RECORD +63 -0
- lecrapaud-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "code",
|
|
5
|
+
"execution_count": 2,
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"outputs": [],
|
|
8
|
+
"source": [
|
|
9
|
+
"# import autosklearn.classification\n",
|
|
10
|
+
"import sklearn.datasets\n",
|
|
11
|
+
"import sklearn.metrics\n",
|
|
12
|
+
"from pprint import pprint\n",
|
|
13
|
+
"from tabpfn import TabPFNClassifier\n",
|
|
14
|
+
"import numpy as np\n",
|
|
15
|
+
"from pathlib import Path\n",
|
|
16
|
+
"import pandas as pd\n",
|
|
17
|
+
"import time\n",
|
|
18
|
+
"from sklearn.metrics import accuracy_score\n",
|
|
19
|
+
"from sklearn.datasets import load_breast_cancer\n",
|
|
20
|
+
"from sklearn.model_selection import train_test_split"
|
|
21
|
+
]
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"cell_type": "code",
|
|
25
|
+
"execution_count": null,
|
|
26
|
+
"metadata": {},
|
|
27
|
+
"outputs": [],
|
|
28
|
+
"source": [
|
|
29
|
+
"X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)\n",
|
|
30
|
+
"X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n",
|
|
31
|
+
" X, y, random_state=1\n",
|
|
32
|
+
")"
|
|
33
|
+
]
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
"cell_type": "code",
|
|
37
|
+
"execution_count": null,
|
|
38
|
+
"metadata": {},
|
|
39
|
+
"outputs": [],
|
|
40
|
+
"source": [
|
|
41
|
+
"automl = autosklearn.classification.AutoSklearnClassifier(\n",
|
|
42
|
+
" time_left_for_this_task=120,\n",
|
|
43
|
+
" per_run_time_limit=30,\n",
|
|
44
|
+
" tmp_folder=\"/tmp/autosklearn_interpretable_models_example_tmp\",\n",
|
|
45
|
+
" include={\n",
|
|
46
|
+
" \"classifier\": [\"decision_tree\", \"lda\", \"sgd\"],\n",
|
|
47
|
+
" \"feature_preprocessor\": [\n",
|
|
48
|
+
" \"no_preprocessing\",\n",
|
|
49
|
+
" \"polynomial\",\n",
|
|
50
|
+
" \"select_percentile_classification\",\n",
|
|
51
|
+
" ],\n",
|
|
52
|
+
" },\n",
|
|
53
|
+
" ensemble_kwargs={\"ensemble_size\": 1},\n",
|
|
54
|
+
")\n",
|
|
55
|
+
"automl.fit(X_train, y_train, dataset_name=\"breast_cancer\")"
|
|
56
|
+
]
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
"cell_type": "code",
|
|
60
|
+
"execution_count": null,
|
|
61
|
+
"metadata": {},
|
|
62
|
+
"outputs": [],
|
|
63
|
+
"source": [
|
|
64
|
+
"pprint(automl.show_models(), indent=4)"
|
|
65
|
+
]
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"cell_type": "code",
|
|
69
|
+
"execution_count": null,
|
|
70
|
+
"metadata": {},
|
|
71
|
+
"outputs": [],
|
|
72
|
+
"source": [
|
|
73
|
+
"predictions = automl.predict(X_test)\n",
|
|
74
|
+
"print(\"Accuracy score:\", sklearn.metrics.accuracy_score(y_test, predictions))"
|
|
75
|
+
]
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
"cell_type": "code",
|
|
79
|
+
"execution_count": null,
|
|
80
|
+
"metadata": {},
|
|
81
|
+
"outputs": [],
|
|
82
|
+
"source": [
|
|
83
|
+
"# N_ensemble_configurations defines how many estimators are averaged, it is bounded by #features * #classes\n",
|
|
84
|
+
"# more ensemble members are slower, but more accurate\n",
|
|
85
|
+
"classifier = TabPFNClassifier(device=\"cuda\", N_ensemble_configurations=4)"
|
|
86
|
+
]
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
"cell_type": "code",
|
|
90
|
+
"execution_count": null,
|
|
91
|
+
"metadata": {},
|
|
92
|
+
"outputs": [],
|
|
93
|
+
"source": [
|
|
94
|
+
"start = time.time()\n",
|
|
95
|
+
"classifier.fit(X_train, y_train)\n",
|
|
96
|
+
"y_eval, p_eval = classifier.predict(X_test, return_winning_probability=True)\n",
|
|
97
|
+
"print(\n",
|
|
98
|
+
" \"Prediction time: \", time.time() - start, \"Accuracy\", accuracy_score(y_test, y_eval)\n",
|
|
99
|
+
")"
|
|
100
|
+
]
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
"cell_type": "code",
|
|
104
|
+
"execution_count": null,
|
|
105
|
+
"metadata": {},
|
|
106
|
+
"outputs": [],
|
|
107
|
+
"source": [
|
|
108
|
+
"# We also offer the `predict_proba` interface\n",
|
|
109
|
+
"classifier.predict_proba(X_test).shape"
|
|
110
|
+
]
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
"cell_type": "code",
|
|
114
|
+
"execution_count": null,
|
|
115
|
+
"metadata": {},
|
|
116
|
+
"outputs": [],
|
|
117
|
+
"source": [
|
|
118
|
+
"out_table = pd.DataFrame(X_test.copy().astype(str))\n",
|
|
119
|
+
"out_table[\"prediction\"] = [f\"{y_e} (p={p_e:.2f})\" for y_e, p_e in zip(y_eval, p_eval)]\n",
|
|
120
|
+
"out_table"
|
|
121
|
+
]
|
|
122
|
+
}
|
|
123
|
+
],
|
|
124
|
+
"metadata": {
|
|
125
|
+
"kernelspec": {
|
|
126
|
+
"display_name": ".venv",
|
|
127
|
+
"language": "python",
|
|
128
|
+
"name": "python3"
|
|
129
|
+
},
|
|
130
|
+
"language_info": {
|
|
131
|
+
"codemirror_mode": {
|
|
132
|
+
"name": "ipython",
|
|
133
|
+
"version": 3
|
|
134
|
+
},
|
|
135
|
+
"file_extension": ".py",
|
|
136
|
+
"mimetype": "text/x-python",
|
|
137
|
+
"name": "python",
|
|
138
|
+
"nbconvert_exporter": "python",
|
|
139
|
+
"pygments_lexer": "ipython3",
|
|
140
|
+
"version": "3.12.8"
|
|
141
|
+
}
|
|
142
|
+
},
|
|
143
|
+
"nbformat": 4,
|
|
144
|
+
"nbformat_minor": 2
|
|
145
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# def _get_weekly_return(y_true, y_pred):
|
|
2
|
+
# df = pd.concat([y_true, y_pred, stock_data[['YEARWEEK', 'STOCK', 'TARGET_1']]], join='inner', axis=1)
|
|
3
|
+
# df['PRED'] += 1
|
|
4
|
+
# df['TARGET'] += 1
|
|
5
|
+
# return df[['YEARWEEK', 'STOCK', 'PRED', 'TARGET']].groupby(['YEARWEEK', 'STOCK']).prod().reset_index()
|
|
6
|
+
|
|
7
|
+
# def _calc_spread_return_per_week(df, portfolio_size):
|
|
8
|
+
# return (df.sort_values('PRED', ascending=False)['TARGET_1'][:portfolio_size] - 1).mean()
|
|
9
|
+
|
|
10
|
+
# def sharpe_ratio_weekly(y_true, y_pred, portfolio_size:int=10):
|
|
11
|
+
# df = _get_weekly_return(y_true, y_pred)
|
|
12
|
+
# buf = df.groupby('YEARWEEK').apply(_calc_spread_return_per_week, portfolio_size)
|
|
13
|
+
# sharpe_ratio = (buf.mean() * 52) / (buf.std() * np.sqrt(52))
|
|
14
|
+
# buf += 1
|
|
15
|
+
# cumulated_roi = buf.prod() - 1
|
|
16
|
+
# cagr = buf.prod() ** (1 / (buf.shape[0]/52) ) - 1
|
|
17
|
+
# return sharpe_ratio, cumulated_roi, cagr
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def sharpe_ratio_daily(y_true, y_pred, portfolio_size: int = 10):
|
|
21
|
+
df = pd.concat(
|
|
22
|
+
[y_true, y_pred, stock_data[["DATE", "TARGET_1"]]], join="inner", axis=1
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def _calc_spread_return_per_day(df: pd.DataFrame, portfolio_size: int):
|
|
26
|
+
# print(df.sort_values('PRED', ascending=False)[['PRED', 'TARGET', 'TARGET_1']].head(10))
|
|
27
|
+
return (
|
|
28
|
+
df.sort_values("PRED", ascending=False)["TARGET_1"].iloc[:portfolio_size]
|
|
29
|
+
).mean()
|
|
30
|
+
|
|
31
|
+
buf = df.groupby("DATE").apply(_calc_spread_return_per_day, portfolio_size)
|
|
32
|
+
|
|
33
|
+
sharpe_ratio = (buf.mean() * 252) / (buf.std() * np.sqrt(252))
|
|
34
|
+
buf += 1
|
|
35
|
+
cumulated_roi = buf.prod() - 1
|
|
36
|
+
cagr = buf.prod() ** (1 / (buf.shape[0] / 252)) - 1
|
|
37
|
+
return sharpe_ratio, cumulated_roi, cagr
|
lecrapaud/training.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import joblib
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from lecrapaud.experiment import create_dataset
|
|
7
|
+
from lecrapaud.feature_engineering import (
|
|
8
|
+
feature_engineering,
|
|
9
|
+
encode_categorical_features,
|
|
10
|
+
add_pca_features,
|
|
11
|
+
summarize_dataframe,
|
|
12
|
+
)
|
|
13
|
+
from lecrapaud.feature_selection import (
|
|
14
|
+
feature_selection,
|
|
15
|
+
train_val_test_split,
|
|
16
|
+
train_val_test_split_time_series,
|
|
17
|
+
scale_data,
|
|
18
|
+
reshape_time_series,
|
|
19
|
+
)
|
|
20
|
+
from lecrapaud.model_selection import model_selection
|
|
21
|
+
from lecrapaud.search_space import all_models
|
|
22
|
+
from lecrapaud.directory_management import tmp_dir
|
|
23
|
+
from lecrapaud.db import Dataset
|
|
24
|
+
from lecrapaud.utils import logger
|
|
25
|
+
from lecrapaud.config import PYTHON_ENV
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Parameters
|
|
29
|
+
columns_date = ["DATE"]
|
|
30
|
+
columns_te_groupby = [["SECTOR", "DATE"], ["SUBINDUSTRY", "DATE"]]
|
|
31
|
+
columns_te_target = ["RET", "VOLUME", "RESIDUAL_RET", "RELATIVE_VOLUME"] + [
|
|
32
|
+
f"{ind}_{p}"
|
|
33
|
+
for p in [9, 14, 21, 50]
|
|
34
|
+
for ind in [
|
|
35
|
+
"CUMUL_RET",
|
|
36
|
+
"SMA",
|
|
37
|
+
"EMA",
|
|
38
|
+
"VOLATILITY",
|
|
39
|
+
"ATR",
|
|
40
|
+
"ADX",
|
|
41
|
+
"%K",
|
|
42
|
+
"RSI",
|
|
43
|
+
"MFI",
|
|
44
|
+
]
|
|
45
|
+
]
|
|
46
|
+
target_clf = [2, 4, 6, 8, 9, 10, 11]
|
|
47
|
+
column_ordinal = ["STOCK"]
|
|
48
|
+
column_binary = ["SECTOR", "SUBINDUSTRY", "LOCATION"]
|
|
49
|
+
columns_pca = []
|
|
50
|
+
target_numbers = range(1, 15)
|
|
51
|
+
date_column = "DATE"
|
|
52
|
+
group_column = "STOCK"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def run_training(
|
|
56
|
+
get_data_function: function,
|
|
57
|
+
get_data_params: dict = None,
|
|
58
|
+
time_series: bool = False,
|
|
59
|
+
dataset_id=None,
|
|
60
|
+
years_of_data=2,
|
|
61
|
+
list_of_groups=None,
|
|
62
|
+
percentile=15,
|
|
63
|
+
corr_threshold=80,
|
|
64
|
+
max_features=20,
|
|
65
|
+
max_timesteps=120,
|
|
66
|
+
targets_numbers=range(1, 15),
|
|
67
|
+
models_idx=range(len(all_models)),
|
|
68
|
+
number_of_trials=20,
|
|
69
|
+
perform_hyperoptimization=True,
|
|
70
|
+
perform_crossval=False,
|
|
71
|
+
clean_dir=False,
|
|
72
|
+
preserve_model=False,
|
|
73
|
+
session_name="test",
|
|
74
|
+
):
|
|
75
|
+
logging.captureWarnings(True)
|
|
76
|
+
|
|
77
|
+
if any(all_models[i].get("recurrent") for i in models_idx) and not time_series:
|
|
78
|
+
ValueError(
|
|
79
|
+
"You need to set time_series to true to use recurrent model, or remove recurrent models from models_idx chosen"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
if dataset_id is None:
|
|
83
|
+
# Get the data
|
|
84
|
+
logger.info("Getting data...")
|
|
85
|
+
data = get_data_function(**get_data_params)
|
|
86
|
+
|
|
87
|
+
# # preprocess & feature engineering => Should be in get_data_function
|
|
88
|
+
# logger.info("Preprocessing...")
|
|
89
|
+
# preprocessed_data = preprocessing(data, for_training=True, save_as_csv=True)
|
|
90
|
+
|
|
91
|
+
logger.info(f"Feature engineering for {session_name}...")
|
|
92
|
+
data_for_training = feature_engineering(
|
|
93
|
+
data,
|
|
94
|
+
columns_date=columns_date,
|
|
95
|
+
columns_te_groupby=columns_te_groupby,
|
|
96
|
+
columns_te_target=columns_te_target,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Split
|
|
100
|
+
if time_series:
|
|
101
|
+
train, val, test = train_val_test_split_time_series(data_for_training)
|
|
102
|
+
else:
|
|
103
|
+
train, val, test = train_val_test_split(
|
|
104
|
+
data, stratify_col=f"target_{target_numbers[0]}"
|
|
105
|
+
) # TODO: only stratifying first target for now
|
|
106
|
+
|
|
107
|
+
# Create Dataset / Experiment (TODO: should be defined sooner)
|
|
108
|
+
dataset = create_dataset(
|
|
109
|
+
train, val, test, corr_threshold, percentile, max_features
|
|
110
|
+
)
|
|
111
|
+
dataset_dir = dataset.path
|
|
112
|
+
dataset_id = dataset.id
|
|
113
|
+
data_dir = f"{dataset_dir}/data"
|
|
114
|
+
preprocessing_dir = f"{dataset_dir}/preprocessing"
|
|
115
|
+
os.makedirs(data_dir, exist_ok=True)
|
|
116
|
+
os.makedirs(preprocessing_dir, exist_ok=True)
|
|
117
|
+
|
|
118
|
+
# PCA
|
|
119
|
+
train, pcas = add_pca_features(train, columns_pca)
|
|
120
|
+
val, _ = add_pca_features(val, columns_pca, pcas=pcas)
|
|
121
|
+
test, _ = add_pca_features(test, columns_pca, pcas=pcas)
|
|
122
|
+
|
|
123
|
+
if PYTHON_ENV != "Test":
|
|
124
|
+
joblib.dump(pcas, f"{preprocessing_dir}/pca.pkl")
|
|
125
|
+
|
|
126
|
+
# Encoding
|
|
127
|
+
train, transformer = encode_categorical_features(
|
|
128
|
+
train, column_ordinal=column_ordinal, column_binary=column_binary
|
|
129
|
+
)
|
|
130
|
+
val, _ = encode_categorical_features(
|
|
131
|
+
val,
|
|
132
|
+
column_ordinal=column_ordinal,
|
|
133
|
+
column_binary=column_binary,
|
|
134
|
+
transformer=transformer,
|
|
135
|
+
)
|
|
136
|
+
test, _ = encode_categorical_features(
|
|
137
|
+
test,
|
|
138
|
+
column_ordinal=column_ordinal,
|
|
139
|
+
column_binary=column_binary,
|
|
140
|
+
transformer=transformer,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
if PYTHON_ENV != "Test":
|
|
144
|
+
joblib.dump(data_for_training, f"{data_dir}/full.pkl")
|
|
145
|
+
joblib.dump(transformer, f"{preprocessing_dir}/column_transformer.pkl")
|
|
146
|
+
summary = summarize_dataframe(train)
|
|
147
|
+
summary.to_csv(f"{dataset_dir}/feature_summary.csv", index=False)
|
|
148
|
+
|
|
149
|
+
# feature selection
|
|
150
|
+
logger.info("Feature Selection...")
|
|
151
|
+
for target_number in targets_numbers:
|
|
152
|
+
feature_selection(
|
|
153
|
+
dataset_id=dataset_id,
|
|
154
|
+
train=train,
|
|
155
|
+
target_number=target_number,
|
|
156
|
+
single_process=True,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
dataset = Dataset.get(dataset_id)
|
|
160
|
+
all_features = dataset.get_all_features()
|
|
161
|
+
columns_to_keep = all_features + [f"TARGET_{i}" for i in target_numbers]
|
|
162
|
+
|
|
163
|
+
duplicates = [
|
|
164
|
+
col for col in set(columns_to_keep) if columns_to_keep.count(col) > 1
|
|
165
|
+
]
|
|
166
|
+
if duplicates:
|
|
167
|
+
raise ValueError(f"Doublons détectés dans columns_to_keep: {duplicates}")
|
|
168
|
+
|
|
169
|
+
train = train[columns_to_keep]
|
|
170
|
+
val = val[columns_to_keep]
|
|
171
|
+
test = test[columns_to_keep]
|
|
172
|
+
|
|
173
|
+
# save data
|
|
174
|
+
if PYTHON_ENV != "Test":
|
|
175
|
+
joblib.dump(train[columns_to_keep], f"{data_dir}/train.pkl")
|
|
176
|
+
joblib.dump(val[columns_to_keep], f"{data_dir}/val.pkl")
|
|
177
|
+
joblib.dump(test[columns_to_keep], f"{data_dir}/test.pkl")
|
|
178
|
+
|
|
179
|
+
# scaling features
|
|
180
|
+
if any(t not in target_clf for t in target_numbers) and any(
|
|
181
|
+
all_models[i].get("need_scaling") for i in models_idx
|
|
182
|
+
):
|
|
183
|
+
logger.info("Scaling features...")
|
|
184
|
+
train_scaled, scaler_x, scalers_y = scale_data(
|
|
185
|
+
train, save_dir=preprocessing_dir
|
|
186
|
+
)
|
|
187
|
+
val_scaled, _, _ = scale_data(
|
|
188
|
+
val, save_dir=preprocessing_dir, scaler_x=scaler_x, scalers_y=scalers_y
|
|
189
|
+
)
|
|
190
|
+
test_scaled, _, _ = scale_data(
|
|
191
|
+
test, save_dir=preprocessing_dir, scaler_x=scaler_x, scalers_y=scalers_y
|
|
192
|
+
)
|
|
193
|
+
else:
|
|
194
|
+
train_scaled = None
|
|
195
|
+
val_scaled = None
|
|
196
|
+
test_scaled = None
|
|
197
|
+
|
|
198
|
+
# save data
|
|
199
|
+
if PYTHON_ENV != "Test":
|
|
200
|
+
joblib.dump(train_scaled, f"{data_dir}/train_scaled.pkl")
|
|
201
|
+
joblib.dump(val_scaled, f"{data_dir}/val_scaled.pkl")
|
|
202
|
+
joblib.dump(test_scaled, f"{data_dir}/test_scaled.pkl")
|
|
203
|
+
|
|
204
|
+
data = {
|
|
205
|
+
"train": train,
|
|
206
|
+
"val": val,
|
|
207
|
+
"test": test,
|
|
208
|
+
"train_scaled": train_scaled,
|
|
209
|
+
"val_scaled": val_scaled,
|
|
210
|
+
"test_scaled": test_scaled,
|
|
211
|
+
"scalers_y": scalers_y,
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
# reshape data for time series
|
|
215
|
+
reshaped_data = None
|
|
216
|
+
if any(all_models[i].get("recurrent") for i in models_idx) and time_series:
|
|
217
|
+
# reshaping data for recurrent models
|
|
218
|
+
logger.info("Reshaping data for recurrent models...")
|
|
219
|
+
reshaped_data = reshape_time_series(
|
|
220
|
+
train_scaled, val_scaled, test_scaled, all_features, timesteps=max_timesteps
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# model selection and hyperoptimization
|
|
224
|
+
logger.info("Model Selection and Hyperoptimization...")
|
|
225
|
+
for target_number in target_numbers:
|
|
226
|
+
model_selection(
|
|
227
|
+
dataset_id=dataset_id,
|
|
228
|
+
models_idx=models_idx,
|
|
229
|
+
target_number=target_number,
|
|
230
|
+
session_name=session_name,
|
|
231
|
+
perform_hyperoptimization=perform_hyperoptimization,
|
|
232
|
+
perform_crossval=perform_crossval,
|
|
233
|
+
number_of_trials=number_of_trials,
|
|
234
|
+
plot=False,
|
|
235
|
+
clean_dir=clean_dir,
|
|
236
|
+
preserve_model=preserve_model,
|
|
237
|
+
reshaped_data=reshaped_data,
|
|
238
|
+
data=(data or None),
|
|
239
|
+
)
|
lecrapaud/utils.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import logging
|
|
3
|
+
from logging.handlers import RotatingFileHandler
|
|
4
|
+
import shutil
|
|
5
|
+
import os
|
|
6
|
+
import subprocess
|
|
7
|
+
from datetime import datetime, date
|
|
8
|
+
from ftfy import fix_text
|
|
9
|
+
import unicodedata
|
|
10
|
+
import re
|
|
11
|
+
import string
|
|
12
|
+
|
|
13
|
+
from lecrapaud.directory_management import logger_dir
|
|
14
|
+
from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV
|
|
15
|
+
|
|
16
|
+
_LOGGER_ALREADY_CONFIGURED = False
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def setup_logger():
|
|
20
|
+
|
|
21
|
+
global _LOGGER_ALREADY_CONFIGURED
|
|
22
|
+
if _LOGGER_ALREADY_CONFIGURED: # ← bail out if done before
|
|
23
|
+
|
|
24
|
+
return logging.getLogger("stock" if PYTHON_ENV != "Worker" else "")
|
|
25
|
+
|
|
26
|
+
print(
|
|
27
|
+
f"Setting up logger with PYTHON_ENV {PYTHON_ENV} and LOGGING_LEVEL {LOGGING_LEVEL}"
|
|
28
|
+
)
|
|
29
|
+
# ------------------------------------------------------------------ #
|
|
30
|
+
# Real configuration happens only on the FIRST call #
|
|
31
|
+
# ------------------------------------------------------------------ #
|
|
32
|
+
fmt = "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s - %(message)s"
|
|
33
|
+
datefmt = "%Y-%m-%d %H:%M:%S"
|
|
34
|
+
logging.basicConfig(format=fmt, datefmt=datefmt) # root format
|
|
35
|
+
formatter = logging.Formatter(fmt, datefmt=datefmt)
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger("" if PYTHON_ENV == "Worker" else "stock")
|
|
38
|
+
|
|
39
|
+
log_level = getattr(logging, LOGGING_LEVEL.upper(), logging.INFO)
|
|
40
|
+
logger.setLevel(log_level)
|
|
41
|
+
|
|
42
|
+
# pick a file according to environment
|
|
43
|
+
env_file = {
|
|
44
|
+
"Development": "dev.log",
|
|
45
|
+
"Production": "prod.log",
|
|
46
|
+
"Test": "test.log",
|
|
47
|
+
"Worker": "worker.log",
|
|
48
|
+
}.get(PYTHON_ENV, "app.log")
|
|
49
|
+
|
|
50
|
+
file_handler = RotatingFileHandler(
|
|
51
|
+
f"{logger_dir}/{env_file}",
|
|
52
|
+
maxBytes=5 * 1024 * 1024,
|
|
53
|
+
backupCount=3,
|
|
54
|
+
)
|
|
55
|
+
file_handler.setFormatter(formatter)
|
|
56
|
+
file_handler.setLevel(log_level)
|
|
57
|
+
logger.addHandler(file_handler)
|
|
58
|
+
|
|
59
|
+
_LOGGER_ALREADY_CONFIGURED = True
|
|
60
|
+
return logger
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
logger = setup_logger()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def get_df_name(obj, namespace):
|
|
67
|
+
return [name for name in namespace if namespace[name] is obj][0]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def pprint(item):
|
|
71
|
+
with pd.option_context("display.max_rows", None):
|
|
72
|
+
logger.info(item)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def object_to_dict(obj):
|
|
76
|
+
if isinstance(obj, dict):
|
|
77
|
+
return {k: object_to_dict(v) for k, v in obj.items()}
|
|
78
|
+
elif hasattr(obj, "__dict__"):
|
|
79
|
+
return {k: object_to_dict(v) for k, v in obj.__dict__.items()}
|
|
80
|
+
elif isinstance(obj, list):
|
|
81
|
+
return [object_to_dict(i) for i in obj]
|
|
82
|
+
else:
|
|
83
|
+
return obj
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def copy_any(src, dst):
|
|
87
|
+
if os.path.isdir(src):
|
|
88
|
+
# Copy folder using copytree
|
|
89
|
+
shutil.copytree(src, dst)
|
|
90
|
+
else:
|
|
91
|
+
# Copy file using copy2 (which preserves metadata)
|
|
92
|
+
shutil.copy2(src, dst)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def contains_best(folder_path):
|
|
96
|
+
# Iterate over all files and folders in the specified directory
|
|
97
|
+
for root, dirs, files in os.walk(folder_path):
|
|
98
|
+
# Check each file and folder name for '.best' or '.keras'
|
|
99
|
+
for name in files + dirs:
|
|
100
|
+
if ".best" in name or ".keras" in name:
|
|
101
|
+
return True
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def get_folder_sizes(directory=os.path.expanduser("~")):
|
|
106
|
+
folder_sizes = {}
|
|
107
|
+
|
|
108
|
+
for folder in os.listdir(directory):
|
|
109
|
+
folder_path = os.path.join(directory, folder)
|
|
110
|
+
if os.path.isdir(folder_path):
|
|
111
|
+
try:
|
|
112
|
+
size = (
|
|
113
|
+
subprocess.check_output(["du", "-sk", folder_path])
|
|
114
|
+
.split()[0]
|
|
115
|
+
.decode("utf-8")
|
|
116
|
+
)
|
|
117
|
+
folder_sizes[folder] = int(size)
|
|
118
|
+
except subprocess.CalledProcessError:
|
|
119
|
+
logger.info(f"Skipping {folder_path}: Permission Denied")
|
|
120
|
+
|
|
121
|
+
sorted_folders = sorted(folder_sizes.items(), key=lambda x: x[1], reverse=True)
|
|
122
|
+
logger.info(f"{'Folder':<50}{'Size (MB)':>10}")
|
|
123
|
+
logger.info("=" * 60)
|
|
124
|
+
for folder, size in sorted_folders:
|
|
125
|
+
logger.info(f"{folder:<50}{size / (1024*1024):>10.2f}")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def create_cron_job(
|
|
129
|
+
script_path,
|
|
130
|
+
venv_path,
|
|
131
|
+
log_file,
|
|
132
|
+
pythonpath,
|
|
133
|
+
cwd,
|
|
134
|
+
job_frequency="* * * * *",
|
|
135
|
+
cron_name="My Custom Cron Job",
|
|
136
|
+
):
|
|
137
|
+
"""
|
|
138
|
+
Creates a cron job to run a Python script with a virtual environment, logging output, and setting PYTHONPATH and CWD.
|
|
139
|
+
|
|
140
|
+
Parameters:
|
|
141
|
+
- script_path (str): Path to the Python script to run.
|
|
142
|
+
- venv_path (str): Path to the virtual environment's Python interpreter.
|
|
143
|
+
- log_file (str): Path to the log file for output.
|
|
144
|
+
- pythonpath (str): Value for the PYTHONPATH environment variable.
|
|
145
|
+
- cwd (str): Working directory from which the script should run.
|
|
146
|
+
- job_frequency (str): Cron timing syntax (default is every minute).
|
|
147
|
+
- cron_name (str): Name to identify the cron job.
|
|
148
|
+
"""
|
|
149
|
+
# Construct the cron command
|
|
150
|
+
cron_command = (
|
|
151
|
+
f"{job_frequency} /bin/zsh -c 'pgrep -fl python | grep -q {os.path.basename(script_path)} "
|
|
152
|
+
f'|| (echo -e "Cron job {cron_name} started at $(date)" >> {log_file} && cd {cwd} && '
|
|
153
|
+
f"PYTHONPATH={pythonpath} {venv_path}/bin/python {script_path} >> {log_file} 2>&1)'"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Check existing cron jobs and remove any with the same comment
|
|
157
|
+
subprocess.run(f"(crontab -l | grep -v '{cron_name}') | crontab -", shell=True)
|
|
158
|
+
|
|
159
|
+
# Add the new cron job with the comment
|
|
160
|
+
full_cron_job = f"{cron_command} # {cron_name}\n"
|
|
161
|
+
subprocess.run(f'(crontab -l; echo "{full_cron_job}") | crontab -', shell=True)
|
|
162
|
+
logger.info(f"Cron job created: {full_cron_job}")
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def remove_all_cron_jobs():
|
|
166
|
+
"""
|
|
167
|
+
Removes all cron jobs for the current user.
|
|
168
|
+
"""
|
|
169
|
+
try:
|
|
170
|
+
# Clear the user's crontab
|
|
171
|
+
subprocess.run("crontab -r", shell=True, check=True)
|
|
172
|
+
logger.info("All cron jobs have been removed successfully.")
|
|
173
|
+
except subprocess.CalledProcessError:
|
|
174
|
+
logger.info(
|
|
175
|
+
"Failed to remove cron jobs. There may not be any cron jobs to remove, or there could be a permissions issue."
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def serialize_timestamp(dict: dict):
|
|
180
|
+
def convert(obj):
|
|
181
|
+
if isinstance(obj, (datetime, date, pd.Timestamp)):
|
|
182
|
+
return obj.isoformat()
|
|
183
|
+
|
|
184
|
+
return obj
|
|
185
|
+
|
|
186
|
+
return [{k: convert(v) for k, v in item.items()} for item in dict]
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def remove_accents(text: str) -> str:
|
|
190
|
+
"""
|
|
191
|
+
Cleans the text of:
|
|
192
|
+
- Broken Unicode
|
|
193
|
+
- Accents
|
|
194
|
+
- Control characters (including \x00, \u0000, etc.)
|
|
195
|
+
- Escape sequences
|
|
196
|
+
- Non-printable characters
|
|
197
|
+
- Excessive punctuation (like ........ or !!!!)
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
# Step 1: Fix mojibake and broken Unicode
|
|
201
|
+
text = fix_text(text)
|
|
202
|
+
|
|
203
|
+
# Step 2 bis: Normalize accents
|
|
204
|
+
text = unicodedata.normalize("NFKD", text)
|
|
205
|
+
text = text.encode("ASCII", "ignore").decode("utf8")
|
|
206
|
+
|
|
207
|
+
# Step 3: Remove known weird tokens
|
|
208
|
+
text = text.replace("<|endoftext|>", "")
|
|
209
|
+
text = text.replace("\u0000", "").replace("\x00", "")
|
|
210
|
+
|
|
211
|
+
# Step 4: Remove raw control characters (e.g., \x1f)
|
|
212
|
+
text = "".join(c for c in text if unicodedata.category(c)[0] != "C" or c == "\n")
|
|
213
|
+
|
|
214
|
+
# Step 5: Remove literal escape sequences like \xNN
|
|
215
|
+
text = re.sub(r"\\x[0-9a-fA-F]{2}", "", text)
|
|
216
|
+
|
|
217
|
+
# Step 6: Remove non-printable characters
|
|
218
|
+
printable = set(string.printable)
|
|
219
|
+
text = "".join(c for c in text if c in printable)
|
|
220
|
+
|
|
221
|
+
# Step 7: Collapse repeated punctuation (e.g., ........ → .)
|
|
222
|
+
text = re.sub(r"([!?.])\1{2,}", r"\1", text) # !!!!!! → !
|
|
223
|
+
text = re.sub(r"([-—])\1{1,}", r"\1", text) # ------ → -
|
|
224
|
+
text = re.sub(r"([,.]){4,}", r"\1", text) # ...... → .
|
|
225
|
+
|
|
226
|
+
return text.strip()
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def serialize_for_json(obj):
|
|
230
|
+
"""
|
|
231
|
+
Recursively convert any object into a JSON-serializable structure.
|
|
232
|
+
Classes and class instances are converted to readable strings like 'ClassName()'.
|
|
233
|
+
"""
|
|
234
|
+
if isinstance(obj, (str, int, float, bool, type(None))):
|
|
235
|
+
return obj
|
|
236
|
+
elif isinstance(obj, dict):
|
|
237
|
+
return {str(k): serialize_for_json(v) for k, v in obj.items()}
|
|
238
|
+
elif isinstance(obj, (list, tuple, set)):
|
|
239
|
+
return [serialize_for_json(v) for v in obj]
|
|
240
|
+
elif isinstance(obj, type):
|
|
241
|
+
# A class/type object like int, str, etc.
|
|
242
|
+
return obj.__name__
|
|
243
|
+
elif hasattr(obj, "__class__"):
|
|
244
|
+
return f"{obj.__class__.__name__}()"
|
|
245
|
+
else:
|
|
246
|
+
return str(obj)
|