lecrapaud 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/__init__.py +0 -0
- lecrapaud/config.py +16 -0
- lecrapaud/db/__init__.py +0 -0
- lecrapaud/db/alembic/README +1 -0
- lecrapaud/db/alembic/env.py +78 -0
- lecrapaud/db/alembic/script.py.mako +26 -0
- lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +295 -0
- lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +30 -0
- lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +52 -0
- lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +34 -0
- lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +39 -0
- lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +36 -0
- lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +30 -0
- lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +132 -0
- lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +36 -0
- lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +62 -0
- lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +107 -0
- lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +38 -0
- lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +44 -0
- lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +30 -0
- lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +32 -0
- lecrapaud/db/crud.py +179 -0
- lecrapaud/db/models/__init__.py +11 -0
- lecrapaud/db/models/base.py +6 -0
- lecrapaud/db/models/dataset.py +124 -0
- lecrapaud/db/models/feature.py +46 -0
- lecrapaud/db/models/feature_selection.py +126 -0
- lecrapaud/db/models/feature_selection_rank.py +80 -0
- lecrapaud/db/models/model.py +41 -0
- lecrapaud/db/models/model_selection.py +56 -0
- lecrapaud/db/models/model_training.py +54 -0
- lecrapaud/db/models/score.py +62 -0
- lecrapaud/db/models/target.py +59 -0
- lecrapaud/db/services.py +0 -0
- lecrapaud/db/setup.py +58 -0
- lecrapaud/directory_management.py +28 -0
- lecrapaud/feature_engineering.py +1119 -0
- lecrapaud/feature_selection.py +1229 -0
- lecrapaud/jobs/__init__.py +13 -0
- lecrapaud/jobs/config.py +17 -0
- lecrapaud/jobs/scheduler.py +36 -0
- lecrapaud/jobs/tasks.py +57 -0
- lecrapaud/model_selection.py +1571 -0
- lecrapaud/predictions.py +292 -0
- lecrapaud/search_space.py +844 -0
- lecrapaud/services/__init__.py +0 -0
- lecrapaud/services/embedding_categorical.py +71 -0
- lecrapaud/services/indicators.py +309 -0
- lecrapaud/speed_tests/experiments.py +139 -0
- lecrapaud/speed_tests/test-gpu-bilstm.ipynb +261 -0
- lecrapaud/speed_tests/test-gpu-resnet.ipynb +166 -0
- lecrapaud/speed_tests/test-gpu-transformers.ipynb +254 -0
- lecrapaud/speed_tests/tests.ipynb +145 -0
- lecrapaud/speed_tests/trash.py +37 -0
- lecrapaud/training.py +151 -0
- lecrapaud/utils.py +246 -0
- lecrapaud-0.4.0.dist-info/LICENSE +201 -0
- lecrapaud-0.4.0.dist-info/METADATA +103 -0
- lecrapaud-0.4.0.dist-info/RECORD +60 -0
- lecrapaud-0.4.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "code",
|
|
5
|
+
"execution_count": 2,
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"outputs": [],
|
|
8
|
+
"source": [
|
|
9
|
+
"# import autosklearn.classification\n",
|
|
10
|
+
"import sklearn.datasets\n",
|
|
11
|
+
"import sklearn.metrics\n",
|
|
12
|
+
"from pprint import pprint\n",
|
|
13
|
+
"from tabpfn import TabPFNClassifier\n",
|
|
14
|
+
"import numpy as np\n",
|
|
15
|
+
"from pathlib import Path\n",
|
|
16
|
+
"import pandas as pd\n",
|
|
17
|
+
"import time\n",
|
|
18
|
+
"from sklearn.metrics import accuracy_score\n",
|
|
19
|
+
"from sklearn.datasets import load_breast_cancer\n",
|
|
20
|
+
"from sklearn.model_selection import train_test_split"
|
|
21
|
+
]
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"cell_type": "code",
|
|
25
|
+
"execution_count": null,
|
|
26
|
+
"metadata": {},
|
|
27
|
+
"outputs": [],
|
|
28
|
+
"source": [
|
|
29
|
+
"X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)\n",
|
|
30
|
+
"X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n",
|
|
31
|
+
" X, y, random_state=1\n",
|
|
32
|
+
")"
|
|
33
|
+
]
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
"cell_type": "code",
|
|
37
|
+
"execution_count": null,
|
|
38
|
+
"metadata": {},
|
|
39
|
+
"outputs": [],
|
|
40
|
+
"source": [
|
|
41
|
+
"automl = autosklearn.classification.AutoSklearnClassifier(\n",
|
|
42
|
+
" time_left_for_this_task=120,\n",
|
|
43
|
+
" per_run_time_limit=30,\n",
|
|
44
|
+
" tmp_folder=\"/tmp/autosklearn_interpretable_models_example_tmp\",\n",
|
|
45
|
+
" include={\n",
|
|
46
|
+
" \"classifier\": [\"decision_tree\", \"lda\", \"sgd\"],\n",
|
|
47
|
+
" \"feature_preprocessor\": [\n",
|
|
48
|
+
" \"no_preprocessing\",\n",
|
|
49
|
+
" \"polynomial\",\n",
|
|
50
|
+
" \"select_percentile_classification\",\n",
|
|
51
|
+
" ],\n",
|
|
52
|
+
" },\n",
|
|
53
|
+
" ensemble_kwargs={\"ensemble_size\": 1},\n",
|
|
54
|
+
")\n",
|
|
55
|
+
"automl.fit(X_train, y_train, dataset_name=\"breast_cancer\")"
|
|
56
|
+
]
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
"cell_type": "code",
|
|
60
|
+
"execution_count": null,
|
|
61
|
+
"metadata": {},
|
|
62
|
+
"outputs": [],
|
|
63
|
+
"source": [
|
|
64
|
+
"pprint(automl.show_models(), indent=4)"
|
|
65
|
+
]
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"cell_type": "code",
|
|
69
|
+
"execution_count": null,
|
|
70
|
+
"metadata": {},
|
|
71
|
+
"outputs": [],
|
|
72
|
+
"source": [
|
|
73
|
+
"predictions = automl.predict(X_test)\n",
|
|
74
|
+
"print(\"Accuracy score:\", sklearn.metrics.accuracy_score(y_test, predictions))"
|
|
75
|
+
]
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
"cell_type": "code",
|
|
79
|
+
"execution_count": null,
|
|
80
|
+
"metadata": {},
|
|
81
|
+
"outputs": [],
|
|
82
|
+
"source": [
|
|
83
|
+
"# N_ensemble_configurations defines how many estimators are averaged, it is bounded by #features * #classes\n",
|
|
84
|
+
"# more ensemble members are slower, but more accurate\n",
|
|
85
|
+
"classifier = TabPFNClassifier(device=\"cuda\", N_ensemble_configurations=4)"
|
|
86
|
+
]
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
"cell_type": "code",
|
|
90
|
+
"execution_count": null,
|
|
91
|
+
"metadata": {},
|
|
92
|
+
"outputs": [],
|
|
93
|
+
"source": [
|
|
94
|
+
"start = time.time()\n",
|
|
95
|
+
"classifier.fit(X_train, y_train)\n",
|
|
96
|
+
"y_eval, p_eval = classifier.predict(X_test, return_winning_probability=True)\n",
|
|
97
|
+
"print(\n",
|
|
98
|
+
" \"Prediction time: \", time.time() - start, \"Accuracy\", accuracy_score(y_test, y_eval)\n",
|
|
99
|
+
")"
|
|
100
|
+
]
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
"cell_type": "code",
|
|
104
|
+
"execution_count": null,
|
|
105
|
+
"metadata": {},
|
|
106
|
+
"outputs": [],
|
|
107
|
+
"source": [
|
|
108
|
+
"# We also offer the `predict_proba` interface\n",
|
|
109
|
+
"classifier.predict_proba(X_test).shape"
|
|
110
|
+
]
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
"cell_type": "code",
|
|
114
|
+
"execution_count": null,
|
|
115
|
+
"metadata": {},
|
|
116
|
+
"outputs": [],
|
|
117
|
+
"source": [
|
|
118
|
+
"out_table = pd.DataFrame(X_test.copy().astype(str))\n",
|
|
119
|
+
"out_table[\"prediction\"] = [f\"{y_e} (p={p_e:.2f})\" for y_e, p_e in zip(y_eval, p_eval)]\n",
|
|
120
|
+
"out_table"
|
|
121
|
+
]
|
|
122
|
+
}
|
|
123
|
+
],
|
|
124
|
+
"metadata": {
|
|
125
|
+
"kernelspec": {
|
|
126
|
+
"display_name": ".venv",
|
|
127
|
+
"language": "python",
|
|
128
|
+
"name": "python3"
|
|
129
|
+
},
|
|
130
|
+
"language_info": {
|
|
131
|
+
"codemirror_mode": {
|
|
132
|
+
"name": "ipython",
|
|
133
|
+
"version": 3
|
|
134
|
+
},
|
|
135
|
+
"file_extension": ".py",
|
|
136
|
+
"mimetype": "text/x-python",
|
|
137
|
+
"name": "python",
|
|
138
|
+
"nbconvert_exporter": "python",
|
|
139
|
+
"pygments_lexer": "ipython3",
|
|
140
|
+
"version": "3.12.8"
|
|
141
|
+
}
|
|
142
|
+
},
|
|
143
|
+
"nbformat": 4,
|
|
144
|
+
"nbformat_minor": 2
|
|
145
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# def _get_weekly_return(y_true, y_pred):
|
|
2
|
+
# df = pd.concat([y_true, y_pred, stock_data[['YEARWEEK', 'STOCK', 'TARGET_1']]], join='inner', axis=1)
|
|
3
|
+
# df['PRED'] += 1
|
|
4
|
+
# df['TARGET'] += 1
|
|
5
|
+
# return df[['YEARWEEK', 'STOCK', 'PRED', 'TARGET']].groupby(['YEARWEEK', 'STOCK']).prod().reset_index()
|
|
6
|
+
|
|
7
|
+
# def _calc_spread_return_per_week(df, portfolio_size):
|
|
8
|
+
# return (df.sort_values('PRED', ascending=False)['TARGET_1'][:portfolio_size] - 1).mean()
|
|
9
|
+
|
|
10
|
+
# def sharpe_ratio_weekly(y_true, y_pred, portfolio_size:int=10):
|
|
11
|
+
# df = _get_weekly_return(y_true, y_pred)
|
|
12
|
+
# buf = df.groupby('YEARWEEK').apply(_calc_spread_return_per_week, portfolio_size)
|
|
13
|
+
# sharpe_ratio = (buf.mean() * 52) / (buf.std() * np.sqrt(52))
|
|
14
|
+
# buf += 1
|
|
15
|
+
# cumulated_roi = buf.prod() - 1
|
|
16
|
+
# cagr = buf.prod() ** (1 / (buf.shape[0]/52) ) - 1
|
|
17
|
+
# return sharpe_ratio, cumulated_roi, cagr
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def sharpe_ratio_daily(y_true, y_pred, portfolio_size: int = 10):
|
|
21
|
+
df = pd.concat(
|
|
22
|
+
[y_true, y_pred, stock_data[["DATE", "TARGET_1"]]], join="inner", axis=1
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def _calc_spread_return_per_day(df: pd.DataFrame, portfolio_size: int):
|
|
26
|
+
# print(df.sort_values('PRED', ascending=False)[['PRED', 'TARGET', 'TARGET_1']].head(10))
|
|
27
|
+
return (
|
|
28
|
+
df.sort_values("PRED", ascending=False)["TARGET_1"].iloc[:portfolio_size]
|
|
29
|
+
).mean()
|
|
30
|
+
|
|
31
|
+
buf = df.groupby("DATE").apply(_calc_spread_return_per_day, portfolio_size)
|
|
32
|
+
|
|
33
|
+
sharpe_ratio = (buf.mean() * 252) / (buf.std() * np.sqrt(252))
|
|
34
|
+
buf += 1
|
|
35
|
+
cumulated_roi = buf.prod() - 1
|
|
36
|
+
cagr = buf.prod() ** (1 / (buf.shape[0] / 252)) - 1
|
|
37
|
+
return sharpe_ratio, cumulated_roi, cagr
|
lecrapaud/training.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import joblib
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import os
|
|
5
|
+
from src.utils import logger
|
|
6
|
+
|
|
7
|
+
from src.feature_engineering import feature_engineering
|
|
8
|
+
from src.feature_selection import (
|
|
9
|
+
create_sets_from_data,
|
|
10
|
+
feature_selection,
|
|
11
|
+
scale_data,
|
|
12
|
+
reshape_time_series,
|
|
13
|
+
)
|
|
14
|
+
from src.model_selection import model_selection, test_hardware
|
|
15
|
+
from src.data_sourcing import get_filtered_data
|
|
16
|
+
from src.constants import stock_list_3, stock_list_1
|
|
17
|
+
from src.search_space import ml_models, dl_recurrent_models
|
|
18
|
+
from src.directory_management import tmp_dir
|
|
19
|
+
from src.db.models import Dataset
|
|
20
|
+
from src.config import PYTHON_ENV
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def run_training(
|
|
24
|
+
dataset_id=None,
|
|
25
|
+
years_of_data=2,
|
|
26
|
+
list_of_groups=stock_list_1,
|
|
27
|
+
percentile=15,
|
|
28
|
+
corr_threshold=80,
|
|
29
|
+
max_features=20,
|
|
30
|
+
max_timesteps=120,
|
|
31
|
+
targets_numbers=range(1, 15),
|
|
32
|
+
models_idx=range(len(ml_models)),
|
|
33
|
+
number_of_trials=20,
|
|
34
|
+
perform_hyperoptimization=True,
|
|
35
|
+
perform_crossval=False,
|
|
36
|
+
clean_dir=False,
|
|
37
|
+
preserve_model=False,
|
|
38
|
+
session_name="test",
|
|
39
|
+
):
|
|
40
|
+
logging.captureWarnings(True)
|
|
41
|
+
|
|
42
|
+
if dataset_id is None:
|
|
43
|
+
# Get the data
|
|
44
|
+
logger.info("Getting data...")
|
|
45
|
+
data = get_filtered_data(
|
|
46
|
+
years_of_data=years_of_data,
|
|
47
|
+
list_of_groups=list_of_groups,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# preprocess & feature engineering
|
|
51
|
+
logger.info("Preprocessing...")
|
|
52
|
+
data_for_training = feature_engineering(
|
|
53
|
+
data, for_training=True, save_as_csv=True
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# train / val / test sets
|
|
57
|
+
train, val, test, dataset = create_sets_from_data(
|
|
58
|
+
data_for_training,
|
|
59
|
+
percentile=percentile,
|
|
60
|
+
corr_threshold=corr_threshold,
|
|
61
|
+
max_features=max_features,
|
|
62
|
+
)
|
|
63
|
+
dataset_dir = dataset.path
|
|
64
|
+
dataset_id = dataset.id
|
|
65
|
+
train_data_dir = f"{dataset_dir}/data"
|
|
66
|
+
os.makedirs(train_data_dir, exist_ok=True)
|
|
67
|
+
preprocessing_dir = f"{dataset_dir}/preprocessing"
|
|
68
|
+
|
|
69
|
+
# feature selection
|
|
70
|
+
logger.info("Feature Selection...")
|
|
71
|
+
for target_number in targets_numbers:
|
|
72
|
+
feature_selection(
|
|
73
|
+
dataset_id=dataset_id,
|
|
74
|
+
train=train,
|
|
75
|
+
target_number=target_number,
|
|
76
|
+
single_process=True,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
dataset = Dataset.get(dataset_id)
|
|
80
|
+
all_features = dataset.get_all_features()
|
|
81
|
+
columns_to_keep = all_features + [f"TARGET_{i}" for i in range(1, 15)]
|
|
82
|
+
logger.info(columns_to_keep)
|
|
83
|
+
duplicates = [
|
|
84
|
+
col for col in set(columns_to_keep) if columns_to_keep.count(col) > 1
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
if duplicates:
|
|
88
|
+
raise ValueError(f"Doublons détectés dans columns_to_keep: {duplicates}")
|
|
89
|
+
|
|
90
|
+
train = train[columns_to_keep]
|
|
91
|
+
val = val[columns_to_keep]
|
|
92
|
+
test = test[columns_to_keep]
|
|
93
|
+
|
|
94
|
+
if PYTHON_ENV != "Test":
|
|
95
|
+
joblib.dump(train[columns_to_keep], f"{train_data_dir}/train.pkl")
|
|
96
|
+
joblib.dump(val[columns_to_keep], f"{train_data_dir}/val.pkl")
|
|
97
|
+
joblib.dump(test[columns_to_keep], f"{train_data_dir}/test.pkl")
|
|
98
|
+
|
|
99
|
+
# scaling features
|
|
100
|
+
logger.info("Scaling features...")
|
|
101
|
+
train_scaled, scaler_x, scalers_y = scale_data(
|
|
102
|
+
train, save_dir=preprocessing_dir
|
|
103
|
+
)
|
|
104
|
+
val_scaled, _, _ = scale_data(
|
|
105
|
+
val, save_dir=preprocessing_dir, scaler_x=scaler_x, scalers_y=scalers_y
|
|
106
|
+
)
|
|
107
|
+
test_scaled, _, _ = scale_data(
|
|
108
|
+
test, save_dir=preprocessing_dir, scaler_x=scaler_x, scalers_y=scalers_y
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if PYTHON_ENV != "Test":
|
|
112
|
+
joblib.dump(train_scaled, f"{train_data_dir}/train_scaled.pkl")
|
|
113
|
+
joblib.dump(val_scaled, f"{train_data_dir}/val_scaled.pkl")
|
|
114
|
+
joblib.dump(test_scaled, f"{train_data_dir}/test_scaled.pkl")
|
|
115
|
+
|
|
116
|
+
data = {
|
|
117
|
+
"train": train,
|
|
118
|
+
"val": val,
|
|
119
|
+
"test": test,
|
|
120
|
+
"train_scaled": train_scaled,
|
|
121
|
+
"val_scaled": val_scaled,
|
|
122
|
+
"test_scaled": test_scaled,
|
|
123
|
+
"scalers_y": scalers_y,
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
list_models = ml_models + dl_recurrent_models
|
|
127
|
+
reshaped_data = None
|
|
128
|
+
if any(list_models[i].get("recurrent") for i in models_idx):
|
|
129
|
+
# reshaping data for recurrent models
|
|
130
|
+
logger.info("Reshaping data for recurrent models...")
|
|
131
|
+
reshaped_data = reshape_time_series(
|
|
132
|
+
train_scaled, val_scaled, test_scaled, all_features, timesteps=max_timesteps
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# model selection and hyperoptimization
|
|
136
|
+
logger.info("Model Selection and Hyperoptimization...")
|
|
137
|
+
for target_number in targets_numbers:
|
|
138
|
+
model_selection(
|
|
139
|
+
dataset_id=dataset_id,
|
|
140
|
+
models_idx=models_idx,
|
|
141
|
+
target_number=target_number,
|
|
142
|
+
session_name=session_name,
|
|
143
|
+
perform_hyperoptimization=perform_hyperoptimization,
|
|
144
|
+
perform_crossval=perform_crossval,
|
|
145
|
+
number_of_trials=number_of_trials,
|
|
146
|
+
plot=False,
|
|
147
|
+
clean_dir=clean_dir,
|
|
148
|
+
preserve_model=preserve_model,
|
|
149
|
+
reshaped_data=reshaped_data,
|
|
150
|
+
data=(data or None),
|
|
151
|
+
)
|
lecrapaud/utils.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import logging
|
|
3
|
+
from logging.handlers import RotatingFileHandler
|
|
4
|
+
import shutil
|
|
5
|
+
import os
|
|
6
|
+
import subprocess
|
|
7
|
+
from datetime import datetime, date
|
|
8
|
+
from ftfy import fix_text
|
|
9
|
+
import unicodedata
|
|
10
|
+
import re
|
|
11
|
+
import string
|
|
12
|
+
|
|
13
|
+
from src.directory_management import logger_dir
|
|
14
|
+
from src.config import LOGGING_LEVEL, PYTHON_ENV
|
|
15
|
+
|
|
16
|
+
_LOGGER_ALREADY_CONFIGURED = False
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def setup_logger():
|
|
20
|
+
|
|
21
|
+
global _LOGGER_ALREADY_CONFIGURED
|
|
22
|
+
if _LOGGER_ALREADY_CONFIGURED: # ← bail out if done before
|
|
23
|
+
|
|
24
|
+
return logging.getLogger("stock" if PYTHON_ENV != "Worker" else "")
|
|
25
|
+
|
|
26
|
+
print(
|
|
27
|
+
f"Setting up logger with PYTHON_ENV {PYTHON_ENV} and LOGGING_LEVEL {LOGGING_LEVEL}"
|
|
28
|
+
)
|
|
29
|
+
# ------------------------------------------------------------------ #
|
|
30
|
+
# Real configuration happens only on the FIRST call #
|
|
31
|
+
# ------------------------------------------------------------------ #
|
|
32
|
+
fmt = "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s - %(message)s"
|
|
33
|
+
datefmt = "%Y-%m-%d %H:%M:%S"
|
|
34
|
+
logging.basicConfig(format=fmt, datefmt=datefmt) # root format
|
|
35
|
+
formatter = logging.Formatter(fmt, datefmt=datefmt)
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger("" if PYTHON_ENV == "Worker" else "stock")
|
|
38
|
+
|
|
39
|
+
log_level = getattr(logging, LOGGING_LEVEL.upper(), logging.INFO)
|
|
40
|
+
logger.setLevel(log_level)
|
|
41
|
+
|
|
42
|
+
# pick a file according to environment
|
|
43
|
+
env_file = {
|
|
44
|
+
"Development": "dev.log",
|
|
45
|
+
"Production": "prod.log",
|
|
46
|
+
"Test": "test.log",
|
|
47
|
+
"Worker": "worker.log",
|
|
48
|
+
}.get(PYTHON_ENV, "app.log")
|
|
49
|
+
|
|
50
|
+
file_handler = RotatingFileHandler(
|
|
51
|
+
f"{logger_dir}/{env_file}",
|
|
52
|
+
maxBytes=5 * 1024 * 1024,
|
|
53
|
+
backupCount=3,
|
|
54
|
+
)
|
|
55
|
+
file_handler.setFormatter(formatter)
|
|
56
|
+
file_handler.setLevel(log_level)
|
|
57
|
+
logger.addHandler(file_handler)
|
|
58
|
+
|
|
59
|
+
_LOGGER_ALREADY_CONFIGURED = True
|
|
60
|
+
return logger
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
logger = setup_logger()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def get_df_name(obj, namespace):
|
|
67
|
+
return [name for name in namespace if namespace[name] is obj][0]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def pprint(item):
|
|
71
|
+
with pd.option_context("display.max_rows", None):
|
|
72
|
+
logger.info(item)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def object_to_dict(obj):
|
|
76
|
+
if isinstance(obj, dict):
|
|
77
|
+
return {k: object_to_dict(v) for k, v in obj.items()}
|
|
78
|
+
elif hasattr(obj, "__dict__"):
|
|
79
|
+
return {k: object_to_dict(v) for k, v in obj.__dict__.items()}
|
|
80
|
+
elif isinstance(obj, list):
|
|
81
|
+
return [object_to_dict(i) for i in obj]
|
|
82
|
+
else:
|
|
83
|
+
return obj
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def copy_any(src, dst):
|
|
87
|
+
if os.path.isdir(src):
|
|
88
|
+
# Copy folder using copytree
|
|
89
|
+
shutil.copytree(src, dst)
|
|
90
|
+
else:
|
|
91
|
+
# Copy file using copy2 (which preserves metadata)
|
|
92
|
+
shutil.copy2(src, dst)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def contains_best(folder_path):
|
|
96
|
+
# Iterate over all files and folders in the specified directory
|
|
97
|
+
for root, dirs, files in os.walk(folder_path):
|
|
98
|
+
# Check each file and folder name for '.best' or '.keras'
|
|
99
|
+
for name in files + dirs:
|
|
100
|
+
if ".best" in name or ".keras" in name:
|
|
101
|
+
return True
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def get_folder_sizes(directory=os.path.expanduser("~")):
|
|
106
|
+
folder_sizes = {}
|
|
107
|
+
|
|
108
|
+
for folder in os.listdir(directory):
|
|
109
|
+
folder_path = os.path.join(directory, folder)
|
|
110
|
+
if os.path.isdir(folder_path):
|
|
111
|
+
try:
|
|
112
|
+
size = (
|
|
113
|
+
subprocess.check_output(["du", "-sk", folder_path])
|
|
114
|
+
.split()[0]
|
|
115
|
+
.decode("utf-8")
|
|
116
|
+
)
|
|
117
|
+
folder_sizes[folder] = int(size)
|
|
118
|
+
except subprocess.CalledProcessError:
|
|
119
|
+
logger.info(f"Skipping {folder_path}: Permission Denied")
|
|
120
|
+
|
|
121
|
+
sorted_folders = sorted(folder_sizes.items(), key=lambda x: x[1], reverse=True)
|
|
122
|
+
logger.info(f"{'Folder':<50}{'Size (MB)':>10}")
|
|
123
|
+
logger.info("=" * 60)
|
|
124
|
+
for folder, size in sorted_folders:
|
|
125
|
+
logger.info(f"{folder:<50}{size / (1024*1024):>10.2f}")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def create_cron_job(
|
|
129
|
+
script_path,
|
|
130
|
+
venv_path,
|
|
131
|
+
log_file,
|
|
132
|
+
pythonpath,
|
|
133
|
+
cwd,
|
|
134
|
+
job_frequency="* * * * *",
|
|
135
|
+
cron_name="My Custom Cron Job",
|
|
136
|
+
):
|
|
137
|
+
"""
|
|
138
|
+
Creates a cron job to run a Python script with a virtual environment, logging output, and setting PYTHONPATH and CWD.
|
|
139
|
+
|
|
140
|
+
Parameters:
|
|
141
|
+
- script_path (str): Path to the Python script to run.
|
|
142
|
+
- venv_path (str): Path to the virtual environment's Python interpreter.
|
|
143
|
+
- log_file (str): Path to the log file for output.
|
|
144
|
+
- pythonpath (str): Value for the PYTHONPATH environment variable.
|
|
145
|
+
- cwd (str): Working directory from which the script should run.
|
|
146
|
+
- job_frequency (str): Cron timing syntax (default is every minute).
|
|
147
|
+
- cron_name (str): Name to identify the cron job.
|
|
148
|
+
"""
|
|
149
|
+
# Construct the cron command
|
|
150
|
+
cron_command = (
|
|
151
|
+
f"{job_frequency} /bin/zsh -c 'pgrep -fl python | grep -q {os.path.basename(script_path)} "
|
|
152
|
+
f'|| (echo -e "Cron job {cron_name} started at $(date)" >> {log_file} && cd {cwd} && '
|
|
153
|
+
f"PYTHONPATH={pythonpath} {venv_path}/bin/python {script_path} >> {log_file} 2>&1)'"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Check existing cron jobs and remove any with the same comment
|
|
157
|
+
subprocess.run(f"(crontab -l | grep -v '{cron_name}') | crontab -", shell=True)
|
|
158
|
+
|
|
159
|
+
# Add the new cron job with the comment
|
|
160
|
+
full_cron_job = f"{cron_command} # {cron_name}\n"
|
|
161
|
+
subprocess.run(f'(crontab -l; echo "{full_cron_job}") | crontab -', shell=True)
|
|
162
|
+
logger.info(f"Cron job created: {full_cron_job}")
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def remove_all_cron_jobs():
|
|
166
|
+
"""
|
|
167
|
+
Removes all cron jobs for the current user.
|
|
168
|
+
"""
|
|
169
|
+
try:
|
|
170
|
+
# Clear the user's crontab
|
|
171
|
+
subprocess.run("crontab -r", shell=True, check=True)
|
|
172
|
+
logger.info("All cron jobs have been removed successfully.")
|
|
173
|
+
except subprocess.CalledProcessError:
|
|
174
|
+
logger.info(
|
|
175
|
+
"Failed to remove cron jobs. There may not be any cron jobs to remove, or there could be a permissions issue."
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def serialize_timestamp(dict: dict):
|
|
180
|
+
def convert(obj):
|
|
181
|
+
if isinstance(obj, (datetime, date, pd.Timestamp)):
|
|
182
|
+
return obj.isoformat()
|
|
183
|
+
|
|
184
|
+
return obj
|
|
185
|
+
|
|
186
|
+
return [{k: convert(v) for k, v in item.items()} for item in dict]
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def remove_accents(text: str) -> str:
|
|
190
|
+
"""
|
|
191
|
+
Cleans the text of:
|
|
192
|
+
- Broken Unicode
|
|
193
|
+
- Accents
|
|
194
|
+
- Control characters (including \x00, \u0000, etc.)
|
|
195
|
+
- Escape sequences
|
|
196
|
+
- Non-printable characters
|
|
197
|
+
- Excessive punctuation (like ........ or !!!!)
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
# Step 1: Fix mojibake and broken Unicode
|
|
201
|
+
text = fix_text(text)
|
|
202
|
+
|
|
203
|
+
# Step 2 bis: Normalize accents
|
|
204
|
+
text = unicodedata.normalize("NFKD", text)
|
|
205
|
+
text = text.encode("ASCII", "ignore").decode("utf8")
|
|
206
|
+
|
|
207
|
+
# Step 3: Remove known weird tokens
|
|
208
|
+
text = text.replace("<|endoftext|>", "")
|
|
209
|
+
text = text.replace("\u0000", "").replace("\x00", "")
|
|
210
|
+
|
|
211
|
+
# Step 4: Remove raw control characters (e.g., \x1f)
|
|
212
|
+
text = "".join(c for c in text if unicodedata.category(c)[0] != "C" or c == "\n")
|
|
213
|
+
|
|
214
|
+
# Step 5: Remove literal escape sequences like \xNN
|
|
215
|
+
text = re.sub(r"\\x[0-9a-fA-F]{2}", "", text)
|
|
216
|
+
|
|
217
|
+
# Step 6: Remove non-printable characters
|
|
218
|
+
printable = set(string.printable)
|
|
219
|
+
text = "".join(c for c in text if c in printable)
|
|
220
|
+
|
|
221
|
+
# Step 7: Collapse repeated punctuation (e.g., ........ → .)
|
|
222
|
+
text = re.sub(r"([!?.])\1{2,}", r"\1", text) # !!!!!! → !
|
|
223
|
+
text = re.sub(r"([-—])\1{1,}", r"\1", text) # ------ → -
|
|
224
|
+
text = re.sub(r"([,.]){4,}", r"\1", text) # ...... → .
|
|
225
|
+
|
|
226
|
+
return text.strip()
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def serialize_for_json(obj):
|
|
230
|
+
"""
|
|
231
|
+
Recursively convert any object into a JSON-serializable structure.
|
|
232
|
+
Classes and class instances are converted to readable strings like 'ClassName()'.
|
|
233
|
+
"""
|
|
234
|
+
if isinstance(obj, (str, int, float, bool, type(None))):
|
|
235
|
+
return obj
|
|
236
|
+
elif isinstance(obj, dict):
|
|
237
|
+
return {str(k): serialize_for_json(v) for k, v in obj.items()}
|
|
238
|
+
elif isinstance(obj, (list, tuple, set)):
|
|
239
|
+
return [serialize_for_json(v) for v in obj]
|
|
240
|
+
elif isinstance(obj, type):
|
|
241
|
+
# A class/type object like int, str, etc.
|
|
242
|
+
return obj.__name__
|
|
243
|
+
elif hasattr(obj, "__class__"):
|
|
244
|
+
return f"{obj.__class__.__name__}()"
|
|
245
|
+
else:
|
|
246
|
+
return str(obj)
|