lecrapaud 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (63) hide show
  1. lecrapaud/__init__.py +1 -0
  2. lecrapaud/api.py +271 -0
  3. lecrapaud/config.py +25 -0
  4. lecrapaud/db/__init__.py +1 -0
  5. lecrapaud/db/alembic/README +1 -0
  6. lecrapaud/db/alembic/env.py +78 -0
  7. lecrapaud/db/alembic/script.py.mako +26 -0
  8. lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +295 -0
  9. lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +30 -0
  10. lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +52 -0
  11. lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +34 -0
  12. lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +39 -0
  13. lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +36 -0
  14. lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +30 -0
  15. lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +132 -0
  16. lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +36 -0
  17. lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +62 -0
  18. lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +107 -0
  19. lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +38 -0
  20. lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +44 -0
  21. lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +30 -0
  22. lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +32 -0
  23. lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
  24. lecrapaud/db/models/__init__.py +11 -0
  25. lecrapaud/db/models/base.py +181 -0
  26. lecrapaud/db/models/dataset.py +129 -0
  27. lecrapaud/db/models/feature.py +45 -0
  28. lecrapaud/db/models/feature_selection.py +125 -0
  29. lecrapaud/db/models/feature_selection_rank.py +79 -0
  30. lecrapaud/db/models/model.py +40 -0
  31. lecrapaud/db/models/model_selection.py +63 -0
  32. lecrapaud/db/models/model_training.py +62 -0
  33. lecrapaud/db/models/score.py +65 -0
  34. lecrapaud/db/models/target.py +67 -0
  35. lecrapaud/db/session.py +45 -0
  36. lecrapaud/directory_management.py +28 -0
  37. lecrapaud/experiment.py +64 -0
  38. lecrapaud/feature_engineering.py +846 -0
  39. lecrapaud/feature_selection.py +1167 -0
  40. lecrapaud/integrations/openai_integration.py +225 -0
  41. lecrapaud/jobs/__init__.py +13 -0
  42. lecrapaud/jobs/config.py +17 -0
  43. lecrapaud/jobs/scheduler.py +36 -0
  44. lecrapaud/jobs/tasks.py +57 -0
  45. lecrapaud/model_selection.py +1671 -0
  46. lecrapaud/predictions.py +292 -0
  47. lecrapaud/preprocessing.py +984 -0
  48. lecrapaud/search_space.py +848 -0
  49. lecrapaud/services/__init__.py +0 -0
  50. lecrapaud/services/embedding_categorical.py +71 -0
  51. lecrapaud/services/indicators.py +309 -0
  52. lecrapaud/speed_tests/experiments.py +139 -0
  53. lecrapaud/speed_tests/test-gpu-bilstm.ipynb +261 -0
  54. lecrapaud/speed_tests/test-gpu-resnet.ipynb +166 -0
  55. lecrapaud/speed_tests/test-gpu-transformers.ipynb +254 -0
  56. lecrapaud/speed_tests/tests.ipynb +145 -0
  57. lecrapaud/speed_tests/trash.py +37 -0
  58. lecrapaud/training.py +239 -0
  59. lecrapaud/utils.py +246 -0
  60. lecrapaud-0.1.0.dist-info/LICENSE +201 -0
  61. lecrapaud-0.1.0.dist-info/METADATA +105 -0
  62. lecrapaud-0.1.0.dist-info/RECORD +63 -0
  63. lecrapaud-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,145 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# import autosklearn.classification\n",
10
+ "import sklearn.datasets\n",
11
+ "import sklearn.metrics\n",
12
+ "from pprint import pprint\n",
13
+ "from tabpfn import TabPFNClassifier\n",
14
+ "import numpy as np\n",
15
+ "from pathlib import Path\n",
16
+ "import pandas as pd\n",
17
+ "import time\n",
18
+ "from sklearn.metrics import accuracy_score\n",
19
+ "from sklearn.datasets import load_breast_cancer\n",
20
+ "from sklearn.model_selection import train_test_split"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": null,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)\n",
30
+ "X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n",
31
+ " X, y, random_state=1\n",
32
+ ")"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": null,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "automl = autosklearn.classification.AutoSklearnClassifier(\n",
42
+ " time_left_for_this_task=120,\n",
43
+ " per_run_time_limit=30,\n",
44
+ " tmp_folder=\"/tmp/autosklearn_interpretable_models_example_tmp\",\n",
45
+ " include={\n",
46
+ " \"classifier\": [\"decision_tree\", \"lda\", \"sgd\"],\n",
47
+ " \"feature_preprocessor\": [\n",
48
+ " \"no_preprocessing\",\n",
49
+ " \"polynomial\",\n",
50
+ " \"select_percentile_classification\",\n",
51
+ " ],\n",
52
+ " },\n",
53
+ " ensemble_kwargs={\"ensemble_size\": 1},\n",
54
+ ")\n",
55
+ "automl.fit(X_train, y_train, dataset_name=\"breast_cancer\")"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": null,
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "pprint(automl.show_models(), indent=4)"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": null,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "predictions = automl.predict(X_test)\n",
74
+ "print(\"Accuracy score:\", sklearn.metrics.accuracy_score(y_test, predictions))"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": null,
80
+ "metadata": {},
81
+ "outputs": [],
82
+ "source": [
83
+ "# N_ensemble_configurations defines how many estimators are averaged, it is bounded by #features * #classes\n",
84
+ "# more ensemble members are slower, but more accurate\n",
85
+ "classifier = TabPFNClassifier(device=\"cuda\", N_ensemble_configurations=4)"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": null,
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "start = time.time()\n",
95
+ "classifier.fit(X_train, y_train)\n",
96
+ "y_eval, p_eval = classifier.predict(X_test, return_winning_probability=True)\n",
97
+ "print(\n",
98
+ " \"Prediction time: \", time.time() - start, \"Accuracy\", accuracy_score(y_test, y_eval)\n",
99
+ ")"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": null,
105
+ "metadata": {},
106
+ "outputs": [],
107
+ "source": [
108
+ "# We also offer the `predict_proba` interface\n",
109
+ "classifier.predict_proba(X_test).shape"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": null,
115
+ "metadata": {},
116
+ "outputs": [],
117
+ "source": [
118
+ "out_table = pd.DataFrame(X_test.copy().astype(str))\n",
119
+ "out_table[\"prediction\"] = [f\"{y_e} (p={p_e:.2f})\" for y_e, p_e in zip(y_eval, p_eval)]\n",
120
+ "out_table"
121
+ ]
122
+ }
123
+ ],
124
+ "metadata": {
125
+ "kernelspec": {
126
+ "display_name": ".venv",
127
+ "language": "python",
128
+ "name": "python3"
129
+ },
130
+ "language_info": {
131
+ "codemirror_mode": {
132
+ "name": "ipython",
133
+ "version": 3
134
+ },
135
+ "file_extension": ".py",
136
+ "mimetype": "text/x-python",
137
+ "name": "python",
138
+ "nbconvert_exporter": "python",
139
+ "pygments_lexer": "ipython3",
140
+ "version": "3.12.8"
141
+ }
142
+ },
143
+ "nbformat": 4,
144
+ "nbformat_minor": 2
145
+ }
@@ -0,0 +1,37 @@
1
+ # def _get_weekly_return(y_true, y_pred):
2
+ # df = pd.concat([y_true, y_pred, stock_data[['YEARWEEK', 'STOCK', 'TARGET_1']]], join='inner', axis=1)
3
+ # df['PRED'] += 1
4
+ # df['TARGET'] += 1
5
+ # return df[['YEARWEEK', 'STOCK', 'PRED', 'TARGET']].groupby(['YEARWEEK', 'STOCK']).prod().reset_index()
6
+
7
+ # def _calc_spread_return_per_week(df, portfolio_size):
8
+ # return (df.sort_values('PRED', ascending=False)['TARGET_1'][:portfolio_size] - 1).mean()
9
+
10
+ # def sharpe_ratio_weekly(y_true, y_pred, portfolio_size:int=10):
11
+ # df = _get_weekly_return(y_true, y_pred)
12
+ # buf = df.groupby('YEARWEEK').apply(_calc_spread_return_per_week, portfolio_size)
13
+ # sharpe_ratio = (buf.mean() * 52) / (buf.std() * np.sqrt(52))
14
+ # buf += 1
15
+ # cumulated_roi = buf.prod() - 1
16
+ # cagr = buf.prod() ** (1 / (buf.shape[0]/52) ) - 1
17
+ # return sharpe_ratio, cumulated_roi, cagr
18
+
19
+
20
+ def sharpe_ratio_daily(y_true, y_pred, portfolio_size: int = 10):
21
+ df = pd.concat(
22
+ [y_true, y_pred, stock_data[["DATE", "TARGET_1"]]], join="inner", axis=1
23
+ )
24
+
25
+ def _calc_spread_return_per_day(df: pd.DataFrame, portfolio_size: int):
26
+ # print(df.sort_values('PRED', ascending=False)[['PRED', 'TARGET', 'TARGET_1']].head(10))
27
+ return (
28
+ df.sort_values("PRED", ascending=False)["TARGET_1"].iloc[:portfolio_size]
29
+ ).mean()
30
+
31
+ buf = df.groupby("DATE").apply(_calc_spread_return_per_day, portfolio_size)
32
+
33
+ sharpe_ratio = (buf.mean() * 252) / (buf.std() * np.sqrt(252))
34
+ buf += 1
35
+ cumulated_roi = buf.prod() - 1
36
+ cagr = buf.prod() ** (1 / (buf.shape[0] / 252)) - 1
37
+ return sharpe_ratio, cumulated_roi, cagr
lecrapaud/training.py ADDED
@@ -0,0 +1,239 @@
1
+ import logging
2
+ import joblib
3
+ from pathlib import Path
4
+ import os
5
+
6
+ from lecrapaud.experiment import create_dataset
7
+ from lecrapaud.feature_engineering import (
8
+ feature_engineering,
9
+ encode_categorical_features,
10
+ add_pca_features,
11
+ summarize_dataframe,
12
+ )
13
+ from lecrapaud.feature_selection import (
14
+ feature_selection,
15
+ train_val_test_split,
16
+ train_val_test_split_time_series,
17
+ scale_data,
18
+ reshape_time_series,
19
+ )
20
+ from lecrapaud.model_selection import model_selection
21
+ from lecrapaud.search_space import all_models
22
+ from lecrapaud.directory_management import tmp_dir
23
+ from lecrapaud.db import Dataset
24
+ from lecrapaud.utils import logger
25
+ from lecrapaud.config import PYTHON_ENV
26
+
27
+
28
+ # Parameters
29
+ columns_date = ["DATE"]
30
+ columns_te_groupby = [["SECTOR", "DATE"], ["SUBINDUSTRY", "DATE"]]
31
+ columns_te_target = ["RET", "VOLUME", "RESIDUAL_RET", "RELATIVE_VOLUME"] + [
32
+ f"{ind}_{p}"
33
+ for p in [9, 14, 21, 50]
34
+ for ind in [
35
+ "CUMUL_RET",
36
+ "SMA",
37
+ "EMA",
38
+ "VOLATILITY",
39
+ "ATR",
40
+ "ADX",
41
+ "%K",
42
+ "RSI",
43
+ "MFI",
44
+ ]
45
+ ]
46
+ target_clf = [2, 4, 6, 8, 9, 10, 11]
47
+ column_ordinal = ["STOCK"]
48
+ column_binary = ["SECTOR", "SUBINDUSTRY", "LOCATION"]
49
+ columns_pca = []
50
+ target_numbers = range(1, 15)
51
+ date_column = "DATE"
52
+ group_column = "STOCK"
53
+
54
+
55
+ def run_training(
56
+ get_data_function: function,
57
+ get_data_params: dict = None,
58
+ time_series: bool = False,
59
+ dataset_id=None,
60
+ years_of_data=2,
61
+ list_of_groups=None,
62
+ percentile=15,
63
+ corr_threshold=80,
64
+ max_features=20,
65
+ max_timesteps=120,
66
+ targets_numbers=range(1, 15),
67
+ models_idx=range(len(all_models)),
68
+ number_of_trials=20,
69
+ perform_hyperoptimization=True,
70
+ perform_crossval=False,
71
+ clean_dir=False,
72
+ preserve_model=False,
73
+ session_name="test",
74
+ ):
75
+ logging.captureWarnings(True)
76
+
77
+ if any(all_models[i].get("recurrent") for i in models_idx) and not time_series:
78
+ ValueError(
79
+ "You need to set time_series to true to use recurrent model, or remove recurrent models from models_idx chosen"
80
+ )
81
+
82
+ if dataset_id is None:
83
+ # Get the data
84
+ logger.info("Getting data...")
85
+ data = get_data_function(**get_data_params)
86
+
87
+ # # preprocess & feature engineering => Should be in get_data_function
88
+ # logger.info("Preprocessing...")
89
+ # preprocessed_data = preprocessing(data, for_training=True, save_as_csv=True)
90
+
91
+ logger.info(f"Feature engineering for {session_name}...")
92
+ data_for_training = feature_engineering(
93
+ data,
94
+ columns_date=columns_date,
95
+ columns_te_groupby=columns_te_groupby,
96
+ columns_te_target=columns_te_target,
97
+ )
98
+
99
+ # Split
100
+ if time_series:
101
+ train, val, test = train_val_test_split_time_series(data_for_training)
102
+ else:
103
+ train, val, test = train_val_test_split(
104
+ data, stratify_col=f"target_{target_numbers[0]}"
105
+ ) # TODO: only stratifying first target for now
106
+
107
+ # Create Dataset / Experiment (TODO: should be defined sooner)
108
+ dataset = create_dataset(
109
+ train, val, test, corr_threshold, percentile, max_features
110
+ )
111
+ dataset_dir = dataset.path
112
+ dataset_id = dataset.id
113
+ data_dir = f"{dataset_dir}/data"
114
+ preprocessing_dir = f"{dataset_dir}/preprocessing"
115
+ os.makedirs(data_dir, exist_ok=True)
116
+ os.makedirs(preprocessing_dir, exist_ok=True)
117
+
118
+ # PCA
119
+ train, pcas = add_pca_features(train, columns_pca)
120
+ val, _ = add_pca_features(val, columns_pca, pcas=pcas)
121
+ test, _ = add_pca_features(test, columns_pca, pcas=pcas)
122
+
123
+ if PYTHON_ENV != "Test":
124
+ joblib.dump(pcas, f"{preprocessing_dir}/pca.pkl")
125
+
126
+ # Encoding
127
+ train, transformer = encode_categorical_features(
128
+ train, column_ordinal=column_ordinal, column_binary=column_binary
129
+ )
130
+ val, _ = encode_categorical_features(
131
+ val,
132
+ column_ordinal=column_ordinal,
133
+ column_binary=column_binary,
134
+ transformer=transformer,
135
+ )
136
+ test, _ = encode_categorical_features(
137
+ test,
138
+ column_ordinal=column_ordinal,
139
+ column_binary=column_binary,
140
+ transformer=transformer,
141
+ )
142
+
143
+ if PYTHON_ENV != "Test":
144
+ joblib.dump(data_for_training, f"{data_dir}/full.pkl")
145
+ joblib.dump(transformer, f"{preprocessing_dir}/column_transformer.pkl")
146
+ summary = summarize_dataframe(train)
147
+ summary.to_csv(f"{dataset_dir}/feature_summary.csv", index=False)
148
+
149
+ # feature selection
150
+ logger.info("Feature Selection...")
151
+ for target_number in targets_numbers:
152
+ feature_selection(
153
+ dataset_id=dataset_id,
154
+ train=train,
155
+ target_number=target_number,
156
+ single_process=True,
157
+ )
158
+
159
+ dataset = Dataset.get(dataset_id)
160
+ all_features = dataset.get_all_features()
161
+ columns_to_keep = all_features + [f"TARGET_{i}" for i in target_numbers]
162
+
163
+ duplicates = [
164
+ col for col in set(columns_to_keep) if columns_to_keep.count(col) > 1
165
+ ]
166
+ if duplicates:
167
+ raise ValueError(f"Doublons détectés dans columns_to_keep: {duplicates}")
168
+
169
+ train = train[columns_to_keep]
170
+ val = val[columns_to_keep]
171
+ test = test[columns_to_keep]
172
+
173
+ # save data
174
+ if PYTHON_ENV != "Test":
175
+ joblib.dump(train[columns_to_keep], f"{data_dir}/train.pkl")
176
+ joblib.dump(val[columns_to_keep], f"{data_dir}/val.pkl")
177
+ joblib.dump(test[columns_to_keep], f"{data_dir}/test.pkl")
178
+
179
+ # scaling features
180
+ if any(t not in target_clf for t in target_numbers) and any(
181
+ all_models[i].get("need_scaling") for i in models_idx
182
+ ):
183
+ logger.info("Scaling features...")
184
+ train_scaled, scaler_x, scalers_y = scale_data(
185
+ train, save_dir=preprocessing_dir
186
+ )
187
+ val_scaled, _, _ = scale_data(
188
+ val, save_dir=preprocessing_dir, scaler_x=scaler_x, scalers_y=scalers_y
189
+ )
190
+ test_scaled, _, _ = scale_data(
191
+ test, save_dir=preprocessing_dir, scaler_x=scaler_x, scalers_y=scalers_y
192
+ )
193
+ else:
194
+ train_scaled = None
195
+ val_scaled = None
196
+ test_scaled = None
197
+
198
+ # save data
199
+ if PYTHON_ENV != "Test":
200
+ joblib.dump(train_scaled, f"{data_dir}/train_scaled.pkl")
201
+ joblib.dump(val_scaled, f"{data_dir}/val_scaled.pkl")
202
+ joblib.dump(test_scaled, f"{data_dir}/test_scaled.pkl")
203
+
204
+ data = {
205
+ "train": train,
206
+ "val": val,
207
+ "test": test,
208
+ "train_scaled": train_scaled,
209
+ "val_scaled": val_scaled,
210
+ "test_scaled": test_scaled,
211
+ "scalers_y": scalers_y,
212
+ }
213
+
214
+ # reshape data for time series
215
+ reshaped_data = None
216
+ if any(all_models[i].get("recurrent") for i in models_idx) and time_series:
217
+ # reshaping data for recurrent models
218
+ logger.info("Reshaping data for recurrent models...")
219
+ reshaped_data = reshape_time_series(
220
+ train_scaled, val_scaled, test_scaled, all_features, timesteps=max_timesteps
221
+ )
222
+
223
+ # model selection and hyperoptimization
224
+ logger.info("Model Selection and Hyperoptimization...")
225
+ for target_number in target_numbers:
226
+ model_selection(
227
+ dataset_id=dataset_id,
228
+ models_idx=models_idx,
229
+ target_number=target_number,
230
+ session_name=session_name,
231
+ perform_hyperoptimization=perform_hyperoptimization,
232
+ perform_crossval=perform_crossval,
233
+ number_of_trials=number_of_trials,
234
+ plot=False,
235
+ clean_dir=clean_dir,
236
+ preserve_model=preserve_model,
237
+ reshaped_data=reshaped_data,
238
+ data=(data or None),
239
+ )
lecrapaud/utils.py ADDED
@@ -0,0 +1,246 @@
1
+ import pandas as pd
2
+ import logging
3
+ from logging.handlers import RotatingFileHandler
4
+ import shutil
5
+ import os
6
+ import subprocess
7
+ from datetime import datetime, date
8
+ from ftfy import fix_text
9
+ import unicodedata
10
+ import re
11
+ import string
12
+
13
+ from lecrapaud.directory_management import logger_dir
14
+ from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV
15
+
16
+ _LOGGER_ALREADY_CONFIGURED = False
17
+
18
+
19
+ def setup_logger():
20
+
21
+ global _LOGGER_ALREADY_CONFIGURED
22
+ if _LOGGER_ALREADY_CONFIGURED: # ← bail out if done before
23
+
24
+ return logging.getLogger("stock" if PYTHON_ENV != "Worker" else "")
25
+
26
+ print(
27
+ f"Setting up logger with PYTHON_ENV {PYTHON_ENV} and LOGGING_LEVEL {LOGGING_LEVEL}"
28
+ )
29
+ # ------------------------------------------------------------------ #
30
+ # Real configuration happens only on the FIRST call #
31
+ # ------------------------------------------------------------------ #
32
+ fmt = "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s - %(message)s"
33
+ datefmt = "%Y-%m-%d %H:%M:%S"
34
+ logging.basicConfig(format=fmt, datefmt=datefmt) # root format
35
+ formatter = logging.Formatter(fmt, datefmt=datefmt)
36
+
37
+ logger = logging.getLogger("" if PYTHON_ENV == "Worker" else "stock")
38
+
39
+ log_level = getattr(logging, LOGGING_LEVEL.upper(), logging.INFO)
40
+ logger.setLevel(log_level)
41
+
42
+ # pick a file according to environment
43
+ env_file = {
44
+ "Development": "dev.log",
45
+ "Production": "prod.log",
46
+ "Test": "test.log",
47
+ "Worker": "worker.log",
48
+ }.get(PYTHON_ENV, "app.log")
49
+
50
+ file_handler = RotatingFileHandler(
51
+ f"{logger_dir}/{env_file}",
52
+ maxBytes=5 * 1024 * 1024,
53
+ backupCount=3,
54
+ )
55
+ file_handler.setFormatter(formatter)
56
+ file_handler.setLevel(log_level)
57
+ logger.addHandler(file_handler)
58
+
59
+ _LOGGER_ALREADY_CONFIGURED = True
60
+ return logger
61
+
62
+
63
+ logger = setup_logger()
64
+
65
+
66
+ def get_df_name(obj, namespace):
67
+ return [name for name in namespace if namespace[name] is obj][0]
68
+
69
+
70
+ def pprint(item):
71
+ with pd.option_context("display.max_rows", None):
72
+ logger.info(item)
73
+
74
+
75
+ def object_to_dict(obj):
76
+ if isinstance(obj, dict):
77
+ return {k: object_to_dict(v) for k, v in obj.items()}
78
+ elif hasattr(obj, "__dict__"):
79
+ return {k: object_to_dict(v) for k, v in obj.__dict__.items()}
80
+ elif isinstance(obj, list):
81
+ return [object_to_dict(i) for i in obj]
82
+ else:
83
+ return obj
84
+
85
+
86
+ def copy_any(src, dst):
87
+ if os.path.isdir(src):
88
+ # Copy folder using copytree
89
+ shutil.copytree(src, dst)
90
+ else:
91
+ # Copy file using copy2 (which preserves metadata)
92
+ shutil.copy2(src, dst)
93
+
94
+
95
+ def contains_best(folder_path):
96
+ # Iterate over all files and folders in the specified directory
97
+ for root, dirs, files in os.walk(folder_path):
98
+ # Check each file and folder name for '.best' or '.keras'
99
+ for name in files + dirs:
100
+ if ".best" in name or ".keras" in name:
101
+ return True
102
+ return False
103
+
104
+
105
+ def get_folder_sizes(directory=os.path.expanduser("~")):
106
+ folder_sizes = {}
107
+
108
+ for folder in os.listdir(directory):
109
+ folder_path = os.path.join(directory, folder)
110
+ if os.path.isdir(folder_path):
111
+ try:
112
+ size = (
113
+ subprocess.check_output(["du", "-sk", folder_path])
114
+ .split()[0]
115
+ .decode("utf-8")
116
+ )
117
+ folder_sizes[folder] = int(size)
118
+ except subprocess.CalledProcessError:
119
+ logger.info(f"Skipping {folder_path}: Permission Denied")
120
+
121
+ sorted_folders = sorted(folder_sizes.items(), key=lambda x: x[1], reverse=True)
122
+ logger.info(f"{'Folder':<50}{'Size (MB)':>10}")
123
+ logger.info("=" * 60)
124
+ for folder, size in sorted_folders:
125
+ logger.info(f"{folder:<50}{size / (1024*1024):>10.2f}")
126
+
127
+
128
+ def create_cron_job(
129
+ script_path,
130
+ venv_path,
131
+ log_file,
132
+ pythonpath,
133
+ cwd,
134
+ job_frequency="* * * * *",
135
+ cron_name="My Custom Cron Job",
136
+ ):
137
+ """
138
+ Creates a cron job to run a Python script with a virtual environment, logging output, and setting PYTHONPATH and CWD.
139
+
140
+ Parameters:
141
+ - script_path (str): Path to the Python script to run.
142
+ - venv_path (str): Path to the virtual environment's Python interpreter.
143
+ - log_file (str): Path to the log file for output.
144
+ - pythonpath (str): Value for the PYTHONPATH environment variable.
145
+ - cwd (str): Working directory from which the script should run.
146
+ - job_frequency (str): Cron timing syntax (default is every minute).
147
+ - cron_name (str): Name to identify the cron job.
148
+ """
149
+ # Construct the cron command
150
+ cron_command = (
151
+ f"{job_frequency} /bin/zsh -c 'pgrep -fl python | grep -q {os.path.basename(script_path)} "
152
+ f'|| (echo -e "Cron job {cron_name} started at $(date)" >> {log_file} && cd {cwd} && '
153
+ f"PYTHONPATH={pythonpath} {venv_path}/bin/python {script_path} >> {log_file} 2>&1)'"
154
+ )
155
+
156
+ # Check existing cron jobs and remove any with the same comment
157
+ subprocess.run(f"(crontab -l | grep -v '{cron_name}') | crontab -", shell=True)
158
+
159
+ # Add the new cron job with the comment
160
+ full_cron_job = f"{cron_command} # {cron_name}\n"
161
+ subprocess.run(f'(crontab -l; echo "{full_cron_job}") | crontab -', shell=True)
162
+ logger.info(f"Cron job created: {full_cron_job}")
163
+
164
+
165
+ def remove_all_cron_jobs():
166
+ """
167
+ Removes all cron jobs for the current user.
168
+ """
169
+ try:
170
+ # Clear the user's crontab
171
+ subprocess.run("crontab -r", shell=True, check=True)
172
+ logger.info("All cron jobs have been removed successfully.")
173
+ except subprocess.CalledProcessError:
174
+ logger.info(
175
+ "Failed to remove cron jobs. There may not be any cron jobs to remove, or there could be a permissions issue."
176
+ )
177
+
178
+
179
+ def serialize_timestamp(dict: dict):
180
+ def convert(obj):
181
+ if isinstance(obj, (datetime, date, pd.Timestamp)):
182
+ return obj.isoformat()
183
+
184
+ return obj
185
+
186
+ return [{k: convert(v) for k, v in item.items()} for item in dict]
187
+
188
+
189
+ def remove_accents(text: str) -> str:
190
+ """
191
+ Cleans the text of:
192
+ - Broken Unicode
193
+ - Accents
194
+ - Control characters (including \x00, \u0000, etc.)
195
+ - Escape sequences
196
+ - Non-printable characters
197
+ - Excessive punctuation (like ........ or !!!!)
198
+ """
199
+
200
+ # Step 1: Fix mojibake and broken Unicode
201
+ text = fix_text(text)
202
+
203
+ # Step 2 bis: Normalize accents
204
+ text = unicodedata.normalize("NFKD", text)
205
+ text = text.encode("ASCII", "ignore").decode("utf8")
206
+
207
+ # Step 3: Remove known weird tokens
208
+ text = text.replace("<|endoftext|>", "")
209
+ text = text.replace("\u0000", "").replace("\x00", "")
210
+
211
+ # Step 4: Remove raw control characters (e.g., \x1f)
212
+ text = "".join(c for c in text if unicodedata.category(c)[0] != "C" or c == "\n")
213
+
214
+ # Step 5: Remove literal escape sequences like \xNN
215
+ text = re.sub(r"\\x[0-9a-fA-F]{2}", "", text)
216
+
217
+ # Step 6: Remove non-printable characters
218
+ printable = set(string.printable)
219
+ text = "".join(c for c in text if c in printable)
220
+
221
+ # Step 7: Collapse repeated punctuation (e.g., ........ → .)
222
+ text = re.sub(r"([!?.])\1{2,}", r"\1", text) # !!!!!! → !
223
+ text = re.sub(r"([-—])\1{1,}", r"\1", text) # ------ → -
224
+ text = re.sub(r"([,.]){4,}", r"\1", text) # ...... → .
225
+
226
+ return text.strip()
227
+
228
+
229
+ def serialize_for_json(obj):
230
+ """
231
+ Recursively convert any object into a JSON-serializable structure.
232
+ Classes and class instances are converted to readable strings like 'ClassName()'.
233
+ """
234
+ if isinstance(obj, (str, int, float, bool, type(None))):
235
+ return obj
236
+ elif isinstance(obj, dict):
237
+ return {str(k): serialize_for_json(v) for k, v in obj.items()}
238
+ elif isinstance(obj, (list, tuple, set)):
239
+ return [serialize_for_json(v) for v in obj]
240
+ elif isinstance(obj, type):
241
+ # A class/type object like int, str, etc.
242
+ return obj.__name__
243
+ elif hasattr(obj, "__class__"):
244
+ return f"{obj.__class__.__name__}()"
245
+ else:
246
+ return str(obj)