lecrapaud 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (60) hide show
  1. lecrapaud/__init__.py +0 -0
  2. lecrapaud/config.py +16 -0
  3. lecrapaud/db/__init__.py +0 -0
  4. lecrapaud/db/alembic/README +1 -0
  5. lecrapaud/db/alembic/env.py +78 -0
  6. lecrapaud/db/alembic/script.py.mako +26 -0
  7. lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +295 -0
  8. lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +30 -0
  9. lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +52 -0
  10. lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +34 -0
  11. lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +39 -0
  12. lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +36 -0
  13. lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +30 -0
  14. lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +132 -0
  15. lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +36 -0
  16. lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +62 -0
  17. lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +107 -0
  18. lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +38 -0
  19. lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +44 -0
  20. lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +30 -0
  21. lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +32 -0
  22. lecrapaud/db/crud.py +179 -0
  23. lecrapaud/db/models/__init__.py +11 -0
  24. lecrapaud/db/models/base.py +6 -0
  25. lecrapaud/db/models/dataset.py +124 -0
  26. lecrapaud/db/models/feature.py +46 -0
  27. lecrapaud/db/models/feature_selection.py +126 -0
  28. lecrapaud/db/models/feature_selection_rank.py +80 -0
  29. lecrapaud/db/models/model.py +41 -0
  30. lecrapaud/db/models/model_selection.py +56 -0
  31. lecrapaud/db/models/model_training.py +54 -0
  32. lecrapaud/db/models/score.py +62 -0
  33. lecrapaud/db/models/target.py +59 -0
  34. lecrapaud/db/services.py +0 -0
  35. lecrapaud/db/setup.py +58 -0
  36. lecrapaud/directory_management.py +28 -0
  37. lecrapaud/feature_engineering.py +1119 -0
  38. lecrapaud/feature_selection.py +1229 -0
  39. lecrapaud/jobs/__init__.py +13 -0
  40. lecrapaud/jobs/config.py +17 -0
  41. lecrapaud/jobs/scheduler.py +36 -0
  42. lecrapaud/jobs/tasks.py +57 -0
  43. lecrapaud/model_selection.py +1571 -0
  44. lecrapaud/predictions.py +292 -0
  45. lecrapaud/search_space.py +844 -0
  46. lecrapaud/services/__init__.py +0 -0
  47. lecrapaud/services/embedding_categorical.py +71 -0
  48. lecrapaud/services/indicators.py +309 -0
  49. lecrapaud/speed_tests/experiments.py +139 -0
  50. lecrapaud/speed_tests/test-gpu-bilstm.ipynb +261 -0
  51. lecrapaud/speed_tests/test-gpu-resnet.ipynb +166 -0
  52. lecrapaud/speed_tests/test-gpu-transformers.ipynb +254 -0
  53. lecrapaud/speed_tests/tests.ipynb +145 -0
  54. lecrapaud/speed_tests/trash.py +37 -0
  55. lecrapaud/training.py +151 -0
  56. lecrapaud/utils.py +246 -0
  57. lecrapaud-0.4.0.dist-info/LICENSE +201 -0
  58. lecrapaud-0.4.0.dist-info/METADATA +103 -0
  59. lecrapaud-0.4.0.dist-info/RECORD +60 -0
  60. lecrapaud-0.4.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,145 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# import autosklearn.classification\n",
10
+ "import sklearn.datasets\n",
11
+ "import sklearn.metrics\n",
12
+ "from pprint import pprint\n",
13
+ "from tabpfn import TabPFNClassifier\n",
14
+ "import numpy as np\n",
15
+ "from pathlib import Path\n",
16
+ "import pandas as pd\n",
17
+ "import time\n",
18
+ "from sklearn.metrics import accuracy_score\n",
19
+ "from sklearn.datasets import load_breast_cancer\n",
20
+ "from sklearn.model_selection import train_test_split"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": null,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)\n",
30
+ "X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n",
31
+ " X, y, random_state=1\n",
32
+ ")"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": null,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "automl = autosklearn.classification.AutoSklearnClassifier(\n",
42
+ " time_left_for_this_task=120,\n",
43
+ " per_run_time_limit=30,\n",
44
+ " tmp_folder=\"/tmp/autosklearn_interpretable_models_example_tmp\",\n",
45
+ " include={\n",
46
+ " \"classifier\": [\"decision_tree\", \"lda\", \"sgd\"],\n",
47
+ " \"feature_preprocessor\": [\n",
48
+ " \"no_preprocessing\",\n",
49
+ " \"polynomial\",\n",
50
+ " \"select_percentile_classification\",\n",
51
+ " ],\n",
52
+ " },\n",
53
+ " ensemble_kwargs={\"ensemble_size\": 1},\n",
54
+ ")\n",
55
+ "automl.fit(X_train, y_train, dataset_name=\"breast_cancer\")"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": null,
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "pprint(automl.show_models(), indent=4)"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": null,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "predictions = automl.predict(X_test)\n",
74
+ "print(\"Accuracy score:\", sklearn.metrics.accuracy_score(y_test, predictions))"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": null,
80
+ "metadata": {},
81
+ "outputs": [],
82
+ "source": [
83
+ "# N_ensemble_configurations defines how many estimators are averaged, it is bounded by #features * #classes\n",
84
+ "# more ensemble members are slower, but more accurate\n",
85
+ "classifier = TabPFNClassifier(device=\"cuda\", N_ensemble_configurations=4)"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": null,
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "start = time.time()\n",
95
+ "classifier.fit(X_train, y_train)\n",
96
+ "y_eval, p_eval = classifier.predict(X_test, return_winning_probability=True)\n",
97
+ "print(\n",
98
+ " \"Prediction time: \", time.time() - start, \"Accuracy\", accuracy_score(y_test, y_eval)\n",
99
+ ")"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": null,
105
+ "metadata": {},
106
+ "outputs": [],
107
+ "source": [
108
+ "# We also offer the `predict_proba` interface\n",
109
+ "classifier.predict_proba(X_test).shape"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": null,
115
+ "metadata": {},
116
+ "outputs": [],
117
+ "source": [
118
+ "out_table = pd.DataFrame(X_test.copy().astype(str))\n",
119
+ "out_table[\"prediction\"] = [f\"{y_e} (p={p_e:.2f})\" for y_e, p_e in zip(y_eval, p_eval)]\n",
120
+ "out_table"
121
+ ]
122
+ }
123
+ ],
124
+ "metadata": {
125
+ "kernelspec": {
126
+ "display_name": ".venv",
127
+ "language": "python",
128
+ "name": "python3"
129
+ },
130
+ "language_info": {
131
+ "codemirror_mode": {
132
+ "name": "ipython",
133
+ "version": 3
134
+ },
135
+ "file_extension": ".py",
136
+ "mimetype": "text/x-python",
137
+ "name": "python",
138
+ "nbconvert_exporter": "python",
139
+ "pygments_lexer": "ipython3",
140
+ "version": "3.12.8"
141
+ }
142
+ },
143
+ "nbformat": 4,
144
+ "nbformat_minor": 2
145
+ }
@@ -0,0 +1,37 @@
1
+ # def _get_weekly_return(y_true, y_pred):
2
+ # df = pd.concat([y_true, y_pred, stock_data[['YEARWEEK', 'STOCK', 'TARGET_1']]], join='inner', axis=1)
3
+ # df['PRED'] += 1
4
+ # df['TARGET'] += 1
5
+ # return df[['YEARWEEK', 'STOCK', 'PRED', 'TARGET']].groupby(['YEARWEEK', 'STOCK']).prod().reset_index()
6
+
7
+ # def _calc_spread_return_per_week(df, portfolio_size):
8
+ # return (df.sort_values('PRED', ascending=False)['TARGET_1'][:portfolio_size] - 1).mean()
9
+
10
+ # def sharpe_ratio_weekly(y_true, y_pred, portfolio_size:int=10):
11
+ # df = _get_weekly_return(y_true, y_pred)
12
+ # buf = df.groupby('YEARWEEK').apply(_calc_spread_return_per_week, portfolio_size)
13
+ # sharpe_ratio = (buf.mean() * 52) / (buf.std() * np.sqrt(52))
14
+ # buf += 1
15
+ # cumulated_roi = buf.prod() - 1
16
+ # cagr = buf.prod() ** (1 / (buf.shape[0]/52) ) - 1
17
+ # return sharpe_ratio, cumulated_roi, cagr
18
+
19
+
20
+ def sharpe_ratio_daily(y_true, y_pred, portfolio_size: int = 10):
21
+ df = pd.concat(
22
+ [y_true, y_pred, stock_data[["DATE", "TARGET_1"]]], join="inner", axis=1
23
+ )
24
+
25
+ def _calc_spread_return_per_day(df: pd.DataFrame, portfolio_size: int):
26
+ # print(df.sort_values('PRED', ascending=False)[['PRED', 'TARGET', 'TARGET_1']].head(10))
27
+ return (
28
+ df.sort_values("PRED", ascending=False)["TARGET_1"].iloc[:portfolio_size]
29
+ ).mean()
30
+
31
+ buf = df.groupby("DATE").apply(_calc_spread_return_per_day, portfolio_size)
32
+
33
+ sharpe_ratio = (buf.mean() * 252) / (buf.std() * np.sqrt(252))
34
+ buf += 1
35
+ cumulated_roi = buf.prod() - 1
36
+ cagr = buf.prod() ** (1 / (buf.shape[0] / 252)) - 1
37
+ return sharpe_ratio, cumulated_roi, cagr
lecrapaud/training.py ADDED
@@ -0,0 +1,151 @@
1
+ import logging
2
+ import joblib
3
+ from pathlib import Path
4
+ import os
5
+ from src.utils import logger
6
+
7
+ from src.feature_engineering import feature_engineering
8
+ from src.feature_selection import (
9
+ create_sets_from_data,
10
+ feature_selection,
11
+ scale_data,
12
+ reshape_time_series,
13
+ )
14
+ from src.model_selection import model_selection, test_hardware
15
+ from src.data_sourcing import get_filtered_data
16
+ from src.constants import stock_list_3, stock_list_1
17
+ from src.search_space import ml_models, dl_recurrent_models
18
+ from src.directory_management import tmp_dir
19
+ from src.db.models import Dataset
20
+ from src.config import PYTHON_ENV
21
+
22
+
23
+ def run_training(
24
+ dataset_id=None,
25
+ years_of_data=2,
26
+ list_of_groups=stock_list_1,
27
+ percentile=15,
28
+ corr_threshold=80,
29
+ max_features=20,
30
+ max_timesteps=120,
31
+ targets_numbers=range(1, 15),
32
+ models_idx=range(len(ml_models)),
33
+ number_of_trials=20,
34
+ perform_hyperoptimization=True,
35
+ perform_crossval=False,
36
+ clean_dir=False,
37
+ preserve_model=False,
38
+ session_name="test",
39
+ ):
40
+ logging.captureWarnings(True)
41
+
42
+ if dataset_id is None:
43
+ # Get the data
44
+ logger.info("Getting data...")
45
+ data = get_filtered_data(
46
+ years_of_data=years_of_data,
47
+ list_of_groups=list_of_groups,
48
+ )
49
+
50
+ # preprocess & feature engineering
51
+ logger.info("Preprocessing...")
52
+ data_for_training = feature_engineering(
53
+ data, for_training=True, save_as_csv=True
54
+ )
55
+
56
+ # train / val / test sets
57
+ train, val, test, dataset = create_sets_from_data(
58
+ data_for_training,
59
+ percentile=percentile,
60
+ corr_threshold=corr_threshold,
61
+ max_features=max_features,
62
+ )
63
+ dataset_dir = dataset.path
64
+ dataset_id = dataset.id
65
+ train_data_dir = f"{dataset_dir}/data"
66
+ os.makedirs(train_data_dir, exist_ok=True)
67
+ preprocessing_dir = f"{dataset_dir}/preprocessing"
68
+
69
+ # feature selection
70
+ logger.info("Feature Selection...")
71
+ for target_number in targets_numbers:
72
+ feature_selection(
73
+ dataset_id=dataset_id,
74
+ train=train,
75
+ target_number=target_number,
76
+ single_process=True,
77
+ )
78
+
79
+ dataset = Dataset.get(dataset_id)
80
+ all_features = dataset.get_all_features()
81
+ columns_to_keep = all_features + [f"TARGET_{i}" for i in range(1, 15)]
82
+ logger.info(columns_to_keep)
83
+ duplicates = [
84
+ col for col in set(columns_to_keep) if columns_to_keep.count(col) > 1
85
+ ]
86
+
87
+ if duplicates:
88
+ raise ValueError(f"Doublons détectés dans columns_to_keep: {duplicates}")
89
+
90
+ train = train[columns_to_keep]
91
+ val = val[columns_to_keep]
92
+ test = test[columns_to_keep]
93
+
94
+ if PYTHON_ENV != "Test":
95
+ joblib.dump(train[columns_to_keep], f"{train_data_dir}/train.pkl")
96
+ joblib.dump(val[columns_to_keep], f"{train_data_dir}/val.pkl")
97
+ joblib.dump(test[columns_to_keep], f"{train_data_dir}/test.pkl")
98
+
99
+ # scaling features
100
+ logger.info("Scaling features...")
101
+ train_scaled, scaler_x, scalers_y = scale_data(
102
+ train, save_dir=preprocessing_dir
103
+ )
104
+ val_scaled, _, _ = scale_data(
105
+ val, save_dir=preprocessing_dir, scaler_x=scaler_x, scalers_y=scalers_y
106
+ )
107
+ test_scaled, _, _ = scale_data(
108
+ test, save_dir=preprocessing_dir, scaler_x=scaler_x, scalers_y=scalers_y
109
+ )
110
+
111
+ if PYTHON_ENV != "Test":
112
+ joblib.dump(train_scaled, f"{train_data_dir}/train_scaled.pkl")
113
+ joblib.dump(val_scaled, f"{train_data_dir}/val_scaled.pkl")
114
+ joblib.dump(test_scaled, f"{train_data_dir}/test_scaled.pkl")
115
+
116
+ data = {
117
+ "train": train,
118
+ "val": val,
119
+ "test": test,
120
+ "train_scaled": train_scaled,
121
+ "val_scaled": val_scaled,
122
+ "test_scaled": test_scaled,
123
+ "scalers_y": scalers_y,
124
+ }
125
+
126
+ list_models = ml_models + dl_recurrent_models
127
+ reshaped_data = None
128
+ if any(list_models[i].get("recurrent") for i in models_idx):
129
+ # reshaping data for recurrent models
130
+ logger.info("Reshaping data for recurrent models...")
131
+ reshaped_data = reshape_time_series(
132
+ train_scaled, val_scaled, test_scaled, all_features, timesteps=max_timesteps
133
+ )
134
+
135
+ # model selection and hyperoptimization
136
+ logger.info("Model Selection and Hyperoptimization...")
137
+ for target_number in targets_numbers:
138
+ model_selection(
139
+ dataset_id=dataset_id,
140
+ models_idx=models_idx,
141
+ target_number=target_number,
142
+ session_name=session_name,
143
+ perform_hyperoptimization=perform_hyperoptimization,
144
+ perform_crossval=perform_crossval,
145
+ number_of_trials=number_of_trials,
146
+ plot=False,
147
+ clean_dir=clean_dir,
148
+ preserve_model=preserve_model,
149
+ reshaped_data=reshaped_data,
150
+ data=(data or None),
151
+ )
lecrapaud/utils.py ADDED
@@ -0,0 +1,246 @@
1
+ import pandas as pd
2
+ import logging
3
+ from logging.handlers import RotatingFileHandler
4
+ import shutil
5
+ import os
6
+ import subprocess
7
+ from datetime import datetime, date
8
+ from ftfy import fix_text
9
+ import unicodedata
10
+ import re
11
+ import string
12
+
13
+ from src.directory_management import logger_dir
14
+ from src.config import LOGGING_LEVEL, PYTHON_ENV
15
+
16
+ _LOGGER_ALREADY_CONFIGURED = False
17
+
18
+
19
+ def setup_logger():
20
+
21
+ global _LOGGER_ALREADY_CONFIGURED
22
+ if _LOGGER_ALREADY_CONFIGURED: # ← bail out if done before
23
+
24
+ return logging.getLogger("stock" if PYTHON_ENV != "Worker" else "")
25
+
26
+ print(
27
+ f"Setting up logger with PYTHON_ENV {PYTHON_ENV} and LOGGING_LEVEL {LOGGING_LEVEL}"
28
+ )
29
+ # ------------------------------------------------------------------ #
30
+ # Real configuration happens only on the FIRST call #
31
+ # ------------------------------------------------------------------ #
32
+ fmt = "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s - %(message)s"
33
+ datefmt = "%Y-%m-%d %H:%M:%S"
34
+ logging.basicConfig(format=fmt, datefmt=datefmt) # root format
35
+ formatter = logging.Formatter(fmt, datefmt=datefmt)
36
+
37
+ logger = logging.getLogger("" if PYTHON_ENV == "Worker" else "stock")
38
+
39
+ log_level = getattr(logging, LOGGING_LEVEL.upper(), logging.INFO)
40
+ logger.setLevel(log_level)
41
+
42
+ # pick a file according to environment
43
+ env_file = {
44
+ "Development": "dev.log",
45
+ "Production": "prod.log",
46
+ "Test": "test.log",
47
+ "Worker": "worker.log",
48
+ }.get(PYTHON_ENV, "app.log")
49
+
50
+ file_handler = RotatingFileHandler(
51
+ f"{logger_dir}/{env_file}",
52
+ maxBytes=5 * 1024 * 1024,
53
+ backupCount=3,
54
+ )
55
+ file_handler.setFormatter(formatter)
56
+ file_handler.setLevel(log_level)
57
+ logger.addHandler(file_handler)
58
+
59
+ _LOGGER_ALREADY_CONFIGURED = True
60
+ return logger
61
+
62
+
63
+ logger = setup_logger()
64
+
65
+
66
+ def get_df_name(obj, namespace):
67
+ return [name for name in namespace if namespace[name] is obj][0]
68
+
69
+
70
+ def pprint(item):
71
+ with pd.option_context("display.max_rows", None):
72
+ logger.info(item)
73
+
74
+
75
+ def object_to_dict(obj):
76
+ if isinstance(obj, dict):
77
+ return {k: object_to_dict(v) for k, v in obj.items()}
78
+ elif hasattr(obj, "__dict__"):
79
+ return {k: object_to_dict(v) for k, v in obj.__dict__.items()}
80
+ elif isinstance(obj, list):
81
+ return [object_to_dict(i) for i in obj]
82
+ else:
83
+ return obj
84
+
85
+
86
+ def copy_any(src, dst):
87
+ if os.path.isdir(src):
88
+ # Copy folder using copytree
89
+ shutil.copytree(src, dst)
90
+ else:
91
+ # Copy file using copy2 (which preserves metadata)
92
+ shutil.copy2(src, dst)
93
+
94
+
95
+ def contains_best(folder_path):
96
+ # Iterate over all files and folders in the specified directory
97
+ for root, dirs, files in os.walk(folder_path):
98
+ # Check each file and folder name for '.best' or '.keras'
99
+ for name in files + dirs:
100
+ if ".best" in name or ".keras" in name:
101
+ return True
102
+ return False
103
+
104
+
105
+ def get_folder_sizes(directory=os.path.expanduser("~")):
106
+ folder_sizes = {}
107
+
108
+ for folder in os.listdir(directory):
109
+ folder_path = os.path.join(directory, folder)
110
+ if os.path.isdir(folder_path):
111
+ try:
112
+ size = (
113
+ subprocess.check_output(["du", "-sk", folder_path])
114
+ .split()[0]
115
+ .decode("utf-8")
116
+ )
117
+ folder_sizes[folder] = int(size)
118
+ except subprocess.CalledProcessError:
119
+ logger.info(f"Skipping {folder_path}: Permission Denied")
120
+
121
+ sorted_folders = sorted(folder_sizes.items(), key=lambda x: x[1], reverse=True)
122
+ logger.info(f"{'Folder':<50}{'Size (MB)':>10}")
123
+ logger.info("=" * 60)
124
+ for folder, size in sorted_folders:
125
+ logger.info(f"{folder:<50}{size / (1024*1024):>10.2f}")
126
+
127
+
128
+ def create_cron_job(
129
+ script_path,
130
+ venv_path,
131
+ log_file,
132
+ pythonpath,
133
+ cwd,
134
+ job_frequency="* * * * *",
135
+ cron_name="My Custom Cron Job",
136
+ ):
137
+ """
138
+ Creates a cron job to run a Python script with a virtual environment, logging output, and setting PYTHONPATH and CWD.
139
+
140
+ Parameters:
141
+ - script_path (str): Path to the Python script to run.
142
+ - venv_path (str): Path to the virtual environment's Python interpreter.
143
+ - log_file (str): Path to the log file for output.
144
+ - pythonpath (str): Value for the PYTHONPATH environment variable.
145
+ - cwd (str): Working directory from which the script should run.
146
+ - job_frequency (str): Cron timing syntax (default is every minute).
147
+ - cron_name (str): Name to identify the cron job.
148
+ """
149
+ # Construct the cron command
150
+ cron_command = (
151
+ f"{job_frequency} /bin/zsh -c 'pgrep -fl python | grep -q {os.path.basename(script_path)} "
152
+ f'|| (echo -e "Cron job {cron_name} started at $(date)" >> {log_file} && cd {cwd} && '
153
+ f"PYTHONPATH={pythonpath} {venv_path}/bin/python {script_path} >> {log_file} 2>&1)'"
154
+ )
155
+
156
+ # Check existing cron jobs and remove any with the same comment
157
+ subprocess.run(f"(crontab -l | grep -v '{cron_name}') | crontab -", shell=True)
158
+
159
+ # Add the new cron job with the comment
160
+ full_cron_job = f"{cron_command} # {cron_name}\n"
161
+ subprocess.run(f'(crontab -l; echo "{full_cron_job}") | crontab -', shell=True)
162
+ logger.info(f"Cron job created: {full_cron_job}")
163
+
164
+
165
+ def remove_all_cron_jobs():
166
+ """
167
+ Removes all cron jobs for the current user.
168
+ """
169
+ try:
170
+ # Clear the user's crontab
171
+ subprocess.run("crontab -r", shell=True, check=True)
172
+ logger.info("All cron jobs have been removed successfully.")
173
+ except subprocess.CalledProcessError:
174
+ logger.info(
175
+ "Failed to remove cron jobs. There may not be any cron jobs to remove, or there could be a permissions issue."
176
+ )
177
+
178
+
179
+ def serialize_timestamp(dict: dict):
180
+ def convert(obj):
181
+ if isinstance(obj, (datetime, date, pd.Timestamp)):
182
+ return obj.isoformat()
183
+
184
+ return obj
185
+
186
+ return [{k: convert(v) for k, v in item.items()} for item in dict]
187
+
188
+
189
+ def remove_accents(text: str) -> str:
190
+ """
191
+ Cleans the text of:
192
+ - Broken Unicode
193
+ - Accents
194
+ - Control characters (including \x00, \u0000, etc.)
195
+ - Escape sequences
196
+ - Non-printable characters
197
+ - Excessive punctuation (like ........ or !!!!)
198
+ """
199
+
200
+ # Step 1: Fix mojibake and broken Unicode
201
+ text = fix_text(text)
202
+
203
+ # Step 2 bis: Normalize accents
204
+ text = unicodedata.normalize("NFKD", text)
205
+ text = text.encode("ASCII", "ignore").decode("utf8")
206
+
207
+ # Step 3: Remove known weird tokens
208
+ text = text.replace("<|endoftext|>", "")
209
+ text = text.replace("\u0000", "").replace("\x00", "")
210
+
211
+ # Step 4: Remove raw control characters (e.g., \x1f)
212
+ text = "".join(c for c in text if unicodedata.category(c)[0] != "C" or c == "\n")
213
+
214
+ # Step 5: Remove literal escape sequences like \xNN
215
+ text = re.sub(r"\\x[0-9a-fA-F]{2}", "", text)
216
+
217
+ # Step 6: Remove non-printable characters
218
+ printable = set(string.printable)
219
+ text = "".join(c for c in text if c in printable)
220
+
221
+ # Step 7: Collapse repeated punctuation (e.g., ........ → .)
222
+ text = re.sub(r"([!?.])\1{2,}", r"\1", text) # !!!!!! → !
223
+ text = re.sub(r"([-—])\1{1,}", r"\1", text) # ------ → -
224
+ text = re.sub(r"([,.]){4,}", r"\1", text) # ...... → .
225
+
226
+ return text.strip()
227
+
228
+
229
+ def serialize_for_json(obj):
230
+ """
231
+ Recursively convert any object into a JSON-serializable structure.
232
+ Classes and class instances are converted to readable strings like 'ClassName()'.
233
+ """
234
+ if isinstance(obj, (str, int, float, bool, type(None))):
235
+ return obj
236
+ elif isinstance(obj, dict):
237
+ return {str(k): serialize_for_json(v) for k, v in obj.items()}
238
+ elif isinstance(obj, (list, tuple, set)):
239
+ return [serialize_for_json(v) for v in obj]
240
+ elif isinstance(obj, type):
241
+ # A class/type object like int, str, etc.
242
+ return obj.__name__
243
+ elif hasattr(obj, "__class__"):
244
+ return f"{obj.__class__.__name__}()"
245
+ else:
246
+ return str(obj)