lecrapaud 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (42) hide show
  1. lecrapaud/__init__.py +1 -0
  2. lecrapaud/api.py +277 -0
  3. lecrapaud/config.py +10 -0
  4. lecrapaud/db/__init__.py +1 -0
  5. lecrapaud/db/alembic/env.py +2 -2
  6. lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +24 -12
  7. lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
  8. lecrapaud/db/alembic.ini +116 -0
  9. lecrapaud/db/models/__init__.py +10 -10
  10. lecrapaud/db/models/base.py +176 -1
  11. lecrapaud/db/models/dataset.py +25 -20
  12. lecrapaud/db/models/feature.py +5 -6
  13. lecrapaud/db/models/feature_selection.py +3 -4
  14. lecrapaud/db/models/feature_selection_rank.py +3 -4
  15. lecrapaud/db/models/model.py +3 -4
  16. lecrapaud/db/models/model_selection.py +15 -8
  17. lecrapaud/db/models/model_training.py +15 -7
  18. lecrapaud/db/models/score.py +9 -6
  19. lecrapaud/db/models/target.py +16 -8
  20. lecrapaud/db/session.py +68 -0
  21. lecrapaud/experiment.py +64 -0
  22. lecrapaud/feature_engineering.py +747 -1022
  23. lecrapaud/feature_selection.py +915 -998
  24. lecrapaud/integrations/openai_integration.py +225 -0
  25. lecrapaud/jobs/__init__.py +2 -2
  26. lecrapaud/jobs/config.py +1 -1
  27. lecrapaud/jobs/scheduler.py +1 -1
  28. lecrapaud/jobs/tasks.py +6 -6
  29. lecrapaud/model_selection.py +1060 -960
  30. lecrapaud/search_space.py +4 -0
  31. lecrapaud/utils.py +2 -2
  32. lecrapaud-0.4.2.dist-info/METADATA +177 -0
  33. {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.2.dist-info}/RECORD +36 -35
  34. {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.2.dist-info}/WHEEL +1 -1
  35. lecrapaud/db/crud.py +0 -179
  36. lecrapaud/db/services.py +0 -0
  37. lecrapaud/db/setup.py +0 -58
  38. lecrapaud/predictions.py +0 -292
  39. lecrapaud/training.py +0 -151
  40. lecrapaud-0.4.0.dist-info/METADATA +0 -103
  41. /lecrapaud/{directory_management.py → directories.py} +0 -0
  42. {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.2.dist-info}/LICENSE +0 -0
lecrapaud/predictions.py DELETED
@@ -1,292 +0,0 @@
1
- import keras
2
- import pickle
3
- import pandas as pd
4
- from pathlib import Path
5
- import joblib
6
- from datetime import timedelta, datetime
7
- import logging
8
-
9
- from src.search_space import ml_models, dl_recurrent_models
10
- from src.data_sourcing import get_filtered_data
11
- from src.feature_engineering import feature_engineering
12
- from src.feature_selection import (
13
- encode_categorical_features,
14
- reshape_df,
15
- TARGETS_CLF,
16
- reshape_time_series,
17
- )
18
- from src.model_selection import predict, evaluate
19
- from src.utils import logger
20
- from src.db.models import Dataset
21
- from src.config import LOGGING_LEVEL
22
-
23
- MODELS_LIST = ml_models + dl_recurrent_models
24
-
25
-
26
- def run_prediction(
27
- dataset_id: str,
28
- targets_numbers: list[int],
29
- test: bool = True,
30
- date: datetime = None,
31
- verbose: int = 0,
32
- ):
33
- """Function to run prediction on several TARGETS using best models"""
34
- if verbose == 0:
35
- logger.setLevel(logging.WARNING)
36
-
37
- logger.warning("Running prediction...")
38
-
39
- dataset = Dataset.get(dataset_id)
40
- dataset_dir = dataset.path
41
- preprocessing_dir = f"{dataset_dir}/preprocessing"
42
- list_of_groups = dataset.list_of_groups
43
-
44
- features_dict = {}
45
- scaler_y_dict = {}
46
- model_dict = {}
47
- threshold_dict = {}
48
- for target_number in targets_numbers:
49
- (
50
- model_dict[target_number],
51
- threshold_dict[target_number],
52
- features_dict[target_number],
53
- scaler_y_dict[target_number],
54
- all_features,
55
- scaler_x,
56
- ) = load_model(dataset, target_number)
57
-
58
- # get data for backtesting
59
- if test:
60
- train_data_dir = f"{dataset_dir}/data"
61
- data_for_pred = joblib.load(f"{train_data_dir}/test.pkl")
62
- data_for_pred_scaled = joblib.load(f"{train_data_dir}/test_scaled.pkl")
63
-
64
- if any(
65
- config["recurrent"]
66
- for config in MODELS_LIST
67
- if config["model_name"] in model_dict.values()
68
- ):
69
- train_scaled = joblib.load(f"{train_data_dir}/train_scaled.pkl")
70
- val_scaled = joblib.load(f"{train_data_dir}/val_scaled.pkl")
71
- test_scaled = joblib.load(f"{train_data_dir}/test_scaled.pkl")
72
- reshaped_data = reshape_time_series(
73
- train_scaled, val_scaled, test_scaled, all_features, timesteps=120
74
- )
75
- data_for_pred_reshaped = reshaped_data["x_train_reshaped"]
76
-
77
- most_recent_data = joblib.load(f"{train_data_dir}/full.pkl")
78
- most_recent_data = most_recent_data.loc[data_for_pred.index]
79
-
80
- scores_clf = []
81
- scores_reg = []
82
- # get data for predicting future
83
- else:
84
- # TODO: if date is a bit more older, need more than 0 years
85
- most_recent_data = get_filtered_data(
86
- years_of_data=0, list_of_groups=list_of_groups
87
- )
88
-
89
- most_recent_data = feature_engineering(
90
- most_recent_data, for_training=False, save_as_csv=True
91
- )
92
-
93
- data_for_pred = encode_categorical_features(
94
- most_recent_data, save_dir=preprocessing_dir, fit=False
95
- )
96
-
97
- data_for_pred_scaled = pd.DataFrame(
98
- scaler_x.transform(data_for_pred[all_features]),
99
- columns=list(data_for_pred[all_features].columns),
100
- index=data_for_pred.index,
101
- )
102
-
103
- # TODO: don't we need to have 120 days of data for each stock?
104
- if any(
105
- config["recurrent"]
106
- for config in MODELS_LIST
107
- if config["model_name"] in model_dict.values()
108
- ):
109
- # Count number of rows per stock
110
- counts = data_for_pred["STOCK"].value_counts()
111
-
112
- # Find stocks with insufficient history
113
- insufficient_stocks = counts[counts < 120]
114
-
115
- if not insufficient_stocks.empty:
116
- raise ValueError(
117
- f"Insufficient history for stocks: {', '.join(insufficient_stocks.index)}"
118
- )
119
-
120
- data_for_pred_reshaped = reshape_df(
121
- data_for_pred_scaled[all_features], data_for_pred["STOCK"], 120
122
- )
123
-
124
- # make prediction
125
- for target_number in targets_numbers:
126
-
127
- # Prepare variables and data
128
- target_type = "classification" if target_number in TARGETS_CLF else "regression"
129
- features = features_dict[target_number]
130
- model = model_dict[target_number]
131
- threshold = threshold_dict[target_number]
132
-
133
- config = [
134
- config for config in MODELS_LIST if config["model_name"] == model.model_name
135
- ]
136
- if config is None or len(config) == 0:
137
- Exception(f"Model {model.model_name} was not found in search space.")
138
- else:
139
- config = config[0]
140
-
141
- need_scaling = config["need_scaling"] and target_type == "regression"
142
- if config["recurrent"]:
143
- features_idx = [i for i, e in enumerate(all_features) if e in set(features)]
144
- x_pred = data_for_pred_reshaped[:, :, features_idx]
145
- else:
146
- x_pred = (
147
- data_for_pred_scaled[features]
148
- if need_scaling
149
- else data_for_pred[features]
150
- )
151
-
152
- # Predict
153
- y_pred = predict(model, x_pred, target_type, config, threshold)
154
-
155
- # Fix for recurrent model because x_val has no index as it is a 3D np array
156
- if config["recurrent"]:
157
- y_pred.index = (
158
- most_recent_data.index
159
- ) # TODO: not sure this will work for old dataset not aligned with data_for_training for test use case (done, this is why we decode the test set)
160
-
161
- # Unscale prediction
162
- if need_scaling or config["recurrent"]:
163
- scaler_y = scaler_y_dict[target_number]
164
- y_pred = pd.Series(
165
- scaler_y.inverse_transform(y_pred.values.reshape(-1, 1)).flatten(),
166
- index=most_recent_data.index,
167
- )
168
- y_pred.name = "PRED"
169
-
170
- # Evaluate if test
171
- if test:
172
- prediction = pd.concat(
173
- [most_recent_data[f"TARGET_{target_number}"], y_pred], axis=1
174
- )
175
- prediction.rename(
176
- columns={f"TARGET_{target_number}": "TARGET"}, inplace=True
177
- )
178
- score = evaluate(prediction, target_type)
179
- score["TARGET"] = f"TARGET_{target_number}"
180
- (
181
- scores_clf.append(score)
182
- if target_type == "classification"
183
- else scores_reg.append(score)
184
- )
185
-
186
- if isinstance(y_pred, pd.DataFrame):
187
- y_pred.rename(
188
- columns={"PRED": f"TARGET_{target_number}_PRED"}, inplace=True
189
- )
190
- most_recent_data = pd.concat(
191
- [most_recent_data, y_pred[f"TARGET_{target_number}_PRED"]], axis=1
192
- )
193
-
194
- else:
195
- y_pred.name = f"TARGET_{target_number}_PRED"
196
- most_recent_data = pd.concat([most_recent_data, y_pred], axis=1)
197
-
198
- # return result either for test set or for tomorrow prediction
199
- result = most_recent_data
200
-
201
- if verbose == 0:
202
- logger.setLevel(LOGGING_LEVEL)
203
-
204
- if test:
205
- logger.info("Test results on test set")
206
- scores_reg = pd.DataFrame(scores_reg).set_index("TARGET")
207
- scores_clf = pd.DataFrame(scores_clf).set_index("TARGET")
208
- return result, scores_reg, scores_clf, prediction
209
- elif date:
210
- date = date.replace(hour=0, minute=0, second=0, microsecond=0)
211
- tomorrow = date + timedelta(days=1)
212
- logger.info(f"Prediction for : {tomorrow.date()}")
213
- result = result[result["DATE"] == date]
214
- return result, None, None, None
215
- else:
216
- date = datetime.today()
217
- max_date = result["DATE"].max()
218
- if max_date.date() != date.date():
219
- logger.info(
220
- f"The maximum date found in the dataset is {max_date} and not {date}"
221
- )
222
- tomorrow = max_date + timedelta(days=1)
223
- logger.info(f"Prediction for tomorrow : {tomorrow.date()}")
224
-
225
- # Filter the DataFrame for the last date
226
- filtered_result = result[result["DATE"] == max_date]
227
-
228
- return filtered_result, None, None, None
229
-
230
-
231
- # Helpers
232
- def load_model(dataset: Dataset, target_number: int):
233
- dataset_dir = dataset.path
234
- training_target_dir = f"{dataset_dir}/TARGET_{target_number}"
235
- preprocessing_dir = f"{dataset_dir}/preprocessing"
236
-
237
- # Search for files that contain '.best' or '.keras' in the name
238
- scores_tracking = pd.read_csv(f"{training_target_dir}/scores_tracking.csv")
239
- training_target_dir = Path(training_target_dir)
240
- best_files = list(training_target_dir.glob("*.best*")) + list(
241
- training_target_dir.glob("*.keras*")
242
- )
243
- threshold = (
244
- scores_tracking["THRESHOLD"].values[0]
245
- if "THRESHOLD" in scores_tracking.columns
246
- else None
247
- )
248
-
249
- # If any files are found, try loading the first one (or process as needed)
250
- if best_files:
251
- file_path = best_files[0] # Assuming you want to open the first matching file
252
- try:
253
- # Attempt to load the file as a scikit-learn, XGBoost, or LightGBM model (Pickle format)
254
- model = joblib.load(file_path)
255
- logger.info(f"Loaded model {model.model_name} and threshold {threshold}")
256
- except (pickle.UnpicklingError, EOFError):
257
- # If it's not a pickle file, try loading it as a Keras model
258
- try:
259
- # Attempt to load the file as a Keras model
260
- model = keras.models.load_model(file_path)
261
- logger.info(
262
- f"Loaded model {model.model_name} and threshold {threshold}"
263
- )
264
- except Exception as e:
265
- raise FileNotFoundError(
266
- f"Model could not be loaded from path: {file_path}: {e}"
267
- )
268
- else:
269
- raise FileNotFoundError(
270
- f"No files with '.best' or '.keras' found in the specified folder: {training_target_dir}"
271
- )
272
-
273
- if dataset.name == "data_28_X_X":
274
- features = joblib.load(
275
- f"{preprocessing_dir}/features_{target_number}.pkl"
276
- ) # we keep this for backward compatibility
277
- else:
278
- features = dataset.get_features(target_number)
279
-
280
- scaler_y = None
281
- if target_number not in TARGETS_CLF:
282
- scaler_y = joblib.load(f"{preprocessing_dir}/scaler_y_{target_number}.pkl")
283
-
284
- if dataset.name == "data_28_X_X":
285
- all_features = joblib.load(
286
- f"{preprocessing_dir}/all_features.pkl"
287
- ) # we keep this for backward compatibility
288
- else:
289
- all_features = dataset.get_all_features()
290
- scaler_x = joblib.load(f"{preprocessing_dir}/scaler_x.pkl")
291
-
292
- return model, threshold, features, scaler_y, all_features, scaler_x
lecrapaud/training.py DELETED
@@ -1,151 +0,0 @@
1
- import logging
2
- import joblib
3
- from pathlib import Path
4
- import os
5
- from src.utils import logger
6
-
7
- from src.feature_engineering import feature_engineering
8
- from src.feature_selection import (
9
- create_sets_from_data,
10
- feature_selection,
11
- scale_data,
12
- reshape_time_series,
13
- )
14
- from src.model_selection import model_selection, test_hardware
15
- from src.data_sourcing import get_filtered_data
16
- from src.constants import stock_list_3, stock_list_1
17
- from src.search_space import ml_models, dl_recurrent_models
18
- from src.directory_management import tmp_dir
19
- from src.db.models import Dataset
20
- from src.config import PYTHON_ENV
21
-
22
-
23
- def run_training(
24
- dataset_id=None,
25
- years_of_data=2,
26
- list_of_groups=stock_list_1,
27
- percentile=15,
28
- corr_threshold=80,
29
- max_features=20,
30
- max_timesteps=120,
31
- targets_numbers=range(1, 15),
32
- models_idx=range(len(ml_models)),
33
- number_of_trials=20,
34
- perform_hyperoptimization=True,
35
- perform_crossval=False,
36
- clean_dir=False,
37
- preserve_model=False,
38
- session_name="test",
39
- ):
40
- logging.captureWarnings(True)
41
-
42
- if dataset_id is None:
43
- # Get the data
44
- logger.info("Getting data...")
45
- data = get_filtered_data(
46
- years_of_data=years_of_data,
47
- list_of_groups=list_of_groups,
48
- )
49
-
50
- # preprocess & feature engineering
51
- logger.info("Preprocessing...")
52
- data_for_training = feature_engineering(
53
- data, for_training=True, save_as_csv=True
54
- )
55
-
56
- # train / val / test sets
57
- train, val, test, dataset = create_sets_from_data(
58
- data_for_training,
59
- percentile=percentile,
60
- corr_threshold=corr_threshold,
61
- max_features=max_features,
62
- )
63
- dataset_dir = dataset.path
64
- dataset_id = dataset.id
65
- train_data_dir = f"{dataset_dir}/data"
66
- os.makedirs(train_data_dir, exist_ok=True)
67
- preprocessing_dir = f"{dataset_dir}/preprocessing"
68
-
69
- # feature selection
70
- logger.info("Feature Selection...")
71
- for target_number in targets_numbers:
72
- feature_selection(
73
- dataset_id=dataset_id,
74
- train=train,
75
- target_number=target_number,
76
- single_process=True,
77
- )
78
-
79
- dataset = Dataset.get(dataset_id)
80
- all_features = dataset.get_all_features()
81
- columns_to_keep = all_features + [f"TARGET_{i}" for i in range(1, 15)]
82
- logger.info(columns_to_keep)
83
- duplicates = [
84
- col for col in set(columns_to_keep) if columns_to_keep.count(col) > 1
85
- ]
86
-
87
- if duplicates:
88
- raise ValueError(f"Doublons détectés dans columns_to_keep: {duplicates}")
89
-
90
- train = train[columns_to_keep]
91
- val = val[columns_to_keep]
92
- test = test[columns_to_keep]
93
-
94
- if PYTHON_ENV != "Test":
95
- joblib.dump(train[columns_to_keep], f"{train_data_dir}/train.pkl")
96
- joblib.dump(val[columns_to_keep], f"{train_data_dir}/val.pkl")
97
- joblib.dump(test[columns_to_keep], f"{train_data_dir}/test.pkl")
98
-
99
- # scaling features
100
- logger.info("Scaling features...")
101
- train_scaled, scaler_x, scalers_y = scale_data(
102
- train, save_dir=preprocessing_dir
103
- )
104
- val_scaled, _, _ = scale_data(
105
- val, save_dir=preprocessing_dir, scaler_x=scaler_x, scalers_y=scalers_y
106
- )
107
- test_scaled, _, _ = scale_data(
108
- test, save_dir=preprocessing_dir, scaler_x=scaler_x, scalers_y=scalers_y
109
- )
110
-
111
- if PYTHON_ENV != "Test":
112
- joblib.dump(train_scaled, f"{train_data_dir}/train_scaled.pkl")
113
- joblib.dump(val_scaled, f"{train_data_dir}/val_scaled.pkl")
114
- joblib.dump(test_scaled, f"{train_data_dir}/test_scaled.pkl")
115
-
116
- data = {
117
- "train": train,
118
- "val": val,
119
- "test": test,
120
- "train_scaled": train_scaled,
121
- "val_scaled": val_scaled,
122
- "test_scaled": test_scaled,
123
- "scalers_y": scalers_y,
124
- }
125
-
126
- list_models = ml_models + dl_recurrent_models
127
- reshaped_data = None
128
- if any(list_models[i].get("recurrent") for i in models_idx):
129
- # reshaping data for recurrent models
130
- logger.info("Reshaping data for recurrent models...")
131
- reshaped_data = reshape_time_series(
132
- train_scaled, val_scaled, test_scaled, all_features, timesteps=max_timesteps
133
- )
134
-
135
- # model selection and hyperoptimization
136
- logger.info("Model Selection and Hyperoptimization...")
137
- for target_number in targets_numbers:
138
- model_selection(
139
- dataset_id=dataset_id,
140
- models_idx=models_idx,
141
- target_number=target_number,
142
- session_name=session_name,
143
- perform_hyperoptimization=perform_hyperoptimization,
144
- perform_crossval=perform_crossval,
145
- number_of_trials=number_of_trials,
146
- plot=False,
147
- clean_dir=clean_dir,
148
- preserve_model=preserve_model,
149
- reshaped_data=reshaped_data,
150
- data=(data or None),
151
- )
@@ -1,103 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: lecrapaud
3
- Version: 0.4.0
4
- Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
- License: Apache License
6
- Author: Pierre H. Gallet
7
- Requires-Python: ==3.12.*
8
- Classifier: License :: Other/Proprietary License
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: Programming Language :: Python :: 3.12
11
- Requires-Dist: backoff (>=2.2.1)
12
- Requires-Dist: category-encoders (>=2.8.1)
13
- Requires-Dist: celery (>=5.5.1)
14
- Requires-Dist: curl-cffi (>=0.11.1)
15
- Requires-Dist: deep-translator (>=1.11.4)
16
- Requires-Dist: degiro-connector (>=3.0.26)
17
- Requires-Dist: fake-useragent (>=2.1.0)
18
- Requires-Dist: ftfy (>=6.3.1)
19
- Requires-Dist: honeybadger (>=0.21)
20
- Requires-Dist: joblib (>=1.4.2)
21
- Requires-Dist: keras (>=3.9.0)
22
- Requires-Dist: keras-tcn (>=3.1.2)
23
- Requires-Dist: lightgbm (>=4.6.0)
24
- Requires-Dist: matplotlib (>=3.10.1)
25
- Requires-Dist: mlxtend (>=0.23.4)
26
- Requires-Dist: numpy (>=2.1.3)
27
- Requires-Dist: pandas (>=2.2.3)
28
- Requires-Dist: pandas-market-calendars (>=4.6.1)
29
- Requires-Dist: playwright (>=1.52.0)
30
- Requires-Dist: pydantic (>=2.10.6)
31
- Requires-Dist: python-dotenv (>=1.0.1)
32
- Requires-Dist: pytz (>=2025.1)
33
- Requires-Dist: ratelimit (>=2.2.1)
34
- Requires-Dist: scikit-learn (>=1.6.1)
35
- Requires-Dist: scipy (>=1.15.2)
36
- Requires-Dist: seaborn (>=0.13.2)
37
- Requires-Dist: sentence-transformers (>=3.4.1)
38
- Requires-Dist: sqlalchemy (>=2.0.39)
39
- Requires-Dist: tensorboardx (>=2.6.2.2)
40
- Requires-Dist: tensorflow (>=2.19.0)
41
- Requires-Dist: tf-keras (>=2.19.0)
42
- Requires-Dist: tqdm (>=4.67.1)
43
- Requires-Dist: xgboost (>=3.0.0)
44
- Requires-Dist: yahoo-fin (>=0.8.9.1)
45
- Requires-Dist: yfinance (>=0.2.55)
46
- Description-Content-Type: text/markdown
47
-
48
- # Stock
49
-
50
- ## Overview
51
-
52
- ## Project description
53
-
54
- ## Quick Start
55
-
56
- 1. Create environment
57
-
58
- ```sh
59
- $ pip install virtualenv
60
- $ python -m venv .venv
61
- $ source .venv/bin/activate
62
- ```
63
-
64
- 2. Install dependencies
65
-
66
- ```sh
67
- $ pip install -r requirements.txt
68
- $ pip freeze > requirements.txt
69
- ```
70
-
71
- 3. Deactivate virtualenv (if needed)
72
-
73
- ```sh
74
- $ deactivate
75
- ```
76
-
77
- ## Reminders for Github usage
78
-
79
- 1. Creating Github repository
80
-
81
- ```sh
82
- $ brew install gh
83
- $ gh auth login
84
- $ gh repo create
85
- ```
86
-
87
- 2. Initializing git and first commit to distant repository
88
-
89
- ```sh
90
- $ git init
91
- $ git add .
92
- $ git commit -m 'first commit'
93
- $ git remote add origin <YOUR_REPO_URL>
94
- $ git push -u origin master
95
- ```
96
-
97
- 3. use conventional commits
98
- https://www.conventionalcommits.org/en/v1.0.0/#summary
99
-
100
-
101
- ***
102
-
103
- Pierre Gallet © 2024
File without changes