lecrapaud 0.2.1__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (64) hide show
  1. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/PKG-INFO +4 -4
  2. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/README.md +3 -3
  3. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/api.py +8 -2
  4. lecrapaud-0.3.0/lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +50 -0
  5. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/session.py +11 -0
  6. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/experiment.py +1 -1
  7. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/feature_engineering.py +11 -12
  8. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/feature_selection.py +29 -48
  9. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/model_selection.py +59 -56
  10. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/utils.py +1 -1
  11. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/pyproject.toml +1 -1
  12. lecrapaud-0.2.1/lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +0 -38
  13. lecrapaud-0.2.1/lecrapaud/predictions.py +0 -292
  14. lecrapaud-0.2.1/lecrapaud/preprocessing.py +0 -984
  15. lecrapaud-0.2.1/lecrapaud/training.py +0 -239
  16. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/LICENSE +0 -0
  17. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/__init__.py +0 -0
  18. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/config.py +0 -0
  19. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/__init__.py +0 -0
  20. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/README +0 -0
  21. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/env.py +0 -0
  22. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/script.py.mako +0 -0
  23. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +0 -0
  24. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +0 -0
  25. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +0 -0
  26. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +0 -0
  27. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +0 -0
  28. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +0 -0
  29. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +0 -0
  30. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +0 -0
  31. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +0 -0
  32. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +0 -0
  33. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +0 -0
  34. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +0 -0
  35. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +0 -0
  36. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +0 -0
  37. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +0 -0
  38. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/models/__init__.py +0 -0
  39. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/models/base.py +0 -0
  40. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/models/dataset.py +0 -0
  41. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/models/feature.py +0 -0
  42. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/models/feature_selection.py +0 -0
  43. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/models/feature_selection_rank.py +0 -0
  44. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/models/model.py +0 -0
  45. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/models/model_selection.py +0 -0
  46. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/models/model_training.py +0 -0
  47. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/models/score.py +0 -0
  48. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/db/models/target.py +0 -0
  49. /lecrapaud-0.2.1/lecrapaud/directory_management.py → /lecrapaud-0.3.0/lecrapaud/directories.py +0 -0
  50. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/integrations/openai_integration.py +0 -0
  51. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/jobs/__init__.py +0 -0
  52. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/jobs/config.py +0 -0
  53. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/jobs/scheduler.py +0 -0
  54. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/jobs/tasks.py +0 -0
  55. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/search_space.py +0 -0
  56. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/services/__init__.py +0 -0
  57. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/services/embedding_categorical.py +0 -0
  58. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/services/indicators.py +0 -0
  59. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/speed_tests/experiments.py +0 -0
  60. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/speed_tests/test-gpu-bilstm.ipynb +0 -0
  61. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/speed_tests/test-gpu-resnet.ipynb +0 -0
  62. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/speed_tests/test-gpu-transformers.ipynb +0 -0
  63. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/speed_tests/tests.ipynb +0 -0
  64. {lecrapaud-0.2.1 → lecrapaud-0.3.0}/lecrapaud/speed_tests/trash.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: lecrapaud
3
- Version: 0.2.1
3
+ Version: 0.3.0
4
4
  Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
5
  License: Apache License
6
6
  Author: Pierre H. Gallet
@@ -49,7 +49,7 @@ Description-Content-Type: text/markdown
49
49
 
50
50
  <div align="center">
51
51
 
52
- <div style="font-size:50rem;">🐸</div>
52
+ <img src="https://s3.amazonaws.com/pix.iemoji.com/images/emoji/apple/ios-12/256/frog-face.png" width=120 alt="crapaud"/>
53
53
 
54
54
  ## Welcome to LeCrapaud
55
55
 
@@ -73,13 +73,13 @@ LeCrapaud is a high-level Python library for end-to-end machine learning workflo
73
73
  ## ⚡ Quick Start
74
74
 
75
75
 
76
- ### ⚙️ Install the package
76
+ ### Install the package
77
77
 
78
78
  ```sh
79
79
  pip install lecrapaud
80
80
  ```
81
81
 
82
- ### 🛠️ How it works
82
+ ### How it works
83
83
 
84
84
  This package provides a high-level API to manage experiments for feature engineering, model selection, and prediction on tabular data (e.g. stock data).
85
85
 
@@ -1,6 +1,6 @@
1
1
  <div align="center">
2
2
 
3
- <div style="font-size:50rem;">🐸</div>
3
+ <img src="https://s3.amazonaws.com/pix.iemoji.com/images/emoji/apple/ios-12/256/frog-face.png" width=120 alt="crapaud"/>
4
4
 
5
5
  ## Welcome to LeCrapaud
6
6
 
@@ -24,13 +24,13 @@ LeCrapaud is a high-level Python library for end-to-end machine learning workflo
24
24
  ## ⚡ Quick Start
25
25
 
26
26
 
27
- ### ⚙️ Install the package
27
+ ### Install the package
28
28
 
29
29
  ```sh
30
30
  pip install lecrapaud
31
31
  ```
32
32
 
33
- ### 🛠️ How it works
33
+ ### How it works
34
34
 
35
35
  This package provides a high-level API to manage experiments for feature engineering, model selection, and prediction on tabular data (e.g. stock data).
36
36
 
@@ -32,6 +32,8 @@ experiment.model_selection(data) : return best_model
32
32
 
33
33
  import joblib
34
34
  import pandas as pd
35
+ import logging
36
+ from lecrapaud.utils import logger
35
37
  from lecrapaud.db.session import init_db
36
38
  from lecrapaud.feature_selection import FeatureSelectionEngine, PreprocessModel
37
39
  from lecrapaud.model_selection import ModelSelectionEngine, ModelEngine
@@ -103,7 +105,12 @@ class Experiment:
103
105
  std_data, reshaped_data = self.preprocess_model(train, val, test)
104
106
  self.model_selection(std_data, reshaped_data)
105
107
 
106
- def predict(self, new_data):
108
+ def predict(self, new_data, verbose: int = 0):
109
+ if verbose == 0:
110
+ logger.setLevel(logging.WARNING)
111
+
112
+ logger.warning("Running prediction...")
113
+
107
114
  data = self.feature_engineering(
108
115
  data=new_data,
109
116
  for_training=False,
@@ -127,7 +134,6 @@ class Experiment:
127
134
  else:
128
135
  features = self.dataset.get_features(target_number)
129
136
  model = ModelEngine(path=training_target_dir)
130
- model.load()
131
137
 
132
138
  # getting data
133
139
  if model.recurrent:
@@ -0,0 +1,50 @@
1
+ """make_nullablee
2
+
3
+ Revision ID: 52b809a34371
4
+ Revises: 339927587383
5
+ Create Date: 2025-05-31 18:34:58.962966
6
+
7
+ """
8
+
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+ from sqlalchemy.dialects import mysql
14
+
15
+ # revision identifiers, used by Alembic.
16
+ revision: str = "52b809a34371"
17
+ down_revision: Union[str, None] = "339927587383"
18
+ branch_labels: Union[str, Sequence[str], None] = None
19
+ depends_on: Union[str, Sequence[str], None] = None
20
+
21
+
22
+ def upgrade() -> None:
23
+ # ### commands auto generated by Alembic - please adjust! ###
24
+ op.alter_column(
25
+ "investment_runs",
26
+ "initial_portfolio",
27
+ existing_type=mysql.JSON(),
28
+ nullable=True,
29
+ )
30
+ op.create_foreign_key(
31
+ None,
32
+ "portfolios",
33
+ "investment_runs",
34
+ ["investment_run_id"],
35
+ ["id"],
36
+ ondelete="CASCADE",
37
+ )
38
+ # ### end Alembic commands ###
39
+
40
+
41
+ def downgrade() -> None:
42
+ # ### commands auto generated by Alembic - please adjust! ###
43
+ op.drop_constraint(None, "portfolios", type_="foreignkey")
44
+ op.alter_column(
45
+ "investment_runs",
46
+ "initial_portfolio",
47
+ existing_type=mysql.JSON(),
48
+ nullable=False,
49
+ )
50
+ # ### end Alembic commands ###
@@ -4,6 +4,9 @@ from contextlib import contextmanager
4
4
  from sqlalchemy import create_engine, text
5
5
  from sqlalchemy.orm import sessionmaker
6
6
  from urllib.parse import urlparse
7
+ from alembic.config import Config
8
+ from alembic import command
9
+ import os
7
10
 
8
11
  from lecrapaud.config import DB_USER, DB_PASSWORD, DB_HOST, DB_PORT, DB_NAME, DB_URI
9
12
 
@@ -39,6 +42,14 @@ def init_db(uri: str = None):
39
42
  # Step 4: Create session factory
40
43
  _SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=_engine)
41
44
 
45
+ # Step 5: Apply Alembic migrations programmatically
46
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
47
+ alembic_cfg_path = os.path.join(project_root, "alembic.ini")
48
+
49
+ alembic_cfg = Config(alembic_cfg_path)
50
+ alembic_cfg.set_main_option("sqlalchemy.url", uri or os.getenv("DATABASE_URL"))
51
+ command.upgrade(alembic_cfg, "head")
52
+
42
53
 
43
54
  # Dependency to get a session instance
44
55
  @contextmanager
@@ -5,7 +5,7 @@ from pathlib import Path
5
5
  os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
6
6
 
7
7
  # Internal
8
- from lecrapaud.directory_management import tmp_dir
8
+ from lecrapaud.directories import tmp_dir
9
9
  from lecrapaud.utils import logger
10
10
  from lecrapaud.config import PYTHON_ENV
11
11
  from lecrapaud.db import (
@@ -101,7 +101,7 @@ class FeatureEngineeringEngine:
101
101
 
102
102
  def run(self) -> pd.DataFrame:
103
103
  # drop columns
104
- self.data = self.data.drop(columns=self.columns_drop)
104
+ self.data = self.data.drop(columns=self.columns_drop, errors="ignore")
105
105
 
106
106
  # convert object columns to numeric if possible
107
107
  self.data = convert_object_columns_that_are_numeric(self.data)
@@ -324,6 +324,8 @@ class PreprocessFeature:
324
324
  **kwargs,
325
325
  ):
326
326
  self.data = data
327
+ self.data.columns = self.data.columns.str.upper()
328
+
327
329
  self.dataset = dataset
328
330
  self.columns_pca = columns_pca
329
331
  self.columns_onehot = columns_onehot
@@ -350,7 +352,7 @@ class PreprocessFeature:
350
352
  self.train_val_test_split_time_series()
351
353
  if self.time_series
352
354
  else self.train_val_test_split(
353
- stratify_col=f"target_{self.target_numbers[0]}"
355
+ stratify_col=f"TARGET_{self.target_numbers[0]}"
354
356
  )
355
357
  ) # TODO: only stratifying first target for now
356
358
 
@@ -359,8 +361,7 @@ class PreprocessFeature:
359
361
  val, _ = self.add_pca_features(test, pcas=pcas)
360
362
  test, _ = self.add_pca_features(val, pcas=pcas)
361
363
 
362
- if PYTHON_ENV != "Test":
363
- joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
364
+ joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
364
365
 
365
366
  # Encoding
366
367
  train, transformer = self.encode_categorical_features(train)
@@ -373,11 +374,10 @@ class PreprocessFeature:
373
374
  transformer=transformer,
374
375
  )
375
376
 
376
- if PYTHON_ENV != "Test":
377
- joblib.dump(self.data, f"{self.data_dir}/full.pkl")
378
- joblib.dump(transformer, f"{self.preprocessing_dir}/column_transformer.pkl")
379
- summary = summarize_dataframe(train)
380
- summary.to_csv(f"{self.dataset_dir}/feature_summary.csv", index=False)
377
+ joblib.dump(self.data, f"{self.data_dir}/full.pkl")
378
+ joblib.dump(transformer, f"{self.preprocessing_dir}/column_transformer.pkl")
379
+ summary = summarize_dataframe(train)
380
+ summary.to_csv(f"{self.dataset_dir}/feature_summary.csv", index=False)
381
381
 
382
382
  return train, val, test
383
383
 
@@ -579,8 +579,8 @@ class PreprocessFeature:
579
579
  columns_ordinal: list[str] = self.columns_ordinal
580
580
  columns_frequency: list[str] = self.columns_frequency
581
581
 
582
- X = df.loc[:, ~df.columns.str.contains("^target_")]
583
- y = df.loc[:, df.columns.str.contains("^target_")]
582
+ X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
583
+ y = df.loc[:, df.columns.str.contains("^TARGET_")]
584
584
  save_in_db = False
585
585
 
586
586
  all_columns = (
@@ -643,7 +643,6 @@ class PreprocessFeature:
643
643
 
644
644
  # Try to convert columns to best possible dtypes
645
645
  X_transformed = X_transformed.convert_dtypes()
646
- X_transformed.columns = X_transformed.columns.str.upper()
647
646
 
648
647
  # Insert features in db
649
648
  if save_in_db:
@@ -37,7 +37,7 @@ from sklearn.preprocessing import StandardScaler, MinMaxScaler
37
37
  from scipy.stats import spearmanr, kendalltau
38
38
 
39
39
  # Internal
40
- from lecrapaud.directory_management import tmp_dir, clean_directory
40
+ from lecrapaud.directories import tmp_dir, clean_directory
41
41
  from lecrapaud.utils import logger
42
42
  from lecrapaud.config import PYTHON_ENV
43
43
  from lecrapaud.db import (
@@ -50,10 +50,6 @@ from lecrapaud.db import (
50
50
  from lecrapaud.db.session import get_db
51
51
  from lecrapaud.search_space import all_models
52
52
 
53
- # Variables for targets handling
54
- TARGETS_MCLF = [11]
55
- GROUPING_COLUMN = "STOCK"
56
-
57
53
  # Annoying Warnings
58
54
  warnings.filterwarnings("ignore", category=FutureWarning)
59
55
 
@@ -103,7 +99,7 @@ class FeatureSelectionEngine:
103
99
  # Main feature selection function
104
100
  def run(
105
101
  self,
106
- single_process: bool = False,
102
+ single_process: bool = True,
107
103
  ):
108
104
  """Function to do feature selection with a range of different feature selection technics
109
105
 
@@ -114,10 +110,7 @@ class FeatureSelectionEngine:
114
110
  """
115
111
  target_number = self.target_number
116
112
  target_type = self.target_type
117
- if PYTHON_ENV != "Test":
118
- fs_dir_target = self.fs_dir_target
119
- else:
120
- fs_dir_target = None
113
+ fs_dir_target = self.fs_dir_target
121
114
 
122
115
  # Create the feature selection in db
123
116
  target = Target.find_by(name=f"TARGET_{target_number}")
@@ -162,7 +155,7 @@ class FeatureSelectionEngine:
162
155
  # handling categorical features (only if classification)
163
156
  self.X_categorical, self.X_numerical = get_features_by_types(self.X)
164
157
 
165
- if target_type == "classification":
158
+ if target_type == "classification" and self.X_categorical.shape[1] > 0:
166
159
  feat_scores = self.select_categorical_features(
167
160
  percentile=percentile, save_dir=fs_dir_target
168
161
  )
@@ -292,24 +285,22 @@ class FeatureSelectionEngine:
292
285
  f"We selected {len(features_selected_list)} features and {len(features_selected_by_every_methods)} were selected unanimously:"
293
286
  )
294
287
  logger.debug(features_selected_by_every_methods)
295
- if PYTHON_ENV != "Test":
296
- pd.Series(features_selected_list).to_csv(
297
- f"{fs_dir_target}/features_before_corr.csv",
298
- index=True,
299
- header=True,
300
- index_label="ID",
301
- )
288
+ pd.Series(features_selected_list).to_csv(
289
+ f"{fs_dir_target}/features_before_corr.csv",
290
+ index=True,
291
+ header=True,
292
+ index_label="ID",
293
+ )
302
294
 
303
295
  # removing correlated features
304
296
  self.X = self.X[features_selected_list]
305
297
  features, features_correlated = self.remove_correlated_features(corr_threshold)
306
- if PYTHON_ENV != "Test":
307
- pd.Series(features).to_csv(
308
- f"{fs_dir_target}/features_before_max.csv",
309
- index=True,
310
- header=True,
311
- index_label="ID",
312
- )
298
+ pd.Series(features).to_csv(
299
+ f"{fs_dir_target}/features_before_max.csv",
300
+ index=True,
301
+ header=True,
302
+ index_label="ID",
303
+ )
313
304
  features = features[:max_features]
314
305
 
315
306
  # adding categorical features selected
@@ -337,8 +328,7 @@ class FeatureSelectionEngine:
337
328
  best_features_path = Path(
338
329
  f"{self.preprocessing_dir}/features_{target_number}.pkl"
339
330
  ).resolve()
340
- if PYTHON_ENV != "Test":
341
- joblib.dump(features, best_features_path)
331
+ joblib.dump(features, best_features_path)
342
332
 
343
333
  # save in db
344
334
  db_features = Feature.filter(name__in=features)
@@ -798,6 +788,7 @@ class PreprocessModel:
798
788
 
799
789
  self.dataset_dir = dataset.path
800
790
  self.data_dir = f"{self.dataset_dir}/data"
791
+ self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
801
792
 
802
793
  self.all_features = dataset.get_all_features(
803
794
  date_column=date_column, group_column=group_column
@@ -819,31 +810,23 @@ class PreprocessModel:
819
810
 
820
811
  def run(self):
821
812
  # save data
822
- if PYTHON_ENV != "Test":
823
- joblib.dump(self.train, f"{self.data_dir}/train.pkl")
824
- joblib.dump(self.val, f"{self.data_dir}/val.pkl")
825
- joblib.dump(self.test, f"{self.data_dir}/test.pkl")
826
- preprocessing_dir = f"{self.dataset_dir}/preprocessing"
827
- else:
828
- preprocessing_dir = None
813
+ joblib.dump(self.train, f"{self.data_dir}/train.pkl")
814
+ joblib.dump(self.val, f"{self.data_dir}/val.pkl")
815
+ joblib.dump(self.test, f"{self.data_dir}/test.pkl")
829
816
 
830
817
  # scaling features
831
818
  if any(t not in self.target_clf for t in self.target_numbers) and any(
832
819
  all_models[i].get("need_scaling") for i in self.models_idx
833
820
  ):
834
821
  logger.info("Scaling features...")
835
- train_scaled, scaler_x, scalers_y = self.scale_data(
836
- self.train, save_dir=preprocessing_dir
837
- )
822
+ train_scaled, scaler_x, scalers_y = self.scale_data(self.train)
838
823
  val_scaled, _, _ = self.scale_data(
839
824
  self.val,
840
- save_dir=preprocessing_dir,
841
825
  scaler_x=scaler_x,
842
826
  scalers_y=scalers_y,
843
827
  )
844
828
  test_scaled, _, _ = self.scale_data(
845
829
  self.test,
846
- save_dir=preprocessing_dir,
847
830
  scaler_x=scaler_x,
848
831
  scalers_y=scalers_y,
849
832
  )
@@ -853,10 +836,9 @@ class PreprocessModel:
853
836
  test_scaled = None
854
837
 
855
838
  # save data
856
- if PYTHON_ENV != "Test":
857
- joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
858
- joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
859
- joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
839
+ joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
840
+ joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
841
+ joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
860
842
 
861
843
  data = {
862
844
  "train": self.train,
@@ -923,7 +905,6 @@ class PreprocessModel:
923
905
  def scale_data(
924
906
  self,
925
907
  df: pd.DataFrame,
926
- save_dir: str,
927
908
  scaler_x=None,
928
909
  scalers_y: Optional[list] = None,
929
910
  ):
@@ -939,8 +920,7 @@ class PreprocessModel:
939
920
  X_scaled = pd.DataFrame(
940
921
  scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
941
922
  )
942
- if save_dir:
943
- joblib.dump(scaler_x, f"{save_dir}/scaler_x.pkl")
923
+ joblib.dump(scaler_x, f"{self.preprocessing_dir}/scaler_x.pkl")
944
924
 
945
925
  # Determine which targets need to be scaled
946
926
  targets_numbers_to_scale = [
@@ -969,8 +949,9 @@ class PreprocessModel:
969
949
  columns=y.columns,
970
950
  index=y.index,
971
951
  )
972
- if save_dir:
973
- joblib.dump(scaler_y, f"{save_dir}/scaler_y_{target_number}.pkl")
952
+ joblib.dump(
953
+ scaler_y, f"{self.preprocessing_dir}/scaler_y_{target_number}.pkl"
954
+ )
974
955
 
975
956
  scalers_y[f"scaler_y_{target_number}"] = scaler_y
976
957
  scaled_targets[target_number] = scaled_y
@@ -65,7 +65,7 @@ from ray.air import session
65
65
 
66
66
  # Internal library
67
67
  from lecrapaud.search_space import all_models
68
- from lecrapaud.directory_management import clean_directory
68
+ from lecrapaud.directories import clean_directory
69
69
  from lecrapaud.utils import copy_any, contains_best, logger, serialize_for_json
70
70
  from lecrapaud.config import PYTHON_ENV
71
71
  from lecrapaud.feature_selection import load_train_data
@@ -120,8 +120,9 @@ class ModelEngine:
120
120
  plot: bool = False,
121
121
  log_dir: str = None,
122
122
  ):
123
+ self.path = path
123
124
  if path:
124
- self.load(path)
125
+ self.load()
125
126
  else:
126
127
  self.model_name = model_name
127
128
  self.target_type = target_type
@@ -134,6 +135,7 @@ class ModelEngine:
134
135
  f"Model {self.model_name} is not supported by this library."
135
136
  f"Choose a model from the list of supported models: {[model['model_name'] for model in all_models].join(', ')}"
136
137
  )
138
+ config = config[0]
137
139
 
138
140
  self.recurrent = config["recurrent"]
139
141
  self.need_scaling = config["need_scaling"]
@@ -147,7 +149,7 @@ class ModelEngine:
147
149
  else:
148
150
  self.scaler_y = None
149
151
 
150
- self.path = path
152
+ self.threshold = None
151
153
 
152
154
  def fit(self, *args):
153
155
  if self.recurrent:
@@ -629,9 +631,6 @@ class ModelEngine:
629
631
  self.model_name = self._model.model_name
630
632
  self.target_type = self._model.target_type
631
633
 
632
- def __getattr__(self, attr):
633
- return getattr(self._model, attr)
634
-
635
634
 
636
635
  def trainable(
637
636
  params,
@@ -778,20 +777,20 @@ class ModelSelectionEngine:
778
777
  raise ValueError("Please provide a dataset.")
779
778
 
780
779
  if self.data:
781
- self.train = self.data["train"]
782
- self.val = self.data["val"]
783
- self.test = self.data["test"]
784
- self.train_scaled = self.data["train_scaled"]
785
- self.val_scaled = self.data["val_scaled"]
786
- self.test_scaled = self.data["test_scaled"]
780
+ train = self.data["train"]
781
+ val = self.data["val"]
782
+ test = self.data["test"]
783
+ train_scaled = self.data["train_scaled"]
784
+ val_scaled = self.data["val_scaled"]
785
+ test_scaled = self.data["test_scaled"]
787
786
  else:
788
787
  (
789
- self.train,
790
- self.val,
791
- self.test,
792
- self.train_scaled,
793
- self.val_scaled,
794
- self.test_scaled,
788
+ train,
789
+ val,
790
+ test,
791
+ train_scaled,
792
+ val_scaled,
793
+ test_scaled,
795
794
  ) = load_train_data(self.dataset_dir, self.target_number, self.target_clf)
796
795
 
797
796
  if (
@@ -810,10 +809,12 @@ class ModelSelectionEngine:
810
809
  raise ValueError("reshaped_data is not provided.")
811
810
 
812
811
  logger.info("Loading reshaped data...")
813
- self.x_train_reshaped = self.reshaped_data["x_train_reshaped"]
814
- self.y_train_reshaped = self.reshaped_data["y_train_reshaped"]
815
- self.x_val_reshaped = self.reshaped_data["x_val_reshaped"]
816
- self.y_val_reshaped = self.reshaped_data["y_val_reshaped"]
812
+ x_train_reshaped = self.reshaped_data["x_train_reshaped"]
813
+ y_train_reshaped = self.reshaped_data["y_train_reshaped"]
814
+ x_val_reshaped = self.reshaped_data["x_val_reshaped"]
815
+ y_val_reshaped = self.reshaped_data["y_val_reshaped"]
816
+ x_test_reshaped = self.reshaped_data["x_test_reshaped"]
817
+ y_test_reshaped = self.reshaped_data["y_test_reshaped"]
817
818
 
818
819
  # create model selection in db
819
820
  target = Target.find_by(name=f"TARGET_{self.target_number}")
@@ -864,31 +865,33 @@ class ModelSelectionEngine:
864
865
  if e in set(self.features)
865
866
  ]
866
867
  # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
867
- self.x_train = self.x_train_reshaped[:, :, features_idx]
868
- self.y_train = self.y_train_reshaped[:, [self.target_number, 0]]
869
- self.x_val = self.x_val_reshaped[:, :, features_idx]
870
- self.y_val = self.y_val_reshaped[:, [self.target_number, 0]]
868
+ x_train = x_train_reshaped[:, :, features_idx]
869
+ y_train = y_train_reshaped[:, [self.target_number, 0]]
870
+ x_val = x_val_reshaped[:, :, features_idx]
871
+ y_val = y_val_reshaped[:, [self.target_number, 0]]
872
+ x_test = x_test_reshaped[:, :, features_idx]
873
+ y_test = y_test_reshaped[:, [self.target_number, 0]]
871
874
  else:
872
875
  config = config[self.target_type]
873
876
 
874
877
  if need_scaling and self.target_type == "regression":
875
- self.x_train = self.train_scaled[self.features]
876
- self.y_train = self.train_scaled[
877
- f"TARGET_{self.target_number}"
878
- ].rename("TARGET")
879
- self.x_val = self.val_scaled[self.features]
880
- self.y_val = self.val_scaled[f"TARGET_{self.target_number}"].rename(
878
+ x_train = train_scaled[self.features]
879
+ y_train = train_scaled[f"TARGET_{self.target_number}"].rename(
881
880
  "TARGET"
882
881
  )
883
- else:
884
- self.x_train = self.train[self.features]
885
- self.y_train = self.train[f"TARGET_{self.target_number}"].rename(
886
- "TARGET"
887
- )
888
- self.x_val = self.val[self.features]
889
- self.y_val = self.val[f"TARGET_{self.target_number}"].rename(
882
+ x_val = val_scaled[self.features]
883
+ y_val = val_scaled[f"TARGET_{self.target_number}"].rename("TARGET")
884
+ x_test = test_scaled[self.features]
885
+ y_test = test_scaled[f"TARGET_{self.target_number}"].rename(
890
886
  "TARGET"
891
887
  )
888
+ else:
889
+ x_train = train[self.features]
890
+ y_train = train[f"TARGET_{self.target_number}"].rename("TARGET")
891
+ x_val = val[self.features]
892
+ y_val = val[f"TARGET_{self.target_number}"].rename("TARGET")
893
+ x_test = test[self.features]
894
+ y_test = test[f"TARGET_{self.target_number}"].rename("TARGET")
892
895
 
893
896
  log_dir = get_log_dir(self.training_target_dir, model_name)
894
897
  # instantiate model
@@ -904,7 +907,7 @@ class ModelSelectionEngine:
904
907
  start = time.time()
905
908
  # Tuning hyperparameters
906
909
  if perform_hyperopt:
907
- best_params = self.hyperoptimize(model)
910
+ best_params = self.hyperoptimize(x_train, y_train, x_val, y_val, model)
908
911
 
909
912
  # save best params
910
913
  best_params_file = f"{self.training_target_dir}/best_params.json"
@@ -929,8 +932,8 @@ class ModelSelectionEngine:
929
932
 
930
933
  # Perform cross-validation of the best model on k-folds of train + val set
931
934
  if perform_crossval:
932
- x_train_val = pd.concat([self.x_train, self.x_val, self.x_test], axis=0)
933
- y_train_val = pd.concat([self.y_train, self.y_val, self.y_test], axis=0)
935
+ x_train_val = pd.concat([x_train, x_val, x_test], axis=0)
936
+ y_train_val = pd.concat([y_train, y_val, y_test], axis=0)
934
937
  n_splits = 4
935
938
  n_samples = len(x_train_val)
936
939
  test_size = int(n_samples / (n_splits + 4))
@@ -943,7 +946,7 @@ class ModelSelectionEngine:
943
946
  self.type_name = f"crossval_fold_{i}"
944
947
 
945
948
  if self.time_series:
946
- date_series = self.train[self.date_column].copy()
949
+ date_series = train[self.date_column].copy()
947
950
 
948
951
  if need_scaling:
949
952
  date_series = date_series.map(pd.Timestamp.fromordinal)
@@ -997,10 +1000,10 @@ class ModelSelectionEngine:
997
1000
  # Retrain on entire training set, but keep score on cross-validation folds
998
1001
  best_score, best_model, best_pred = self.train_model(
999
1002
  params=best_params,
1000
- x_train=pd.concat([self.x_train, self.x_val], axis=0),
1001
- y_train=pd.concat([self.y_train, self.y_val], axis=0),
1002
- x_val=self.x_test,
1003
- y_val=self.y_test,
1003
+ x_train=pd.concat([x_train, x_val], axis=0),
1004
+ y_train=pd.concat([y_train, y_val], axis=0),
1005
+ x_val=x_test,
1006
+ y_val=y_test,
1004
1007
  model=model,
1005
1008
  )
1006
1009
  best_score = cross_validation_mean_score
@@ -1009,10 +1012,10 @@ class ModelSelectionEngine:
1009
1012
  self.type_name = "validation"
1010
1013
  best_score, best_model, best_pred = self.train_model(
1011
1014
  params=best_params,
1012
- x_train=pd.concat([self.x_train, self.x_val], axis=0),
1013
- y_train=pd.concat([self.y_train, self.y_val], axis=0),
1014
- x_val=self.x_test,
1015
- y_val=self.y_test,
1015
+ x_train=pd.concat([x_train, x_val], axis=0),
1016
+ y_train=pd.concat([y_train, y_val], axis=0),
1017
+ x_val=x_test,
1018
+ y_val=y_test,
1016
1019
  model=model,
1017
1020
  )
1018
1021
 
@@ -1114,7 +1117,7 @@ class ModelSelectionEngine:
1114
1117
 
1115
1118
  logger.info(f"Best model overall is : {best_score_overall}")
1116
1119
 
1117
- def hyperoptimize(self, model: ModelEngine):
1120
+ def hyperoptimize(self, x_train, y_train, x_val, y_val, model: ModelEngine):
1118
1121
  self.type_name = "hyperopts"
1119
1122
 
1120
1123
  def collect_error_logs(training_target_dir: int, storage_path: str):
@@ -1140,10 +1143,10 @@ class ModelSelectionEngine:
1140
1143
  tuner = Tuner(
1141
1144
  trainable=with_parameters(
1142
1145
  trainable,
1143
- x_train=self.x_train,
1144
- y_train=self.y_train,
1145
- x_val=self.x_val,
1146
- y_val=self.y_val,
1146
+ x_train=x_train,
1147
+ y_train=y_train,
1148
+ x_val=x_val,
1149
+ y_val=y_val,
1147
1150
  model_name=model.model_name,
1148
1151
  target_type=self.target_type,
1149
1152
  session_name=self.session_name,
@@ -10,7 +10,7 @@ import unicodedata
10
10
  import re
11
11
  import string
12
12
 
13
- from lecrapaud.directory_management import logger_dir
13
+ from lecrapaud.directories import logger_dir
14
14
  from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV
15
15
 
16
16
  _LOGGER_ALREADY_CONFIGURED = False
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "lecrapaud"
3
- version = "0.2.1"
3
+ version = "0.3.0"
4
4
  description = "Framework for machine and deep learning, with regression, classification and time series analysis"
5
5
  authors = [
6
6
  {name = "Pierre H. Gallet"}