lecrapaud 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (64) hide show
  1. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/PKG-INFO +27 -20
  2. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/README.md +26 -19
  3. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/api.py +8 -2
  4. lecrapaud-0.3.0/lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +50 -0
  5. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/session.py +11 -0
  6. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/experiment.py +1 -1
  7. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/feature_engineering.py +11 -12
  8. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/feature_selection.py +29 -48
  9. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/model_selection.py +59 -59
  10. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/utils.py +1 -1
  11. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/pyproject.toml +1 -1
  12. lecrapaud-0.2.0/lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +0 -38
  13. lecrapaud-0.2.0/lecrapaud/predictions.py +0 -292
  14. lecrapaud-0.2.0/lecrapaud/preprocessing.py +0 -984
  15. lecrapaud-0.2.0/lecrapaud/training.py +0 -239
  16. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/LICENSE +0 -0
  17. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/__init__.py +0 -0
  18. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/config.py +0 -0
  19. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/__init__.py +0 -0
  20. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/README +0 -0
  21. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/env.py +0 -0
  22. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/script.py.mako +0 -0
  23. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +0 -0
  24. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +0 -0
  25. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +0 -0
  26. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +0 -0
  27. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +0 -0
  28. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +0 -0
  29. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +0 -0
  30. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +0 -0
  31. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +0 -0
  32. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +0 -0
  33. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +0 -0
  34. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +0 -0
  35. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +0 -0
  36. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +0 -0
  37. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +0 -0
  38. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/__init__.py +0 -0
  39. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/base.py +0 -0
  40. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/dataset.py +0 -0
  41. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/feature.py +0 -0
  42. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/feature_selection.py +0 -0
  43. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/feature_selection_rank.py +0 -0
  44. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/model.py +0 -0
  45. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/model_selection.py +0 -0
  46. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/model_training.py +0 -0
  47. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/score.py +0 -0
  48. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/db/models/target.py +0 -0
  49. /lecrapaud-0.2.0/lecrapaud/directory_management.py → /lecrapaud-0.3.0/lecrapaud/directories.py +0 -0
  50. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/integrations/openai_integration.py +0 -0
  51. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/jobs/__init__.py +0 -0
  52. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/jobs/config.py +0 -0
  53. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/jobs/scheduler.py +0 -0
  54. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/jobs/tasks.py +0 -0
  55. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/search_space.py +0 -0
  56. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/services/__init__.py +0 -0
  57. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/services/embedding_categorical.py +0 -0
  58. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/services/indicators.py +0 -0
  59. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/speed_tests/experiments.py +0 -0
  60. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/speed_tests/test-gpu-bilstm.ipynb +0 -0
  61. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/speed_tests/test-gpu-resnet.ipynb +0 -0
  62. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/speed_tests/test-gpu-transformers.ipynb +0 -0
  63. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/speed_tests/tests.ipynb +0 -0
  64. {lecrapaud-0.2.0 → lecrapaud-0.3.0}/lecrapaud/speed_tests/trash.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: lecrapaud
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
5
  License: Apache License
6
6
  Author: Pierre H. Gallet
@@ -49,7 +49,7 @@ Description-Content-Type: text/markdown
49
49
 
50
50
  <div align="center">
51
51
 
52
- # 🐸
52
+ <img src="https://s3.amazonaws.com/pix.iemoji.com/images/emoji/apple/ios-12/256/frog-face.png" width=120 alt="crapaud"/>
53
53
 
54
54
  ## Welcome to LeCrapaud
55
55
 
@@ -72,34 +72,21 @@ LeCrapaud is a high-level Python library for end-to-end machine learning workflo
72
72
 
73
73
  ## ⚡ Quick Start
74
74
 
75
- 1. Create environment
76
75
 
77
- ```sh
78
- $ pip install virtualenv
79
- $ python -m venv .venv
80
- $ source .venv/bin/activate
81
- ```
82
-
83
- 2. Install dependencies
76
+ ### Install the package
84
77
 
85
78
  ```sh
86
- $ make install
79
+ pip install lecrapaud
87
80
  ```
88
81
 
89
- 3. Deactivate virtualenv (if needed)
90
-
91
- ```sh
92
- $ deactivate
93
- ```
94
-
95
- ## 🛠️ How it works
82
+ ### How it works
96
83
 
97
84
  This package provides a high-level API to manage experiments for feature engineering, model selection, and prediction on tabular data (e.g. stock data).
98
85
 
99
86
  ### Typical workflow
100
87
 
101
88
  ```python
102
- from lecrapaud.api import LeCrapaud
89
+ from lecrapaud import LeCrapaud
103
90
 
104
91
  # 1. Create the main app
105
92
  app = LeCrapaud()
@@ -159,6 +146,26 @@ $ git push -u origin master
159
146
  3. Use conventional commits
160
147
  https://www.conventionalcommits.org/en/v1.0.0/#summary
161
148
 
149
+ 4. Create environment
150
+
151
+ ```sh
152
+ $ pip install virtualenv
153
+ $ python -m venv .venv
154
+ $ source .venv/bin/activate
155
+ ```
156
+
157
+ 5. Install dependencies
158
+
159
+ ```sh
160
+ $ make install
161
+ ```
162
+
163
+ 6. Deactivate virtualenv (if needed)
164
+
165
+ ```sh
166
+ $ deactivate
167
+ ```
168
+
162
169
  ---
163
170
 
164
- Pierre Gallet © 2024
171
+ Pierre Gallet © 2025
@@ -1,6 +1,6 @@
1
1
  <div align="center">
2
2
 
3
- # 🐸
3
+ <img src="https://s3.amazonaws.com/pix.iemoji.com/images/emoji/apple/ios-12/256/frog-face.png" width=120 alt="crapaud"/>
4
4
 
5
5
  ## Welcome to LeCrapaud
6
6
 
@@ -23,34 +23,21 @@ LeCrapaud is a high-level Python library for end-to-end machine learning workflo
23
23
 
24
24
  ## ⚡ Quick Start
25
25
 
26
- 1. Create environment
27
26
 
28
- ```sh
29
- $ pip install virtualenv
30
- $ python -m venv .venv
31
- $ source .venv/bin/activate
32
- ```
33
-
34
- 2. Install dependencies
27
+ ### Install the package
35
28
 
36
29
  ```sh
37
- $ make install
30
+ pip install lecrapaud
38
31
  ```
39
32
 
40
- 3. Deactivate virtualenv (if needed)
41
-
42
- ```sh
43
- $ deactivate
44
- ```
45
-
46
- ## 🛠️ How it works
33
+ ### How it works
47
34
 
48
35
  This package provides a high-level API to manage experiments for feature engineering, model selection, and prediction on tabular data (e.g. stock data).
49
36
 
50
37
  ### Typical workflow
51
38
 
52
39
  ```python
53
- from lecrapaud.api import LeCrapaud
40
+ from lecrapaud import LeCrapaud
54
41
 
55
42
  # 1. Create the main app
56
43
  app = LeCrapaud()
@@ -110,6 +97,26 @@ $ git push -u origin master
110
97
  3. Use conventional commits
111
98
  https://www.conventionalcommits.org/en/v1.0.0/#summary
112
99
 
100
+ 4. Create environment
101
+
102
+ ```sh
103
+ $ pip install virtualenv
104
+ $ python -m venv .venv
105
+ $ source .venv/bin/activate
106
+ ```
107
+
108
+ 5. Install dependencies
109
+
110
+ ```sh
111
+ $ make install
112
+ ```
113
+
114
+ 6. Deactivate virtualenv (if needed)
115
+
116
+ ```sh
117
+ $ deactivate
118
+ ```
119
+
113
120
  ---
114
121
 
115
- Pierre Gallet © 2024
122
+ Pierre Gallet © 2025
@@ -32,6 +32,8 @@ experiment.model_selection(data) : return best_model
32
32
 
33
33
  import joblib
34
34
  import pandas as pd
35
+ import logging
36
+ from lecrapaud.utils import logger
35
37
  from lecrapaud.db.session import init_db
36
38
  from lecrapaud.feature_selection import FeatureSelectionEngine, PreprocessModel
37
39
  from lecrapaud.model_selection import ModelSelectionEngine, ModelEngine
@@ -103,7 +105,12 @@ class Experiment:
103
105
  std_data, reshaped_data = self.preprocess_model(train, val, test)
104
106
  self.model_selection(std_data, reshaped_data)
105
107
 
106
- def predict(self, new_data):
108
+ def predict(self, new_data, verbose: int = 0):
109
+ if verbose == 0:
110
+ logger.setLevel(logging.WARNING)
111
+
112
+ logger.warning("Running prediction...")
113
+
107
114
  data = self.feature_engineering(
108
115
  data=new_data,
109
116
  for_training=False,
@@ -127,7 +134,6 @@ class Experiment:
127
134
  else:
128
135
  features = self.dataset.get_features(target_number)
129
136
  model = ModelEngine(path=training_target_dir)
130
- model.load()
131
137
 
132
138
  # getting data
133
139
  if model.recurrent:
@@ -0,0 +1,50 @@
1
+ """make_nullablee
2
+
3
+ Revision ID: 52b809a34371
4
+ Revises: 339927587383
5
+ Create Date: 2025-05-31 18:34:58.962966
6
+
7
+ """
8
+
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+ from sqlalchemy.dialects import mysql
14
+
15
+ # revision identifiers, used by Alembic.
16
+ revision: str = "52b809a34371"
17
+ down_revision: Union[str, None] = "339927587383"
18
+ branch_labels: Union[str, Sequence[str], None] = None
19
+ depends_on: Union[str, Sequence[str], None] = None
20
+
21
+
22
+ def upgrade() -> None:
23
+ # ### commands auto generated by Alembic - please adjust! ###
24
+ op.alter_column(
25
+ "investment_runs",
26
+ "initial_portfolio",
27
+ existing_type=mysql.JSON(),
28
+ nullable=True,
29
+ )
30
+ op.create_foreign_key(
31
+ None,
32
+ "portfolios",
33
+ "investment_runs",
34
+ ["investment_run_id"],
35
+ ["id"],
36
+ ondelete="CASCADE",
37
+ )
38
+ # ### end Alembic commands ###
39
+
40
+
41
+ def downgrade() -> None:
42
+ # ### commands auto generated by Alembic - please adjust! ###
43
+ op.drop_constraint(None, "portfolios", type_="foreignkey")
44
+ op.alter_column(
45
+ "investment_runs",
46
+ "initial_portfolio",
47
+ existing_type=mysql.JSON(),
48
+ nullable=False,
49
+ )
50
+ # ### end Alembic commands ###
@@ -4,6 +4,9 @@ from contextlib import contextmanager
4
4
  from sqlalchemy import create_engine, text
5
5
  from sqlalchemy.orm import sessionmaker
6
6
  from urllib.parse import urlparse
7
+ from alembic.config import Config
8
+ from alembic import command
9
+ import os
7
10
 
8
11
  from lecrapaud.config import DB_USER, DB_PASSWORD, DB_HOST, DB_PORT, DB_NAME, DB_URI
9
12
 
@@ -39,6 +42,14 @@ def init_db(uri: str = None):
39
42
  # Step 4: Create session factory
40
43
  _SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=_engine)
41
44
 
45
+ # Step 5: Apply Alembic migrations programmatically
46
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
47
+ alembic_cfg_path = os.path.join(project_root, "alembic.ini")
48
+
49
+ alembic_cfg = Config(alembic_cfg_path)
50
+ alembic_cfg.set_main_option("sqlalchemy.url", uri or os.getenv("DATABASE_URL"))
51
+ command.upgrade(alembic_cfg, "head")
52
+
42
53
 
43
54
  # Dependency to get a session instance
44
55
  @contextmanager
@@ -5,7 +5,7 @@ from pathlib import Path
5
5
  os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
6
6
 
7
7
  # Internal
8
- from lecrapaud.directory_management import tmp_dir
8
+ from lecrapaud.directories import tmp_dir
9
9
  from lecrapaud.utils import logger
10
10
  from lecrapaud.config import PYTHON_ENV
11
11
  from lecrapaud.db import (
@@ -101,7 +101,7 @@ class FeatureEngineeringEngine:
101
101
 
102
102
  def run(self) -> pd.DataFrame:
103
103
  # drop columns
104
- self.data = self.data.drop(columns=self.columns_drop)
104
+ self.data = self.data.drop(columns=self.columns_drop, errors="ignore")
105
105
 
106
106
  # convert object columns to numeric if possible
107
107
  self.data = convert_object_columns_that_are_numeric(self.data)
@@ -324,6 +324,8 @@ class PreprocessFeature:
324
324
  **kwargs,
325
325
  ):
326
326
  self.data = data
327
+ self.data.columns = self.data.columns.str.upper()
328
+
327
329
  self.dataset = dataset
328
330
  self.columns_pca = columns_pca
329
331
  self.columns_onehot = columns_onehot
@@ -350,7 +352,7 @@ class PreprocessFeature:
350
352
  self.train_val_test_split_time_series()
351
353
  if self.time_series
352
354
  else self.train_val_test_split(
353
- stratify_col=f"target_{self.target_numbers[0]}"
355
+ stratify_col=f"TARGET_{self.target_numbers[0]}"
354
356
  )
355
357
  ) # TODO: only stratifying first target for now
356
358
 
@@ -359,8 +361,7 @@ class PreprocessFeature:
359
361
  val, _ = self.add_pca_features(test, pcas=pcas)
360
362
  test, _ = self.add_pca_features(val, pcas=pcas)
361
363
 
362
- if PYTHON_ENV != "Test":
363
- joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
364
+ joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
364
365
 
365
366
  # Encoding
366
367
  train, transformer = self.encode_categorical_features(train)
@@ -373,11 +374,10 @@ class PreprocessFeature:
373
374
  transformer=transformer,
374
375
  )
375
376
 
376
- if PYTHON_ENV != "Test":
377
- joblib.dump(self.data, f"{self.data_dir}/full.pkl")
378
- joblib.dump(transformer, f"{self.preprocessing_dir}/column_transformer.pkl")
379
- summary = summarize_dataframe(train)
380
- summary.to_csv(f"{self.dataset_dir}/feature_summary.csv", index=False)
377
+ joblib.dump(self.data, f"{self.data_dir}/full.pkl")
378
+ joblib.dump(transformer, f"{self.preprocessing_dir}/column_transformer.pkl")
379
+ summary = summarize_dataframe(train)
380
+ summary.to_csv(f"{self.dataset_dir}/feature_summary.csv", index=False)
381
381
 
382
382
  return train, val, test
383
383
 
@@ -579,8 +579,8 @@ class PreprocessFeature:
579
579
  columns_ordinal: list[str] = self.columns_ordinal
580
580
  columns_frequency: list[str] = self.columns_frequency
581
581
 
582
- X = df.loc[:, ~df.columns.str.contains("^target_")]
583
- y = df.loc[:, df.columns.str.contains("^target_")]
582
+ X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
583
+ y = df.loc[:, df.columns.str.contains("^TARGET_")]
584
584
  save_in_db = False
585
585
 
586
586
  all_columns = (
@@ -643,7 +643,6 @@ class PreprocessFeature:
643
643
 
644
644
  # Try to convert columns to best possible dtypes
645
645
  X_transformed = X_transformed.convert_dtypes()
646
- X_transformed.columns = X_transformed.columns.str.upper()
647
646
 
648
647
  # Insert features in db
649
648
  if save_in_db:
@@ -37,7 +37,7 @@ from sklearn.preprocessing import StandardScaler, MinMaxScaler
37
37
  from scipy.stats import spearmanr, kendalltau
38
38
 
39
39
  # Internal
40
- from lecrapaud.directory_management import tmp_dir, clean_directory
40
+ from lecrapaud.directories import tmp_dir, clean_directory
41
41
  from lecrapaud.utils import logger
42
42
  from lecrapaud.config import PYTHON_ENV
43
43
  from lecrapaud.db import (
@@ -50,10 +50,6 @@ from lecrapaud.db import (
50
50
  from lecrapaud.db.session import get_db
51
51
  from lecrapaud.search_space import all_models
52
52
 
53
- # Variables for targets handling
54
- TARGETS_MCLF = [11]
55
- GROUPING_COLUMN = "STOCK"
56
-
57
53
  # Annoying Warnings
58
54
  warnings.filterwarnings("ignore", category=FutureWarning)
59
55
 
@@ -103,7 +99,7 @@ class FeatureSelectionEngine:
103
99
  # Main feature selection function
104
100
  def run(
105
101
  self,
106
- single_process: bool = False,
102
+ single_process: bool = True,
107
103
  ):
108
104
  """Function to do feature selection with a range of different feature selection technics
109
105
 
@@ -114,10 +110,7 @@ class FeatureSelectionEngine:
114
110
  """
115
111
  target_number = self.target_number
116
112
  target_type = self.target_type
117
- if PYTHON_ENV != "Test":
118
- fs_dir_target = self.fs_dir_target
119
- else:
120
- fs_dir_target = None
113
+ fs_dir_target = self.fs_dir_target
121
114
 
122
115
  # Create the feature selection in db
123
116
  target = Target.find_by(name=f"TARGET_{target_number}")
@@ -162,7 +155,7 @@ class FeatureSelectionEngine:
162
155
  # handling categorical features (only if classification)
163
156
  self.X_categorical, self.X_numerical = get_features_by_types(self.X)
164
157
 
165
- if target_type == "classification":
158
+ if target_type == "classification" and self.X_categorical.shape[1] > 0:
166
159
  feat_scores = self.select_categorical_features(
167
160
  percentile=percentile, save_dir=fs_dir_target
168
161
  )
@@ -292,24 +285,22 @@ class FeatureSelectionEngine:
292
285
  f"We selected {len(features_selected_list)} features and {len(features_selected_by_every_methods)} were selected unanimously:"
293
286
  )
294
287
  logger.debug(features_selected_by_every_methods)
295
- if PYTHON_ENV != "Test":
296
- pd.Series(features_selected_list).to_csv(
297
- f"{fs_dir_target}/features_before_corr.csv",
298
- index=True,
299
- header=True,
300
- index_label="ID",
301
- )
288
+ pd.Series(features_selected_list).to_csv(
289
+ f"{fs_dir_target}/features_before_corr.csv",
290
+ index=True,
291
+ header=True,
292
+ index_label="ID",
293
+ )
302
294
 
303
295
  # removing correlated features
304
296
  self.X = self.X[features_selected_list]
305
297
  features, features_correlated = self.remove_correlated_features(corr_threshold)
306
- if PYTHON_ENV != "Test":
307
- pd.Series(features).to_csv(
308
- f"{fs_dir_target}/features_before_max.csv",
309
- index=True,
310
- header=True,
311
- index_label="ID",
312
- )
298
+ pd.Series(features).to_csv(
299
+ f"{fs_dir_target}/features_before_max.csv",
300
+ index=True,
301
+ header=True,
302
+ index_label="ID",
303
+ )
313
304
  features = features[:max_features]
314
305
 
315
306
  # adding categorical features selected
@@ -337,8 +328,7 @@ class FeatureSelectionEngine:
337
328
  best_features_path = Path(
338
329
  f"{self.preprocessing_dir}/features_{target_number}.pkl"
339
330
  ).resolve()
340
- if PYTHON_ENV != "Test":
341
- joblib.dump(features, best_features_path)
331
+ joblib.dump(features, best_features_path)
342
332
 
343
333
  # save in db
344
334
  db_features = Feature.filter(name__in=features)
@@ -798,6 +788,7 @@ class PreprocessModel:
798
788
 
799
789
  self.dataset_dir = dataset.path
800
790
  self.data_dir = f"{self.dataset_dir}/data"
791
+ self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
801
792
 
802
793
  self.all_features = dataset.get_all_features(
803
794
  date_column=date_column, group_column=group_column
@@ -819,31 +810,23 @@ class PreprocessModel:
819
810
 
820
811
  def run(self):
821
812
  # save data
822
- if PYTHON_ENV != "Test":
823
- joblib.dump(self.train, f"{self.data_dir}/train.pkl")
824
- joblib.dump(self.val, f"{self.data_dir}/val.pkl")
825
- joblib.dump(self.test, f"{self.data_dir}/test.pkl")
826
- preprocessing_dir = f"{self.dataset_dir}/preprocessing"
827
- else:
828
- preprocessing_dir = None
813
+ joblib.dump(self.train, f"{self.data_dir}/train.pkl")
814
+ joblib.dump(self.val, f"{self.data_dir}/val.pkl")
815
+ joblib.dump(self.test, f"{self.data_dir}/test.pkl")
829
816
 
830
817
  # scaling features
831
818
  if any(t not in self.target_clf for t in self.target_numbers) and any(
832
819
  all_models[i].get("need_scaling") for i in self.models_idx
833
820
  ):
834
821
  logger.info("Scaling features...")
835
- train_scaled, scaler_x, scalers_y = self.scale_data(
836
- self.train, save_dir=preprocessing_dir
837
- )
822
+ train_scaled, scaler_x, scalers_y = self.scale_data(self.train)
838
823
  val_scaled, _, _ = self.scale_data(
839
824
  self.val,
840
- save_dir=preprocessing_dir,
841
825
  scaler_x=scaler_x,
842
826
  scalers_y=scalers_y,
843
827
  )
844
828
  test_scaled, _, _ = self.scale_data(
845
829
  self.test,
846
- save_dir=preprocessing_dir,
847
830
  scaler_x=scaler_x,
848
831
  scalers_y=scalers_y,
849
832
  )
@@ -853,10 +836,9 @@ class PreprocessModel:
853
836
  test_scaled = None
854
837
 
855
838
  # save data
856
- if PYTHON_ENV != "Test":
857
- joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
858
- joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
859
- joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
839
+ joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
840
+ joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
841
+ joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
860
842
 
861
843
  data = {
862
844
  "train": self.train,
@@ -923,7 +905,6 @@ class PreprocessModel:
923
905
  def scale_data(
924
906
  self,
925
907
  df: pd.DataFrame,
926
- save_dir: str,
927
908
  scaler_x=None,
928
909
  scalers_y: Optional[list] = None,
929
910
  ):
@@ -939,8 +920,7 @@ class PreprocessModel:
939
920
  X_scaled = pd.DataFrame(
940
921
  scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
941
922
  )
942
- if save_dir:
943
- joblib.dump(scaler_x, f"{save_dir}/scaler_x.pkl")
923
+ joblib.dump(scaler_x, f"{self.preprocessing_dir}/scaler_x.pkl")
944
924
 
945
925
  # Determine which targets need to be scaled
946
926
  targets_numbers_to_scale = [
@@ -969,8 +949,9 @@ class PreprocessModel:
969
949
  columns=y.columns,
970
950
  index=y.index,
971
951
  )
972
- if save_dir:
973
- joblib.dump(scaler_y, f"{save_dir}/scaler_y_{target_number}.pkl")
952
+ joblib.dump(
953
+ scaler_y, f"{self.preprocessing_dir}/scaler_y_{target_number}.pkl"
954
+ )
974
955
 
975
956
  scalers_y[f"scaler_y_{target_number}"] = scaler_y
976
957
  scaled_targets[target_number] = scaled_y