lecrapaud 0.21.1__py3-none-any.whl → 0.21.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

lecrapaud/config.py CHANGED
@@ -34,5 +34,5 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
34
34
  LECRAPAUD_LOGFILE = os.getenv("LECRAPAUD_LOGFILE")
35
35
  LECRAPAUD_TABLE_PREFIX = os.getenv("LECRAPAUD_TABLE_PREFIX", "lecrapaud")
36
36
  LECRAPAUD_OPTIMIZATION_BACKEND = os.getenv(
37
- "LECRAPAUD_OPTIMIZATION_BACKEND", "ray"
37
+ "LECRAPAUD_OPTIMIZATION_BACKEND", "hyperopt"
38
38
  ).lower()
@@ -278,24 +278,32 @@ class FeatureSelectionEngine:
278
278
 
279
279
  features_selected_list = features_selected["features"].values.tolist()
280
280
 
281
- # Save ensemble features before correlation (aggregated features)
282
- logger.info("Saving ensemble features before correlation...")
283
- all_features_in_data = self.X.columns.tolist()
281
+ # Save ensemble features for all numerical features with global ranking
282
+ logger.info("Saving ensemble features with global ranking for all numerical features...")
283
+ numerical_features_in_data = self.X_numerical.columns.tolist()
284
284
  ensemble_rows = []
285
285
 
286
- # Add global rank for selected features
287
- features_selected_with_global_rank = features_selected.copy()
288
- features_selected_with_global_rank["global_rank"] = range(1, len(features_selected_with_global_rank) + 1)
286
+ # Create global ranking for ALL numerical features (1 to n, no null values)
287
+ all_numerical_scores = pd.concat(results, axis=0)
288
+ all_numerical_scores = all_numerical_scores.groupby("features").agg({
289
+ "rank": "mean" # Average rank across all methods
290
+ }).reset_index()
291
+ all_numerical_scores.sort_values("rank", inplace=True)
292
+ all_numerical_scores["global_rank"] = range(1, len(all_numerical_scores) + 1)
289
293
 
290
- for feature in all_features_in_data:
294
+ for feature in numerical_features_in_data:
291
295
  feature_id = feature_map.get(feature)
292
296
  if feature_id:
293
297
  is_selected = feature in features_selected_list
294
- global_rank = None
295
- if is_selected:
296
- global_rank = features_selected_with_global_rank[
297
- features_selected_with_global_rank["features"] == feature
298
+
299
+ # Get global rank (no null values - all features get a rank)
300
+ if feature in all_numerical_scores["features"].values:
301
+ global_rank = all_numerical_scores[
302
+ all_numerical_scores["features"] == feature
298
303
  ]["global_rank"].values[0]
304
+ else:
305
+ # Fallback: assign last rank + position for features not in results
306
+ global_rank = len(all_numerical_scores) + numerical_features_in_data.index(feature) + 1
299
307
 
300
308
  ensemble_rows.append({
301
309
  "feature_selection_id": feature_selection.id,
@@ -353,28 +361,12 @@ class FeatureSelectionEngine:
353
361
  )
354
362
 
355
363
  # Final update for features after max limitation (final selection)
356
- logger.info("Finalizing ensemble features with categorical features...")
364
+ logger.info("Finalizing ensemble features...")
357
365
  for row in ensemble_rows:
358
366
  feature = Feature.get(row["feature_id"]).name
359
367
  if feature in features and row["support"] == 1:
360
368
  row["support"] = 2 # 2 = in final selection
361
369
 
362
- # Add categorical features to ensemble if not already present
363
- if target_type == "classification":
364
- for cat_feature in categorical_features_selected:
365
- feature_id = feature_map.get(cat_feature)
366
- if feature_id and not any(row["feature_id"] == feature_id for row in ensemble_rows):
367
- ensemble_rows.append({
368
- "feature_selection_id": feature_selection.id,
369
- "feature_id": feature_id,
370
- "method": "ensemble",
371
- "score": None,
372
- "pvalue": None,
373
- "support": 2, # 2 = in final selection (categorical)
374
- "rank": None, # No rank for categorical features added at the end
375
- "training_time": 0,
376
- })
377
-
378
370
  # Re-save all ensemble data with updated support values
379
371
  FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
380
372
  logger.debug(
@@ -55,8 +55,7 @@ from tensorboardX import SummaryWriter
55
55
 
56
56
  # Optimization
57
57
  import ray
58
- from ray.tune import Tuner, TuneConfig, with_parameters
59
- from ray.train import RunConfig
58
+ from ray.tune import Tuner, TuneConfig, with_parameters, RunConfig
60
59
  from ray.tune.search.hyperopt import HyperOptSearch
61
60
  from ray.tune.search.bayesopt import BayesOptSearch
62
61
  from ray.tune.logger import TBXLoggerCallback
@@ -1357,8 +1356,12 @@ class ModelSelectionEngine:
1357
1356
  """Choose between Ray Tune and HyperOpt standalone based on configuration."""
1358
1357
  if LECRAPAUD_OPTIMIZATION_BACKEND == "hyperopt":
1359
1358
  return self.hyperoptimize_hyperopt(x_train, y_train, x_val, y_val, model)
1360
- else:
1359
+ elif LECRAPAUD_OPTIMIZATION_BACKEND == "ray":
1361
1360
  return self.hyperoptimize_ray(x_train, y_train, x_val, y_val, model)
1361
+ else:
1362
+ raise ValueError(
1363
+ f"Invalid optimization backend: {LECRAPAUD_OPTIMIZATION_BACKEND}."
1364
+ )
1362
1365
 
1363
1366
  def hyperoptimize_hyperopt(
1364
1367
  self, x_train, y_train, x_val, y_val, model: ModelEngine
@@ -1746,11 +1749,11 @@ def evaluate(
1746
1749
  y_pred_proba = (
1747
1750
  prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
1748
1751
  )
1749
- if num_classes > 2:
1750
- lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
1751
- lb.fit(labels)
1752
- y_true_onhot = lb.transform(y_true)
1753
- y_pred_onehot = lb.transform(y_pred)
1752
+ # if num_classes > 2:
1753
+ # lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
1754
+ # lb.fit(labels)
1755
+ # y_true_onhot = lb.transform(y_true)
1756
+ # y_pred_onehot = lb.transform(y_pred)
1754
1757
 
1755
1758
  score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
1756
1759
  score["ACCURACY"] = accuracy_score(y_true, y_pred)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lecrapaud
3
- Version: 0.21.1
3
+ Version: 0.21.2
4
4
  Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
5
  License: Apache License
6
6
  License-File: LICENSE
@@ -218,7 +218,11 @@ context = {
218
218
  "val_size": 0.2,
219
219
  "test_size": 0.2,
220
220
  "pca_temporal": [
221
- {"name": "LAST_20_RET", "columns": [f"RET_-{i}" for i in range(1, 21)]},
221
+ # Old format (still supported)
222
+ # {"name": "LAST_20_RET", "columns": [f"RET_-{i}" for i in range(1, 21)]},
223
+ # New simplified format - automatically creates lag columns
224
+ {"name": "LAST_20_RET", "column": "RET", "lags": 20},
225
+ {"name": "LAST_10_VOL", "column": "VOLUME", "lags": 10},
222
226
  ],
223
227
  "pca_cross_sectional": [
224
228
  {
@@ -255,11 +259,20 @@ experiment = app.create_experiment(data=your_dataframe, **context)
255
259
 
256
260
  2. **Parameter Precedence**: When loading an existing experiment, the stored context takes precedence over any parameters passed to the constructor.
257
261
 
258
- 3. **PCA Time Series**: For time series data with `pca_cross_sectional` where index equals `date_column`, the system automatically uses an expanding window approach to prevent data leakage.
262
+ 3. **PCA Time Series**:
263
+ - For time series data, both `pca_cross_sectional` and `pca_temporal` automatically use an expanding window approach with periodic refresh (default: every 90 days) to prevent data leakage.
264
+ - The system fits PCA only on historical data (lookback window of 365 days by default) and avoids look-ahead bias.
265
+ - For panel data (e.g., multiple stocks), lag features are created per group when using the simplified `pca_temporal` format.
266
+ - Missing PCA values are handled with forward-fill followed by zero-fill to ensure compatibility with downstream models.
259
267
 
260
- 4. **OpenAI Embeddings**: If using `columns_pca` with text columns, ensure `OPENAI_API_KEY` is set as an environment variable.
268
+ 4. **PCA Temporal Simplified Format**:
269
+ - Instead of manually listing lag columns: `{"name": "LAST_20_RET", "columns": ["RET_-1", "RET_-2", ..., "RET_-20"]}`
270
+ - Use the simplified format: `{"name": "LAST_20_RET", "column": "RET", "lags": 20}`
271
+ - The system automatically creates the lag columns, handling panel data correctly with `group_column`.
261
272
 
262
- 5. **Model Indices**: The `models_idx` parameter accepts both integer indices and string names (e.g., `'xgb'`, `'lgb'`, `'catboost'`).
273
+ 5. **OpenAI Embeddings**: If using `columns_pca` with text columns, ensure `OPENAI_API_KEY` is set as an environment variable.
274
+
275
+ 6. **Model Indices**: The `models_idx` parameter accepts both integer indices and string names (e.g., `'xgb'`, `'lgb'`, `'catboost'`).
263
276
 
264
277
 
265
278
 
@@ -1,6 +1,6 @@
1
1
  lecrapaud/__init__.py,sha256=7Wp_VF08UZP8o-GkpB4_yRjP4twQmpcTc3202OkPmHs,176
2
2
  lecrapaud/api.py,sha256=7OL_wbg9hCmlZ0WI6eCDkublntES3f320OZlpuKu8f4,22376
3
- lecrapaud/config.py,sha256=0NEg61QdLxQ97bVFDDXa6OwlWFEo_z8VIhX5KrD1ik0,1170
3
+ lecrapaud/config.py,sha256=7kwV9kpglFX79YC3fKcANawWJMYYi7SGaVShNsmO4EQ,1175
4
4
  lecrapaud/db/__init__.py,sha256=82o9fMfaqKXPh2_rt44EzNRVZV1R4LScEnQYvj_TjK0,34
5
5
  lecrapaud/db/alembic/README,sha256=MVlc9TYmr57RbhXET6QxgyCcwWP7w-vLkEsirENqiIQ,38
6
6
  lecrapaud/db/alembic/env.py,sha256=RvTTBa3bDVBxmDtapAfzUoeWBgmVQU3s9U6HmQCAP84,2421
@@ -31,7 +31,7 @@ lecrapaud/db/session.py,sha256=u9NCwUoV5VbtScRb6HOSQr4oTEjIwj0waP5mGlc1qJg,3735
31
31
  lecrapaud/directories.py,sha256=0LrANuDgbuneSLker60c6q2hmGnQ3mKHIztTGzTx6Gw,826
32
32
  lecrapaud/experiment.py,sha256=LiecZS3P4igO_3nJ4IB-2b25CttQS2RePDnhBNucvdE,2478
33
33
  lecrapaud/feature_engineering.py,sha256=SvGrJXv24rVgH0QE5mRwJITcCLfUqgbV2Ep68bBVnJs,58794
34
- lecrapaud/feature_selection.py,sha256=Q9xWVmZsvRjX9mJHB_PY_KLXsEAYNLX7txSe0cniY4A,47529
34
+ lecrapaud/feature_selection.py,sha256=vzL-eklVZl-tHIwqTy4Yg9kYpwOTCoM72IrFoJyDmg8,47203
35
35
  lecrapaud/integrations/openai_integration.py,sha256=hHLF3fk5Bps8KNbNrEL3NUFa945jwClE6LrLpuMZOd4,7459
36
36
  lecrapaud/jobs/__init__.py,sha256=ZkrsyTOR21c_wN7RY8jPhm8jCrL1oCEtTsf3VFIlQiE,292
37
37
  lecrapaud/jobs/config.py,sha256=AmO0j3RFjx8H66dfKw_7vnshaOJb9Ox5BAZ9cwwLFMY,377
@@ -41,10 +41,10 @@ lecrapaud/misc/tabpfn_tests.ipynb,sha256=VkgsCUJ30d8jaL2VaWtQAgb8ngHPNtPgnXLs7QQ
41
41
  lecrapaud/misc/test-gpu-bilstm.ipynb,sha256=4nLuZRJVe2kn6kEmauhRiz5wkWT9AVrYhI9CEk_dYUY,9608
42
42
  lecrapaud/misc/test-gpu-resnet.ipynb,sha256=27Vu7nYwujYeh3fOxBNCnKJn3MXNPKZU-U8oDDUbymg,4944
43
43
  lecrapaud/misc/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIFST_xl_AFW2jgI,8444
44
- lecrapaud/model_selection.py,sha256=o4_hOEp91_33HtMatVHU7YPc71KZ2hK7wucN63xqWkA,88017
44
+ lecrapaud/model_selection.py,sha256=VL-JQGY-dRsFgfuRob_-lt9bFyex_PyAUJsPHqr453c,88187
45
45
  lecrapaud/search_space.py,sha256=caCehJklD3-sgmlisJj_GmuB7LJiVvTF71gEjPGDvV4,36336
46
46
  lecrapaud/utils.py,sha256=0k76HFETO0_NgCYUv8b3RTBLgry6MsDBaHJfpAplxCY,8855
47
- lecrapaud-0.21.1.dist-info/METADATA,sha256=rKls8xvjhu9f72jTw2sjBYCmQPw-N02RSScSOjJ1E2g,14348
48
- lecrapaud-0.21.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
49
- lecrapaud-0.21.1.dist-info/licenses/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
50
- lecrapaud-0.21.1.dist-info/RECORD,,
47
+ lecrapaud-0.21.2.dist-info/METADATA,sha256=5e2V3i21uDdh9fnrdc5MZUQ7EZMJgR8mRKnvGzqwmZw,15337
48
+ lecrapaud-0.21.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
49
+ lecrapaud-0.21.2.dist-info/licenses/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
50
+ lecrapaud-0.21.2.dist-info/RECORD,,