lecrapaud 0.21.1__py3-none-any.whl → 0.21.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/config.py +1 -1
- lecrapaud/feature_selection.py +20 -28
- lecrapaud/model_selection.py +11 -8
- {lecrapaud-0.21.1.dist-info → lecrapaud-0.21.2.dist-info}/METADATA +18 -5
- {lecrapaud-0.21.1.dist-info → lecrapaud-0.21.2.dist-info}/RECORD +7 -7
- {lecrapaud-0.21.1.dist-info → lecrapaud-0.21.2.dist-info}/WHEEL +0 -0
- {lecrapaud-0.21.1.dist-info → lecrapaud-0.21.2.dist-info}/licenses/LICENSE +0 -0
lecrapaud/config.py
CHANGED
|
@@ -34,5 +34,5 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
|
34
34
|
LECRAPAUD_LOGFILE = os.getenv("LECRAPAUD_LOGFILE")
|
|
35
35
|
LECRAPAUD_TABLE_PREFIX = os.getenv("LECRAPAUD_TABLE_PREFIX", "lecrapaud")
|
|
36
36
|
LECRAPAUD_OPTIMIZATION_BACKEND = os.getenv(
|
|
37
|
-
"LECRAPAUD_OPTIMIZATION_BACKEND", "
|
|
37
|
+
"LECRAPAUD_OPTIMIZATION_BACKEND", "hyperopt"
|
|
38
38
|
).lower()
|
lecrapaud/feature_selection.py
CHANGED
|
@@ -278,24 +278,32 @@ class FeatureSelectionEngine:
|
|
|
278
278
|
|
|
279
279
|
features_selected_list = features_selected["features"].values.tolist()
|
|
280
280
|
|
|
281
|
-
# Save ensemble features
|
|
282
|
-
logger.info("Saving ensemble features
|
|
283
|
-
|
|
281
|
+
# Save ensemble features for all numerical features with global ranking
|
|
282
|
+
logger.info("Saving ensemble features with global ranking for all numerical features...")
|
|
283
|
+
numerical_features_in_data = self.X_numerical.columns.tolist()
|
|
284
284
|
ensemble_rows = []
|
|
285
285
|
|
|
286
|
-
#
|
|
287
|
-
|
|
288
|
-
|
|
286
|
+
# Create global ranking for ALL numerical features (1 to n, no null values)
|
|
287
|
+
all_numerical_scores = pd.concat(results, axis=0)
|
|
288
|
+
all_numerical_scores = all_numerical_scores.groupby("features").agg({
|
|
289
|
+
"rank": "mean" # Average rank across all methods
|
|
290
|
+
}).reset_index()
|
|
291
|
+
all_numerical_scores.sort_values("rank", inplace=True)
|
|
292
|
+
all_numerical_scores["global_rank"] = range(1, len(all_numerical_scores) + 1)
|
|
289
293
|
|
|
290
|
-
for feature in
|
|
294
|
+
for feature in numerical_features_in_data:
|
|
291
295
|
feature_id = feature_map.get(feature)
|
|
292
296
|
if feature_id:
|
|
293
297
|
is_selected = feature in features_selected_list
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
+
|
|
299
|
+
# Get global rank (no null values - all features get a rank)
|
|
300
|
+
if feature in all_numerical_scores["features"].values:
|
|
301
|
+
global_rank = all_numerical_scores[
|
|
302
|
+
all_numerical_scores["features"] == feature
|
|
298
303
|
]["global_rank"].values[0]
|
|
304
|
+
else:
|
|
305
|
+
# Fallback: assign last rank + position for features not in results
|
|
306
|
+
global_rank = len(all_numerical_scores) + numerical_features_in_data.index(feature) + 1
|
|
299
307
|
|
|
300
308
|
ensemble_rows.append({
|
|
301
309
|
"feature_selection_id": feature_selection.id,
|
|
@@ -353,28 +361,12 @@ class FeatureSelectionEngine:
|
|
|
353
361
|
)
|
|
354
362
|
|
|
355
363
|
# Final update for features after max limitation (final selection)
|
|
356
|
-
logger.info("Finalizing ensemble features
|
|
364
|
+
logger.info("Finalizing ensemble features...")
|
|
357
365
|
for row in ensemble_rows:
|
|
358
366
|
feature = Feature.get(row["feature_id"]).name
|
|
359
367
|
if feature in features and row["support"] == 1:
|
|
360
368
|
row["support"] = 2 # 2 = in final selection
|
|
361
369
|
|
|
362
|
-
# Add categorical features to ensemble if not already present
|
|
363
|
-
if target_type == "classification":
|
|
364
|
-
for cat_feature in categorical_features_selected:
|
|
365
|
-
feature_id = feature_map.get(cat_feature)
|
|
366
|
-
if feature_id and not any(row["feature_id"] == feature_id for row in ensemble_rows):
|
|
367
|
-
ensemble_rows.append({
|
|
368
|
-
"feature_selection_id": feature_selection.id,
|
|
369
|
-
"feature_id": feature_id,
|
|
370
|
-
"method": "ensemble",
|
|
371
|
-
"score": None,
|
|
372
|
-
"pvalue": None,
|
|
373
|
-
"support": 2, # 2 = in final selection (categorical)
|
|
374
|
-
"rank": None, # No rank for categorical features added at the end
|
|
375
|
-
"training_time": 0,
|
|
376
|
-
})
|
|
377
|
-
|
|
378
370
|
# Re-save all ensemble data with updated support values
|
|
379
371
|
FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
|
|
380
372
|
logger.debug(
|
lecrapaud/model_selection.py
CHANGED
|
@@ -55,8 +55,7 @@ from tensorboardX import SummaryWriter
|
|
|
55
55
|
|
|
56
56
|
# Optimization
|
|
57
57
|
import ray
|
|
58
|
-
from ray.tune import Tuner, TuneConfig, with_parameters
|
|
59
|
-
from ray.train import RunConfig
|
|
58
|
+
from ray.tune import Tuner, TuneConfig, with_parameters, RunConfig
|
|
60
59
|
from ray.tune.search.hyperopt import HyperOptSearch
|
|
61
60
|
from ray.tune.search.bayesopt import BayesOptSearch
|
|
62
61
|
from ray.tune.logger import TBXLoggerCallback
|
|
@@ -1357,8 +1356,12 @@ class ModelSelectionEngine:
|
|
|
1357
1356
|
"""Choose between Ray Tune and HyperOpt standalone based on configuration."""
|
|
1358
1357
|
if LECRAPAUD_OPTIMIZATION_BACKEND == "hyperopt":
|
|
1359
1358
|
return self.hyperoptimize_hyperopt(x_train, y_train, x_val, y_val, model)
|
|
1360
|
-
|
|
1359
|
+
elif LECRAPAUD_OPTIMIZATION_BACKEND == "ray":
|
|
1361
1360
|
return self.hyperoptimize_ray(x_train, y_train, x_val, y_val, model)
|
|
1361
|
+
else:
|
|
1362
|
+
raise ValueError(
|
|
1363
|
+
f"Invalid optimization backend: {LECRAPAUD_OPTIMIZATION_BACKEND}."
|
|
1364
|
+
)
|
|
1362
1365
|
|
|
1363
1366
|
def hyperoptimize_hyperopt(
|
|
1364
1367
|
self, x_train, y_train, x_val, y_val, model: ModelEngine
|
|
@@ -1746,11 +1749,11 @@ def evaluate(
|
|
|
1746
1749
|
y_pred_proba = (
|
|
1747
1750
|
prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
|
|
1748
1751
|
)
|
|
1749
|
-
if num_classes > 2:
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1752
|
+
# if num_classes > 2:
|
|
1753
|
+
# lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
|
|
1754
|
+
# lb.fit(labels)
|
|
1755
|
+
# y_true_onhot = lb.transform(y_true)
|
|
1756
|
+
# y_pred_onehot = lb.transform(y_pred)
|
|
1754
1757
|
|
|
1755
1758
|
score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
|
|
1756
1759
|
score["ACCURACY"] = accuracy_score(y_true, y_pred)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lecrapaud
|
|
3
|
-
Version: 0.21.
|
|
3
|
+
Version: 0.21.2
|
|
4
4
|
Summary: Framework for machine and deep learning, with regression, classification and time series analysis
|
|
5
5
|
License: Apache License
|
|
6
6
|
License-File: LICENSE
|
|
@@ -218,7 +218,11 @@ context = {
|
|
|
218
218
|
"val_size": 0.2,
|
|
219
219
|
"test_size": 0.2,
|
|
220
220
|
"pca_temporal": [
|
|
221
|
-
|
|
221
|
+
# Old format (still supported)
|
|
222
|
+
# {"name": "LAST_20_RET", "columns": [f"RET_-{i}" for i in range(1, 21)]},
|
|
223
|
+
# New simplified format - automatically creates lag columns
|
|
224
|
+
{"name": "LAST_20_RET", "column": "RET", "lags": 20},
|
|
225
|
+
{"name": "LAST_10_VOL", "column": "VOLUME", "lags": 10},
|
|
222
226
|
],
|
|
223
227
|
"pca_cross_sectional": [
|
|
224
228
|
{
|
|
@@ -255,11 +259,20 @@ experiment = app.create_experiment(data=your_dataframe, **context)
|
|
|
255
259
|
|
|
256
260
|
2. **Parameter Precedence**: When loading an existing experiment, the stored context takes precedence over any parameters passed to the constructor.
|
|
257
261
|
|
|
258
|
-
3. **PCA Time Series**:
|
|
262
|
+
3. **PCA Time Series**:
|
|
263
|
+
- For time series data, both `pca_cross_sectional` and `pca_temporal` automatically use an expanding window approach with periodic refresh (default: every 90 days) to prevent data leakage.
|
|
264
|
+
- The system fits PCA only on historical data (lookback window of 365 days by default) and avoids look-ahead bias.
|
|
265
|
+
- For panel data (e.g., multiple stocks), lag features are created per group when using the simplified `pca_temporal` format.
|
|
266
|
+
- Missing PCA values are handled with forward-fill followed by zero-fill to ensure compatibility with downstream models.
|
|
259
267
|
|
|
260
|
-
4. **
|
|
268
|
+
4. **PCA Temporal Simplified Format**:
|
|
269
|
+
- Instead of manually listing lag columns: `{"name": "LAST_20_RET", "columns": ["RET_-1", "RET_-2", ..., "RET_-20"]}`
|
|
270
|
+
- Use the simplified format: `{"name": "LAST_20_RET", "column": "RET", "lags": 20}`
|
|
271
|
+
- The system automatically creates the lag columns, handling panel data correctly with `group_column`.
|
|
261
272
|
|
|
262
|
-
5. **
|
|
273
|
+
5. **OpenAI Embeddings**: If using `columns_pca` with text columns, ensure `OPENAI_API_KEY` is set as an environment variable.
|
|
274
|
+
|
|
275
|
+
6. **Model Indices**: The `models_idx` parameter accepts both integer indices and string names (e.g., `'xgb'`, `'lgb'`, `'catboost'`).
|
|
263
276
|
|
|
264
277
|
|
|
265
278
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
lecrapaud/__init__.py,sha256=7Wp_VF08UZP8o-GkpB4_yRjP4twQmpcTc3202OkPmHs,176
|
|
2
2
|
lecrapaud/api.py,sha256=7OL_wbg9hCmlZ0WI6eCDkublntES3f320OZlpuKu8f4,22376
|
|
3
|
-
lecrapaud/config.py,sha256=
|
|
3
|
+
lecrapaud/config.py,sha256=7kwV9kpglFX79YC3fKcANawWJMYYi7SGaVShNsmO4EQ,1175
|
|
4
4
|
lecrapaud/db/__init__.py,sha256=82o9fMfaqKXPh2_rt44EzNRVZV1R4LScEnQYvj_TjK0,34
|
|
5
5
|
lecrapaud/db/alembic/README,sha256=MVlc9TYmr57RbhXET6QxgyCcwWP7w-vLkEsirENqiIQ,38
|
|
6
6
|
lecrapaud/db/alembic/env.py,sha256=RvTTBa3bDVBxmDtapAfzUoeWBgmVQU3s9U6HmQCAP84,2421
|
|
@@ -31,7 +31,7 @@ lecrapaud/db/session.py,sha256=u9NCwUoV5VbtScRb6HOSQr4oTEjIwj0waP5mGlc1qJg,3735
|
|
|
31
31
|
lecrapaud/directories.py,sha256=0LrANuDgbuneSLker60c6q2hmGnQ3mKHIztTGzTx6Gw,826
|
|
32
32
|
lecrapaud/experiment.py,sha256=LiecZS3P4igO_3nJ4IB-2b25CttQS2RePDnhBNucvdE,2478
|
|
33
33
|
lecrapaud/feature_engineering.py,sha256=SvGrJXv24rVgH0QE5mRwJITcCLfUqgbV2Ep68bBVnJs,58794
|
|
34
|
-
lecrapaud/feature_selection.py,sha256=
|
|
34
|
+
lecrapaud/feature_selection.py,sha256=vzL-eklVZl-tHIwqTy4Yg9kYpwOTCoM72IrFoJyDmg8,47203
|
|
35
35
|
lecrapaud/integrations/openai_integration.py,sha256=hHLF3fk5Bps8KNbNrEL3NUFa945jwClE6LrLpuMZOd4,7459
|
|
36
36
|
lecrapaud/jobs/__init__.py,sha256=ZkrsyTOR21c_wN7RY8jPhm8jCrL1oCEtTsf3VFIlQiE,292
|
|
37
37
|
lecrapaud/jobs/config.py,sha256=AmO0j3RFjx8H66dfKw_7vnshaOJb9Ox5BAZ9cwwLFMY,377
|
|
@@ -41,10 +41,10 @@ lecrapaud/misc/tabpfn_tests.ipynb,sha256=VkgsCUJ30d8jaL2VaWtQAgb8ngHPNtPgnXLs7QQ
|
|
|
41
41
|
lecrapaud/misc/test-gpu-bilstm.ipynb,sha256=4nLuZRJVe2kn6kEmauhRiz5wkWT9AVrYhI9CEk_dYUY,9608
|
|
42
42
|
lecrapaud/misc/test-gpu-resnet.ipynb,sha256=27Vu7nYwujYeh3fOxBNCnKJn3MXNPKZU-U8oDDUbymg,4944
|
|
43
43
|
lecrapaud/misc/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIFST_xl_AFW2jgI,8444
|
|
44
|
-
lecrapaud/model_selection.py,sha256=
|
|
44
|
+
lecrapaud/model_selection.py,sha256=VL-JQGY-dRsFgfuRob_-lt9bFyex_PyAUJsPHqr453c,88187
|
|
45
45
|
lecrapaud/search_space.py,sha256=caCehJklD3-sgmlisJj_GmuB7LJiVvTF71gEjPGDvV4,36336
|
|
46
46
|
lecrapaud/utils.py,sha256=0k76HFETO0_NgCYUv8b3RTBLgry6MsDBaHJfpAplxCY,8855
|
|
47
|
-
lecrapaud-0.21.
|
|
48
|
-
lecrapaud-0.21.
|
|
49
|
-
lecrapaud-0.21.
|
|
50
|
-
lecrapaud-0.21.
|
|
47
|
+
lecrapaud-0.21.2.dist-info/METADATA,sha256=5e2V3i21uDdh9fnrdc5MZUQ7EZMJgR8mRKnvGzqwmZw,15337
|
|
48
|
+
lecrapaud-0.21.2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
49
|
+
lecrapaud-0.21.2.dist-info/licenses/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
|
|
50
|
+
lecrapaud-0.21.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|