mcli-framework 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/chat_cmd.py +42 -0
- mcli/app/commands_cmd.py +226 -0
- mcli/app/completion_cmd.py +216 -0
- mcli/app/completion_helpers.py +288 -0
- mcli/app/cron_test_cmd.py +697 -0
- mcli/app/logs_cmd.py +419 -0
- mcli/app/main.py +492 -0
- mcli/app/model/model.py +1060 -0
- mcli/app/model_cmd.py +227 -0
- mcli/app/redis_cmd.py +269 -0
- mcli/app/video/video.py +1114 -0
- mcli/app/visual_cmd.py +303 -0
- mcli/chat/chat.py +2409 -0
- mcli/chat/command_rag.py +514 -0
- mcli/chat/enhanced_chat.py +652 -0
- mcli/chat/system_controller.py +1010 -0
- mcli/chat/system_integration.py +1016 -0
- mcli/cli.py +25 -0
- mcli/config.toml +20 -0
- mcli/lib/api/api.py +586 -0
- mcli/lib/api/daemon_client.py +203 -0
- mcli/lib/api/daemon_client_local.py +44 -0
- mcli/lib/api/daemon_decorator.py +217 -0
- mcli/lib/api/mcli_decorators.py +1032 -0
- mcli/lib/auth/auth.py +85 -0
- mcli/lib/auth/aws_manager.py +85 -0
- mcli/lib/auth/azure_manager.py +91 -0
- mcli/lib/auth/credential_manager.py +192 -0
- mcli/lib/auth/gcp_manager.py +93 -0
- mcli/lib/auth/key_manager.py +117 -0
- mcli/lib/auth/mcli_manager.py +93 -0
- mcli/lib/auth/token_manager.py +75 -0
- mcli/lib/auth/token_util.py +1011 -0
- mcli/lib/config/config.py +47 -0
- mcli/lib/discovery/__init__.py +1 -0
- mcli/lib/discovery/command_discovery.py +274 -0
- mcli/lib/erd/erd.py +1345 -0
- mcli/lib/erd/generate_graph.py +453 -0
- mcli/lib/files/files.py +76 -0
- mcli/lib/fs/fs.py +109 -0
- mcli/lib/lib.py +29 -0
- mcli/lib/logger/logger.py +611 -0
- mcli/lib/performance/optimizer.py +409 -0
- mcli/lib/performance/rust_bridge.py +502 -0
- mcli/lib/performance/uvloop_config.py +154 -0
- mcli/lib/pickles/pickles.py +50 -0
- mcli/lib/search/cached_vectorizer.py +479 -0
- mcli/lib/services/data_pipeline.py +460 -0
- mcli/lib/services/lsh_client.py +441 -0
- mcli/lib/services/redis_service.py +387 -0
- mcli/lib/shell/shell.py +137 -0
- mcli/lib/toml/toml.py +33 -0
- mcli/lib/ui/styling.py +47 -0
- mcli/lib/ui/visual_effects.py +634 -0
- mcli/lib/watcher/watcher.py +185 -0
- mcli/ml/api/app.py +215 -0
- mcli/ml/api/middleware.py +224 -0
- mcli/ml/api/routers/admin_router.py +12 -0
- mcli/ml/api/routers/auth_router.py +244 -0
- mcli/ml/api/routers/backtest_router.py +12 -0
- mcli/ml/api/routers/data_router.py +12 -0
- mcli/ml/api/routers/model_router.py +302 -0
- mcli/ml/api/routers/monitoring_router.py +12 -0
- mcli/ml/api/routers/portfolio_router.py +12 -0
- mcli/ml/api/routers/prediction_router.py +267 -0
- mcli/ml/api/routers/trade_router.py +12 -0
- mcli/ml/api/routers/websocket_router.py +76 -0
- mcli/ml/api/schemas.py +64 -0
- mcli/ml/auth/auth_manager.py +425 -0
- mcli/ml/auth/models.py +154 -0
- mcli/ml/auth/permissions.py +302 -0
- mcli/ml/backtesting/backtest_engine.py +502 -0
- mcli/ml/backtesting/performance_metrics.py +393 -0
- mcli/ml/cache.py +400 -0
- mcli/ml/cli/main.py +398 -0
- mcli/ml/config/settings.py +394 -0
- mcli/ml/configs/dvc_config.py +230 -0
- mcli/ml/configs/mlflow_config.py +131 -0
- mcli/ml/configs/mlops_manager.py +293 -0
- mcli/ml/dashboard/app.py +532 -0
- mcli/ml/dashboard/app_integrated.py +738 -0
- mcli/ml/dashboard/app_supabase.py +560 -0
- mcli/ml/dashboard/app_training.py +615 -0
- mcli/ml/dashboard/cli.py +51 -0
- mcli/ml/data_ingestion/api_connectors.py +501 -0
- mcli/ml/data_ingestion/data_pipeline.py +567 -0
- mcli/ml/data_ingestion/stream_processor.py +512 -0
- mcli/ml/database/migrations/env.py +94 -0
- mcli/ml/database/models.py +667 -0
- mcli/ml/database/session.py +200 -0
- mcli/ml/experimentation/ab_testing.py +845 -0
- mcli/ml/features/ensemble_features.py +607 -0
- mcli/ml/features/political_features.py +676 -0
- mcli/ml/features/recommendation_engine.py +809 -0
- mcli/ml/features/stock_features.py +573 -0
- mcli/ml/features/test_feature_engineering.py +346 -0
- mcli/ml/logging.py +85 -0
- mcli/ml/mlops/data_versioning.py +518 -0
- mcli/ml/mlops/experiment_tracker.py +377 -0
- mcli/ml/mlops/model_serving.py +481 -0
- mcli/ml/mlops/pipeline_orchestrator.py +614 -0
- mcli/ml/models/base_models.py +324 -0
- mcli/ml/models/ensemble_models.py +675 -0
- mcli/ml/models/recommendation_models.py +474 -0
- mcli/ml/models/test_models.py +487 -0
- mcli/ml/monitoring/drift_detection.py +676 -0
- mcli/ml/monitoring/metrics.py +45 -0
- mcli/ml/optimization/portfolio_optimizer.py +834 -0
- mcli/ml/preprocessing/data_cleaners.py +451 -0
- mcli/ml/preprocessing/feature_extractors.py +491 -0
- mcli/ml/preprocessing/ml_pipeline.py +382 -0
- mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
- mcli/ml/preprocessing/test_preprocessing.py +294 -0
- mcli/ml/scripts/populate_sample_data.py +200 -0
- mcli/ml/tasks.py +400 -0
- mcli/ml/tests/test_integration.py +429 -0
- mcli/ml/tests/test_training_dashboard.py +387 -0
- mcli/public/oi/oi.py +15 -0
- mcli/public/public.py +4 -0
- mcli/self/self_cmd.py +1246 -0
- mcli/workflow/daemon/api_daemon.py +800 -0
- mcli/workflow/daemon/async_command_database.py +681 -0
- mcli/workflow/daemon/async_process_manager.py +591 -0
- mcli/workflow/daemon/client.py +530 -0
- mcli/workflow/daemon/commands.py +1196 -0
- mcli/workflow/daemon/daemon.py +905 -0
- mcli/workflow/daemon/daemon_api.py +59 -0
- mcli/workflow/daemon/enhanced_daemon.py +571 -0
- mcli/workflow/daemon/process_cli.py +244 -0
- mcli/workflow/daemon/process_manager.py +439 -0
- mcli/workflow/daemon/test_daemon.py +275 -0
- mcli/workflow/dashboard/dashboard_cmd.py +113 -0
- mcli/workflow/docker/docker.py +0 -0
- mcli/workflow/file/file.py +100 -0
- mcli/workflow/gcloud/config.toml +21 -0
- mcli/workflow/gcloud/gcloud.py +58 -0
- mcli/workflow/git_commit/ai_service.py +328 -0
- mcli/workflow/git_commit/commands.py +430 -0
- mcli/workflow/lsh_integration.py +355 -0
- mcli/workflow/model_service/client.py +594 -0
- mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
- mcli/workflow/model_service/lightweight_embedder.py +397 -0
- mcli/workflow/model_service/lightweight_model_server.py +714 -0
- mcli/workflow/model_service/lightweight_test.py +241 -0
- mcli/workflow/model_service/model_service.py +1955 -0
- mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
- mcli/workflow/model_service/pdf_processor.py +386 -0
- mcli/workflow/model_service/test_efficient_runner.py +234 -0
- mcli/workflow/model_service/test_example.py +315 -0
- mcli/workflow/model_service/test_integration.py +131 -0
- mcli/workflow/model_service/test_new_features.py +149 -0
- mcli/workflow/openai/openai.py +99 -0
- mcli/workflow/politician_trading/commands.py +1790 -0
- mcli/workflow/politician_trading/config.py +134 -0
- mcli/workflow/politician_trading/connectivity.py +490 -0
- mcli/workflow/politician_trading/data_sources.py +395 -0
- mcli/workflow/politician_trading/database.py +410 -0
- mcli/workflow/politician_trading/demo.py +248 -0
- mcli/workflow/politician_trading/models.py +165 -0
- mcli/workflow/politician_trading/monitoring.py +413 -0
- mcli/workflow/politician_trading/scrapers.py +966 -0
- mcli/workflow/politician_trading/scrapers_california.py +412 -0
- mcli/workflow/politician_trading/scrapers_eu.py +377 -0
- mcli/workflow/politician_trading/scrapers_uk.py +350 -0
- mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
- mcli/workflow/politician_trading/supabase_functions.py +354 -0
- mcli/workflow/politician_trading/workflow.py +852 -0
- mcli/workflow/registry/registry.py +180 -0
- mcli/workflow/repo/repo.py +223 -0
- mcli/workflow/scheduler/commands.py +493 -0
- mcli/workflow/scheduler/cron_parser.py +238 -0
- mcli/workflow/scheduler/job.py +182 -0
- mcli/workflow/scheduler/monitor.py +139 -0
- mcli/workflow/scheduler/persistence.py +324 -0
- mcli/workflow/scheduler/scheduler.py +679 -0
- mcli/workflow/sync/sync_cmd.py +437 -0
- mcli/workflow/sync/test_cmd.py +314 -0
- mcli/workflow/videos/videos.py +242 -0
- mcli/workflow/wakatime/wakatime.py +11 -0
- mcli/workflow/workflow.py +37 -0
- mcli_framework-7.0.0.dist-info/METADATA +479 -0
- mcli_framework-7.0.0.dist-info/RECORD +186 -0
- mcli_framework-7.0.0.dist-info/WHEEL +5 -0
- mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
- mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
- mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,614 @@
|
|
|
1
|
+
"""End-to-end ML pipeline orchestrator"""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../.."))
|
|
6
|
+
|
|
7
|
+
from typing import Dict, Any, Optional, List, Callable, Union
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import numpy as np
|
|
13
|
+
import torch
|
|
14
|
+
import logging
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
import json
|
|
17
|
+
import pickle
|
|
18
|
+
|
|
19
|
+
from ml.preprocessing.data_processor import DataProcessor, ProcessingConfig
|
|
20
|
+
from ml.features.stock_features import StockRecommendationFeatures
|
|
21
|
+
from ml.features.political_features import PoliticalInfluenceFeatures
|
|
22
|
+
from ml.features.ensemble_features import EnsembleFeatureBuilder
|
|
23
|
+
from ml.features.recommendation_engine import StockRecommendationEngine, RecommendationConfig as FeatureRecommendationConfig
|
|
24
|
+
from ml.models.ensemble_models import DeepEnsembleModel, EnsembleConfig, ModelConfig, EnsembleTrainer
|
|
25
|
+
from ml.models.recommendation_models import StockRecommendationModel, RecommendationConfig, RecommendationTrainer
|
|
26
|
+
from .experiment_tracker import ExperimentTracker, MLflowConfig
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class PipelineStage(Enum):
|
|
32
|
+
"""Pipeline execution stages"""
|
|
33
|
+
DATA_INGESTION = "data_ingestion"
|
|
34
|
+
DATA_PREPROCESSING = "data_preprocessing"
|
|
35
|
+
FEATURE_ENGINEERING = "feature_engineering"
|
|
36
|
+
MODEL_TRAINING = "model_training"
|
|
37
|
+
MODEL_EVALUATION = "model_evaluation"
|
|
38
|
+
MODEL_DEPLOYMENT = "model_deployment"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class PipelineStep:
|
|
43
|
+
"""Individual pipeline step configuration"""
|
|
44
|
+
name: str
|
|
45
|
+
stage: PipelineStage
|
|
46
|
+
function: Callable
|
|
47
|
+
inputs: Dict[str, Any] = field(default_factory=dict)
|
|
48
|
+
outputs: List[str] = field(default_factory=list)
|
|
49
|
+
config: Optional[Dict[str, Any]] = None
|
|
50
|
+
enabled: bool = True
|
|
51
|
+
retry_count: int = 3
|
|
52
|
+
timeout: Optional[int] = None # seconds
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class PipelineConfig:
|
|
57
|
+
"""Complete pipeline configuration"""
|
|
58
|
+
name: str = "politician-trading-ml-pipeline"
|
|
59
|
+
version: str = "1.0.0"
|
|
60
|
+
data_dir: Path = Path("data")
|
|
61
|
+
model_dir: Path = Path("models")
|
|
62
|
+
output_dir: Path = Path("outputs")
|
|
63
|
+
cache_dir: Path = Path("cache")
|
|
64
|
+
enable_mlflow: bool = True
|
|
65
|
+
mlflow_config: Optional[MLflowConfig] = None
|
|
66
|
+
enable_caching: bool = True
|
|
67
|
+
parallel_execution: bool = False
|
|
68
|
+
checkpoint_frequency: int = 5 # Save checkpoint every N steps
|
|
69
|
+
tags: Dict[str, str] = field(default_factory=dict)
|
|
70
|
+
|
|
71
|
+
def __post_init__(self):
|
|
72
|
+
# Create directories
|
|
73
|
+
for dir_path in [self.data_dir, self.model_dir, self.output_dir, self.cache_dir]:
|
|
74
|
+
dir_path.mkdir(parents=True, exist_ok=True)
|
|
75
|
+
|
|
76
|
+
if self.enable_mlflow and not self.mlflow_config:
|
|
77
|
+
self.mlflow_config = MLflowConfig()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class MLPipeline:
|
|
81
|
+
"""End-to-end ML pipeline orchestrator"""
|
|
82
|
+
|
|
83
|
+
def __init__(self, config: PipelineConfig):
|
|
84
|
+
self.config = config
|
|
85
|
+
self.steps: List[PipelineStep] = []
|
|
86
|
+
self.artifacts: Dict[str, Any] = {}
|
|
87
|
+
self.metrics: Dict[str, float] = {}
|
|
88
|
+
self.experiment_tracker = None
|
|
89
|
+
self.current_step = None
|
|
90
|
+
|
|
91
|
+
if config.enable_mlflow:
|
|
92
|
+
self.experiment_tracker = ExperimentTracker(config.mlflow_config)
|
|
93
|
+
|
|
94
|
+
# Initialize components
|
|
95
|
+
self.data_processor = None
|
|
96
|
+
self.feature_extractors = {}
|
|
97
|
+
self.model = None
|
|
98
|
+
self.trainer = None
|
|
99
|
+
|
|
100
|
+
self._setup_default_pipeline()
|
|
101
|
+
|
|
102
|
+
def _setup_default_pipeline(self):
|
|
103
|
+
"""Setup default pipeline steps"""
|
|
104
|
+
# Data ingestion
|
|
105
|
+
self.add_step(PipelineStep(
|
|
106
|
+
name="load_raw_data",
|
|
107
|
+
stage=PipelineStage.DATA_INGESTION,
|
|
108
|
+
function=self._load_raw_data,
|
|
109
|
+
outputs=["raw_trading_data", "raw_stock_data"]
|
|
110
|
+
))
|
|
111
|
+
|
|
112
|
+
# Data preprocessing
|
|
113
|
+
self.add_step(PipelineStep(
|
|
114
|
+
name="preprocess_data",
|
|
115
|
+
stage=PipelineStage.DATA_PREPROCESSING,
|
|
116
|
+
function=self._preprocess_data,
|
|
117
|
+
inputs={"trading_data": "raw_trading_data", "stock_data": "raw_stock_data"},
|
|
118
|
+
outputs=["processed_trading_data", "processed_stock_data"]
|
|
119
|
+
))
|
|
120
|
+
|
|
121
|
+
# Feature engineering
|
|
122
|
+
self.add_step(PipelineStep(
|
|
123
|
+
name="extract_features",
|
|
124
|
+
stage=PipelineStage.FEATURE_ENGINEERING,
|
|
125
|
+
function=self._extract_features,
|
|
126
|
+
inputs={"trading_data": "processed_trading_data", "stock_data": "processed_stock_data"},
|
|
127
|
+
outputs=["feature_matrix", "feature_names", "labels"]
|
|
128
|
+
))
|
|
129
|
+
|
|
130
|
+
# Model training
|
|
131
|
+
self.add_step(PipelineStep(
|
|
132
|
+
name="train_model",
|
|
133
|
+
stage=PipelineStage.MODEL_TRAINING,
|
|
134
|
+
function=self._train_model,
|
|
135
|
+
inputs={"X": "feature_matrix", "y": "labels"},
|
|
136
|
+
outputs=["trained_model", "training_metrics"]
|
|
137
|
+
))
|
|
138
|
+
|
|
139
|
+
# Model evaluation
|
|
140
|
+
self.add_step(PipelineStep(
|
|
141
|
+
name="evaluate_model",
|
|
142
|
+
stage=PipelineStage.MODEL_EVALUATION,
|
|
143
|
+
function=self._evaluate_model,
|
|
144
|
+
inputs={"model": "trained_model", "X_test": "test_features", "y_test": "test_labels"},
|
|
145
|
+
outputs=["evaluation_metrics", "predictions"]
|
|
146
|
+
))
|
|
147
|
+
|
|
148
|
+
# Model deployment
|
|
149
|
+
self.add_step(PipelineStep(
|
|
150
|
+
name="deploy_model",
|
|
151
|
+
stage=PipelineStage.MODEL_DEPLOYMENT,
|
|
152
|
+
function=self._deploy_model,
|
|
153
|
+
inputs={"model": "trained_model", "metrics": "evaluation_metrics"},
|
|
154
|
+
outputs=["deployment_info"]
|
|
155
|
+
))
|
|
156
|
+
|
|
157
|
+
def add_step(self, step: PipelineStep):
|
|
158
|
+
"""Add step to pipeline"""
|
|
159
|
+
self.steps.append(step)
|
|
160
|
+
logger.debug(f"Added pipeline step: {step.name}")
|
|
161
|
+
|
|
162
|
+
def _load_raw_data(self) -> Dict[str, pd.DataFrame]:
|
|
163
|
+
"""Load raw data from sources"""
|
|
164
|
+
logger.info("Loading raw data...")
|
|
165
|
+
|
|
166
|
+
# Load politician trading data
|
|
167
|
+
trading_data_path = self.config.data_dir / "politician_trades.csv"
|
|
168
|
+
if trading_data_path.exists():
|
|
169
|
+
trading_data = pd.read_csv(trading_data_path)
|
|
170
|
+
else:
|
|
171
|
+
# Generate mock data for testing
|
|
172
|
+
trading_data = self._generate_mock_trading_data()
|
|
173
|
+
|
|
174
|
+
# Load stock price data
|
|
175
|
+
stock_data_path = self.config.data_dir / "stock_prices.csv"
|
|
176
|
+
if stock_data_path.exists():
|
|
177
|
+
stock_data = pd.read_csv(stock_data_path)
|
|
178
|
+
else:
|
|
179
|
+
# Generate mock data for testing
|
|
180
|
+
stock_data = self._generate_mock_stock_data()
|
|
181
|
+
|
|
182
|
+
logger.info(f"Loaded {len(trading_data)} trading records and {len(stock_data)} stock prices")
|
|
183
|
+
|
|
184
|
+
return {
|
|
185
|
+
"raw_trading_data": trading_data,
|
|
186
|
+
"raw_stock_data": stock_data
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
def _preprocess_data(self, trading_data: pd.DataFrame,
|
|
190
|
+
stock_data: pd.DataFrame) -> Dict[str, pd.DataFrame]:
|
|
191
|
+
"""Preprocess raw data"""
|
|
192
|
+
logger.info("Preprocessing data...")
|
|
193
|
+
|
|
194
|
+
# Initialize data processor
|
|
195
|
+
processing_config = ProcessingConfig()
|
|
196
|
+
self.data_processor = DataProcessor(processing_config)
|
|
197
|
+
|
|
198
|
+
# Process trading data
|
|
199
|
+
processed_trading = self.data_processor.process_politician_trades(trading_data)
|
|
200
|
+
|
|
201
|
+
# Process stock data (ensure proper format)
|
|
202
|
+
processed_stock = stock_data.copy()
|
|
203
|
+
if 'date' in processed_stock.columns and processed_stock['date'].dtype == 'object':
|
|
204
|
+
processed_stock['date'] = pd.to_datetime(processed_stock['date'])
|
|
205
|
+
|
|
206
|
+
# Clean and validate
|
|
207
|
+
processed_trading = self.data_processor.clean_data(processed_trading)
|
|
208
|
+
processed_stock = self.data_processor.clean_data(processed_stock)
|
|
209
|
+
|
|
210
|
+
logger.info(f"Preprocessed {len(processed_trading)} trading records")
|
|
211
|
+
|
|
212
|
+
return {
|
|
213
|
+
"processed_trading_data": processed_trading,
|
|
214
|
+
"processed_stock_data": processed_stock
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
def _extract_features(self, trading_data: pd.DataFrame,
|
|
218
|
+
stock_data: pd.DataFrame) -> Dict[str, Any]:
|
|
219
|
+
"""Extract features from preprocessed data"""
|
|
220
|
+
logger.info("Extracting features...")
|
|
221
|
+
|
|
222
|
+
# Initialize feature extractors
|
|
223
|
+
stock_extractor = StockRecommendationFeatures()
|
|
224
|
+
political_extractor = PoliticalInfluenceFeatures()
|
|
225
|
+
ensemble_builder = EnsembleFeatureBuilder()
|
|
226
|
+
|
|
227
|
+
# Extract stock features
|
|
228
|
+
stock_features = pd.DataFrame()
|
|
229
|
+
if not stock_data.empty:
|
|
230
|
+
try:
|
|
231
|
+
stock_features = stock_extractor.extract_features(stock_data)
|
|
232
|
+
except Exception as e:
|
|
233
|
+
logger.warning(f"Could not extract stock features: {e}")
|
|
234
|
+
|
|
235
|
+
# Extract political features
|
|
236
|
+
political_features = political_extractor.extract_influence_features(trading_data)
|
|
237
|
+
|
|
238
|
+
# Combine features
|
|
239
|
+
if not stock_features.empty:
|
|
240
|
+
feature_df = pd.concat([political_features, stock_features], axis=1)
|
|
241
|
+
else:
|
|
242
|
+
feature_df = political_features
|
|
243
|
+
|
|
244
|
+
# Build ensemble features
|
|
245
|
+
feature_df = ensemble_builder.build_ensemble_features(feature_df)
|
|
246
|
+
|
|
247
|
+
# Create labels (simplified - would be based on actual returns)
|
|
248
|
+
labels = np.random.randint(0, 2, len(feature_df))
|
|
249
|
+
|
|
250
|
+
# Store feature names
|
|
251
|
+
feature_names = feature_df.columns.tolist()
|
|
252
|
+
|
|
253
|
+
logger.info(f"Extracted {len(feature_names)} features from {len(feature_df)} samples")
|
|
254
|
+
|
|
255
|
+
return {
|
|
256
|
+
"feature_matrix": feature_df.values,
|
|
257
|
+
"feature_names": feature_names,
|
|
258
|
+
"labels": labels
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
def _train_model(self, X: np.ndarray, y: np.ndarray) -> Dict[str, Any]:
|
|
262
|
+
"""Train ensemble model"""
|
|
263
|
+
logger.info("Training model...")
|
|
264
|
+
|
|
265
|
+
# Split data
|
|
266
|
+
train_size = int(0.8 * len(X))
|
|
267
|
+
X_train, X_val = X[:train_size], X[train_size:]
|
|
268
|
+
y_train, y_val = y[:train_size], y[train_size:]
|
|
269
|
+
|
|
270
|
+
# Store test data for evaluation
|
|
271
|
+
self.artifacts["test_features"] = X_val
|
|
272
|
+
self.artifacts["test_labels"] = y_val
|
|
273
|
+
|
|
274
|
+
# Configure ensemble
|
|
275
|
+
model_configs = [
|
|
276
|
+
ModelConfig(
|
|
277
|
+
model_type="mlp",
|
|
278
|
+
hidden_dims=[256, 128],
|
|
279
|
+
dropout_rate=0.3,
|
|
280
|
+
learning_rate=0.001,
|
|
281
|
+
weight_decay=1e-4,
|
|
282
|
+
batch_size=32,
|
|
283
|
+
epochs=10
|
|
284
|
+
),
|
|
285
|
+
ModelConfig(
|
|
286
|
+
model_type="attention",
|
|
287
|
+
hidden_dims=[128],
|
|
288
|
+
dropout_rate=0.2,
|
|
289
|
+
learning_rate=0.001,
|
|
290
|
+
weight_decay=1e-4,
|
|
291
|
+
batch_size=32,
|
|
292
|
+
epochs=10
|
|
293
|
+
)
|
|
294
|
+
]
|
|
295
|
+
|
|
296
|
+
ensemble_config = EnsembleConfig(
|
|
297
|
+
base_models=model_configs,
|
|
298
|
+
ensemble_method="weighted_average"
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
recommendation_config = RecommendationConfig(
|
|
302
|
+
ensemble_config=ensemble_config,
|
|
303
|
+
risk_adjustment=True,
|
|
304
|
+
confidence_threshold=0.6
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
# Create and train model
|
|
308
|
+
input_dim = X.shape[1]
|
|
309
|
+
self.model = StockRecommendationModel(input_dim, recommendation_config)
|
|
310
|
+
|
|
311
|
+
# Generate mock risk and return labels for training
|
|
312
|
+
returns_train = np.random.normal(0.05, 0.15, len(y_train))
|
|
313
|
+
risk_labels_train = np.random.choice([0, 1, 2], len(y_train), p=[0.3, 0.5, 0.2])
|
|
314
|
+
returns_val = np.random.normal(0.05, 0.15, len(y_val))
|
|
315
|
+
risk_labels_val = np.random.choice([0, 1, 2], len(y_val), p=[0.3, 0.5, 0.2])
|
|
316
|
+
|
|
317
|
+
# Train model
|
|
318
|
+
trainer = RecommendationTrainer(self.model, recommendation_config)
|
|
319
|
+
result = trainer.train(
|
|
320
|
+
X_train, y_train, returns_train, risk_labels_train,
|
|
321
|
+
X_val, y_val, returns_val, risk_labels_val,
|
|
322
|
+
epochs=10, batch_size=32
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# Extract metrics
|
|
326
|
+
training_metrics = {
|
|
327
|
+
"train_accuracy": result.train_metrics.accuracy,
|
|
328
|
+
"train_precision": result.train_metrics.precision,
|
|
329
|
+
"train_recall": result.train_metrics.recall,
|
|
330
|
+
"train_f1": result.train_metrics.f1_score,
|
|
331
|
+
"val_accuracy": result.val_metrics.accuracy,
|
|
332
|
+
"val_precision": result.val_metrics.precision,
|
|
333
|
+
"val_recall": result.val_metrics.recall,
|
|
334
|
+
"val_f1": result.val_metrics.f1_score
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
logger.info(f"Model trained - Val accuracy: {training_metrics['val_accuracy']:.3f}")
|
|
338
|
+
|
|
339
|
+
return {
|
|
340
|
+
"trained_model": self.model,
|
|
341
|
+
"training_metrics": training_metrics
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
def _evaluate_model(self, model: StockRecommendationModel,
|
|
345
|
+
X_test: np.ndarray, y_test: np.ndarray) -> Dict[str, Any]:
|
|
346
|
+
"""Evaluate trained model"""
|
|
347
|
+
logger.info("Evaluating model...")
|
|
348
|
+
|
|
349
|
+
# Generate predictions
|
|
350
|
+
predictions = model.predict(X_test)
|
|
351
|
+
probabilities = model.predict_proba(X_test)
|
|
352
|
+
|
|
353
|
+
# Calculate metrics
|
|
354
|
+
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
|
|
355
|
+
|
|
356
|
+
evaluation_metrics = {
|
|
357
|
+
"test_accuracy": accuracy_score(y_test, predictions),
|
|
358
|
+
"test_precision": precision_score(y_test, predictions, average='weighted', zero_division=0),
|
|
359
|
+
"test_recall": recall_score(y_test, predictions, average='weighted', zero_division=0),
|
|
360
|
+
"test_f1": f1_score(y_test, predictions, average='weighted', zero_division=0)
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
# Calculate AUC if binary classification
|
|
364
|
+
if probabilities.shape[1] == 2:
|
|
365
|
+
try:
|
|
366
|
+
evaluation_metrics["test_auc"] = roc_auc_score(y_test, probabilities[:, 1])
|
|
367
|
+
except:
|
|
368
|
+
pass
|
|
369
|
+
|
|
370
|
+
logger.info(f"Model evaluation - Test accuracy: {evaluation_metrics['test_accuracy']:.3f}")
|
|
371
|
+
|
|
372
|
+
return {
|
|
373
|
+
"evaluation_metrics": evaluation_metrics,
|
|
374
|
+
"predictions": predictions
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
def _deploy_model(self, model: StockRecommendationModel,
|
|
378
|
+
metrics: Dict[str, float]) -> Dict[str, Any]:
|
|
379
|
+
"""Deploy model (save to disk)"""
|
|
380
|
+
logger.info("Deploying model...")
|
|
381
|
+
|
|
382
|
+
# Save model
|
|
383
|
+
model_path = self.config.model_dir / f"model_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pt"
|
|
384
|
+
torch.save({
|
|
385
|
+
'model_state_dict': model.state_dict(),
|
|
386
|
+
'metrics': metrics,
|
|
387
|
+
'config': model.recommendation_config
|
|
388
|
+
}, model_path)
|
|
389
|
+
|
|
390
|
+
deployment_info = {
|
|
391
|
+
"model_path": str(model_path),
|
|
392
|
+
"deployed_at": datetime.now().isoformat(),
|
|
393
|
+
"metrics": metrics
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
logger.info(f"Model deployed to {model_path}")
|
|
397
|
+
return {"deployment_info": deployment_info}
|
|
398
|
+
|
|
399
|
+
def run(self, start_step: Optional[str] = None,
|
|
400
|
+
end_step: Optional[str] = None) -> Dict[str, Any]:
|
|
401
|
+
"""Execute pipeline"""
|
|
402
|
+
logger.info(f"Starting pipeline: {self.config.name} v{self.config.version}")
|
|
403
|
+
|
|
404
|
+
# Start MLflow run if enabled
|
|
405
|
+
if self.experiment_tracker:
|
|
406
|
+
run_name = f"{self.config.name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
407
|
+
self.experiment_tracker.start_run(run_name, self.config.tags)
|
|
408
|
+
|
|
409
|
+
# Log pipeline config
|
|
410
|
+
self.experiment_tracker.log_params({
|
|
411
|
+
"pipeline_name": self.config.name,
|
|
412
|
+
"pipeline_version": self.config.version,
|
|
413
|
+
"enable_caching": self.config.enable_caching,
|
|
414
|
+
"parallel_execution": self.config.parallel_execution
|
|
415
|
+
})
|
|
416
|
+
|
|
417
|
+
# Execute steps
|
|
418
|
+
start_idx = 0
|
|
419
|
+
end_idx = len(self.steps)
|
|
420
|
+
|
|
421
|
+
if start_step:
|
|
422
|
+
start_idx = next((i for i, s in enumerate(self.steps) if s.name == start_step), 0)
|
|
423
|
+
if end_step:
|
|
424
|
+
end_idx = next((i+1 for i, s in enumerate(self.steps) if s.name == end_step), len(self.steps))
|
|
425
|
+
|
|
426
|
+
for i, step in enumerate(self.steps[start_idx:end_idx], start=start_idx):
|
|
427
|
+
if not step.enabled:
|
|
428
|
+
logger.info(f"Skipping disabled step: {step.name}")
|
|
429
|
+
continue
|
|
430
|
+
|
|
431
|
+
self.current_step = step
|
|
432
|
+
logger.info(f"Executing step {i+1}/{len(self.steps)}: {step.name}")
|
|
433
|
+
|
|
434
|
+
try:
|
|
435
|
+
# Prepare inputs
|
|
436
|
+
kwargs = {}
|
|
437
|
+
for param, artifact_key in step.inputs.items():
|
|
438
|
+
if artifact_key in self.artifacts:
|
|
439
|
+
kwargs[param] = self.artifacts[artifact_key]
|
|
440
|
+
|
|
441
|
+
# Execute step
|
|
442
|
+
result = step.function(**kwargs)
|
|
443
|
+
|
|
444
|
+
# Store outputs
|
|
445
|
+
if result and step.outputs:
|
|
446
|
+
if len(step.outputs) == 1:
|
|
447
|
+
self.artifacts[step.outputs[0]] = result
|
|
448
|
+
else:
|
|
449
|
+
for output_key, value in result.items():
|
|
450
|
+
if output_key in step.outputs:
|
|
451
|
+
self.artifacts[output_key] = value
|
|
452
|
+
|
|
453
|
+
# Log to MLflow
|
|
454
|
+
if self.experiment_tracker and "metrics" in str(result):
|
|
455
|
+
if isinstance(result, dict) and any("metric" in k for k in result.keys()):
|
|
456
|
+
metrics_dict = result.get("training_metrics", result.get("evaluation_metrics", {}))
|
|
457
|
+
self.experiment_tracker.log_metrics(metrics_dict)
|
|
458
|
+
|
|
459
|
+
# Checkpoint if needed
|
|
460
|
+
if (i + 1) % self.config.checkpoint_frequency == 0:
|
|
461
|
+
self._save_checkpoint(i + 1)
|
|
462
|
+
|
|
463
|
+
except Exception as e:
|
|
464
|
+
logger.error(f"Step {step.name} failed: {e}")
|
|
465
|
+
if self.experiment_tracker:
|
|
466
|
+
self.experiment_tracker.end_run(status="FAILED")
|
|
467
|
+
raise
|
|
468
|
+
|
|
469
|
+
# Log final model to MLflow
|
|
470
|
+
if self.experiment_tracker and self.model:
|
|
471
|
+
try:
|
|
472
|
+
example_input = pd.DataFrame(
|
|
473
|
+
self.artifacts.get("feature_matrix", np.random.randn(5, 100))[:5]
|
|
474
|
+
)
|
|
475
|
+
self.experiment_tracker.log_model(
|
|
476
|
+
self.model,
|
|
477
|
+
"recommendation_model",
|
|
478
|
+
input_example=example_input
|
|
479
|
+
)
|
|
480
|
+
except Exception as e:
|
|
481
|
+
logger.warning(f"Could not log model to MLflow: {e}")
|
|
482
|
+
|
|
483
|
+
# End MLflow run
|
|
484
|
+
if self.experiment_tracker:
|
|
485
|
+
self.experiment_tracker.end_run(status="FINISHED")
|
|
486
|
+
|
|
487
|
+
logger.info("Pipeline execution completed successfully")
|
|
488
|
+
|
|
489
|
+
return {
|
|
490
|
+
"artifacts": self.artifacts,
|
|
491
|
+
"metrics": self.metrics,
|
|
492
|
+
"model": self.model
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
def _save_checkpoint(self, step_number: int):
|
|
496
|
+
"""Save pipeline checkpoint"""
|
|
497
|
+
checkpoint_path = self.config.cache_dir / f"checkpoint_step_{step_number}.pkl"
|
|
498
|
+
|
|
499
|
+
checkpoint = {
|
|
500
|
+
"step_number": step_number,
|
|
501
|
+
"artifacts": {k: v for k, v in self.artifacts.items()
|
|
502
|
+
if not isinstance(v, (torch.nn.Module, type))},
|
|
503
|
+
"metrics": self.metrics,
|
|
504
|
+
"timestamp": datetime.now()
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
with open(checkpoint_path, 'wb') as f:
|
|
508
|
+
pickle.dump(checkpoint, f)
|
|
509
|
+
|
|
510
|
+
logger.debug(f"Saved checkpoint at step {step_number}")
|
|
511
|
+
|
|
512
|
+
def load_checkpoint(self, checkpoint_path: Path):
|
|
513
|
+
"""Load pipeline checkpoint"""
|
|
514
|
+
with open(checkpoint_path, 'rb') as f:
|
|
515
|
+
checkpoint = pickle.load(f)
|
|
516
|
+
|
|
517
|
+
self.artifacts.update(checkpoint["artifacts"])
|
|
518
|
+
self.metrics.update(checkpoint["metrics"])
|
|
519
|
+
|
|
520
|
+
logger.info(f"Loaded checkpoint from step {checkpoint['step_number']}")
|
|
521
|
+
|
|
522
|
+
def _generate_mock_trading_data(self) -> pd.DataFrame:
|
|
523
|
+
"""Generate mock politician trading data for testing"""
|
|
524
|
+
np.random.seed(42)
|
|
525
|
+
n_records = 500
|
|
526
|
+
|
|
527
|
+
politicians = ["Nancy Pelosi", "Mitch McConnell", "Chuck Schumer", "Kevin McCarthy"]
|
|
528
|
+
tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA"]
|
|
529
|
+
|
|
530
|
+
data = []
|
|
531
|
+
for _ in range(n_records):
|
|
532
|
+
data.append({
|
|
533
|
+
"politician_name_cleaned": np.random.choice(politicians),
|
|
534
|
+
"transaction_date_cleaned": pd.Timestamp.now() - pd.Timedelta(days=np.random.randint(1, 365)),
|
|
535
|
+
"transaction_amount_cleaned": np.random.uniform(1000, 500000),
|
|
536
|
+
"transaction_type_cleaned": np.random.choice(["buy", "sell"]),
|
|
537
|
+
"ticker_cleaned": np.random.choice(tickers)
|
|
538
|
+
})
|
|
539
|
+
|
|
540
|
+
return pd.DataFrame(data)
|
|
541
|
+
|
|
542
|
+
def _generate_mock_stock_data(self) -> pd.DataFrame:
|
|
543
|
+
"""Generate mock stock price data for testing"""
|
|
544
|
+
np.random.seed(42)
|
|
545
|
+
tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA"]
|
|
546
|
+
dates = pd.date_range(end=pd.Timestamp.now(), periods=100)
|
|
547
|
+
|
|
548
|
+
data = []
|
|
549
|
+
for ticker in tickers:
|
|
550
|
+
base_price = np.random.uniform(100, 500)
|
|
551
|
+
for date in dates:
|
|
552
|
+
price = base_price * (1 + np.random.normal(0, 0.02))
|
|
553
|
+
data.append({
|
|
554
|
+
"symbol": ticker,
|
|
555
|
+
"date": date,
|
|
556
|
+
"close": price,
|
|
557
|
+
"volume": np.random.randint(1000000, 10000000),
|
|
558
|
+
"open": price * 0.99,
|
|
559
|
+
"high": price * 1.01,
|
|
560
|
+
"low": price * 0.98
|
|
561
|
+
})
|
|
562
|
+
|
|
563
|
+
return pd.DataFrame(data)
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
class PipelineExecutor:
|
|
567
|
+
"""Execute and manage multiple pipeline runs"""
|
|
568
|
+
|
|
569
|
+
def __init__(self, config: PipelineConfig):
|
|
570
|
+
self.config = config
|
|
571
|
+
self.pipelines: Dict[str, MLPipeline] = {}
|
|
572
|
+
|
|
573
|
+
def create_pipeline(self, name: str) -> MLPipeline:
|
|
574
|
+
"""Create new pipeline instance"""
|
|
575
|
+
pipeline = MLPipeline(self.config)
|
|
576
|
+
self.pipelines[name] = pipeline
|
|
577
|
+
return pipeline
|
|
578
|
+
|
|
579
|
+
def run_pipeline(self, name: str, **kwargs) -> Dict[str, Any]:
|
|
580
|
+
"""Run specific pipeline"""
|
|
581
|
+
if name not in self.pipelines:
|
|
582
|
+
self.pipelines[name] = MLPipeline(self.config)
|
|
583
|
+
|
|
584
|
+
return self.pipelines[name].run(**kwargs)
|
|
585
|
+
|
|
586
|
+
def run_experiment(self, n_runs: int = 5,
|
|
587
|
+
param_grid: Optional[Dict[str, List]] = None) -> pd.DataFrame:
|
|
588
|
+
"""Run multiple experiments with different parameters"""
|
|
589
|
+
results = []
|
|
590
|
+
|
|
591
|
+
for i in range(n_runs):
|
|
592
|
+
logger.info(f"Running experiment {i+1}/{n_runs}")
|
|
593
|
+
|
|
594
|
+
# Create new pipeline for each run
|
|
595
|
+
pipeline_name = f"experiment_{i}"
|
|
596
|
+
pipeline = self.create_pipeline(pipeline_name)
|
|
597
|
+
|
|
598
|
+
# Modify parameters if grid provided
|
|
599
|
+
if param_grid:
|
|
600
|
+
# Simple parameter modification (would be more sophisticated in practice)
|
|
601
|
+
pass
|
|
602
|
+
|
|
603
|
+
# Run pipeline
|
|
604
|
+
result = pipeline.run()
|
|
605
|
+
|
|
606
|
+
# Collect metrics
|
|
607
|
+
run_metrics = {
|
|
608
|
+
"run_id": i,
|
|
609
|
+
"pipeline_name": pipeline_name,
|
|
610
|
+
**result.get("metrics", {})
|
|
611
|
+
}
|
|
612
|
+
results.append(run_metrics)
|
|
613
|
+
|
|
614
|
+
return pd.DataFrame(results)
|