mcli-framework 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (186) hide show
  1. mcli/app/chat_cmd.py +42 -0
  2. mcli/app/commands_cmd.py +226 -0
  3. mcli/app/completion_cmd.py +216 -0
  4. mcli/app/completion_helpers.py +288 -0
  5. mcli/app/cron_test_cmd.py +697 -0
  6. mcli/app/logs_cmd.py +419 -0
  7. mcli/app/main.py +492 -0
  8. mcli/app/model/model.py +1060 -0
  9. mcli/app/model_cmd.py +227 -0
  10. mcli/app/redis_cmd.py +269 -0
  11. mcli/app/video/video.py +1114 -0
  12. mcli/app/visual_cmd.py +303 -0
  13. mcli/chat/chat.py +2409 -0
  14. mcli/chat/command_rag.py +514 -0
  15. mcli/chat/enhanced_chat.py +652 -0
  16. mcli/chat/system_controller.py +1010 -0
  17. mcli/chat/system_integration.py +1016 -0
  18. mcli/cli.py +25 -0
  19. mcli/config.toml +20 -0
  20. mcli/lib/api/api.py +586 -0
  21. mcli/lib/api/daemon_client.py +203 -0
  22. mcli/lib/api/daemon_client_local.py +44 -0
  23. mcli/lib/api/daemon_decorator.py +217 -0
  24. mcli/lib/api/mcli_decorators.py +1032 -0
  25. mcli/lib/auth/auth.py +85 -0
  26. mcli/lib/auth/aws_manager.py +85 -0
  27. mcli/lib/auth/azure_manager.py +91 -0
  28. mcli/lib/auth/credential_manager.py +192 -0
  29. mcli/lib/auth/gcp_manager.py +93 -0
  30. mcli/lib/auth/key_manager.py +117 -0
  31. mcli/lib/auth/mcli_manager.py +93 -0
  32. mcli/lib/auth/token_manager.py +75 -0
  33. mcli/lib/auth/token_util.py +1011 -0
  34. mcli/lib/config/config.py +47 -0
  35. mcli/lib/discovery/__init__.py +1 -0
  36. mcli/lib/discovery/command_discovery.py +274 -0
  37. mcli/lib/erd/erd.py +1345 -0
  38. mcli/lib/erd/generate_graph.py +453 -0
  39. mcli/lib/files/files.py +76 -0
  40. mcli/lib/fs/fs.py +109 -0
  41. mcli/lib/lib.py +29 -0
  42. mcli/lib/logger/logger.py +611 -0
  43. mcli/lib/performance/optimizer.py +409 -0
  44. mcli/lib/performance/rust_bridge.py +502 -0
  45. mcli/lib/performance/uvloop_config.py +154 -0
  46. mcli/lib/pickles/pickles.py +50 -0
  47. mcli/lib/search/cached_vectorizer.py +479 -0
  48. mcli/lib/services/data_pipeline.py +460 -0
  49. mcli/lib/services/lsh_client.py +441 -0
  50. mcli/lib/services/redis_service.py +387 -0
  51. mcli/lib/shell/shell.py +137 -0
  52. mcli/lib/toml/toml.py +33 -0
  53. mcli/lib/ui/styling.py +47 -0
  54. mcli/lib/ui/visual_effects.py +634 -0
  55. mcli/lib/watcher/watcher.py +185 -0
  56. mcli/ml/api/app.py +215 -0
  57. mcli/ml/api/middleware.py +224 -0
  58. mcli/ml/api/routers/admin_router.py +12 -0
  59. mcli/ml/api/routers/auth_router.py +244 -0
  60. mcli/ml/api/routers/backtest_router.py +12 -0
  61. mcli/ml/api/routers/data_router.py +12 -0
  62. mcli/ml/api/routers/model_router.py +302 -0
  63. mcli/ml/api/routers/monitoring_router.py +12 -0
  64. mcli/ml/api/routers/portfolio_router.py +12 -0
  65. mcli/ml/api/routers/prediction_router.py +267 -0
  66. mcli/ml/api/routers/trade_router.py +12 -0
  67. mcli/ml/api/routers/websocket_router.py +76 -0
  68. mcli/ml/api/schemas.py +64 -0
  69. mcli/ml/auth/auth_manager.py +425 -0
  70. mcli/ml/auth/models.py +154 -0
  71. mcli/ml/auth/permissions.py +302 -0
  72. mcli/ml/backtesting/backtest_engine.py +502 -0
  73. mcli/ml/backtesting/performance_metrics.py +393 -0
  74. mcli/ml/cache.py +400 -0
  75. mcli/ml/cli/main.py +398 -0
  76. mcli/ml/config/settings.py +394 -0
  77. mcli/ml/configs/dvc_config.py +230 -0
  78. mcli/ml/configs/mlflow_config.py +131 -0
  79. mcli/ml/configs/mlops_manager.py +293 -0
  80. mcli/ml/dashboard/app.py +532 -0
  81. mcli/ml/dashboard/app_integrated.py +738 -0
  82. mcli/ml/dashboard/app_supabase.py +560 -0
  83. mcli/ml/dashboard/app_training.py +615 -0
  84. mcli/ml/dashboard/cli.py +51 -0
  85. mcli/ml/data_ingestion/api_connectors.py +501 -0
  86. mcli/ml/data_ingestion/data_pipeline.py +567 -0
  87. mcli/ml/data_ingestion/stream_processor.py +512 -0
  88. mcli/ml/database/migrations/env.py +94 -0
  89. mcli/ml/database/models.py +667 -0
  90. mcli/ml/database/session.py +200 -0
  91. mcli/ml/experimentation/ab_testing.py +845 -0
  92. mcli/ml/features/ensemble_features.py +607 -0
  93. mcli/ml/features/political_features.py +676 -0
  94. mcli/ml/features/recommendation_engine.py +809 -0
  95. mcli/ml/features/stock_features.py +573 -0
  96. mcli/ml/features/test_feature_engineering.py +346 -0
  97. mcli/ml/logging.py +85 -0
  98. mcli/ml/mlops/data_versioning.py +518 -0
  99. mcli/ml/mlops/experiment_tracker.py +377 -0
  100. mcli/ml/mlops/model_serving.py +481 -0
  101. mcli/ml/mlops/pipeline_orchestrator.py +614 -0
  102. mcli/ml/models/base_models.py +324 -0
  103. mcli/ml/models/ensemble_models.py +675 -0
  104. mcli/ml/models/recommendation_models.py +474 -0
  105. mcli/ml/models/test_models.py +487 -0
  106. mcli/ml/monitoring/drift_detection.py +676 -0
  107. mcli/ml/monitoring/metrics.py +45 -0
  108. mcli/ml/optimization/portfolio_optimizer.py +834 -0
  109. mcli/ml/preprocessing/data_cleaners.py +451 -0
  110. mcli/ml/preprocessing/feature_extractors.py +491 -0
  111. mcli/ml/preprocessing/ml_pipeline.py +382 -0
  112. mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
  113. mcli/ml/preprocessing/test_preprocessing.py +294 -0
  114. mcli/ml/scripts/populate_sample_data.py +200 -0
  115. mcli/ml/tasks.py +400 -0
  116. mcli/ml/tests/test_integration.py +429 -0
  117. mcli/ml/tests/test_training_dashboard.py +387 -0
  118. mcli/public/oi/oi.py +15 -0
  119. mcli/public/public.py +4 -0
  120. mcli/self/self_cmd.py +1246 -0
  121. mcli/workflow/daemon/api_daemon.py +800 -0
  122. mcli/workflow/daemon/async_command_database.py +681 -0
  123. mcli/workflow/daemon/async_process_manager.py +591 -0
  124. mcli/workflow/daemon/client.py +530 -0
  125. mcli/workflow/daemon/commands.py +1196 -0
  126. mcli/workflow/daemon/daemon.py +905 -0
  127. mcli/workflow/daemon/daemon_api.py +59 -0
  128. mcli/workflow/daemon/enhanced_daemon.py +571 -0
  129. mcli/workflow/daemon/process_cli.py +244 -0
  130. mcli/workflow/daemon/process_manager.py +439 -0
  131. mcli/workflow/daemon/test_daemon.py +275 -0
  132. mcli/workflow/dashboard/dashboard_cmd.py +113 -0
  133. mcli/workflow/docker/docker.py +0 -0
  134. mcli/workflow/file/file.py +100 -0
  135. mcli/workflow/gcloud/config.toml +21 -0
  136. mcli/workflow/gcloud/gcloud.py +58 -0
  137. mcli/workflow/git_commit/ai_service.py +328 -0
  138. mcli/workflow/git_commit/commands.py +430 -0
  139. mcli/workflow/lsh_integration.py +355 -0
  140. mcli/workflow/model_service/client.py +594 -0
  141. mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
  142. mcli/workflow/model_service/lightweight_embedder.py +397 -0
  143. mcli/workflow/model_service/lightweight_model_server.py +714 -0
  144. mcli/workflow/model_service/lightweight_test.py +241 -0
  145. mcli/workflow/model_service/model_service.py +1955 -0
  146. mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
  147. mcli/workflow/model_service/pdf_processor.py +386 -0
  148. mcli/workflow/model_service/test_efficient_runner.py +234 -0
  149. mcli/workflow/model_service/test_example.py +315 -0
  150. mcli/workflow/model_service/test_integration.py +131 -0
  151. mcli/workflow/model_service/test_new_features.py +149 -0
  152. mcli/workflow/openai/openai.py +99 -0
  153. mcli/workflow/politician_trading/commands.py +1790 -0
  154. mcli/workflow/politician_trading/config.py +134 -0
  155. mcli/workflow/politician_trading/connectivity.py +490 -0
  156. mcli/workflow/politician_trading/data_sources.py +395 -0
  157. mcli/workflow/politician_trading/database.py +410 -0
  158. mcli/workflow/politician_trading/demo.py +248 -0
  159. mcli/workflow/politician_trading/models.py +165 -0
  160. mcli/workflow/politician_trading/monitoring.py +413 -0
  161. mcli/workflow/politician_trading/scrapers.py +966 -0
  162. mcli/workflow/politician_trading/scrapers_california.py +412 -0
  163. mcli/workflow/politician_trading/scrapers_eu.py +377 -0
  164. mcli/workflow/politician_trading/scrapers_uk.py +350 -0
  165. mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
  166. mcli/workflow/politician_trading/supabase_functions.py +354 -0
  167. mcli/workflow/politician_trading/workflow.py +852 -0
  168. mcli/workflow/registry/registry.py +180 -0
  169. mcli/workflow/repo/repo.py +223 -0
  170. mcli/workflow/scheduler/commands.py +493 -0
  171. mcli/workflow/scheduler/cron_parser.py +238 -0
  172. mcli/workflow/scheduler/job.py +182 -0
  173. mcli/workflow/scheduler/monitor.py +139 -0
  174. mcli/workflow/scheduler/persistence.py +324 -0
  175. mcli/workflow/scheduler/scheduler.py +679 -0
  176. mcli/workflow/sync/sync_cmd.py +437 -0
  177. mcli/workflow/sync/test_cmd.py +314 -0
  178. mcli/workflow/videos/videos.py +242 -0
  179. mcli/workflow/wakatime/wakatime.py +11 -0
  180. mcli/workflow/workflow.py +37 -0
  181. mcli_framework-7.0.0.dist-info/METADATA +479 -0
  182. mcli_framework-7.0.0.dist-info/RECORD +186 -0
  183. mcli_framework-7.0.0.dist-info/WHEEL +5 -0
  184. mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
  185. mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
  186. mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,614 @@
1
+ """End-to-end ML pipeline orchestrator"""
2
+
3
+ import sys
4
+ import os
5
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../.."))
6
+
7
+ from typing import Dict, Any, Optional, List, Callable, Union
8
+ from dataclasses import dataclass, field
9
+ from enum import Enum
10
+ from pathlib import Path
11
+ import pandas as pd
12
+ import numpy as np
13
+ import torch
14
+ import logging
15
+ from datetime import datetime
16
+ import json
17
+ import pickle
18
+
19
+ from ml.preprocessing.data_processor import DataProcessor, ProcessingConfig
20
+ from ml.features.stock_features import StockRecommendationFeatures
21
+ from ml.features.political_features import PoliticalInfluenceFeatures
22
+ from ml.features.ensemble_features import EnsembleFeatureBuilder
23
+ from ml.features.recommendation_engine import StockRecommendationEngine, RecommendationConfig as FeatureRecommendationConfig
24
+ from ml.models.ensemble_models import DeepEnsembleModel, EnsembleConfig, ModelConfig, EnsembleTrainer
25
+ from ml.models.recommendation_models import StockRecommendationModel, RecommendationConfig, RecommendationTrainer
26
+ from .experiment_tracker import ExperimentTracker, MLflowConfig
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ class PipelineStage(Enum):
32
+ """Pipeline execution stages"""
33
+ DATA_INGESTION = "data_ingestion"
34
+ DATA_PREPROCESSING = "data_preprocessing"
35
+ FEATURE_ENGINEERING = "feature_engineering"
36
+ MODEL_TRAINING = "model_training"
37
+ MODEL_EVALUATION = "model_evaluation"
38
+ MODEL_DEPLOYMENT = "model_deployment"
39
+
40
+
41
+ @dataclass
42
+ class PipelineStep:
43
+ """Individual pipeline step configuration"""
44
+ name: str
45
+ stage: PipelineStage
46
+ function: Callable
47
+ inputs: Dict[str, Any] = field(default_factory=dict)
48
+ outputs: List[str] = field(default_factory=list)
49
+ config: Optional[Dict[str, Any]] = None
50
+ enabled: bool = True
51
+ retry_count: int = 3
52
+ timeout: Optional[int] = None # seconds
53
+
54
+
55
+ @dataclass
56
+ class PipelineConfig:
57
+ """Complete pipeline configuration"""
58
+ name: str = "politician-trading-ml-pipeline"
59
+ version: str = "1.0.0"
60
+ data_dir: Path = Path("data")
61
+ model_dir: Path = Path("models")
62
+ output_dir: Path = Path("outputs")
63
+ cache_dir: Path = Path("cache")
64
+ enable_mlflow: bool = True
65
+ mlflow_config: Optional[MLflowConfig] = None
66
+ enable_caching: bool = True
67
+ parallel_execution: bool = False
68
+ checkpoint_frequency: int = 5 # Save checkpoint every N steps
69
+ tags: Dict[str, str] = field(default_factory=dict)
70
+
71
+ def __post_init__(self):
72
+ # Create directories
73
+ for dir_path in [self.data_dir, self.model_dir, self.output_dir, self.cache_dir]:
74
+ dir_path.mkdir(parents=True, exist_ok=True)
75
+
76
+ if self.enable_mlflow and not self.mlflow_config:
77
+ self.mlflow_config = MLflowConfig()
78
+
79
+
80
+ class MLPipeline:
81
+ """End-to-end ML pipeline orchestrator"""
82
+
83
+ def __init__(self, config: PipelineConfig):
84
+ self.config = config
85
+ self.steps: List[PipelineStep] = []
86
+ self.artifacts: Dict[str, Any] = {}
87
+ self.metrics: Dict[str, float] = {}
88
+ self.experiment_tracker = None
89
+ self.current_step = None
90
+
91
+ if config.enable_mlflow:
92
+ self.experiment_tracker = ExperimentTracker(config.mlflow_config)
93
+
94
+ # Initialize components
95
+ self.data_processor = None
96
+ self.feature_extractors = {}
97
+ self.model = None
98
+ self.trainer = None
99
+
100
+ self._setup_default_pipeline()
101
+
102
+ def _setup_default_pipeline(self):
103
+ """Setup default pipeline steps"""
104
+ # Data ingestion
105
+ self.add_step(PipelineStep(
106
+ name="load_raw_data",
107
+ stage=PipelineStage.DATA_INGESTION,
108
+ function=self._load_raw_data,
109
+ outputs=["raw_trading_data", "raw_stock_data"]
110
+ ))
111
+
112
+ # Data preprocessing
113
+ self.add_step(PipelineStep(
114
+ name="preprocess_data",
115
+ stage=PipelineStage.DATA_PREPROCESSING,
116
+ function=self._preprocess_data,
117
+ inputs={"trading_data": "raw_trading_data", "stock_data": "raw_stock_data"},
118
+ outputs=["processed_trading_data", "processed_stock_data"]
119
+ ))
120
+
121
+ # Feature engineering
122
+ self.add_step(PipelineStep(
123
+ name="extract_features",
124
+ stage=PipelineStage.FEATURE_ENGINEERING,
125
+ function=self._extract_features,
126
+ inputs={"trading_data": "processed_trading_data", "stock_data": "processed_stock_data"},
127
+ outputs=["feature_matrix", "feature_names", "labels"]
128
+ ))
129
+
130
+ # Model training
131
+ self.add_step(PipelineStep(
132
+ name="train_model",
133
+ stage=PipelineStage.MODEL_TRAINING,
134
+ function=self._train_model,
135
+ inputs={"X": "feature_matrix", "y": "labels"},
136
+ outputs=["trained_model", "training_metrics"]
137
+ ))
138
+
139
+ # Model evaluation
140
+ self.add_step(PipelineStep(
141
+ name="evaluate_model",
142
+ stage=PipelineStage.MODEL_EVALUATION,
143
+ function=self._evaluate_model,
144
+ inputs={"model": "trained_model", "X_test": "test_features", "y_test": "test_labels"},
145
+ outputs=["evaluation_metrics", "predictions"]
146
+ ))
147
+
148
+ # Model deployment
149
+ self.add_step(PipelineStep(
150
+ name="deploy_model",
151
+ stage=PipelineStage.MODEL_DEPLOYMENT,
152
+ function=self._deploy_model,
153
+ inputs={"model": "trained_model", "metrics": "evaluation_metrics"},
154
+ outputs=["deployment_info"]
155
+ ))
156
+
157
+ def add_step(self, step: PipelineStep):
158
+ """Add step to pipeline"""
159
+ self.steps.append(step)
160
+ logger.debug(f"Added pipeline step: {step.name}")
161
+
162
+ def _load_raw_data(self) -> Dict[str, pd.DataFrame]:
163
+ """Load raw data from sources"""
164
+ logger.info("Loading raw data...")
165
+
166
+ # Load politician trading data
167
+ trading_data_path = self.config.data_dir / "politician_trades.csv"
168
+ if trading_data_path.exists():
169
+ trading_data = pd.read_csv(trading_data_path)
170
+ else:
171
+ # Generate mock data for testing
172
+ trading_data = self._generate_mock_trading_data()
173
+
174
+ # Load stock price data
175
+ stock_data_path = self.config.data_dir / "stock_prices.csv"
176
+ if stock_data_path.exists():
177
+ stock_data = pd.read_csv(stock_data_path)
178
+ else:
179
+ # Generate mock data for testing
180
+ stock_data = self._generate_mock_stock_data()
181
+
182
+ logger.info(f"Loaded {len(trading_data)} trading records and {len(stock_data)} stock prices")
183
+
184
+ return {
185
+ "raw_trading_data": trading_data,
186
+ "raw_stock_data": stock_data
187
+ }
188
+
189
+ def _preprocess_data(self, trading_data: pd.DataFrame,
190
+ stock_data: pd.DataFrame) -> Dict[str, pd.DataFrame]:
191
+ """Preprocess raw data"""
192
+ logger.info("Preprocessing data...")
193
+
194
+ # Initialize data processor
195
+ processing_config = ProcessingConfig()
196
+ self.data_processor = DataProcessor(processing_config)
197
+
198
+ # Process trading data
199
+ processed_trading = self.data_processor.process_politician_trades(trading_data)
200
+
201
+ # Process stock data (ensure proper format)
202
+ processed_stock = stock_data.copy()
203
+ if 'date' in processed_stock.columns and processed_stock['date'].dtype == 'object':
204
+ processed_stock['date'] = pd.to_datetime(processed_stock['date'])
205
+
206
+ # Clean and validate
207
+ processed_trading = self.data_processor.clean_data(processed_trading)
208
+ processed_stock = self.data_processor.clean_data(processed_stock)
209
+
210
+ logger.info(f"Preprocessed {len(processed_trading)} trading records")
211
+
212
+ return {
213
+ "processed_trading_data": processed_trading,
214
+ "processed_stock_data": processed_stock
215
+ }
216
+
217
+ def _extract_features(self, trading_data: pd.DataFrame,
218
+ stock_data: pd.DataFrame) -> Dict[str, Any]:
219
+ """Extract features from preprocessed data"""
220
+ logger.info("Extracting features...")
221
+
222
+ # Initialize feature extractors
223
+ stock_extractor = StockRecommendationFeatures()
224
+ political_extractor = PoliticalInfluenceFeatures()
225
+ ensemble_builder = EnsembleFeatureBuilder()
226
+
227
+ # Extract stock features
228
+ stock_features = pd.DataFrame()
229
+ if not stock_data.empty:
230
+ try:
231
+ stock_features = stock_extractor.extract_features(stock_data)
232
+ except Exception as e:
233
+ logger.warning(f"Could not extract stock features: {e}")
234
+
235
+ # Extract political features
236
+ political_features = political_extractor.extract_influence_features(trading_data)
237
+
238
+ # Combine features
239
+ if not stock_features.empty:
240
+ feature_df = pd.concat([political_features, stock_features], axis=1)
241
+ else:
242
+ feature_df = political_features
243
+
244
+ # Build ensemble features
245
+ feature_df = ensemble_builder.build_ensemble_features(feature_df)
246
+
247
+ # Create labels (simplified - would be based on actual returns)
248
+ labels = np.random.randint(0, 2, len(feature_df))
249
+
250
+ # Store feature names
251
+ feature_names = feature_df.columns.tolist()
252
+
253
+ logger.info(f"Extracted {len(feature_names)} features from {len(feature_df)} samples")
254
+
255
+ return {
256
+ "feature_matrix": feature_df.values,
257
+ "feature_names": feature_names,
258
+ "labels": labels
259
+ }
260
+
261
+ def _train_model(self, X: np.ndarray, y: np.ndarray) -> Dict[str, Any]:
262
+ """Train ensemble model"""
263
+ logger.info("Training model...")
264
+
265
+ # Split data
266
+ train_size = int(0.8 * len(X))
267
+ X_train, X_val = X[:train_size], X[train_size:]
268
+ y_train, y_val = y[:train_size], y[train_size:]
269
+
270
+ # Store test data for evaluation
271
+ self.artifacts["test_features"] = X_val
272
+ self.artifacts["test_labels"] = y_val
273
+
274
+ # Configure ensemble
275
+ model_configs = [
276
+ ModelConfig(
277
+ model_type="mlp",
278
+ hidden_dims=[256, 128],
279
+ dropout_rate=0.3,
280
+ learning_rate=0.001,
281
+ weight_decay=1e-4,
282
+ batch_size=32,
283
+ epochs=10
284
+ ),
285
+ ModelConfig(
286
+ model_type="attention",
287
+ hidden_dims=[128],
288
+ dropout_rate=0.2,
289
+ learning_rate=0.001,
290
+ weight_decay=1e-4,
291
+ batch_size=32,
292
+ epochs=10
293
+ )
294
+ ]
295
+
296
+ ensemble_config = EnsembleConfig(
297
+ base_models=model_configs,
298
+ ensemble_method="weighted_average"
299
+ )
300
+
301
+ recommendation_config = RecommendationConfig(
302
+ ensemble_config=ensemble_config,
303
+ risk_adjustment=True,
304
+ confidence_threshold=0.6
305
+ )
306
+
307
+ # Create and train model
308
+ input_dim = X.shape[1]
309
+ self.model = StockRecommendationModel(input_dim, recommendation_config)
310
+
311
+ # Generate mock risk and return labels for training
312
+ returns_train = np.random.normal(0.05, 0.15, len(y_train))
313
+ risk_labels_train = np.random.choice([0, 1, 2], len(y_train), p=[0.3, 0.5, 0.2])
314
+ returns_val = np.random.normal(0.05, 0.15, len(y_val))
315
+ risk_labels_val = np.random.choice([0, 1, 2], len(y_val), p=[0.3, 0.5, 0.2])
316
+
317
+ # Train model
318
+ trainer = RecommendationTrainer(self.model, recommendation_config)
319
+ result = trainer.train(
320
+ X_train, y_train, returns_train, risk_labels_train,
321
+ X_val, y_val, returns_val, risk_labels_val,
322
+ epochs=10, batch_size=32
323
+ )
324
+
325
+ # Extract metrics
326
+ training_metrics = {
327
+ "train_accuracy": result.train_metrics.accuracy,
328
+ "train_precision": result.train_metrics.precision,
329
+ "train_recall": result.train_metrics.recall,
330
+ "train_f1": result.train_metrics.f1_score,
331
+ "val_accuracy": result.val_metrics.accuracy,
332
+ "val_precision": result.val_metrics.precision,
333
+ "val_recall": result.val_metrics.recall,
334
+ "val_f1": result.val_metrics.f1_score
335
+ }
336
+
337
+ logger.info(f"Model trained - Val accuracy: {training_metrics['val_accuracy']:.3f}")
338
+
339
+ return {
340
+ "trained_model": self.model,
341
+ "training_metrics": training_metrics
342
+ }
343
+
344
+ def _evaluate_model(self, model: StockRecommendationModel,
345
+ X_test: np.ndarray, y_test: np.ndarray) -> Dict[str, Any]:
346
+ """Evaluate trained model"""
347
+ logger.info("Evaluating model...")
348
+
349
+ # Generate predictions
350
+ predictions = model.predict(X_test)
351
+ probabilities = model.predict_proba(X_test)
352
+
353
+ # Calculate metrics
354
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
355
+
356
+ evaluation_metrics = {
357
+ "test_accuracy": accuracy_score(y_test, predictions),
358
+ "test_precision": precision_score(y_test, predictions, average='weighted', zero_division=0),
359
+ "test_recall": recall_score(y_test, predictions, average='weighted', zero_division=0),
360
+ "test_f1": f1_score(y_test, predictions, average='weighted', zero_division=0)
361
+ }
362
+
363
+ # Calculate AUC if binary classification
364
+ if probabilities.shape[1] == 2:
365
+ try:
366
+ evaluation_metrics["test_auc"] = roc_auc_score(y_test, probabilities[:, 1])
367
+ except:
368
+ pass
369
+
370
+ logger.info(f"Model evaluation - Test accuracy: {evaluation_metrics['test_accuracy']:.3f}")
371
+
372
+ return {
373
+ "evaluation_metrics": evaluation_metrics,
374
+ "predictions": predictions
375
+ }
376
+
377
+ def _deploy_model(self, model: StockRecommendationModel,
378
+ metrics: Dict[str, float]) -> Dict[str, Any]:
379
+ """Deploy model (save to disk)"""
380
+ logger.info("Deploying model...")
381
+
382
+ # Save model
383
+ model_path = self.config.model_dir / f"model_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pt"
384
+ torch.save({
385
+ 'model_state_dict': model.state_dict(),
386
+ 'metrics': metrics,
387
+ 'config': model.recommendation_config
388
+ }, model_path)
389
+
390
+ deployment_info = {
391
+ "model_path": str(model_path),
392
+ "deployed_at": datetime.now().isoformat(),
393
+ "metrics": metrics
394
+ }
395
+
396
+ logger.info(f"Model deployed to {model_path}")
397
+ return {"deployment_info": deployment_info}
398
+
399
+ def run(self, start_step: Optional[str] = None,
400
+ end_step: Optional[str] = None) -> Dict[str, Any]:
401
+ """Execute pipeline"""
402
+ logger.info(f"Starting pipeline: {self.config.name} v{self.config.version}")
403
+
404
+ # Start MLflow run if enabled
405
+ if self.experiment_tracker:
406
+ run_name = f"{self.config.name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
407
+ self.experiment_tracker.start_run(run_name, self.config.tags)
408
+
409
+ # Log pipeline config
410
+ self.experiment_tracker.log_params({
411
+ "pipeline_name": self.config.name,
412
+ "pipeline_version": self.config.version,
413
+ "enable_caching": self.config.enable_caching,
414
+ "parallel_execution": self.config.parallel_execution
415
+ })
416
+
417
+ # Execute steps
418
+ start_idx = 0
419
+ end_idx = len(self.steps)
420
+
421
+ if start_step:
422
+ start_idx = next((i for i, s in enumerate(self.steps) if s.name == start_step), 0)
423
+ if end_step:
424
+ end_idx = next((i+1 for i, s in enumerate(self.steps) if s.name == end_step), len(self.steps))
425
+
426
+ for i, step in enumerate(self.steps[start_idx:end_idx], start=start_idx):
427
+ if not step.enabled:
428
+ logger.info(f"Skipping disabled step: {step.name}")
429
+ continue
430
+
431
+ self.current_step = step
432
+ logger.info(f"Executing step {i+1}/{len(self.steps)}: {step.name}")
433
+
434
+ try:
435
+ # Prepare inputs
436
+ kwargs = {}
437
+ for param, artifact_key in step.inputs.items():
438
+ if artifact_key in self.artifacts:
439
+ kwargs[param] = self.artifacts[artifact_key]
440
+
441
+ # Execute step
442
+ result = step.function(**kwargs)
443
+
444
+ # Store outputs
445
+ if result and step.outputs:
446
+ if len(step.outputs) == 1:
447
+ self.artifacts[step.outputs[0]] = result
448
+ else:
449
+ for output_key, value in result.items():
450
+ if output_key in step.outputs:
451
+ self.artifacts[output_key] = value
452
+
453
+ # Log to MLflow
454
+ if self.experiment_tracker and "metrics" in str(result):
455
+ if isinstance(result, dict) and any("metric" in k for k in result.keys()):
456
+ metrics_dict = result.get("training_metrics", result.get("evaluation_metrics", {}))
457
+ self.experiment_tracker.log_metrics(metrics_dict)
458
+
459
+ # Checkpoint if needed
460
+ if (i + 1) % self.config.checkpoint_frequency == 0:
461
+ self._save_checkpoint(i + 1)
462
+
463
+ except Exception as e:
464
+ logger.error(f"Step {step.name} failed: {e}")
465
+ if self.experiment_tracker:
466
+ self.experiment_tracker.end_run(status="FAILED")
467
+ raise
468
+
469
+ # Log final model to MLflow
470
+ if self.experiment_tracker and self.model:
471
+ try:
472
+ example_input = pd.DataFrame(
473
+ self.artifacts.get("feature_matrix", np.random.randn(5, 100))[:5]
474
+ )
475
+ self.experiment_tracker.log_model(
476
+ self.model,
477
+ "recommendation_model",
478
+ input_example=example_input
479
+ )
480
+ except Exception as e:
481
+ logger.warning(f"Could not log model to MLflow: {e}")
482
+
483
+ # End MLflow run
484
+ if self.experiment_tracker:
485
+ self.experiment_tracker.end_run(status="FINISHED")
486
+
487
+ logger.info("Pipeline execution completed successfully")
488
+
489
+ return {
490
+ "artifacts": self.artifacts,
491
+ "metrics": self.metrics,
492
+ "model": self.model
493
+ }
494
+
495
+ def _save_checkpoint(self, step_number: int):
496
+ """Save pipeline checkpoint"""
497
+ checkpoint_path = self.config.cache_dir / f"checkpoint_step_{step_number}.pkl"
498
+
499
+ checkpoint = {
500
+ "step_number": step_number,
501
+ "artifacts": {k: v for k, v in self.artifacts.items()
502
+ if not isinstance(v, (torch.nn.Module, type))},
503
+ "metrics": self.metrics,
504
+ "timestamp": datetime.now()
505
+ }
506
+
507
+ with open(checkpoint_path, 'wb') as f:
508
+ pickle.dump(checkpoint, f)
509
+
510
+ logger.debug(f"Saved checkpoint at step {step_number}")
511
+
512
+ def load_checkpoint(self, checkpoint_path: Path):
513
+ """Load pipeline checkpoint"""
514
+ with open(checkpoint_path, 'rb') as f:
515
+ checkpoint = pickle.load(f)
516
+
517
+ self.artifacts.update(checkpoint["artifacts"])
518
+ self.metrics.update(checkpoint["metrics"])
519
+
520
+ logger.info(f"Loaded checkpoint from step {checkpoint['step_number']}")
521
+
522
+ def _generate_mock_trading_data(self) -> pd.DataFrame:
523
+ """Generate mock politician trading data for testing"""
524
+ np.random.seed(42)
525
+ n_records = 500
526
+
527
+ politicians = ["Nancy Pelosi", "Mitch McConnell", "Chuck Schumer", "Kevin McCarthy"]
528
+ tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA"]
529
+
530
+ data = []
531
+ for _ in range(n_records):
532
+ data.append({
533
+ "politician_name_cleaned": np.random.choice(politicians),
534
+ "transaction_date_cleaned": pd.Timestamp.now() - pd.Timedelta(days=np.random.randint(1, 365)),
535
+ "transaction_amount_cleaned": np.random.uniform(1000, 500000),
536
+ "transaction_type_cleaned": np.random.choice(["buy", "sell"]),
537
+ "ticker_cleaned": np.random.choice(tickers)
538
+ })
539
+
540
+ return pd.DataFrame(data)
541
+
542
+ def _generate_mock_stock_data(self) -> pd.DataFrame:
543
+ """Generate mock stock price data for testing"""
544
+ np.random.seed(42)
545
+ tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA"]
546
+ dates = pd.date_range(end=pd.Timestamp.now(), periods=100)
547
+
548
+ data = []
549
+ for ticker in tickers:
550
+ base_price = np.random.uniform(100, 500)
551
+ for date in dates:
552
+ price = base_price * (1 + np.random.normal(0, 0.02))
553
+ data.append({
554
+ "symbol": ticker,
555
+ "date": date,
556
+ "close": price,
557
+ "volume": np.random.randint(1000000, 10000000),
558
+ "open": price * 0.99,
559
+ "high": price * 1.01,
560
+ "low": price * 0.98
561
+ })
562
+
563
+ return pd.DataFrame(data)
564
+
565
+
566
+ class PipelineExecutor:
567
+ """Execute and manage multiple pipeline runs"""
568
+
569
+ def __init__(self, config: PipelineConfig):
570
+ self.config = config
571
+ self.pipelines: Dict[str, MLPipeline] = {}
572
+
573
+ def create_pipeline(self, name: str) -> MLPipeline:
574
+ """Create new pipeline instance"""
575
+ pipeline = MLPipeline(self.config)
576
+ self.pipelines[name] = pipeline
577
+ return pipeline
578
+
579
+ def run_pipeline(self, name: str, **kwargs) -> Dict[str, Any]:
580
+ """Run specific pipeline"""
581
+ if name not in self.pipelines:
582
+ self.pipelines[name] = MLPipeline(self.config)
583
+
584
+ return self.pipelines[name].run(**kwargs)
585
+
586
+ def run_experiment(self, n_runs: int = 5,
587
+ param_grid: Optional[Dict[str, List]] = None) -> pd.DataFrame:
588
+ """Run multiple experiments with different parameters"""
589
+ results = []
590
+
591
+ for i in range(n_runs):
592
+ logger.info(f"Running experiment {i+1}/{n_runs}")
593
+
594
+ # Create new pipeline for each run
595
+ pipeline_name = f"experiment_{i}"
596
+ pipeline = self.create_pipeline(pipeline_name)
597
+
598
+ # Modify parameters if grid provided
599
+ if param_grid:
600
+ # Simple parameter modification (would be more sophisticated in practice)
601
+ pass
602
+
603
+ # Run pipeline
604
+ result = pipeline.run()
605
+
606
+ # Collect metrics
607
+ run_metrics = {
608
+ "run_id": i,
609
+ "pipeline_name": pipeline_name,
610
+ **result.get("metrics", {})
611
+ }
612
+ results.append(run_metrics)
613
+
614
+ return pd.DataFrame(results)