mcli-framework 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (186) hide show
  1. mcli/app/chat_cmd.py +42 -0
  2. mcli/app/commands_cmd.py +226 -0
  3. mcli/app/completion_cmd.py +216 -0
  4. mcli/app/completion_helpers.py +288 -0
  5. mcli/app/cron_test_cmd.py +697 -0
  6. mcli/app/logs_cmd.py +419 -0
  7. mcli/app/main.py +492 -0
  8. mcli/app/model/model.py +1060 -0
  9. mcli/app/model_cmd.py +227 -0
  10. mcli/app/redis_cmd.py +269 -0
  11. mcli/app/video/video.py +1114 -0
  12. mcli/app/visual_cmd.py +303 -0
  13. mcli/chat/chat.py +2409 -0
  14. mcli/chat/command_rag.py +514 -0
  15. mcli/chat/enhanced_chat.py +652 -0
  16. mcli/chat/system_controller.py +1010 -0
  17. mcli/chat/system_integration.py +1016 -0
  18. mcli/cli.py +25 -0
  19. mcli/config.toml +20 -0
  20. mcli/lib/api/api.py +586 -0
  21. mcli/lib/api/daemon_client.py +203 -0
  22. mcli/lib/api/daemon_client_local.py +44 -0
  23. mcli/lib/api/daemon_decorator.py +217 -0
  24. mcli/lib/api/mcli_decorators.py +1032 -0
  25. mcli/lib/auth/auth.py +85 -0
  26. mcli/lib/auth/aws_manager.py +85 -0
  27. mcli/lib/auth/azure_manager.py +91 -0
  28. mcli/lib/auth/credential_manager.py +192 -0
  29. mcli/lib/auth/gcp_manager.py +93 -0
  30. mcli/lib/auth/key_manager.py +117 -0
  31. mcli/lib/auth/mcli_manager.py +93 -0
  32. mcli/lib/auth/token_manager.py +75 -0
  33. mcli/lib/auth/token_util.py +1011 -0
  34. mcli/lib/config/config.py +47 -0
  35. mcli/lib/discovery/__init__.py +1 -0
  36. mcli/lib/discovery/command_discovery.py +274 -0
  37. mcli/lib/erd/erd.py +1345 -0
  38. mcli/lib/erd/generate_graph.py +453 -0
  39. mcli/lib/files/files.py +76 -0
  40. mcli/lib/fs/fs.py +109 -0
  41. mcli/lib/lib.py +29 -0
  42. mcli/lib/logger/logger.py +611 -0
  43. mcli/lib/performance/optimizer.py +409 -0
  44. mcli/lib/performance/rust_bridge.py +502 -0
  45. mcli/lib/performance/uvloop_config.py +154 -0
  46. mcli/lib/pickles/pickles.py +50 -0
  47. mcli/lib/search/cached_vectorizer.py +479 -0
  48. mcli/lib/services/data_pipeline.py +460 -0
  49. mcli/lib/services/lsh_client.py +441 -0
  50. mcli/lib/services/redis_service.py +387 -0
  51. mcli/lib/shell/shell.py +137 -0
  52. mcli/lib/toml/toml.py +33 -0
  53. mcli/lib/ui/styling.py +47 -0
  54. mcli/lib/ui/visual_effects.py +634 -0
  55. mcli/lib/watcher/watcher.py +185 -0
  56. mcli/ml/api/app.py +215 -0
  57. mcli/ml/api/middleware.py +224 -0
  58. mcli/ml/api/routers/admin_router.py +12 -0
  59. mcli/ml/api/routers/auth_router.py +244 -0
  60. mcli/ml/api/routers/backtest_router.py +12 -0
  61. mcli/ml/api/routers/data_router.py +12 -0
  62. mcli/ml/api/routers/model_router.py +302 -0
  63. mcli/ml/api/routers/monitoring_router.py +12 -0
  64. mcli/ml/api/routers/portfolio_router.py +12 -0
  65. mcli/ml/api/routers/prediction_router.py +267 -0
  66. mcli/ml/api/routers/trade_router.py +12 -0
  67. mcli/ml/api/routers/websocket_router.py +76 -0
  68. mcli/ml/api/schemas.py +64 -0
  69. mcli/ml/auth/auth_manager.py +425 -0
  70. mcli/ml/auth/models.py +154 -0
  71. mcli/ml/auth/permissions.py +302 -0
  72. mcli/ml/backtesting/backtest_engine.py +502 -0
  73. mcli/ml/backtesting/performance_metrics.py +393 -0
  74. mcli/ml/cache.py +400 -0
  75. mcli/ml/cli/main.py +398 -0
  76. mcli/ml/config/settings.py +394 -0
  77. mcli/ml/configs/dvc_config.py +230 -0
  78. mcli/ml/configs/mlflow_config.py +131 -0
  79. mcli/ml/configs/mlops_manager.py +293 -0
  80. mcli/ml/dashboard/app.py +532 -0
  81. mcli/ml/dashboard/app_integrated.py +738 -0
  82. mcli/ml/dashboard/app_supabase.py +560 -0
  83. mcli/ml/dashboard/app_training.py +615 -0
  84. mcli/ml/dashboard/cli.py +51 -0
  85. mcli/ml/data_ingestion/api_connectors.py +501 -0
  86. mcli/ml/data_ingestion/data_pipeline.py +567 -0
  87. mcli/ml/data_ingestion/stream_processor.py +512 -0
  88. mcli/ml/database/migrations/env.py +94 -0
  89. mcli/ml/database/models.py +667 -0
  90. mcli/ml/database/session.py +200 -0
  91. mcli/ml/experimentation/ab_testing.py +845 -0
  92. mcli/ml/features/ensemble_features.py +607 -0
  93. mcli/ml/features/political_features.py +676 -0
  94. mcli/ml/features/recommendation_engine.py +809 -0
  95. mcli/ml/features/stock_features.py +573 -0
  96. mcli/ml/features/test_feature_engineering.py +346 -0
  97. mcli/ml/logging.py +85 -0
  98. mcli/ml/mlops/data_versioning.py +518 -0
  99. mcli/ml/mlops/experiment_tracker.py +377 -0
  100. mcli/ml/mlops/model_serving.py +481 -0
  101. mcli/ml/mlops/pipeline_orchestrator.py +614 -0
  102. mcli/ml/models/base_models.py +324 -0
  103. mcli/ml/models/ensemble_models.py +675 -0
  104. mcli/ml/models/recommendation_models.py +474 -0
  105. mcli/ml/models/test_models.py +487 -0
  106. mcli/ml/monitoring/drift_detection.py +676 -0
  107. mcli/ml/monitoring/metrics.py +45 -0
  108. mcli/ml/optimization/portfolio_optimizer.py +834 -0
  109. mcli/ml/preprocessing/data_cleaners.py +451 -0
  110. mcli/ml/preprocessing/feature_extractors.py +491 -0
  111. mcli/ml/preprocessing/ml_pipeline.py +382 -0
  112. mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
  113. mcli/ml/preprocessing/test_preprocessing.py +294 -0
  114. mcli/ml/scripts/populate_sample_data.py +200 -0
  115. mcli/ml/tasks.py +400 -0
  116. mcli/ml/tests/test_integration.py +429 -0
  117. mcli/ml/tests/test_training_dashboard.py +387 -0
  118. mcli/public/oi/oi.py +15 -0
  119. mcli/public/public.py +4 -0
  120. mcli/self/self_cmd.py +1246 -0
  121. mcli/workflow/daemon/api_daemon.py +800 -0
  122. mcli/workflow/daemon/async_command_database.py +681 -0
  123. mcli/workflow/daemon/async_process_manager.py +591 -0
  124. mcli/workflow/daemon/client.py +530 -0
  125. mcli/workflow/daemon/commands.py +1196 -0
  126. mcli/workflow/daemon/daemon.py +905 -0
  127. mcli/workflow/daemon/daemon_api.py +59 -0
  128. mcli/workflow/daemon/enhanced_daemon.py +571 -0
  129. mcli/workflow/daemon/process_cli.py +244 -0
  130. mcli/workflow/daemon/process_manager.py +439 -0
  131. mcli/workflow/daemon/test_daemon.py +275 -0
  132. mcli/workflow/dashboard/dashboard_cmd.py +113 -0
  133. mcli/workflow/docker/docker.py +0 -0
  134. mcli/workflow/file/file.py +100 -0
  135. mcli/workflow/gcloud/config.toml +21 -0
  136. mcli/workflow/gcloud/gcloud.py +58 -0
  137. mcli/workflow/git_commit/ai_service.py +328 -0
  138. mcli/workflow/git_commit/commands.py +430 -0
  139. mcli/workflow/lsh_integration.py +355 -0
  140. mcli/workflow/model_service/client.py +594 -0
  141. mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
  142. mcli/workflow/model_service/lightweight_embedder.py +397 -0
  143. mcli/workflow/model_service/lightweight_model_server.py +714 -0
  144. mcli/workflow/model_service/lightweight_test.py +241 -0
  145. mcli/workflow/model_service/model_service.py +1955 -0
  146. mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
  147. mcli/workflow/model_service/pdf_processor.py +386 -0
  148. mcli/workflow/model_service/test_efficient_runner.py +234 -0
  149. mcli/workflow/model_service/test_example.py +315 -0
  150. mcli/workflow/model_service/test_integration.py +131 -0
  151. mcli/workflow/model_service/test_new_features.py +149 -0
  152. mcli/workflow/openai/openai.py +99 -0
  153. mcli/workflow/politician_trading/commands.py +1790 -0
  154. mcli/workflow/politician_trading/config.py +134 -0
  155. mcli/workflow/politician_trading/connectivity.py +490 -0
  156. mcli/workflow/politician_trading/data_sources.py +395 -0
  157. mcli/workflow/politician_trading/database.py +410 -0
  158. mcli/workflow/politician_trading/demo.py +248 -0
  159. mcli/workflow/politician_trading/models.py +165 -0
  160. mcli/workflow/politician_trading/monitoring.py +413 -0
  161. mcli/workflow/politician_trading/scrapers.py +966 -0
  162. mcli/workflow/politician_trading/scrapers_california.py +412 -0
  163. mcli/workflow/politician_trading/scrapers_eu.py +377 -0
  164. mcli/workflow/politician_trading/scrapers_uk.py +350 -0
  165. mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
  166. mcli/workflow/politician_trading/supabase_functions.py +354 -0
  167. mcli/workflow/politician_trading/workflow.py +852 -0
  168. mcli/workflow/registry/registry.py +180 -0
  169. mcli/workflow/repo/repo.py +223 -0
  170. mcli/workflow/scheduler/commands.py +493 -0
  171. mcli/workflow/scheduler/cron_parser.py +238 -0
  172. mcli/workflow/scheduler/job.py +182 -0
  173. mcli/workflow/scheduler/monitor.py +139 -0
  174. mcli/workflow/scheduler/persistence.py +324 -0
  175. mcli/workflow/scheduler/scheduler.py +679 -0
  176. mcli/workflow/sync/sync_cmd.py +437 -0
  177. mcli/workflow/sync/test_cmd.py +314 -0
  178. mcli/workflow/videos/videos.py +242 -0
  179. mcli/workflow/wakatime/wakatime.py +11 -0
  180. mcli/workflow/workflow.py +37 -0
  181. mcli_framework-7.0.0.dist-info/METADATA +479 -0
  182. mcli_framework-7.0.0.dist-info/RECORD +186 -0
  183. mcli_framework-7.0.0.dist-info/WHEEL +5 -0
  184. mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
  185. mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
  186. mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,569 @@
1
+ """Main preprocessor for politician trading data"""
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ from datetime import datetime, timedelta
6
+ from typing import Any, Dict, List, Optional, Tuple, Union
7
+ from dataclasses import dataclass, asdict
8
+ import logging
9
+ from pathlib import Path
10
+ import joblib
11
+
12
+ from .data_cleaners import TradingDataCleaner, OutlierDetector, MissingValueHandler, CleaningStats
13
+ from .feature_extractors import (
14
+ PoliticianFeatureExtractor,
15
+ MarketFeatureExtractor,
16
+ TemporalFeatureExtractor,
17
+ SentimentFeatureExtractor,
18
+ FeatureExtractionStats,
19
+ )
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ @dataclass
25
+ class PreprocessingConfig:
26
+ """Configuration for preprocessing pipeline"""
27
+
28
+ # Data cleaning
29
+ enable_data_cleaning: bool = True
30
+ enable_outlier_detection: bool = True
31
+ enable_missing_value_handling: bool = True
32
+ outlier_action: str = "flag" # "flag", "remove", or "cap"
33
+
34
+ # Feature extraction
35
+ enable_politician_features: bool = True
36
+ enable_market_features: bool = True
37
+ enable_temporal_features: bool = True
38
+ enable_sentiment_features: bool = True
39
+
40
+ # Temporal settings
41
+ lookback_periods: List[int] = None
42
+ include_future_leakage: bool = False
43
+
44
+ # Data splitting
45
+ train_split_ratio: float = 0.7
46
+ val_split_ratio: float = 0.15
47
+ test_split_ratio: float = 0.15
48
+ split_by_time: bool = True
49
+
50
+ # Output settings
51
+ save_preprocessing_artifacts: bool = True
52
+ artifacts_dir: Optional[Path] = None
53
+
54
+ def __post_init__(self):
55
+ if self.lookback_periods is None:
56
+ self.lookback_periods = [7, 30, 90, 365]
57
+
58
+ if self.artifacts_dir is None:
59
+ self.artifacts_dir = Path("./data/preprocessing_artifacts")
60
+
61
+ # Validate split ratios
62
+ total_ratio = self.train_split_ratio + self.val_split_ratio + self.test_split_ratio
63
+ if abs(total_ratio - 1.0) > 0.001:
64
+ raise ValueError(f"Split ratios must sum to 1.0, got {total_ratio}")
65
+
66
+
67
+ @dataclass
68
+ class PreprocessingResults:
69
+ """Results from preprocessing pipeline"""
70
+
71
+ # Processed data
72
+ train_data: pd.DataFrame
73
+ val_data: pd.DataFrame
74
+ test_data: pd.DataFrame
75
+
76
+ # Feature information
77
+ feature_names: List[str]
78
+ categorical_features: List[str]
79
+ numerical_features: List[str]
80
+ target_columns: List[str]
81
+
82
+ # Statistics
83
+ cleaning_stats: CleaningStats
84
+ original_shape: Tuple[int, int]
85
+ final_shape: Tuple[int, int]
86
+ feature_count: int
87
+
88
+ # Artifacts paths
89
+ scaler_path: Optional[Path] = None
90
+ encoder_path: Optional[Path] = None
91
+ feature_metadata_path: Optional[Path] = None
92
+
93
+
94
+ class PoliticianTradingPreprocessor:
95
+ """Main preprocessor for politician trading data for ML models"""
96
+
97
+ def __init__(self, config: Optional[PreprocessingConfig] = None):
98
+ self.config = config or PreprocessingConfig()
99
+
100
+ # Initialize components
101
+ self.data_cleaner = TradingDataCleaner()
102
+ self.outlier_detector = OutlierDetector()
103
+ self.missing_value_handler = MissingValueHandler()
104
+
105
+ self.politician_extractor = PoliticianFeatureExtractor()
106
+ self.market_extractor = MarketFeatureExtractor()
107
+ self.temporal_extractor = TemporalFeatureExtractor(
108
+ config={"lookback_periods": self.config.lookback_periods}
109
+ )
110
+ self.sentiment_extractor = SentimentFeatureExtractor()
111
+
112
+ # Preprocessing artifacts
113
+ self.scaler = None
114
+ self.categorical_encoder = None
115
+ self.feature_metadata = {}
116
+
117
+ # Create artifacts directory
118
+ self.config.artifacts_dir.mkdir(parents=True, exist_ok=True)
119
+
120
+ def preprocess(
121
+ self, raw_data: Union[List[Dict[str, Any]], pd.DataFrame]
122
+ ) -> PreprocessingResults:
123
+ """Main preprocessing pipeline"""
124
+ logger.info("Starting politician trading data preprocessing")
125
+
126
+ # Convert to DataFrame if needed
127
+ if isinstance(raw_data, list):
128
+ df = pd.DataFrame(raw_data)
129
+ else:
130
+ df = raw_data.copy()
131
+
132
+ original_shape = df.shape
133
+ logger.info(f"Input data shape: {original_shape}")
134
+
135
+ # Step 1: Data Cleaning
136
+ if self.config.enable_data_cleaning:
137
+ df, cleaning_stats = self._clean_data(df)
138
+ logger.info(f"After cleaning: {df.shape}")
139
+ else:
140
+ cleaning_stats = CleaningStats(
141
+ total_records=len(df),
142
+ cleaned_records=len(df),
143
+ removed_records=0,
144
+ cleaning_operations={},
145
+ outliers_detected=0,
146
+ missing_values_filled=0,
147
+ )
148
+
149
+ # Step 2: Feature Extraction
150
+ df = self._extract_features(df)
151
+ logger.info(f"After feature extraction: {df.shape}")
152
+
153
+ # Step 3: Handle outliers
154
+ if self.config.enable_outlier_detection:
155
+ df = self._handle_outliers(df)
156
+ logger.info(f"After outlier handling: {df.shape}")
157
+
158
+ # Step 4: Handle missing values
159
+ if self.config.enable_missing_value_handling:
160
+ df = self._handle_missing_values(df)
161
+ logger.info(f"After missing value handling: {df.shape}")
162
+
163
+ # Step 5: Feature engineering and encoding
164
+ df = self._engineer_features(df)
165
+ logger.info(f"After feature engineering: {df.shape}")
166
+
167
+ # Step 6: Create target variables
168
+ df = self._create_target_variables(df)
169
+ logger.info(f"After target creation: {df.shape}")
170
+
171
+ # Step 7: Split data
172
+ train_data, val_data, test_data = self._split_data(df)
173
+
174
+ # Step 8: Scale features
175
+ train_data, val_data, test_data = self._scale_features(train_data, val_data, test_data)
176
+
177
+ # Step 9: Save artifacts
178
+ if self.config.save_preprocessing_artifacts:
179
+ self._save_artifacts()
180
+
181
+ # Prepare results
182
+ feature_names = [col for col in df.columns if not col.startswith("target_")]
183
+ categorical_features = self._identify_categorical_features(df)
184
+ numerical_features = self._identify_numerical_features(df)
185
+ target_columns = [col for col in df.columns if col.startswith("target_")]
186
+
187
+ results = PreprocessingResults(
188
+ train_data=train_data,
189
+ val_data=val_data,
190
+ test_data=test_data,
191
+ feature_names=feature_names,
192
+ categorical_features=categorical_features,
193
+ numerical_features=numerical_features,
194
+ target_columns=target_columns,
195
+ cleaning_stats=cleaning_stats,
196
+ original_shape=original_shape,
197
+ final_shape=df.shape,
198
+ feature_count=len(feature_names),
199
+ scaler_path=self.config.artifacts_dir / "scaler.joblib",
200
+ encoder_path=self.config.artifacts_dir / "encoder.joblib",
201
+ feature_metadata_path=self.config.artifacts_dir / "feature_metadata.joblib",
202
+ )
203
+
204
+ logger.info(f"Preprocessing complete. Final shape: {df.shape}")
205
+ logger.info(f"Features: {len(feature_names)}, Targets: {len(target_columns)}")
206
+
207
+ return results
208
+
209
+ def _clean_data(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, CleaningStats]:
210
+ """Clean the raw data"""
211
+ logger.info("Cleaning data")
212
+
213
+ # Convert to list of records for cleaner
214
+ records = df.to_dict("records")
215
+ cleaned_records, cleaning_stats = self.data_cleaner.clean_trading_records(records)
216
+
217
+ # Convert back to DataFrame
218
+ cleaned_df = pd.DataFrame(cleaned_records)
219
+
220
+ return cleaned_df, cleaning_stats
221
+
222
+ def _extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
223
+ """Extract all features"""
224
+ logger.info("Extracting features")
225
+
226
+ if self.config.enable_politician_features:
227
+ df = self.politician_extractor.extract_politician_features(df)
228
+ logger.info("Politician features extracted")
229
+
230
+ if self.config.enable_market_features:
231
+ df = self.market_extractor.extract_market_features(df)
232
+ logger.info("Market features extracted")
233
+
234
+ if self.config.enable_temporal_features:
235
+ df = self.temporal_extractor.extract_temporal_features(df)
236
+ logger.info("Temporal features extracted")
237
+
238
+ if self.config.enable_sentiment_features:
239
+ df = self.sentiment_extractor.extract_sentiment_features(df)
240
+ logger.info("Sentiment features extracted")
241
+
242
+ return df
243
+
244
+ def _handle_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
245
+ """Handle outliers in the data"""
246
+ logger.info("Handling outliers")
247
+
248
+ df_with_outliers, outlier_info = self.outlier_detector.detect_outliers(df)
249
+
250
+ if self.config.outlier_action == "remove":
251
+ df_clean = df_with_outliers[~df_with_outliers["is_outlier"]]
252
+ logger.info(f"Removed {outlier_info['total_outliers']} outliers")
253
+ elif self.config.outlier_action == "flag":
254
+ df_clean = df_with_outliers
255
+ logger.info(f"Flagged {outlier_info['total_outliers']} outliers")
256
+ else: # cap
257
+ df_clean = self._cap_outliers(df_with_outliers)
258
+ logger.info(f"Capped {outlier_info['total_outliers']} outliers")
259
+
260
+ return df_clean
261
+
262
+ def _handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
263
+ """Handle missing values"""
264
+ logger.info("Handling missing values")
265
+
266
+ df_clean, missing_info = self.missing_value_handler.handle_missing_values(df)
267
+ logger.info(f"Handled missing values: {missing_info['final_missing_counts']}")
268
+
269
+ return df_clean
270
+
271
+ def _engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
272
+ """Engineer additional features"""
273
+ logger.info("Engineering features")
274
+
275
+ # Transaction amount buckets
276
+ if "transaction_amount_cleaned" in df.columns:
277
+ df["amount_bucket"] = pd.cut(
278
+ df["transaction_amount_cleaned"],
279
+ bins=[0, 1000, 15000, 50000, 500000, float("inf")],
280
+ labels=["micro", "small", "medium", "large", "mega"],
281
+ )
282
+
283
+ # Politician activity level
284
+ if "total_transactions" in df.columns:
285
+ df["politician_activity_level"] = pd.cut(
286
+ df["total_transactions"],
287
+ bins=[0, 5, 20, 50, float("inf")],
288
+ labels=["low", "medium", "high", "very_high"],
289
+ )
290
+
291
+ # Market timing features
292
+ if "transaction_date_dt" in df.columns:
293
+ # Days since start of data
294
+ min_date = df["transaction_date_dt"].min()
295
+ df["days_since_start"] = (df["transaction_date_dt"] - min_date).dt.days
296
+
297
+ # Market cycle approximation (simplified)
298
+ df["market_cycle_phase"] = (df["days_since_start"] % 1460) / 1460 # 4-year cycle
299
+
300
+ # Interaction features
301
+ if all(col in df.columns for col in ["buy_ratio", "total_transactions"]):
302
+ df["buy_volume_interaction"] = df["buy_ratio"] * df["total_transactions"]
303
+
304
+ if all(
305
+ col in df.columns
306
+ for col in ["transaction_amount_cleaned", "politician_trading_frequency"]
307
+ ):
308
+ df["amount_frequency_interaction"] = (
309
+ df["transaction_amount_cleaned"] * df["politician_trading_frequency"]
310
+ )
311
+
312
+ return df
313
+
314
+ def _create_target_variables(self, df: pd.DataFrame) -> pd.DataFrame:
315
+ """Create target variables for ML models"""
316
+ logger.info("Creating target variables")
317
+
318
+ # Sort by politician and date for future stock performance calculation
319
+ df = df.sort_values(["politician_name_cleaned", "transaction_date_dt"])
320
+
321
+ # Target 1: Stock performance after politician trade (simplified)
322
+ # This would typically require external market data
323
+ # For now, create synthetic targets based on transaction patterns
324
+
325
+ # Target: Whether the trade was profitable (binary classification)
326
+ # Assumption: Larger transactions from frequent traders are more likely profitable
327
+ if all(
328
+ col in df.columns
329
+ for col in ["transaction_amount_cleaned", "politician_trading_frequency"]
330
+ ):
331
+ # Probability based on amount and frequency
332
+ amount_score = np.log1p(df["transaction_amount_cleaned"]) / 10
333
+ frequency_score = np.log1p(df["politician_trading_frequency"]) / 5
334
+
335
+ profit_probability = (amount_score + frequency_score) / 2
336
+ profit_probability = np.clip(profit_probability, 0.1, 0.9)
337
+
338
+ # Binary target with some randomness
339
+ np.random.seed(42) # For reproducibility
340
+ df["target_profitable"] = np.random.binomial(1, profit_probability)
341
+
342
+ # Target 2: Stock recommendation score (regression)
343
+ # Based on politician patterns and market factors
344
+ if "transaction_type_cleaned" in df.columns:
345
+ base_score = 0.5 # Neutral
346
+
347
+ # Adjust based on transaction type
348
+ type_adjustment = (
349
+ df["transaction_type_cleaned"]
350
+ .map({"buy": 0.2, "sell": -0.2, "exchange": 0.0})
351
+ .fillna(0)
352
+ )
353
+
354
+ # Adjust based on politician track record
355
+ if "buy_ratio" in df.columns:
356
+ track_record_adjustment = (df["buy_ratio"] - 0.5) * 0.3
357
+
358
+ # Adjust based on timing
359
+ if "is_end_of_quarter" in df.columns:
360
+ timing_adjustment = df["is_end_of_quarter"].astype(int) * 0.1
361
+
362
+ recommendation_score = (
363
+ base_score + type_adjustment + track_record_adjustment + timing_adjustment
364
+ )
365
+ df["target_recommendation_score"] = np.clip(recommendation_score, 0, 1)
366
+
367
+ # Target 3: Risk level (multi-class classification)
368
+ if "transaction_volatility" in df.columns:
369
+ risk_conditions = [
370
+ (df["transaction_volatility"] <= 0.2),
371
+ (df["transaction_volatility"] <= 0.5),
372
+ (df["transaction_volatility"] <= 1.0),
373
+ (df["transaction_volatility"] > 1.0),
374
+ ]
375
+ risk_choices = ["low", "medium", "high", "very_high"]
376
+ df["target_risk_level"] = np.select(risk_conditions, risk_choices, default="medium")
377
+
378
+ return df
379
+
380
+ def _split_data(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
381
+ """Split data into train/val/test sets"""
382
+ logger.info("Splitting data")
383
+
384
+ if self.config.split_by_time and "transaction_date_dt" in df.columns:
385
+ # Time-based split
386
+ df_sorted = df.sort_values("transaction_date_dt")
387
+
388
+ train_size = int(len(df_sorted) * self.config.train_split_ratio)
389
+ val_size = int(len(df_sorted) * self.config.val_split_ratio)
390
+
391
+ train_data = df_sorted.iloc[:train_size]
392
+ val_data = df_sorted.iloc[train_size : train_size + val_size]
393
+ test_data = df_sorted.iloc[train_size + val_size :]
394
+
395
+ else:
396
+ # Random split
397
+ df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
398
+
399
+ train_size = int(len(df_shuffled) * self.config.train_split_ratio)
400
+ val_size = int(len(df_shuffled) * self.config.val_split_ratio)
401
+
402
+ train_data = df_shuffled.iloc[:train_size]
403
+ val_data = df_shuffled.iloc[train_size : train_size + val_size]
404
+ test_data = df_shuffled.iloc[train_size + val_size :]
405
+
406
+ logger.info(
407
+ f"Split sizes - Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}"
408
+ )
409
+
410
+ return train_data, val_data, test_data
411
+
412
+ def _scale_features(
413
+ self, train_data: pd.DataFrame, val_data: pd.DataFrame, test_data: pd.DataFrame
414
+ ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
415
+ """Scale numerical features"""
416
+ logger.info("Scaling features")
417
+
418
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
419
+
420
+ numerical_features = self._identify_numerical_features(train_data)
421
+ categorical_features = self._identify_categorical_features(train_data)
422
+
423
+ # Fit scaler on training data
424
+ self.scaler = StandardScaler()
425
+ if numerical_features:
426
+ train_scaled = train_data.copy()
427
+ val_scaled = val_data.copy()
428
+ test_scaled = test_data.copy()
429
+
430
+ train_scaled[numerical_features] = self.scaler.fit_transform(
431
+ train_data[numerical_features]
432
+ )
433
+ val_scaled[numerical_features] = self.scaler.transform(val_data[numerical_features])
434
+ test_scaled[numerical_features] = self.scaler.transform(test_data[numerical_features])
435
+ else:
436
+ train_scaled, val_scaled, test_scaled = train_data, val_data, test_data
437
+
438
+ # Encode categorical features
439
+ self.categorical_encoder = {}
440
+ if categorical_features:
441
+ for feature in categorical_features:
442
+ encoder = LabelEncoder()
443
+ # Fit on combined data to handle unseen categories
444
+ all_values = pd.concat(
445
+ [train_scaled[feature], val_scaled[feature], test_scaled[feature]]
446
+ ).astype(str)
447
+
448
+ encoder.fit(all_values)
449
+ self.categorical_encoder[feature] = encoder
450
+
451
+ train_scaled[feature] = encoder.transform(train_scaled[feature].astype(str))
452
+ val_scaled[feature] = encoder.transform(val_scaled[feature].astype(str))
453
+ test_scaled[feature] = encoder.transform(test_scaled[feature].astype(str))
454
+
455
+ return train_scaled, val_scaled, test_scaled
456
+
457
+ def _identify_numerical_features(self, df: pd.DataFrame) -> List[str]:
458
+ """Identify numerical features"""
459
+ numerical_features = []
460
+ for col in df.columns:
461
+ if (
462
+ df[col].dtype in ["int64", "float64"]
463
+ and not col.startswith("target_")
464
+ and not col.endswith("_cleaned")
465
+ and col not in ["is_outlier"]
466
+ ):
467
+ numerical_features.append(col)
468
+ return numerical_features
469
+
470
+ def _identify_categorical_features(self, df: pd.DataFrame) -> List[str]:
471
+ """Identify categorical features"""
472
+ categorical_features = []
473
+ for col in df.columns:
474
+ if (
475
+ df[col].dtype == "object"
476
+ or df[col].dtype.name == "category"
477
+ and not col.startswith("target_")
478
+ ):
479
+ categorical_features.append(col)
480
+ return categorical_features
481
+
482
+ def _cap_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
483
+ """Cap outliers to percentile values"""
484
+ df_capped = df.copy()
485
+ numerical_cols = self._identify_numerical_features(df)
486
+
487
+ for col in numerical_cols:
488
+ if col in df_capped.columns:
489
+ q1 = df_capped[col].quantile(0.01)
490
+ q99 = df_capped[col].quantile(0.99)
491
+ df_capped[col] = np.clip(df_capped[col], q1, q99)
492
+
493
+ return df_capped
494
+
495
+ def _save_artifacts(self):
496
+ """Save preprocessing artifacts"""
497
+ logger.info("Saving preprocessing artifacts")
498
+
499
+ if self.scaler:
500
+ joblib.dump(self.scaler, self.config.artifacts_dir / "scaler.joblib")
501
+
502
+ if self.categorical_encoder:
503
+ joblib.dump(self.categorical_encoder, self.config.artifacts_dir / "encoder.joblib")
504
+
505
+ # Save feature metadata
506
+ self.feature_metadata = {
507
+ "config": asdict(self.config),
508
+ "preprocessing_timestamp": datetime.now().isoformat(),
509
+ }
510
+ joblib.dump(self.feature_metadata, self.config.artifacts_dir / "feature_metadata.joblib")
511
+
512
+ def load_artifacts(self, artifacts_dir: Path):
513
+ """Load preprocessing artifacts"""
514
+ logger.info(f"Loading preprocessing artifacts from {artifacts_dir}")
515
+
516
+ scaler_path = artifacts_dir / "scaler.joblib"
517
+ if scaler_path.exists():
518
+ self.scaler = joblib.load(scaler_path)
519
+
520
+ encoder_path = artifacts_dir / "encoder.joblib"
521
+ if encoder_path.exists():
522
+ self.categorical_encoder = joblib.load(encoder_path)
523
+
524
+ metadata_path = artifacts_dir / "feature_metadata.joblib"
525
+ if metadata_path.exists():
526
+ self.feature_metadata = joblib.load(metadata_path)
527
+
528
+ def transform_new_data(
529
+ self, new_data: Union[List[Dict[str, Any]], pd.DataFrame]
530
+ ) -> pd.DataFrame:
531
+ """Transform new data using fitted preprocessors"""
532
+ logger.info("Transforming new data with fitted preprocessors")
533
+
534
+ if self.scaler is None and self.categorical_encoder is None:
535
+ raise ValueError("No preprocessing artifacts loaded. Call load_artifacts() first.")
536
+
537
+ # Convert to DataFrame if needed
538
+ if isinstance(new_data, list):
539
+ df = pd.DataFrame(new_data)
540
+ else:
541
+ df = new_data.copy()
542
+
543
+ # Apply same preprocessing steps (without fitting)
544
+ if self.config.enable_data_cleaning:
545
+ records = df.to_dict("records")
546
+ cleaned_records, _ = self.data_cleaner.clean_trading_records(records)
547
+ df = pd.DataFrame(cleaned_records)
548
+
549
+ # Extract features
550
+ df = self._extract_features(df)
551
+
552
+ # Engineer features
553
+ df = self._engineer_features(df)
554
+
555
+ # Apply scaling and encoding
556
+ numerical_features = self._identify_numerical_features(df)
557
+ categorical_features = self._identify_categorical_features(df)
558
+
559
+ if self.scaler and numerical_features:
560
+ df[numerical_features] = self.scaler.transform(df[numerical_features])
561
+
562
+ if self.categorical_encoder and categorical_features:
563
+ for feature in categorical_features:
564
+ if feature in self.categorical_encoder:
565
+ df[feature] = self.categorical_encoder[feature].transform(
566
+ df[feature].astype(str)
567
+ )
568
+
569
+ return df