mcli-framework 7.10.0__py3-none-any.whl → 7.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/lib/custom_commands.py +10 -0
- mcli/lib/optional_deps.py +240 -0
- mcli/ml/backtesting/run.py +5 -3
- mcli/ml/models/ensemble_models.py +1 -0
- mcli/ml/models/recommendation_models.py +1 -0
- mcli/ml/optimization/optimize.py +6 -4
- mcli/ml/serving/serve.py +2 -2
- mcli/ml/training/train.py +14 -7
- mcli/self/completion_cmd.py +2 -2
- mcli/workflow/doc_convert.py +82 -112
- mcli/workflow/git_commit/ai_service.py +13 -2
- mcli/workflow/notebook/converter.py +375 -0
- mcli/workflow/notebook/notebook_cmd.py +441 -0
- mcli/workflow/notebook/schema.py +402 -0
- mcli/workflow/notebook/validator.py +313 -0
- mcli/workflow/workflow.py +14 -0
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/METADATA +37 -3
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/RECORD +22 -37
- mcli/ml/features/political_features.py +0 -677
- mcli/ml/preprocessing/politician_trading_preprocessor.py +0 -570
- mcli/workflow/politician_trading/config.py +0 -134
- mcli/workflow/politician_trading/connectivity.py +0 -492
- mcli/workflow/politician_trading/data_sources.py +0 -654
- mcli/workflow/politician_trading/database.py +0 -412
- mcli/workflow/politician_trading/demo.py +0 -249
- mcli/workflow/politician_trading/models.py +0 -327
- mcli/workflow/politician_trading/monitoring.py +0 -413
- mcli/workflow/politician_trading/scrapers.py +0 -1074
- mcli/workflow/politician_trading/scrapers_california.py +0 -434
- mcli/workflow/politician_trading/scrapers_corporate_registry.py +0 -797
- mcli/workflow/politician_trading/scrapers_eu.py +0 -376
- mcli/workflow/politician_trading/scrapers_free_sources.py +0 -509
- mcli/workflow/politician_trading/scrapers_third_party.py +0 -373
- mcli/workflow/politician_trading/scrapers_uk.py +0 -378
- mcli/workflow/politician_trading/scrapers_us_states.py +0 -471
- mcli/workflow/politician_trading/seed_database.py +0 -520
- mcli/workflow/politician_trading/supabase_functions.py +0 -354
- mcli/workflow/politician_trading/workflow.py +0 -879
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/WHEEL +0 -0
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/entry_points.txt +0 -0
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/licenses/LICENSE +0 -0
- {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/top_level.txt +0 -0
|
@@ -1,570 +0,0 @@
|
|
|
1
|
-
"""Main preprocessor for politician trading data"""
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
from dataclasses import asdict, dataclass
|
|
5
|
-
from datetime import datetime, timedelta
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
8
|
-
|
|
9
|
-
import joblib
|
|
10
|
-
import numpy as np
|
|
11
|
-
import pandas as pd
|
|
12
|
-
|
|
13
|
-
from .data_cleaners import CleaningStats, MissingValueHandler, OutlierDetector, TradingDataCleaner
|
|
14
|
-
from .feature_extractors import (
|
|
15
|
-
FeatureExtractionStats,
|
|
16
|
-
MarketFeatureExtractor,
|
|
17
|
-
PoliticianFeatureExtractor,
|
|
18
|
-
SentimentFeatureExtractor,
|
|
19
|
-
TemporalFeatureExtractor,
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
logger = logging.getLogger(__name__)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
@dataclass
|
|
26
|
-
class PreprocessingConfig:
|
|
27
|
-
"""Configuration for preprocessing pipeline"""
|
|
28
|
-
|
|
29
|
-
# Data cleaning
|
|
30
|
-
enable_data_cleaning: bool = True
|
|
31
|
-
enable_outlier_detection: bool = True
|
|
32
|
-
enable_missing_value_handling: bool = True
|
|
33
|
-
outlier_action: str = "flag" # "flag", "remove", or "cap"
|
|
34
|
-
|
|
35
|
-
# Feature extraction
|
|
36
|
-
enable_politician_features: bool = True
|
|
37
|
-
enable_market_features: bool = True
|
|
38
|
-
enable_temporal_features: bool = True
|
|
39
|
-
enable_sentiment_features: bool = True
|
|
40
|
-
|
|
41
|
-
# Temporal settings
|
|
42
|
-
lookback_periods: List[int] = None
|
|
43
|
-
include_future_leakage: bool = False
|
|
44
|
-
|
|
45
|
-
# Data splitting
|
|
46
|
-
train_split_ratio: float = 0.7
|
|
47
|
-
val_split_ratio: float = 0.15
|
|
48
|
-
test_split_ratio: float = 0.15
|
|
49
|
-
split_by_time: bool = True
|
|
50
|
-
|
|
51
|
-
# Output settings
|
|
52
|
-
save_preprocessing_artifacts: bool = True
|
|
53
|
-
artifacts_dir: Optional[Path] = None
|
|
54
|
-
|
|
55
|
-
def __post_init__(self):
|
|
56
|
-
if self.lookback_periods is None:
|
|
57
|
-
self.lookback_periods = [7, 30, 90, 365]
|
|
58
|
-
|
|
59
|
-
if self.artifacts_dir is None:
|
|
60
|
-
self.artifacts_dir = Path("./data/preprocessing_artifacts")
|
|
61
|
-
|
|
62
|
-
# Validate split ratios
|
|
63
|
-
total_ratio = self.train_split_ratio + self.val_split_ratio + self.test_split_ratio
|
|
64
|
-
if abs(total_ratio - 1.0) > 0.001:
|
|
65
|
-
raise ValueError(f"Split ratios must sum to 1.0, got {total_ratio}")
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
@dataclass
|
|
69
|
-
class PreprocessingResults:
|
|
70
|
-
"""Results from preprocessing pipeline"""
|
|
71
|
-
|
|
72
|
-
# Processed data
|
|
73
|
-
train_data: pd.DataFrame
|
|
74
|
-
val_data: pd.DataFrame
|
|
75
|
-
test_data: pd.DataFrame
|
|
76
|
-
|
|
77
|
-
# Feature information
|
|
78
|
-
feature_names: List[str]
|
|
79
|
-
categorical_features: List[str]
|
|
80
|
-
numerical_features: List[str]
|
|
81
|
-
target_columns: List[str]
|
|
82
|
-
|
|
83
|
-
# Statistics
|
|
84
|
-
cleaning_stats: CleaningStats
|
|
85
|
-
original_shape: Tuple[int, int]
|
|
86
|
-
final_shape: Tuple[int, int]
|
|
87
|
-
feature_count: int
|
|
88
|
-
|
|
89
|
-
# Artifacts paths
|
|
90
|
-
scaler_path: Optional[Path] = None
|
|
91
|
-
encoder_path: Optional[Path] = None
|
|
92
|
-
feature_metadata_path: Optional[Path] = None
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
class PoliticianTradingPreprocessor:
|
|
96
|
-
"""Main preprocessor for politician trading data for ML models"""
|
|
97
|
-
|
|
98
|
-
def __init__(self, config: Optional[PreprocessingConfig] = None):
|
|
99
|
-
self.config = config or PreprocessingConfig()
|
|
100
|
-
|
|
101
|
-
# Initialize components
|
|
102
|
-
self.data_cleaner = TradingDataCleaner()
|
|
103
|
-
self.outlier_detector = OutlierDetector()
|
|
104
|
-
self.missing_value_handler = MissingValueHandler()
|
|
105
|
-
|
|
106
|
-
self.politician_extractor = PoliticianFeatureExtractor()
|
|
107
|
-
self.market_extractor = MarketFeatureExtractor()
|
|
108
|
-
self.temporal_extractor = TemporalFeatureExtractor(
|
|
109
|
-
config={"lookback_periods": self.config.lookback_periods}
|
|
110
|
-
)
|
|
111
|
-
self.sentiment_extractor = SentimentFeatureExtractor()
|
|
112
|
-
|
|
113
|
-
# Preprocessing artifacts
|
|
114
|
-
self.scaler = None
|
|
115
|
-
self.categorical_encoder = None
|
|
116
|
-
self.feature_metadata = {}
|
|
117
|
-
|
|
118
|
-
# Create artifacts directory
|
|
119
|
-
self.config.artifacts_dir.mkdir(parents=True, exist_ok=True)
|
|
120
|
-
|
|
121
|
-
def preprocess(
|
|
122
|
-
self, raw_data: Union[List[Dict[str, Any]], pd.DataFrame]
|
|
123
|
-
) -> PreprocessingResults:
|
|
124
|
-
"""Main preprocessing pipeline"""
|
|
125
|
-
logger.info("Starting politician trading data preprocessing")
|
|
126
|
-
|
|
127
|
-
# Convert to DataFrame if needed
|
|
128
|
-
if isinstance(raw_data, list):
|
|
129
|
-
df = pd.DataFrame(raw_data)
|
|
130
|
-
else:
|
|
131
|
-
df = raw_data.copy()
|
|
132
|
-
|
|
133
|
-
original_shape = df.shape
|
|
134
|
-
logger.info(f"Input data shape: {original_shape}")
|
|
135
|
-
|
|
136
|
-
# Step 1: Data Cleaning
|
|
137
|
-
if self.config.enable_data_cleaning:
|
|
138
|
-
df, cleaning_stats = self._clean_data(df)
|
|
139
|
-
logger.info(f"After cleaning: {df.shape}")
|
|
140
|
-
else:
|
|
141
|
-
cleaning_stats = CleaningStats(
|
|
142
|
-
total_records=len(df),
|
|
143
|
-
cleaned_records=len(df),
|
|
144
|
-
removed_records=0,
|
|
145
|
-
cleaning_operations={},
|
|
146
|
-
outliers_detected=0,
|
|
147
|
-
missing_values_filled=0,
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
# Step 2: Feature Extraction
|
|
151
|
-
df = self._extract_features(df)
|
|
152
|
-
logger.info(f"After feature extraction: {df.shape}")
|
|
153
|
-
|
|
154
|
-
# Step 3: Handle outliers
|
|
155
|
-
if self.config.enable_outlier_detection:
|
|
156
|
-
df = self._handle_outliers(df)
|
|
157
|
-
logger.info(f"After outlier handling: {df.shape}")
|
|
158
|
-
|
|
159
|
-
# Step 4: Handle missing values
|
|
160
|
-
if self.config.enable_missing_value_handling:
|
|
161
|
-
df = self._handle_missing_values(df)
|
|
162
|
-
logger.info(f"After missing value handling: {df.shape}")
|
|
163
|
-
|
|
164
|
-
# Step 5: Feature engineering and encoding
|
|
165
|
-
df = self._engineer_features(df)
|
|
166
|
-
logger.info(f"After feature engineering: {df.shape}")
|
|
167
|
-
|
|
168
|
-
# Step 6: Create target variables
|
|
169
|
-
df = self._create_target_variables(df)
|
|
170
|
-
logger.info(f"After target creation: {df.shape}")
|
|
171
|
-
|
|
172
|
-
# Step 7: Split data
|
|
173
|
-
train_data, val_data, test_data = self._split_data(df)
|
|
174
|
-
|
|
175
|
-
# Step 8: Scale features
|
|
176
|
-
train_data, val_data, test_data = self._scale_features(train_data, val_data, test_data)
|
|
177
|
-
|
|
178
|
-
# Step 9: Save artifacts
|
|
179
|
-
if self.config.save_preprocessing_artifacts:
|
|
180
|
-
self._save_artifacts()
|
|
181
|
-
|
|
182
|
-
# Prepare results
|
|
183
|
-
feature_names = [col for col in df.columns if not col.startswith("target_")]
|
|
184
|
-
categorical_features = self._identify_categorical_features(df)
|
|
185
|
-
numerical_features = self._identify_numerical_features(df)
|
|
186
|
-
target_columns = [col for col in df.columns if col.startswith("target_")]
|
|
187
|
-
|
|
188
|
-
results = PreprocessingResults(
|
|
189
|
-
train_data=train_data,
|
|
190
|
-
val_data=val_data,
|
|
191
|
-
test_data=test_data,
|
|
192
|
-
feature_names=feature_names,
|
|
193
|
-
categorical_features=categorical_features,
|
|
194
|
-
numerical_features=numerical_features,
|
|
195
|
-
target_columns=target_columns,
|
|
196
|
-
cleaning_stats=cleaning_stats,
|
|
197
|
-
original_shape=original_shape,
|
|
198
|
-
final_shape=df.shape,
|
|
199
|
-
feature_count=len(feature_names),
|
|
200
|
-
scaler_path=self.config.artifacts_dir / "scaler.joblib",
|
|
201
|
-
encoder_path=self.config.artifacts_dir / "encoder.joblib",
|
|
202
|
-
feature_metadata_path=self.config.artifacts_dir / "feature_metadata.joblib",
|
|
203
|
-
)
|
|
204
|
-
|
|
205
|
-
logger.info(f"Preprocessing complete. Final shape: {df.shape}")
|
|
206
|
-
logger.info(f"Features: {len(feature_names)}, Targets: {len(target_columns)}")
|
|
207
|
-
|
|
208
|
-
return results
|
|
209
|
-
|
|
210
|
-
def _clean_data(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, CleaningStats]:
|
|
211
|
-
"""Clean the raw data"""
|
|
212
|
-
logger.info("Cleaning data")
|
|
213
|
-
|
|
214
|
-
# Convert to list of records for cleaner
|
|
215
|
-
records = df.to_dict("records")
|
|
216
|
-
cleaned_records, cleaning_stats = self.data_cleaner.clean_trading_records(records)
|
|
217
|
-
|
|
218
|
-
# Convert back to DataFrame
|
|
219
|
-
cleaned_df = pd.DataFrame(cleaned_records)
|
|
220
|
-
|
|
221
|
-
return cleaned_df, cleaning_stats
|
|
222
|
-
|
|
223
|
-
def _extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
224
|
-
"""Extract all features"""
|
|
225
|
-
logger.info("Extracting features")
|
|
226
|
-
|
|
227
|
-
if self.config.enable_politician_features:
|
|
228
|
-
df = self.politician_extractor.extract_politician_features(df)
|
|
229
|
-
logger.info("Politician features extracted")
|
|
230
|
-
|
|
231
|
-
if self.config.enable_market_features:
|
|
232
|
-
df = self.market_extractor.extract_market_features(df)
|
|
233
|
-
logger.info("Market features extracted")
|
|
234
|
-
|
|
235
|
-
if self.config.enable_temporal_features:
|
|
236
|
-
df = self.temporal_extractor.extract_temporal_features(df)
|
|
237
|
-
logger.info("Temporal features extracted")
|
|
238
|
-
|
|
239
|
-
if self.config.enable_sentiment_features:
|
|
240
|
-
df = self.sentiment_extractor.extract_sentiment_features(df)
|
|
241
|
-
logger.info("Sentiment features extracted")
|
|
242
|
-
|
|
243
|
-
return df
|
|
244
|
-
|
|
245
|
-
def _handle_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
246
|
-
"""Handle outliers in the data"""
|
|
247
|
-
logger.info("Handling outliers")
|
|
248
|
-
|
|
249
|
-
df_with_outliers, outlier_info = self.outlier_detector.detect_outliers(df)
|
|
250
|
-
|
|
251
|
-
if self.config.outlier_action == "remove":
|
|
252
|
-
df_clean = df_with_outliers[~df_with_outliers["is_outlier"]]
|
|
253
|
-
logger.info(f"Removed {outlier_info['total_outliers']} outliers")
|
|
254
|
-
elif self.config.outlier_action == "flag":
|
|
255
|
-
df_clean = df_with_outliers
|
|
256
|
-
logger.info(f"Flagged {outlier_info['total_outliers']} outliers")
|
|
257
|
-
else: # cap
|
|
258
|
-
df_clean = self._cap_outliers(df_with_outliers)
|
|
259
|
-
logger.info(f"Capped {outlier_info['total_outliers']} outliers")
|
|
260
|
-
|
|
261
|
-
return df_clean
|
|
262
|
-
|
|
263
|
-
def _handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
264
|
-
"""Handle missing values"""
|
|
265
|
-
logger.info("Handling missing values")
|
|
266
|
-
|
|
267
|
-
df_clean, missing_info = self.missing_value_handler.handle_missing_values(df)
|
|
268
|
-
logger.info(f"Handled missing values: {missing_info['final_missing_counts']}")
|
|
269
|
-
|
|
270
|
-
return df_clean
|
|
271
|
-
|
|
272
|
-
def _engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
273
|
-
"""Engineer additional features"""
|
|
274
|
-
logger.info("Engineering features")
|
|
275
|
-
|
|
276
|
-
# Transaction amount buckets
|
|
277
|
-
if "transaction_amount_cleaned" in df.columns:
|
|
278
|
-
df["amount_bucket"] = pd.cut(
|
|
279
|
-
df["transaction_amount_cleaned"],
|
|
280
|
-
bins=[0, 1000, 15000, 50000, 500000, float("inf")],
|
|
281
|
-
labels=["micro", "small", "medium", "large", "mega"],
|
|
282
|
-
)
|
|
283
|
-
|
|
284
|
-
# Politician activity level
|
|
285
|
-
if "total_transactions" in df.columns:
|
|
286
|
-
df["politician_activity_level"] = pd.cut(
|
|
287
|
-
df["total_transactions"],
|
|
288
|
-
bins=[0, 5, 20, 50, float("inf")],
|
|
289
|
-
labels=["low", "medium", "high", "very_high"],
|
|
290
|
-
)
|
|
291
|
-
|
|
292
|
-
# Market timing features
|
|
293
|
-
if "transaction_date_dt" in df.columns:
|
|
294
|
-
# Days since start of data
|
|
295
|
-
min_date = df["transaction_date_dt"].min()
|
|
296
|
-
df["days_since_start"] = (df["transaction_date_dt"] - min_date).dt.days
|
|
297
|
-
|
|
298
|
-
# Market cycle approximation (simplified)
|
|
299
|
-
df["market_cycle_phase"] = (df["days_since_start"] % 1460) / 1460 # 4-year cycle
|
|
300
|
-
|
|
301
|
-
# Interaction features
|
|
302
|
-
if all(col in df.columns for col in ["buy_ratio", "total_transactions"]):
|
|
303
|
-
df["buy_volume_interaction"] = df["buy_ratio"] * df["total_transactions"]
|
|
304
|
-
|
|
305
|
-
if all(
|
|
306
|
-
col in df.columns
|
|
307
|
-
for col in ["transaction_amount_cleaned", "politician_trading_frequency"]
|
|
308
|
-
):
|
|
309
|
-
df["amount_frequency_interaction"] = (
|
|
310
|
-
df["transaction_amount_cleaned"] * df["politician_trading_frequency"]
|
|
311
|
-
)
|
|
312
|
-
|
|
313
|
-
return df
|
|
314
|
-
|
|
315
|
-
def _create_target_variables(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
316
|
-
"""Create target variables for ML models"""
|
|
317
|
-
logger.info("Creating target variables")
|
|
318
|
-
|
|
319
|
-
# Sort by politician and date for future stock performance calculation
|
|
320
|
-
df = df.sort_values(["politician_name_cleaned", "transaction_date_dt"])
|
|
321
|
-
|
|
322
|
-
# Target 1: Stock performance after politician trade (simplified)
|
|
323
|
-
# This would typically require external market data
|
|
324
|
-
# For now, create synthetic targets based on transaction patterns
|
|
325
|
-
|
|
326
|
-
# Target: Whether the trade was profitable (binary classification)
|
|
327
|
-
# Assumption: Larger transactions from frequent traders are more likely profitable
|
|
328
|
-
if all(
|
|
329
|
-
col in df.columns
|
|
330
|
-
for col in ["transaction_amount_cleaned", "politician_trading_frequency"]
|
|
331
|
-
):
|
|
332
|
-
# Probability based on amount and frequency
|
|
333
|
-
amount_score = np.log1p(df["transaction_amount_cleaned"]) / 10
|
|
334
|
-
frequency_score = np.log1p(df["politician_trading_frequency"]) / 5
|
|
335
|
-
|
|
336
|
-
profit_probability = (amount_score + frequency_score) / 2
|
|
337
|
-
profit_probability = np.clip(profit_probability, 0.1, 0.9)
|
|
338
|
-
|
|
339
|
-
# Binary target with some randomness
|
|
340
|
-
np.random.seed(42) # For reproducibility
|
|
341
|
-
df["target_profitable"] = np.random.binomial(1, profit_probability)
|
|
342
|
-
|
|
343
|
-
# Target 2: Stock recommendation score (regression)
|
|
344
|
-
# Based on politician patterns and market factors
|
|
345
|
-
if "transaction_type_cleaned" in df.columns:
|
|
346
|
-
base_score = 0.5 # Neutral
|
|
347
|
-
|
|
348
|
-
# Adjust based on transaction type
|
|
349
|
-
type_adjustment = (
|
|
350
|
-
df["transaction_type_cleaned"]
|
|
351
|
-
.map({"buy": 0.2, "sell": -0.2, "exchange": 0.0})
|
|
352
|
-
.fillna(0)
|
|
353
|
-
)
|
|
354
|
-
|
|
355
|
-
# Adjust based on politician track record
|
|
356
|
-
if "buy_ratio" in df.columns:
|
|
357
|
-
track_record_adjustment = (df["buy_ratio"] - 0.5) * 0.3
|
|
358
|
-
|
|
359
|
-
# Adjust based on timing
|
|
360
|
-
if "is_end_of_quarter" in df.columns:
|
|
361
|
-
timing_adjustment = df["is_end_of_quarter"].astype(int) * 0.1
|
|
362
|
-
|
|
363
|
-
recommendation_score = (
|
|
364
|
-
base_score + type_adjustment + track_record_adjustment + timing_adjustment
|
|
365
|
-
)
|
|
366
|
-
df["target_recommendation_score"] = np.clip(recommendation_score, 0, 1)
|
|
367
|
-
|
|
368
|
-
# Target 3: Risk level (multi-class classification)
|
|
369
|
-
if "transaction_volatility" in df.columns:
|
|
370
|
-
risk_conditions = [
|
|
371
|
-
(df["transaction_volatility"] <= 0.2),
|
|
372
|
-
(df["transaction_volatility"] <= 0.5),
|
|
373
|
-
(df["transaction_volatility"] <= 1.0),
|
|
374
|
-
(df["transaction_volatility"] > 1.0),
|
|
375
|
-
]
|
|
376
|
-
risk_choices = ["low", "medium", "high", "very_high"]
|
|
377
|
-
df["target_risk_level"] = np.select(risk_conditions, risk_choices, default="medium")
|
|
378
|
-
|
|
379
|
-
return df
|
|
380
|
-
|
|
381
|
-
def _split_data(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
|
382
|
-
"""Split data into train/val/test sets"""
|
|
383
|
-
logger.info("Splitting data")
|
|
384
|
-
|
|
385
|
-
if self.config.split_by_time and "transaction_date_dt" in df.columns:
|
|
386
|
-
# Time-based split
|
|
387
|
-
df_sorted = df.sort_values("transaction_date_dt")
|
|
388
|
-
|
|
389
|
-
train_size = int(len(df_sorted) * self.config.train_split_ratio)
|
|
390
|
-
val_size = int(len(df_sorted) * self.config.val_split_ratio)
|
|
391
|
-
|
|
392
|
-
train_data = df_sorted.iloc[:train_size]
|
|
393
|
-
val_data = df_sorted.iloc[train_size : train_size + val_size]
|
|
394
|
-
test_data = df_sorted.iloc[train_size + val_size :]
|
|
395
|
-
|
|
396
|
-
else:
|
|
397
|
-
# Random split
|
|
398
|
-
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
|
|
399
|
-
|
|
400
|
-
train_size = int(len(df_shuffled) * self.config.train_split_ratio)
|
|
401
|
-
val_size = int(len(df_shuffled) * self.config.val_split_ratio)
|
|
402
|
-
|
|
403
|
-
train_data = df_shuffled.iloc[:train_size]
|
|
404
|
-
val_data = df_shuffled.iloc[train_size : train_size + val_size]
|
|
405
|
-
test_data = df_shuffled.iloc[train_size + val_size :]
|
|
406
|
-
|
|
407
|
-
logger.info(
|
|
408
|
-
f"Split sizes - Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}"
|
|
409
|
-
)
|
|
410
|
-
|
|
411
|
-
return train_data, val_data, test_data
|
|
412
|
-
|
|
413
|
-
def _scale_features(
|
|
414
|
-
self, train_data: pd.DataFrame, val_data: pd.DataFrame, test_data: pd.DataFrame
|
|
415
|
-
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
|
416
|
-
"""Scale numerical features"""
|
|
417
|
-
logger.info("Scaling features")
|
|
418
|
-
|
|
419
|
-
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
|
420
|
-
|
|
421
|
-
numerical_features = self._identify_numerical_features(train_data)
|
|
422
|
-
categorical_features = self._identify_categorical_features(train_data)
|
|
423
|
-
|
|
424
|
-
# Fit scaler on training data
|
|
425
|
-
self.scaler = StandardScaler()
|
|
426
|
-
if numerical_features:
|
|
427
|
-
train_scaled = train_data.copy()
|
|
428
|
-
val_scaled = val_data.copy()
|
|
429
|
-
test_scaled = test_data.copy()
|
|
430
|
-
|
|
431
|
-
train_scaled[numerical_features] = self.scaler.fit_transform(
|
|
432
|
-
train_data[numerical_features]
|
|
433
|
-
)
|
|
434
|
-
val_scaled[numerical_features] = self.scaler.transform(val_data[numerical_features])
|
|
435
|
-
test_scaled[numerical_features] = self.scaler.transform(test_data[numerical_features])
|
|
436
|
-
else:
|
|
437
|
-
train_scaled, val_scaled, test_scaled = train_data, val_data, test_data
|
|
438
|
-
|
|
439
|
-
# Encode categorical features
|
|
440
|
-
self.categorical_encoder = {}
|
|
441
|
-
if categorical_features:
|
|
442
|
-
for feature in categorical_features:
|
|
443
|
-
encoder = LabelEncoder()
|
|
444
|
-
# Fit on combined data to handle unseen categories
|
|
445
|
-
all_values = pd.concat(
|
|
446
|
-
[train_scaled[feature], val_scaled[feature], test_scaled[feature]]
|
|
447
|
-
).astype(str)
|
|
448
|
-
|
|
449
|
-
encoder.fit(all_values)
|
|
450
|
-
self.categorical_encoder[feature] = encoder
|
|
451
|
-
|
|
452
|
-
train_scaled[feature] = encoder.transform(train_scaled[feature].astype(str))
|
|
453
|
-
val_scaled[feature] = encoder.transform(val_scaled[feature].astype(str))
|
|
454
|
-
test_scaled[feature] = encoder.transform(test_scaled[feature].astype(str))
|
|
455
|
-
|
|
456
|
-
return train_scaled, val_scaled, test_scaled
|
|
457
|
-
|
|
458
|
-
def _identify_numerical_features(self, df: pd.DataFrame) -> List[str]:
|
|
459
|
-
"""Identify numerical features"""
|
|
460
|
-
numerical_features = []
|
|
461
|
-
for col in df.columns:
|
|
462
|
-
if (
|
|
463
|
-
df[col].dtype in ["int64", "float64"]
|
|
464
|
-
and not col.startswith("target_")
|
|
465
|
-
and not col.endswith("_cleaned")
|
|
466
|
-
and col not in ["is_outlier"]
|
|
467
|
-
):
|
|
468
|
-
numerical_features.append(col)
|
|
469
|
-
return numerical_features
|
|
470
|
-
|
|
471
|
-
def _identify_categorical_features(self, df: pd.DataFrame) -> List[str]:
|
|
472
|
-
"""Identify categorical features"""
|
|
473
|
-
categorical_features = []
|
|
474
|
-
for col in df.columns:
|
|
475
|
-
if (
|
|
476
|
-
df[col].dtype == "object"
|
|
477
|
-
or df[col].dtype.name == "category"
|
|
478
|
-
and not col.startswith("target_")
|
|
479
|
-
):
|
|
480
|
-
categorical_features.append(col)
|
|
481
|
-
return categorical_features
|
|
482
|
-
|
|
483
|
-
def _cap_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
484
|
-
"""Cap outliers to percentile values"""
|
|
485
|
-
df_capped = df.copy()
|
|
486
|
-
numerical_cols = self._identify_numerical_features(df)
|
|
487
|
-
|
|
488
|
-
for col in numerical_cols:
|
|
489
|
-
if col in df_capped.columns:
|
|
490
|
-
q1 = df_capped[col].quantile(0.01)
|
|
491
|
-
q99 = df_capped[col].quantile(0.99)
|
|
492
|
-
df_capped[col] = np.clip(df_capped[col], q1, q99)
|
|
493
|
-
|
|
494
|
-
return df_capped
|
|
495
|
-
|
|
496
|
-
def _save_artifacts(self):
|
|
497
|
-
"""Save preprocessing artifacts"""
|
|
498
|
-
logger.info("Saving preprocessing artifacts")
|
|
499
|
-
|
|
500
|
-
if self.scaler:
|
|
501
|
-
joblib.dump(self.scaler, self.config.artifacts_dir / "scaler.joblib")
|
|
502
|
-
|
|
503
|
-
if self.categorical_encoder:
|
|
504
|
-
joblib.dump(self.categorical_encoder, self.config.artifacts_dir / "encoder.joblib")
|
|
505
|
-
|
|
506
|
-
# Save feature metadata
|
|
507
|
-
self.feature_metadata = {
|
|
508
|
-
"config": asdict(self.config),
|
|
509
|
-
"preprocessing_timestamp": datetime.now().isoformat(),
|
|
510
|
-
}
|
|
511
|
-
joblib.dump(self.feature_metadata, self.config.artifacts_dir / "feature_metadata.joblib")
|
|
512
|
-
|
|
513
|
-
def load_artifacts(self, artifacts_dir: Path):
|
|
514
|
-
"""Load preprocessing artifacts"""
|
|
515
|
-
logger.info(f"Loading preprocessing artifacts from {artifacts_dir}")
|
|
516
|
-
|
|
517
|
-
scaler_path = artifacts_dir / "scaler.joblib"
|
|
518
|
-
if scaler_path.exists():
|
|
519
|
-
self.scaler = joblib.load(scaler_path)
|
|
520
|
-
|
|
521
|
-
encoder_path = artifacts_dir / "encoder.joblib"
|
|
522
|
-
if encoder_path.exists():
|
|
523
|
-
self.categorical_encoder = joblib.load(encoder_path)
|
|
524
|
-
|
|
525
|
-
metadata_path = artifacts_dir / "feature_metadata.joblib"
|
|
526
|
-
if metadata_path.exists():
|
|
527
|
-
self.feature_metadata = joblib.load(metadata_path)
|
|
528
|
-
|
|
529
|
-
def transform_new_data(
|
|
530
|
-
self, new_data: Union[List[Dict[str, Any]], pd.DataFrame]
|
|
531
|
-
) -> pd.DataFrame:
|
|
532
|
-
"""Transform new data using fitted preprocessors"""
|
|
533
|
-
logger.info("Transforming new data with fitted preprocessors")
|
|
534
|
-
|
|
535
|
-
if self.scaler is None and self.categorical_encoder is None:
|
|
536
|
-
raise ValueError("No preprocessing artifacts loaded. Call load_artifacts() first.")
|
|
537
|
-
|
|
538
|
-
# Convert to DataFrame if needed
|
|
539
|
-
if isinstance(new_data, list):
|
|
540
|
-
df = pd.DataFrame(new_data)
|
|
541
|
-
else:
|
|
542
|
-
df = new_data.copy()
|
|
543
|
-
|
|
544
|
-
# Apply same preprocessing steps (without fitting)
|
|
545
|
-
if self.config.enable_data_cleaning:
|
|
546
|
-
records = df.to_dict("records")
|
|
547
|
-
cleaned_records, _ = self.data_cleaner.clean_trading_records(records)
|
|
548
|
-
df = pd.DataFrame(cleaned_records)
|
|
549
|
-
|
|
550
|
-
# Extract features
|
|
551
|
-
df = self._extract_features(df)
|
|
552
|
-
|
|
553
|
-
# Engineer features
|
|
554
|
-
df = self._engineer_features(df)
|
|
555
|
-
|
|
556
|
-
# Apply scaling and encoding
|
|
557
|
-
numerical_features = self._identify_numerical_features(df)
|
|
558
|
-
categorical_features = self._identify_categorical_features(df)
|
|
559
|
-
|
|
560
|
-
if self.scaler and numerical_features:
|
|
561
|
-
df[numerical_features] = self.scaler.transform(df[numerical_features])
|
|
562
|
-
|
|
563
|
-
if self.categorical_encoder and categorical_features:
|
|
564
|
-
for feature in categorical_features:
|
|
565
|
-
if feature in self.categorical_encoder:
|
|
566
|
-
df[feature] = self.categorical_encoder[feature].transform(
|
|
567
|
-
df[feature].astype(str)
|
|
568
|
-
)
|
|
569
|
-
|
|
570
|
-
return df
|
|
@@ -1,134 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Configuration for politician trading data workflow
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
import os
|
|
6
|
-
from dataclasses import dataclass
|
|
7
|
-
from typing import Optional
|
|
8
|
-
|
|
9
|
-
from dotenv import load_dotenv
|
|
10
|
-
|
|
11
|
-
load_dotenv()
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@dataclass
|
|
15
|
-
class SupabaseConfig:
|
|
16
|
-
"""Supabase database configuration"""
|
|
17
|
-
|
|
18
|
-
url: str
|
|
19
|
-
key: str
|
|
20
|
-
service_role_key: Optional[str] = None
|
|
21
|
-
|
|
22
|
-
@classmethod
|
|
23
|
-
def from_env(cls) -> "SupabaseConfig":
|
|
24
|
-
"""Load configuration from environment or use provided values"""
|
|
25
|
-
# Your provided Supabase details
|
|
26
|
-
url = os.getenv("SUPABASE_URL", "https://uljsqvwkomdrlnofmlad.supabase.co")
|
|
27
|
-
key = os.getenv(
|
|
28
|
-
"SUPABASE_ANON_KEY",
|
|
29
|
-
"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InVsanNxdndrb21kcmxub2ZtbGFkIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTY4MDIyNDQsImV4cCI6MjA3MjM3ODI0NH0.QCpfcEpxGX_5Wn8ljf_J2KWjJLGdF8zRsV_7OatxmHI",
|
|
30
|
-
)
|
|
31
|
-
service_role_key = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
|
|
32
|
-
|
|
33
|
-
return cls(url=url, key=key, service_role_key=service_role_key)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
@dataclass
|
|
37
|
-
class ScrapingConfig:
|
|
38
|
-
"""Web scraping configuration with comprehensive data sources"""
|
|
39
|
-
|
|
40
|
-
# Rate limiting
|
|
41
|
-
request_delay: float = 1.0 # seconds between requests
|
|
42
|
-
max_retries: int = 3
|
|
43
|
-
timeout: int = 30
|
|
44
|
-
|
|
45
|
-
# User agent for requests
|
|
46
|
-
user_agent: str = "Mozilla/5.0 (compatible; MCLI-PoliticianTracker/1.0)"
|
|
47
|
-
|
|
48
|
-
# Enable/disable source categories
|
|
49
|
-
enable_us_federal: bool = True
|
|
50
|
-
enable_us_states: bool = True
|
|
51
|
-
enable_eu_parliament: bool = True
|
|
52
|
-
enable_eu_national: bool = True
|
|
53
|
-
enable_third_party: bool = True
|
|
54
|
-
|
|
55
|
-
# Legacy properties for backward compatibility
|
|
56
|
-
us_congress_sources: list = None
|
|
57
|
-
eu_sources: list = None
|
|
58
|
-
|
|
59
|
-
def __post_init__(self):
|
|
60
|
-
# Maintain backward compatibility
|
|
61
|
-
if self.us_congress_sources is None:
|
|
62
|
-
self.us_congress_sources = [
|
|
63
|
-
"https://disclosures-clerk.house.gov/FinancialDisclosure",
|
|
64
|
-
"https://efd.senate.gov",
|
|
65
|
-
"https://api.quiverquant.com/beta/live/congresstrading",
|
|
66
|
-
]
|
|
67
|
-
|
|
68
|
-
if self.eu_sources is None:
|
|
69
|
-
self.eu_sources = [
|
|
70
|
-
"https://www.europarl.europa.eu/meps/en/declarations",
|
|
71
|
-
]
|
|
72
|
-
|
|
73
|
-
def get_active_sources(self):
|
|
74
|
-
"""Get all active data sources based on configuration"""
|
|
75
|
-
from .data_sources import ALL_DATA_SOURCES
|
|
76
|
-
|
|
77
|
-
active_sources = []
|
|
78
|
-
|
|
79
|
-
if self.enable_us_federal:
|
|
80
|
-
active_sources.extend(ALL_DATA_SOURCES["us_federal"])
|
|
81
|
-
|
|
82
|
-
if self.enable_us_states:
|
|
83
|
-
active_sources.extend(ALL_DATA_SOURCES["us_states"])
|
|
84
|
-
|
|
85
|
-
if self.enable_eu_parliament:
|
|
86
|
-
active_sources.extend(ALL_DATA_SOURCES["eu_parliament"])
|
|
87
|
-
|
|
88
|
-
if self.enable_eu_national:
|
|
89
|
-
active_sources.extend(ALL_DATA_SOURCES["eu_national"])
|
|
90
|
-
|
|
91
|
-
if self.enable_third_party:
|
|
92
|
-
active_sources.extend(ALL_DATA_SOURCES["third_party"])
|
|
93
|
-
|
|
94
|
-
# Filter to only active status sources
|
|
95
|
-
return [source for source in active_sources if source.status == "active"]
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
@dataclass
|
|
99
|
-
class WorkflowConfig:
|
|
100
|
-
"""Overall workflow configuration"""
|
|
101
|
-
|
|
102
|
-
supabase: SupabaseConfig
|
|
103
|
-
scraping: ScrapingConfig
|
|
104
|
-
|
|
105
|
-
# Cron schedule (for reference, actual scheduling done in Supabase)
|
|
106
|
-
cron_schedule: str = "0 */6 * * *" # Every 6 hours
|
|
107
|
-
|
|
108
|
-
# Data retention
|
|
109
|
-
retention_days: int = 365 # Keep data for 1 year
|
|
110
|
-
|
|
111
|
-
@classmethod
|
|
112
|
-
def default(cls) -> "WorkflowConfig":
|
|
113
|
-
"""Create default configuration"""
|
|
114
|
-
return cls(supabase=SupabaseConfig.from_env(), scraping=ScrapingConfig())
|
|
115
|
-
|
|
116
|
-
def to_serializable_dict(self) -> dict:
|
|
117
|
-
"""Convert to a JSON-serializable dictionary"""
|
|
118
|
-
return {
|
|
119
|
-
"supabase": {
|
|
120
|
-
"url": self.supabase.url,
|
|
121
|
-
"has_service_key": bool(self.supabase.service_role_key),
|
|
122
|
-
# Don't include actual keys for security
|
|
123
|
-
},
|
|
124
|
-
"scraping": {
|
|
125
|
-
"request_delay": self.scraping.request_delay,
|
|
126
|
-
"max_retries": self.scraping.max_retries,
|
|
127
|
-
"timeout": self.scraping.timeout,
|
|
128
|
-
"user_agent": self.scraping.user_agent,
|
|
129
|
-
"us_congress_sources": self.scraping.us_congress_sources,
|
|
130
|
-
"eu_sources": self.scraping.eu_sources,
|
|
131
|
-
},
|
|
132
|
-
"cron_schedule": self.cron_schedule,
|
|
133
|
-
"retention_days": self.retention_days,
|
|
134
|
-
}
|