mcli-framework 7.10.0__py3-none-any.whl → 7.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (42) hide show
  1. mcli/lib/custom_commands.py +10 -0
  2. mcli/lib/optional_deps.py +240 -0
  3. mcli/ml/backtesting/run.py +5 -3
  4. mcli/ml/models/ensemble_models.py +1 -0
  5. mcli/ml/models/recommendation_models.py +1 -0
  6. mcli/ml/optimization/optimize.py +6 -4
  7. mcli/ml/serving/serve.py +2 -2
  8. mcli/ml/training/train.py +14 -7
  9. mcli/self/completion_cmd.py +2 -2
  10. mcli/workflow/doc_convert.py +82 -112
  11. mcli/workflow/git_commit/ai_service.py +13 -2
  12. mcli/workflow/notebook/converter.py +375 -0
  13. mcli/workflow/notebook/notebook_cmd.py +441 -0
  14. mcli/workflow/notebook/schema.py +402 -0
  15. mcli/workflow/notebook/validator.py +313 -0
  16. mcli/workflow/workflow.py +14 -0
  17. {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/METADATA +37 -3
  18. {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/RECORD +22 -37
  19. mcli/ml/features/political_features.py +0 -677
  20. mcli/ml/preprocessing/politician_trading_preprocessor.py +0 -570
  21. mcli/workflow/politician_trading/config.py +0 -134
  22. mcli/workflow/politician_trading/connectivity.py +0 -492
  23. mcli/workflow/politician_trading/data_sources.py +0 -654
  24. mcli/workflow/politician_trading/database.py +0 -412
  25. mcli/workflow/politician_trading/demo.py +0 -249
  26. mcli/workflow/politician_trading/models.py +0 -327
  27. mcli/workflow/politician_trading/monitoring.py +0 -413
  28. mcli/workflow/politician_trading/scrapers.py +0 -1074
  29. mcli/workflow/politician_trading/scrapers_california.py +0 -434
  30. mcli/workflow/politician_trading/scrapers_corporate_registry.py +0 -797
  31. mcli/workflow/politician_trading/scrapers_eu.py +0 -376
  32. mcli/workflow/politician_trading/scrapers_free_sources.py +0 -509
  33. mcli/workflow/politician_trading/scrapers_third_party.py +0 -373
  34. mcli/workflow/politician_trading/scrapers_uk.py +0 -378
  35. mcli/workflow/politician_trading/scrapers_us_states.py +0 -471
  36. mcli/workflow/politician_trading/seed_database.py +0 -520
  37. mcli/workflow/politician_trading/supabase_functions.py +0 -354
  38. mcli/workflow/politician_trading/workflow.py +0 -879
  39. {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/WHEEL +0 -0
  40. {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/entry_points.txt +0 -0
  41. {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/licenses/LICENSE +0 -0
  42. {mcli_framework-7.10.0.dist-info → mcli_framework-7.10.2.dist-info}/top_level.txt +0 -0
@@ -1,570 +0,0 @@
1
- """Main preprocessor for politician trading data"""
2
-
3
- import logging
4
- from dataclasses import asdict, dataclass
5
- from datetime import datetime, timedelta
6
- from pathlib import Path
7
- from typing import Any, Dict, List, Optional, Tuple, Union
8
-
9
- import joblib
10
- import numpy as np
11
- import pandas as pd
12
-
13
- from .data_cleaners import CleaningStats, MissingValueHandler, OutlierDetector, TradingDataCleaner
14
- from .feature_extractors import (
15
- FeatureExtractionStats,
16
- MarketFeatureExtractor,
17
- PoliticianFeatureExtractor,
18
- SentimentFeatureExtractor,
19
- TemporalFeatureExtractor,
20
- )
21
-
22
- logger = logging.getLogger(__name__)
23
-
24
-
25
- @dataclass
26
- class PreprocessingConfig:
27
- """Configuration for preprocessing pipeline"""
28
-
29
- # Data cleaning
30
- enable_data_cleaning: bool = True
31
- enable_outlier_detection: bool = True
32
- enable_missing_value_handling: bool = True
33
- outlier_action: str = "flag" # "flag", "remove", or "cap"
34
-
35
- # Feature extraction
36
- enable_politician_features: bool = True
37
- enable_market_features: bool = True
38
- enable_temporal_features: bool = True
39
- enable_sentiment_features: bool = True
40
-
41
- # Temporal settings
42
- lookback_periods: List[int] = None
43
- include_future_leakage: bool = False
44
-
45
- # Data splitting
46
- train_split_ratio: float = 0.7
47
- val_split_ratio: float = 0.15
48
- test_split_ratio: float = 0.15
49
- split_by_time: bool = True
50
-
51
- # Output settings
52
- save_preprocessing_artifacts: bool = True
53
- artifacts_dir: Optional[Path] = None
54
-
55
- def __post_init__(self):
56
- if self.lookback_periods is None:
57
- self.lookback_periods = [7, 30, 90, 365]
58
-
59
- if self.artifacts_dir is None:
60
- self.artifacts_dir = Path("./data/preprocessing_artifacts")
61
-
62
- # Validate split ratios
63
- total_ratio = self.train_split_ratio + self.val_split_ratio + self.test_split_ratio
64
- if abs(total_ratio - 1.0) > 0.001:
65
- raise ValueError(f"Split ratios must sum to 1.0, got {total_ratio}")
66
-
67
-
68
- @dataclass
69
- class PreprocessingResults:
70
- """Results from preprocessing pipeline"""
71
-
72
- # Processed data
73
- train_data: pd.DataFrame
74
- val_data: pd.DataFrame
75
- test_data: pd.DataFrame
76
-
77
- # Feature information
78
- feature_names: List[str]
79
- categorical_features: List[str]
80
- numerical_features: List[str]
81
- target_columns: List[str]
82
-
83
- # Statistics
84
- cleaning_stats: CleaningStats
85
- original_shape: Tuple[int, int]
86
- final_shape: Tuple[int, int]
87
- feature_count: int
88
-
89
- # Artifacts paths
90
- scaler_path: Optional[Path] = None
91
- encoder_path: Optional[Path] = None
92
- feature_metadata_path: Optional[Path] = None
93
-
94
-
95
- class PoliticianTradingPreprocessor:
96
- """Main preprocessor for politician trading data for ML models"""
97
-
98
- def __init__(self, config: Optional[PreprocessingConfig] = None):
99
- self.config = config or PreprocessingConfig()
100
-
101
- # Initialize components
102
- self.data_cleaner = TradingDataCleaner()
103
- self.outlier_detector = OutlierDetector()
104
- self.missing_value_handler = MissingValueHandler()
105
-
106
- self.politician_extractor = PoliticianFeatureExtractor()
107
- self.market_extractor = MarketFeatureExtractor()
108
- self.temporal_extractor = TemporalFeatureExtractor(
109
- config={"lookback_periods": self.config.lookback_periods}
110
- )
111
- self.sentiment_extractor = SentimentFeatureExtractor()
112
-
113
- # Preprocessing artifacts
114
- self.scaler = None
115
- self.categorical_encoder = None
116
- self.feature_metadata = {}
117
-
118
- # Create artifacts directory
119
- self.config.artifacts_dir.mkdir(parents=True, exist_ok=True)
120
-
121
- def preprocess(
122
- self, raw_data: Union[List[Dict[str, Any]], pd.DataFrame]
123
- ) -> PreprocessingResults:
124
- """Main preprocessing pipeline"""
125
- logger.info("Starting politician trading data preprocessing")
126
-
127
- # Convert to DataFrame if needed
128
- if isinstance(raw_data, list):
129
- df = pd.DataFrame(raw_data)
130
- else:
131
- df = raw_data.copy()
132
-
133
- original_shape = df.shape
134
- logger.info(f"Input data shape: {original_shape}")
135
-
136
- # Step 1: Data Cleaning
137
- if self.config.enable_data_cleaning:
138
- df, cleaning_stats = self._clean_data(df)
139
- logger.info(f"After cleaning: {df.shape}")
140
- else:
141
- cleaning_stats = CleaningStats(
142
- total_records=len(df),
143
- cleaned_records=len(df),
144
- removed_records=0,
145
- cleaning_operations={},
146
- outliers_detected=0,
147
- missing_values_filled=0,
148
- )
149
-
150
- # Step 2: Feature Extraction
151
- df = self._extract_features(df)
152
- logger.info(f"After feature extraction: {df.shape}")
153
-
154
- # Step 3: Handle outliers
155
- if self.config.enable_outlier_detection:
156
- df = self._handle_outliers(df)
157
- logger.info(f"After outlier handling: {df.shape}")
158
-
159
- # Step 4: Handle missing values
160
- if self.config.enable_missing_value_handling:
161
- df = self._handle_missing_values(df)
162
- logger.info(f"After missing value handling: {df.shape}")
163
-
164
- # Step 5: Feature engineering and encoding
165
- df = self._engineer_features(df)
166
- logger.info(f"After feature engineering: {df.shape}")
167
-
168
- # Step 6: Create target variables
169
- df = self._create_target_variables(df)
170
- logger.info(f"After target creation: {df.shape}")
171
-
172
- # Step 7: Split data
173
- train_data, val_data, test_data = self._split_data(df)
174
-
175
- # Step 8: Scale features
176
- train_data, val_data, test_data = self._scale_features(train_data, val_data, test_data)
177
-
178
- # Step 9: Save artifacts
179
- if self.config.save_preprocessing_artifacts:
180
- self._save_artifacts()
181
-
182
- # Prepare results
183
- feature_names = [col for col in df.columns if not col.startswith("target_")]
184
- categorical_features = self._identify_categorical_features(df)
185
- numerical_features = self._identify_numerical_features(df)
186
- target_columns = [col for col in df.columns if col.startswith("target_")]
187
-
188
- results = PreprocessingResults(
189
- train_data=train_data,
190
- val_data=val_data,
191
- test_data=test_data,
192
- feature_names=feature_names,
193
- categorical_features=categorical_features,
194
- numerical_features=numerical_features,
195
- target_columns=target_columns,
196
- cleaning_stats=cleaning_stats,
197
- original_shape=original_shape,
198
- final_shape=df.shape,
199
- feature_count=len(feature_names),
200
- scaler_path=self.config.artifacts_dir / "scaler.joblib",
201
- encoder_path=self.config.artifacts_dir / "encoder.joblib",
202
- feature_metadata_path=self.config.artifacts_dir / "feature_metadata.joblib",
203
- )
204
-
205
- logger.info(f"Preprocessing complete. Final shape: {df.shape}")
206
- logger.info(f"Features: {len(feature_names)}, Targets: {len(target_columns)}")
207
-
208
- return results
209
-
210
- def _clean_data(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, CleaningStats]:
211
- """Clean the raw data"""
212
- logger.info("Cleaning data")
213
-
214
- # Convert to list of records for cleaner
215
- records = df.to_dict("records")
216
- cleaned_records, cleaning_stats = self.data_cleaner.clean_trading_records(records)
217
-
218
- # Convert back to DataFrame
219
- cleaned_df = pd.DataFrame(cleaned_records)
220
-
221
- return cleaned_df, cleaning_stats
222
-
223
- def _extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
224
- """Extract all features"""
225
- logger.info("Extracting features")
226
-
227
- if self.config.enable_politician_features:
228
- df = self.politician_extractor.extract_politician_features(df)
229
- logger.info("Politician features extracted")
230
-
231
- if self.config.enable_market_features:
232
- df = self.market_extractor.extract_market_features(df)
233
- logger.info("Market features extracted")
234
-
235
- if self.config.enable_temporal_features:
236
- df = self.temporal_extractor.extract_temporal_features(df)
237
- logger.info("Temporal features extracted")
238
-
239
- if self.config.enable_sentiment_features:
240
- df = self.sentiment_extractor.extract_sentiment_features(df)
241
- logger.info("Sentiment features extracted")
242
-
243
- return df
244
-
245
- def _handle_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
246
- """Handle outliers in the data"""
247
- logger.info("Handling outliers")
248
-
249
- df_with_outliers, outlier_info = self.outlier_detector.detect_outliers(df)
250
-
251
- if self.config.outlier_action == "remove":
252
- df_clean = df_with_outliers[~df_with_outliers["is_outlier"]]
253
- logger.info(f"Removed {outlier_info['total_outliers']} outliers")
254
- elif self.config.outlier_action == "flag":
255
- df_clean = df_with_outliers
256
- logger.info(f"Flagged {outlier_info['total_outliers']} outliers")
257
- else: # cap
258
- df_clean = self._cap_outliers(df_with_outliers)
259
- logger.info(f"Capped {outlier_info['total_outliers']} outliers")
260
-
261
- return df_clean
262
-
263
- def _handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
264
- """Handle missing values"""
265
- logger.info("Handling missing values")
266
-
267
- df_clean, missing_info = self.missing_value_handler.handle_missing_values(df)
268
- logger.info(f"Handled missing values: {missing_info['final_missing_counts']}")
269
-
270
- return df_clean
271
-
272
- def _engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
273
- """Engineer additional features"""
274
- logger.info("Engineering features")
275
-
276
- # Transaction amount buckets
277
- if "transaction_amount_cleaned" in df.columns:
278
- df["amount_bucket"] = pd.cut(
279
- df["transaction_amount_cleaned"],
280
- bins=[0, 1000, 15000, 50000, 500000, float("inf")],
281
- labels=["micro", "small", "medium", "large", "mega"],
282
- )
283
-
284
- # Politician activity level
285
- if "total_transactions" in df.columns:
286
- df["politician_activity_level"] = pd.cut(
287
- df["total_transactions"],
288
- bins=[0, 5, 20, 50, float("inf")],
289
- labels=["low", "medium", "high", "very_high"],
290
- )
291
-
292
- # Market timing features
293
- if "transaction_date_dt" in df.columns:
294
- # Days since start of data
295
- min_date = df["transaction_date_dt"].min()
296
- df["days_since_start"] = (df["transaction_date_dt"] - min_date).dt.days
297
-
298
- # Market cycle approximation (simplified)
299
- df["market_cycle_phase"] = (df["days_since_start"] % 1460) / 1460 # 4-year cycle
300
-
301
- # Interaction features
302
- if all(col in df.columns for col in ["buy_ratio", "total_transactions"]):
303
- df["buy_volume_interaction"] = df["buy_ratio"] * df["total_transactions"]
304
-
305
- if all(
306
- col in df.columns
307
- for col in ["transaction_amount_cleaned", "politician_trading_frequency"]
308
- ):
309
- df["amount_frequency_interaction"] = (
310
- df["transaction_amount_cleaned"] * df["politician_trading_frequency"]
311
- )
312
-
313
- return df
314
-
315
- def _create_target_variables(self, df: pd.DataFrame) -> pd.DataFrame:
316
- """Create target variables for ML models"""
317
- logger.info("Creating target variables")
318
-
319
- # Sort by politician and date for future stock performance calculation
320
- df = df.sort_values(["politician_name_cleaned", "transaction_date_dt"])
321
-
322
- # Target 1: Stock performance after politician trade (simplified)
323
- # This would typically require external market data
324
- # For now, create synthetic targets based on transaction patterns
325
-
326
- # Target: Whether the trade was profitable (binary classification)
327
- # Assumption: Larger transactions from frequent traders are more likely profitable
328
- if all(
329
- col in df.columns
330
- for col in ["transaction_amount_cleaned", "politician_trading_frequency"]
331
- ):
332
- # Probability based on amount and frequency
333
- amount_score = np.log1p(df["transaction_amount_cleaned"]) / 10
334
- frequency_score = np.log1p(df["politician_trading_frequency"]) / 5
335
-
336
- profit_probability = (amount_score + frequency_score) / 2
337
- profit_probability = np.clip(profit_probability, 0.1, 0.9)
338
-
339
- # Binary target with some randomness
340
- np.random.seed(42) # For reproducibility
341
- df["target_profitable"] = np.random.binomial(1, profit_probability)
342
-
343
- # Target 2: Stock recommendation score (regression)
344
- # Based on politician patterns and market factors
345
- if "transaction_type_cleaned" in df.columns:
346
- base_score = 0.5 # Neutral
347
-
348
- # Adjust based on transaction type
349
- type_adjustment = (
350
- df["transaction_type_cleaned"]
351
- .map({"buy": 0.2, "sell": -0.2, "exchange": 0.0})
352
- .fillna(0)
353
- )
354
-
355
- # Adjust based on politician track record
356
- if "buy_ratio" in df.columns:
357
- track_record_adjustment = (df["buy_ratio"] - 0.5) * 0.3
358
-
359
- # Adjust based on timing
360
- if "is_end_of_quarter" in df.columns:
361
- timing_adjustment = df["is_end_of_quarter"].astype(int) * 0.1
362
-
363
- recommendation_score = (
364
- base_score + type_adjustment + track_record_adjustment + timing_adjustment
365
- )
366
- df["target_recommendation_score"] = np.clip(recommendation_score, 0, 1)
367
-
368
- # Target 3: Risk level (multi-class classification)
369
- if "transaction_volatility" in df.columns:
370
- risk_conditions = [
371
- (df["transaction_volatility"] <= 0.2),
372
- (df["transaction_volatility"] <= 0.5),
373
- (df["transaction_volatility"] <= 1.0),
374
- (df["transaction_volatility"] > 1.0),
375
- ]
376
- risk_choices = ["low", "medium", "high", "very_high"]
377
- df["target_risk_level"] = np.select(risk_conditions, risk_choices, default="medium")
378
-
379
- return df
380
-
381
- def _split_data(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
382
- """Split data into train/val/test sets"""
383
- logger.info("Splitting data")
384
-
385
- if self.config.split_by_time and "transaction_date_dt" in df.columns:
386
- # Time-based split
387
- df_sorted = df.sort_values("transaction_date_dt")
388
-
389
- train_size = int(len(df_sorted) * self.config.train_split_ratio)
390
- val_size = int(len(df_sorted) * self.config.val_split_ratio)
391
-
392
- train_data = df_sorted.iloc[:train_size]
393
- val_data = df_sorted.iloc[train_size : train_size + val_size]
394
- test_data = df_sorted.iloc[train_size + val_size :]
395
-
396
- else:
397
- # Random split
398
- df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
399
-
400
- train_size = int(len(df_shuffled) * self.config.train_split_ratio)
401
- val_size = int(len(df_shuffled) * self.config.val_split_ratio)
402
-
403
- train_data = df_shuffled.iloc[:train_size]
404
- val_data = df_shuffled.iloc[train_size : train_size + val_size]
405
- test_data = df_shuffled.iloc[train_size + val_size :]
406
-
407
- logger.info(
408
- f"Split sizes - Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}"
409
- )
410
-
411
- return train_data, val_data, test_data
412
-
413
- def _scale_features(
414
- self, train_data: pd.DataFrame, val_data: pd.DataFrame, test_data: pd.DataFrame
415
- ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
416
- """Scale numerical features"""
417
- logger.info("Scaling features")
418
-
419
- from sklearn.preprocessing import LabelEncoder, StandardScaler
420
-
421
- numerical_features = self._identify_numerical_features(train_data)
422
- categorical_features = self._identify_categorical_features(train_data)
423
-
424
- # Fit scaler on training data
425
- self.scaler = StandardScaler()
426
- if numerical_features:
427
- train_scaled = train_data.copy()
428
- val_scaled = val_data.copy()
429
- test_scaled = test_data.copy()
430
-
431
- train_scaled[numerical_features] = self.scaler.fit_transform(
432
- train_data[numerical_features]
433
- )
434
- val_scaled[numerical_features] = self.scaler.transform(val_data[numerical_features])
435
- test_scaled[numerical_features] = self.scaler.transform(test_data[numerical_features])
436
- else:
437
- train_scaled, val_scaled, test_scaled = train_data, val_data, test_data
438
-
439
- # Encode categorical features
440
- self.categorical_encoder = {}
441
- if categorical_features:
442
- for feature in categorical_features:
443
- encoder = LabelEncoder()
444
- # Fit on combined data to handle unseen categories
445
- all_values = pd.concat(
446
- [train_scaled[feature], val_scaled[feature], test_scaled[feature]]
447
- ).astype(str)
448
-
449
- encoder.fit(all_values)
450
- self.categorical_encoder[feature] = encoder
451
-
452
- train_scaled[feature] = encoder.transform(train_scaled[feature].astype(str))
453
- val_scaled[feature] = encoder.transform(val_scaled[feature].astype(str))
454
- test_scaled[feature] = encoder.transform(test_scaled[feature].astype(str))
455
-
456
- return train_scaled, val_scaled, test_scaled
457
-
458
- def _identify_numerical_features(self, df: pd.DataFrame) -> List[str]:
459
- """Identify numerical features"""
460
- numerical_features = []
461
- for col in df.columns:
462
- if (
463
- df[col].dtype in ["int64", "float64"]
464
- and not col.startswith("target_")
465
- and not col.endswith("_cleaned")
466
- and col not in ["is_outlier"]
467
- ):
468
- numerical_features.append(col)
469
- return numerical_features
470
-
471
- def _identify_categorical_features(self, df: pd.DataFrame) -> List[str]:
472
- """Identify categorical features"""
473
- categorical_features = []
474
- for col in df.columns:
475
- if (
476
- df[col].dtype == "object"
477
- or df[col].dtype.name == "category"
478
- and not col.startswith("target_")
479
- ):
480
- categorical_features.append(col)
481
- return categorical_features
482
-
483
- def _cap_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
484
- """Cap outliers to percentile values"""
485
- df_capped = df.copy()
486
- numerical_cols = self._identify_numerical_features(df)
487
-
488
- for col in numerical_cols:
489
- if col in df_capped.columns:
490
- q1 = df_capped[col].quantile(0.01)
491
- q99 = df_capped[col].quantile(0.99)
492
- df_capped[col] = np.clip(df_capped[col], q1, q99)
493
-
494
- return df_capped
495
-
496
- def _save_artifacts(self):
497
- """Save preprocessing artifacts"""
498
- logger.info("Saving preprocessing artifacts")
499
-
500
- if self.scaler:
501
- joblib.dump(self.scaler, self.config.artifacts_dir / "scaler.joblib")
502
-
503
- if self.categorical_encoder:
504
- joblib.dump(self.categorical_encoder, self.config.artifacts_dir / "encoder.joblib")
505
-
506
- # Save feature metadata
507
- self.feature_metadata = {
508
- "config": asdict(self.config),
509
- "preprocessing_timestamp": datetime.now().isoformat(),
510
- }
511
- joblib.dump(self.feature_metadata, self.config.artifacts_dir / "feature_metadata.joblib")
512
-
513
- def load_artifacts(self, artifacts_dir: Path):
514
- """Load preprocessing artifacts"""
515
- logger.info(f"Loading preprocessing artifacts from {artifacts_dir}")
516
-
517
- scaler_path = artifacts_dir / "scaler.joblib"
518
- if scaler_path.exists():
519
- self.scaler = joblib.load(scaler_path)
520
-
521
- encoder_path = artifacts_dir / "encoder.joblib"
522
- if encoder_path.exists():
523
- self.categorical_encoder = joblib.load(encoder_path)
524
-
525
- metadata_path = artifacts_dir / "feature_metadata.joblib"
526
- if metadata_path.exists():
527
- self.feature_metadata = joblib.load(metadata_path)
528
-
529
- def transform_new_data(
530
- self, new_data: Union[List[Dict[str, Any]], pd.DataFrame]
531
- ) -> pd.DataFrame:
532
- """Transform new data using fitted preprocessors"""
533
- logger.info("Transforming new data with fitted preprocessors")
534
-
535
- if self.scaler is None and self.categorical_encoder is None:
536
- raise ValueError("No preprocessing artifacts loaded. Call load_artifacts() first.")
537
-
538
- # Convert to DataFrame if needed
539
- if isinstance(new_data, list):
540
- df = pd.DataFrame(new_data)
541
- else:
542
- df = new_data.copy()
543
-
544
- # Apply same preprocessing steps (without fitting)
545
- if self.config.enable_data_cleaning:
546
- records = df.to_dict("records")
547
- cleaned_records, _ = self.data_cleaner.clean_trading_records(records)
548
- df = pd.DataFrame(cleaned_records)
549
-
550
- # Extract features
551
- df = self._extract_features(df)
552
-
553
- # Engineer features
554
- df = self._engineer_features(df)
555
-
556
- # Apply scaling and encoding
557
- numerical_features = self._identify_numerical_features(df)
558
- categorical_features = self._identify_categorical_features(df)
559
-
560
- if self.scaler and numerical_features:
561
- df[numerical_features] = self.scaler.transform(df[numerical_features])
562
-
563
- if self.categorical_encoder and categorical_features:
564
- for feature in categorical_features:
565
- if feature in self.categorical_encoder:
566
- df[feature] = self.categorical_encoder[feature].transform(
567
- df[feature].astype(str)
568
- )
569
-
570
- return df
@@ -1,134 +0,0 @@
1
- """
2
- Configuration for politician trading data workflow
3
- """
4
-
5
- import os
6
- from dataclasses import dataclass
7
- from typing import Optional
8
-
9
- from dotenv import load_dotenv
10
-
11
- load_dotenv()
12
-
13
-
14
- @dataclass
15
- class SupabaseConfig:
16
- """Supabase database configuration"""
17
-
18
- url: str
19
- key: str
20
- service_role_key: Optional[str] = None
21
-
22
- @classmethod
23
- def from_env(cls) -> "SupabaseConfig":
24
- """Load configuration from environment or use provided values"""
25
- # Your provided Supabase details
26
- url = os.getenv("SUPABASE_URL", "https://uljsqvwkomdrlnofmlad.supabase.co")
27
- key = os.getenv(
28
- "SUPABASE_ANON_KEY",
29
- "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InVsanNxdndrb21kcmxub2ZtbGFkIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTY4MDIyNDQsImV4cCI6MjA3MjM3ODI0NH0.QCpfcEpxGX_5Wn8ljf_J2KWjJLGdF8zRsV_7OatxmHI",
30
- )
31
- service_role_key = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
32
-
33
- return cls(url=url, key=key, service_role_key=service_role_key)
34
-
35
-
36
- @dataclass
37
- class ScrapingConfig:
38
- """Web scraping configuration with comprehensive data sources"""
39
-
40
- # Rate limiting
41
- request_delay: float = 1.0 # seconds between requests
42
- max_retries: int = 3
43
- timeout: int = 30
44
-
45
- # User agent for requests
46
- user_agent: str = "Mozilla/5.0 (compatible; MCLI-PoliticianTracker/1.0)"
47
-
48
- # Enable/disable source categories
49
- enable_us_federal: bool = True
50
- enable_us_states: bool = True
51
- enable_eu_parliament: bool = True
52
- enable_eu_national: bool = True
53
- enable_third_party: bool = True
54
-
55
- # Legacy properties for backward compatibility
56
- us_congress_sources: list = None
57
- eu_sources: list = None
58
-
59
- def __post_init__(self):
60
- # Maintain backward compatibility
61
- if self.us_congress_sources is None:
62
- self.us_congress_sources = [
63
- "https://disclosures-clerk.house.gov/FinancialDisclosure",
64
- "https://efd.senate.gov",
65
- "https://api.quiverquant.com/beta/live/congresstrading",
66
- ]
67
-
68
- if self.eu_sources is None:
69
- self.eu_sources = [
70
- "https://www.europarl.europa.eu/meps/en/declarations",
71
- ]
72
-
73
- def get_active_sources(self):
74
- """Get all active data sources based on configuration"""
75
- from .data_sources import ALL_DATA_SOURCES
76
-
77
- active_sources = []
78
-
79
- if self.enable_us_federal:
80
- active_sources.extend(ALL_DATA_SOURCES["us_federal"])
81
-
82
- if self.enable_us_states:
83
- active_sources.extend(ALL_DATA_SOURCES["us_states"])
84
-
85
- if self.enable_eu_parliament:
86
- active_sources.extend(ALL_DATA_SOURCES["eu_parliament"])
87
-
88
- if self.enable_eu_national:
89
- active_sources.extend(ALL_DATA_SOURCES["eu_national"])
90
-
91
- if self.enable_third_party:
92
- active_sources.extend(ALL_DATA_SOURCES["third_party"])
93
-
94
- # Filter to only active status sources
95
- return [source for source in active_sources if source.status == "active"]
96
-
97
-
98
- @dataclass
99
- class WorkflowConfig:
100
- """Overall workflow configuration"""
101
-
102
- supabase: SupabaseConfig
103
- scraping: ScrapingConfig
104
-
105
- # Cron schedule (for reference, actual scheduling done in Supabase)
106
- cron_schedule: str = "0 */6 * * *" # Every 6 hours
107
-
108
- # Data retention
109
- retention_days: int = 365 # Keep data for 1 year
110
-
111
- @classmethod
112
- def default(cls) -> "WorkflowConfig":
113
- """Create default configuration"""
114
- return cls(supabase=SupabaseConfig.from_env(), scraping=ScrapingConfig())
115
-
116
- def to_serializable_dict(self) -> dict:
117
- """Convert to a JSON-serializable dictionary"""
118
- return {
119
- "supabase": {
120
- "url": self.supabase.url,
121
- "has_service_key": bool(self.supabase.service_role_key),
122
- # Don't include actual keys for security
123
- },
124
- "scraping": {
125
- "request_delay": self.scraping.request_delay,
126
- "max_retries": self.scraping.max_retries,
127
- "timeout": self.scraping.timeout,
128
- "user_agent": self.scraping.user_agent,
129
- "us_congress_sources": self.scraping.us_congress_sources,
130
- "eu_sources": self.scraping.eu_sources,
131
- },
132
- "cron_schedule": self.cron_schedule,
133
- "retention_days": self.retention_days,
134
- }