mcli-framework 7.10.1__py3-none-any.whl → 7.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (43) hide show
  1. mcli/app/commands_cmd.py +150 -58
  2. mcli/app/main.py +21 -27
  3. mcli/lib/custom_commands.py +62 -12
  4. mcli/lib/optional_deps.py +240 -0
  5. mcli/lib/paths.py +129 -5
  6. mcli/self/migrate_cmd.py +261 -0
  7. mcli/self/self_cmd.py +8 -0
  8. mcli/workflow/git_commit/ai_service.py +13 -2
  9. mcli/workflow/notebook/__init__.py +16 -0
  10. mcli/workflow/notebook/converter.py +375 -0
  11. mcli/workflow/notebook/notebook_cmd.py +441 -0
  12. mcli/workflow/notebook/schema.py +402 -0
  13. mcli/workflow/notebook/validator.py +313 -0
  14. mcli/workflow/secrets/__init__.py +4 -0
  15. mcli/workflow/secrets/secrets_cmd.py +192 -0
  16. mcli/workflow/workflow.py +35 -5
  17. {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/METADATA +86 -55
  18. {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/RECORD +22 -34
  19. mcli/ml/features/political_features.py +0 -677
  20. mcli/ml/preprocessing/politician_trading_preprocessor.py +0 -570
  21. mcli/workflow/politician_trading/__init__.py +0 -4
  22. mcli/workflow/politician_trading/config.py +0 -134
  23. mcli/workflow/politician_trading/connectivity.py +0 -492
  24. mcli/workflow/politician_trading/data_sources.py +0 -654
  25. mcli/workflow/politician_trading/database.py +0 -412
  26. mcli/workflow/politician_trading/demo.py +0 -249
  27. mcli/workflow/politician_trading/models.py +0 -327
  28. mcli/workflow/politician_trading/monitoring.py +0 -413
  29. mcli/workflow/politician_trading/scrapers.py +0 -1074
  30. mcli/workflow/politician_trading/scrapers_california.py +0 -434
  31. mcli/workflow/politician_trading/scrapers_corporate_registry.py +0 -797
  32. mcli/workflow/politician_trading/scrapers_eu.py +0 -376
  33. mcli/workflow/politician_trading/scrapers_free_sources.py +0 -509
  34. mcli/workflow/politician_trading/scrapers_third_party.py +0 -373
  35. mcli/workflow/politician_trading/scrapers_uk.py +0 -378
  36. mcli/workflow/politician_trading/scrapers_us_states.py +0 -471
  37. mcli/workflow/politician_trading/seed_database.py +0 -520
  38. mcli/workflow/politician_trading/supabase_functions.py +0 -354
  39. mcli/workflow/politician_trading/workflow.py +0 -879
  40. {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/WHEEL +0 -0
  41. {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/entry_points.txt +0 -0
  42. {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/licenses/LICENSE +0 -0
  43. {mcli_framework-7.10.1.dist-info → mcli_framework-7.11.0.dist-info}/top_level.txt +0 -0
@@ -1,570 +0,0 @@
1
- """Main preprocessor for politician trading data"""
2
-
3
- import logging
4
- from dataclasses import asdict, dataclass
5
- from datetime import datetime, timedelta
6
- from pathlib import Path
7
- from typing import Any, Dict, List, Optional, Tuple, Union
8
-
9
- import joblib
10
- import numpy as np
11
- import pandas as pd
12
-
13
- from .data_cleaners import CleaningStats, MissingValueHandler, OutlierDetector, TradingDataCleaner
14
- from .feature_extractors import (
15
- FeatureExtractionStats,
16
- MarketFeatureExtractor,
17
- PoliticianFeatureExtractor,
18
- SentimentFeatureExtractor,
19
- TemporalFeatureExtractor,
20
- )
21
-
22
- logger = logging.getLogger(__name__)
23
-
24
-
25
- @dataclass
26
- class PreprocessingConfig:
27
- """Configuration for preprocessing pipeline"""
28
-
29
- # Data cleaning
30
- enable_data_cleaning: bool = True
31
- enable_outlier_detection: bool = True
32
- enable_missing_value_handling: bool = True
33
- outlier_action: str = "flag" # "flag", "remove", or "cap"
34
-
35
- # Feature extraction
36
- enable_politician_features: bool = True
37
- enable_market_features: bool = True
38
- enable_temporal_features: bool = True
39
- enable_sentiment_features: bool = True
40
-
41
- # Temporal settings
42
- lookback_periods: List[int] = None
43
- include_future_leakage: bool = False
44
-
45
- # Data splitting
46
- train_split_ratio: float = 0.7
47
- val_split_ratio: float = 0.15
48
- test_split_ratio: float = 0.15
49
- split_by_time: bool = True
50
-
51
- # Output settings
52
- save_preprocessing_artifacts: bool = True
53
- artifacts_dir: Optional[Path] = None
54
-
55
- def __post_init__(self):
56
- if self.lookback_periods is None:
57
- self.lookback_periods = [7, 30, 90, 365]
58
-
59
- if self.artifacts_dir is None:
60
- self.artifacts_dir = Path("./data/preprocessing_artifacts")
61
-
62
- # Validate split ratios
63
- total_ratio = self.train_split_ratio + self.val_split_ratio + self.test_split_ratio
64
- if abs(total_ratio - 1.0) > 0.001:
65
- raise ValueError(f"Split ratios must sum to 1.0, got {total_ratio}")
66
-
67
-
68
- @dataclass
69
- class PreprocessingResults:
70
- """Results from preprocessing pipeline"""
71
-
72
- # Processed data
73
- train_data: pd.DataFrame
74
- val_data: pd.DataFrame
75
- test_data: pd.DataFrame
76
-
77
- # Feature information
78
- feature_names: List[str]
79
- categorical_features: List[str]
80
- numerical_features: List[str]
81
- target_columns: List[str]
82
-
83
- # Statistics
84
- cleaning_stats: CleaningStats
85
- original_shape: Tuple[int, int]
86
- final_shape: Tuple[int, int]
87
- feature_count: int
88
-
89
- # Artifacts paths
90
- scaler_path: Optional[Path] = None
91
- encoder_path: Optional[Path] = None
92
- feature_metadata_path: Optional[Path] = None
93
-
94
-
95
- class PoliticianTradingPreprocessor:
96
- """Main preprocessor for politician trading data for ML models"""
97
-
98
- def __init__(self, config: Optional[PreprocessingConfig] = None):
99
- self.config = config or PreprocessingConfig()
100
-
101
- # Initialize components
102
- self.data_cleaner = TradingDataCleaner()
103
- self.outlier_detector = OutlierDetector()
104
- self.missing_value_handler = MissingValueHandler()
105
-
106
- self.politician_extractor = PoliticianFeatureExtractor()
107
- self.market_extractor = MarketFeatureExtractor()
108
- self.temporal_extractor = TemporalFeatureExtractor(
109
- config={"lookback_periods": self.config.lookback_periods}
110
- )
111
- self.sentiment_extractor = SentimentFeatureExtractor()
112
-
113
- # Preprocessing artifacts
114
- self.scaler = None
115
- self.categorical_encoder = None
116
- self.feature_metadata = {}
117
-
118
- # Create artifacts directory
119
- self.config.artifacts_dir.mkdir(parents=True, exist_ok=True)
120
-
121
- def preprocess(
122
- self, raw_data: Union[List[Dict[str, Any]], pd.DataFrame]
123
- ) -> PreprocessingResults:
124
- """Main preprocessing pipeline"""
125
- logger.info("Starting politician trading data preprocessing")
126
-
127
- # Convert to DataFrame if needed
128
- if isinstance(raw_data, list):
129
- df = pd.DataFrame(raw_data)
130
- else:
131
- df = raw_data.copy()
132
-
133
- original_shape = df.shape
134
- logger.info(f"Input data shape: {original_shape}")
135
-
136
- # Step 1: Data Cleaning
137
- if self.config.enable_data_cleaning:
138
- df, cleaning_stats = self._clean_data(df)
139
- logger.info(f"After cleaning: {df.shape}")
140
- else:
141
- cleaning_stats = CleaningStats(
142
- total_records=len(df),
143
- cleaned_records=len(df),
144
- removed_records=0,
145
- cleaning_operations={},
146
- outliers_detected=0,
147
- missing_values_filled=0,
148
- )
149
-
150
- # Step 2: Feature Extraction
151
- df = self._extract_features(df)
152
- logger.info(f"After feature extraction: {df.shape}")
153
-
154
- # Step 3: Handle outliers
155
- if self.config.enable_outlier_detection:
156
- df = self._handle_outliers(df)
157
- logger.info(f"After outlier handling: {df.shape}")
158
-
159
- # Step 4: Handle missing values
160
- if self.config.enable_missing_value_handling:
161
- df = self._handle_missing_values(df)
162
- logger.info(f"After missing value handling: {df.shape}")
163
-
164
- # Step 5: Feature engineering and encoding
165
- df = self._engineer_features(df)
166
- logger.info(f"After feature engineering: {df.shape}")
167
-
168
- # Step 6: Create target variables
169
- df = self._create_target_variables(df)
170
- logger.info(f"After target creation: {df.shape}")
171
-
172
- # Step 7: Split data
173
- train_data, val_data, test_data = self._split_data(df)
174
-
175
- # Step 8: Scale features
176
- train_data, val_data, test_data = self._scale_features(train_data, val_data, test_data)
177
-
178
- # Step 9: Save artifacts
179
- if self.config.save_preprocessing_artifacts:
180
- self._save_artifacts()
181
-
182
- # Prepare results
183
- feature_names = [col for col in df.columns if not col.startswith("target_")]
184
- categorical_features = self._identify_categorical_features(df)
185
- numerical_features = self._identify_numerical_features(df)
186
- target_columns = [col for col in df.columns if col.startswith("target_")]
187
-
188
- results = PreprocessingResults(
189
- train_data=train_data,
190
- val_data=val_data,
191
- test_data=test_data,
192
- feature_names=feature_names,
193
- categorical_features=categorical_features,
194
- numerical_features=numerical_features,
195
- target_columns=target_columns,
196
- cleaning_stats=cleaning_stats,
197
- original_shape=original_shape,
198
- final_shape=df.shape,
199
- feature_count=len(feature_names),
200
- scaler_path=self.config.artifacts_dir / "scaler.joblib",
201
- encoder_path=self.config.artifacts_dir / "encoder.joblib",
202
- feature_metadata_path=self.config.artifacts_dir / "feature_metadata.joblib",
203
- )
204
-
205
- logger.info(f"Preprocessing complete. Final shape: {df.shape}")
206
- logger.info(f"Features: {len(feature_names)}, Targets: {len(target_columns)}")
207
-
208
- return results
209
-
210
- def _clean_data(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, CleaningStats]:
211
- """Clean the raw data"""
212
- logger.info("Cleaning data")
213
-
214
- # Convert to list of records for cleaner
215
- records = df.to_dict("records")
216
- cleaned_records, cleaning_stats = self.data_cleaner.clean_trading_records(records)
217
-
218
- # Convert back to DataFrame
219
- cleaned_df = pd.DataFrame(cleaned_records)
220
-
221
- return cleaned_df, cleaning_stats
222
-
223
- def _extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
224
- """Extract all features"""
225
- logger.info("Extracting features")
226
-
227
- if self.config.enable_politician_features:
228
- df = self.politician_extractor.extract_politician_features(df)
229
- logger.info("Politician features extracted")
230
-
231
- if self.config.enable_market_features:
232
- df = self.market_extractor.extract_market_features(df)
233
- logger.info("Market features extracted")
234
-
235
- if self.config.enable_temporal_features:
236
- df = self.temporal_extractor.extract_temporal_features(df)
237
- logger.info("Temporal features extracted")
238
-
239
- if self.config.enable_sentiment_features:
240
- df = self.sentiment_extractor.extract_sentiment_features(df)
241
- logger.info("Sentiment features extracted")
242
-
243
- return df
244
-
245
- def _handle_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
246
- """Handle outliers in the data"""
247
- logger.info("Handling outliers")
248
-
249
- df_with_outliers, outlier_info = self.outlier_detector.detect_outliers(df)
250
-
251
- if self.config.outlier_action == "remove":
252
- df_clean = df_with_outliers[~df_with_outliers["is_outlier"]]
253
- logger.info(f"Removed {outlier_info['total_outliers']} outliers")
254
- elif self.config.outlier_action == "flag":
255
- df_clean = df_with_outliers
256
- logger.info(f"Flagged {outlier_info['total_outliers']} outliers")
257
- else: # cap
258
- df_clean = self._cap_outliers(df_with_outliers)
259
- logger.info(f"Capped {outlier_info['total_outliers']} outliers")
260
-
261
- return df_clean
262
-
263
- def _handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
264
- """Handle missing values"""
265
- logger.info("Handling missing values")
266
-
267
- df_clean, missing_info = self.missing_value_handler.handle_missing_values(df)
268
- logger.info(f"Handled missing values: {missing_info['final_missing_counts']}")
269
-
270
- return df_clean
271
-
272
- def _engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
273
- """Engineer additional features"""
274
- logger.info("Engineering features")
275
-
276
- # Transaction amount buckets
277
- if "transaction_amount_cleaned" in df.columns:
278
- df["amount_bucket"] = pd.cut(
279
- df["transaction_amount_cleaned"],
280
- bins=[0, 1000, 15000, 50000, 500000, float("inf")],
281
- labels=["micro", "small", "medium", "large", "mega"],
282
- )
283
-
284
- # Politician activity level
285
- if "total_transactions" in df.columns:
286
- df["politician_activity_level"] = pd.cut(
287
- df["total_transactions"],
288
- bins=[0, 5, 20, 50, float("inf")],
289
- labels=["low", "medium", "high", "very_high"],
290
- )
291
-
292
- # Market timing features
293
- if "transaction_date_dt" in df.columns:
294
- # Days since start of data
295
- min_date = df["transaction_date_dt"].min()
296
- df["days_since_start"] = (df["transaction_date_dt"] - min_date).dt.days
297
-
298
- # Market cycle approximation (simplified)
299
- df["market_cycle_phase"] = (df["days_since_start"] % 1460) / 1460 # 4-year cycle
300
-
301
- # Interaction features
302
- if all(col in df.columns for col in ["buy_ratio", "total_transactions"]):
303
- df["buy_volume_interaction"] = df["buy_ratio"] * df["total_transactions"]
304
-
305
- if all(
306
- col in df.columns
307
- for col in ["transaction_amount_cleaned", "politician_trading_frequency"]
308
- ):
309
- df["amount_frequency_interaction"] = (
310
- df["transaction_amount_cleaned"] * df["politician_trading_frequency"]
311
- )
312
-
313
- return df
314
-
315
- def _create_target_variables(self, df: pd.DataFrame) -> pd.DataFrame:
316
- """Create target variables for ML models"""
317
- logger.info("Creating target variables")
318
-
319
- # Sort by politician and date for future stock performance calculation
320
- df = df.sort_values(["politician_name_cleaned", "transaction_date_dt"])
321
-
322
- # Target 1: Stock performance after politician trade (simplified)
323
- # This would typically require external market data
324
- # For now, create synthetic targets based on transaction patterns
325
-
326
- # Target: Whether the trade was profitable (binary classification)
327
- # Assumption: Larger transactions from frequent traders are more likely profitable
328
- if all(
329
- col in df.columns
330
- for col in ["transaction_amount_cleaned", "politician_trading_frequency"]
331
- ):
332
- # Probability based on amount and frequency
333
- amount_score = np.log1p(df["transaction_amount_cleaned"]) / 10
334
- frequency_score = np.log1p(df["politician_trading_frequency"]) / 5
335
-
336
- profit_probability = (amount_score + frequency_score) / 2
337
- profit_probability = np.clip(profit_probability, 0.1, 0.9)
338
-
339
- # Binary target with some randomness
340
- np.random.seed(42) # For reproducibility
341
- df["target_profitable"] = np.random.binomial(1, profit_probability)
342
-
343
- # Target 2: Stock recommendation score (regression)
344
- # Based on politician patterns and market factors
345
- if "transaction_type_cleaned" in df.columns:
346
- base_score = 0.5 # Neutral
347
-
348
- # Adjust based on transaction type
349
- type_adjustment = (
350
- df["transaction_type_cleaned"]
351
- .map({"buy": 0.2, "sell": -0.2, "exchange": 0.0})
352
- .fillna(0)
353
- )
354
-
355
- # Adjust based on politician track record
356
- if "buy_ratio" in df.columns:
357
- track_record_adjustment = (df["buy_ratio"] - 0.5) * 0.3
358
-
359
- # Adjust based on timing
360
- if "is_end_of_quarter" in df.columns:
361
- timing_adjustment = df["is_end_of_quarter"].astype(int) * 0.1
362
-
363
- recommendation_score = (
364
- base_score + type_adjustment + track_record_adjustment + timing_adjustment
365
- )
366
- df["target_recommendation_score"] = np.clip(recommendation_score, 0, 1)
367
-
368
- # Target 3: Risk level (multi-class classification)
369
- if "transaction_volatility" in df.columns:
370
- risk_conditions = [
371
- (df["transaction_volatility"] <= 0.2),
372
- (df["transaction_volatility"] <= 0.5),
373
- (df["transaction_volatility"] <= 1.0),
374
- (df["transaction_volatility"] > 1.0),
375
- ]
376
- risk_choices = ["low", "medium", "high", "very_high"]
377
- df["target_risk_level"] = np.select(risk_conditions, risk_choices, default="medium")
378
-
379
- return df
380
-
381
- def _split_data(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
382
- """Split data into train/val/test sets"""
383
- logger.info("Splitting data")
384
-
385
- if self.config.split_by_time and "transaction_date_dt" in df.columns:
386
- # Time-based split
387
- df_sorted = df.sort_values("transaction_date_dt")
388
-
389
- train_size = int(len(df_sorted) * self.config.train_split_ratio)
390
- val_size = int(len(df_sorted) * self.config.val_split_ratio)
391
-
392
- train_data = df_sorted.iloc[:train_size]
393
- val_data = df_sorted.iloc[train_size : train_size + val_size]
394
- test_data = df_sorted.iloc[train_size + val_size :]
395
-
396
- else:
397
- # Random split
398
- df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
399
-
400
- train_size = int(len(df_shuffled) * self.config.train_split_ratio)
401
- val_size = int(len(df_shuffled) * self.config.val_split_ratio)
402
-
403
- train_data = df_shuffled.iloc[:train_size]
404
- val_data = df_shuffled.iloc[train_size : train_size + val_size]
405
- test_data = df_shuffled.iloc[train_size + val_size :]
406
-
407
- logger.info(
408
- f"Split sizes - Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}"
409
- )
410
-
411
- return train_data, val_data, test_data
412
-
413
- def _scale_features(
414
- self, train_data: pd.DataFrame, val_data: pd.DataFrame, test_data: pd.DataFrame
415
- ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
416
- """Scale numerical features"""
417
- logger.info("Scaling features")
418
-
419
- from sklearn.preprocessing import LabelEncoder, StandardScaler
420
-
421
- numerical_features = self._identify_numerical_features(train_data)
422
- categorical_features = self._identify_categorical_features(train_data)
423
-
424
- # Fit scaler on training data
425
- self.scaler = StandardScaler()
426
- if numerical_features:
427
- train_scaled = train_data.copy()
428
- val_scaled = val_data.copy()
429
- test_scaled = test_data.copy()
430
-
431
- train_scaled[numerical_features] = self.scaler.fit_transform(
432
- train_data[numerical_features]
433
- )
434
- val_scaled[numerical_features] = self.scaler.transform(val_data[numerical_features])
435
- test_scaled[numerical_features] = self.scaler.transform(test_data[numerical_features])
436
- else:
437
- train_scaled, val_scaled, test_scaled = train_data, val_data, test_data
438
-
439
- # Encode categorical features
440
- self.categorical_encoder = {}
441
- if categorical_features:
442
- for feature in categorical_features:
443
- encoder = LabelEncoder()
444
- # Fit on combined data to handle unseen categories
445
- all_values = pd.concat(
446
- [train_scaled[feature], val_scaled[feature], test_scaled[feature]]
447
- ).astype(str)
448
-
449
- encoder.fit(all_values)
450
- self.categorical_encoder[feature] = encoder
451
-
452
- train_scaled[feature] = encoder.transform(train_scaled[feature].astype(str))
453
- val_scaled[feature] = encoder.transform(val_scaled[feature].astype(str))
454
- test_scaled[feature] = encoder.transform(test_scaled[feature].astype(str))
455
-
456
- return train_scaled, val_scaled, test_scaled
457
-
458
- def _identify_numerical_features(self, df: pd.DataFrame) -> List[str]:
459
- """Identify numerical features"""
460
- numerical_features = []
461
- for col in df.columns:
462
- if (
463
- df[col].dtype in ["int64", "float64"]
464
- and not col.startswith("target_")
465
- and not col.endswith("_cleaned")
466
- and col not in ["is_outlier"]
467
- ):
468
- numerical_features.append(col)
469
- return numerical_features
470
-
471
- def _identify_categorical_features(self, df: pd.DataFrame) -> List[str]:
472
- """Identify categorical features"""
473
- categorical_features = []
474
- for col in df.columns:
475
- if (
476
- df[col].dtype == "object"
477
- or df[col].dtype.name == "category"
478
- and not col.startswith("target_")
479
- ):
480
- categorical_features.append(col)
481
- return categorical_features
482
-
483
- def _cap_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
484
- """Cap outliers to percentile values"""
485
- df_capped = df.copy()
486
- numerical_cols = self._identify_numerical_features(df)
487
-
488
- for col in numerical_cols:
489
- if col in df_capped.columns:
490
- q1 = df_capped[col].quantile(0.01)
491
- q99 = df_capped[col].quantile(0.99)
492
- df_capped[col] = np.clip(df_capped[col], q1, q99)
493
-
494
- return df_capped
495
-
496
- def _save_artifacts(self):
497
- """Save preprocessing artifacts"""
498
- logger.info("Saving preprocessing artifacts")
499
-
500
- if self.scaler:
501
- joblib.dump(self.scaler, self.config.artifacts_dir / "scaler.joblib")
502
-
503
- if self.categorical_encoder:
504
- joblib.dump(self.categorical_encoder, self.config.artifacts_dir / "encoder.joblib")
505
-
506
- # Save feature metadata
507
- self.feature_metadata = {
508
- "config": asdict(self.config),
509
- "preprocessing_timestamp": datetime.now().isoformat(),
510
- }
511
- joblib.dump(self.feature_metadata, self.config.artifacts_dir / "feature_metadata.joblib")
512
-
513
- def load_artifacts(self, artifacts_dir: Path):
514
- """Load preprocessing artifacts"""
515
- logger.info(f"Loading preprocessing artifacts from {artifacts_dir}")
516
-
517
- scaler_path = artifacts_dir / "scaler.joblib"
518
- if scaler_path.exists():
519
- self.scaler = joblib.load(scaler_path)
520
-
521
- encoder_path = artifacts_dir / "encoder.joblib"
522
- if encoder_path.exists():
523
- self.categorical_encoder = joblib.load(encoder_path)
524
-
525
- metadata_path = artifacts_dir / "feature_metadata.joblib"
526
- if metadata_path.exists():
527
- self.feature_metadata = joblib.load(metadata_path)
528
-
529
- def transform_new_data(
530
- self, new_data: Union[List[Dict[str, Any]], pd.DataFrame]
531
- ) -> pd.DataFrame:
532
- """Transform new data using fitted preprocessors"""
533
- logger.info("Transforming new data with fitted preprocessors")
534
-
535
- if self.scaler is None and self.categorical_encoder is None:
536
- raise ValueError("No preprocessing artifacts loaded. Call load_artifacts() first.")
537
-
538
- # Convert to DataFrame if needed
539
- if isinstance(new_data, list):
540
- df = pd.DataFrame(new_data)
541
- else:
542
- df = new_data.copy()
543
-
544
- # Apply same preprocessing steps (without fitting)
545
- if self.config.enable_data_cleaning:
546
- records = df.to_dict("records")
547
- cleaned_records, _ = self.data_cleaner.clean_trading_records(records)
548
- df = pd.DataFrame(cleaned_records)
549
-
550
- # Extract features
551
- df = self._extract_features(df)
552
-
553
- # Engineer features
554
- df = self._engineer_features(df)
555
-
556
- # Apply scaling and encoding
557
- numerical_features = self._identify_numerical_features(df)
558
- categorical_features = self._identify_categorical_features(df)
559
-
560
- if self.scaler and numerical_features:
561
- df[numerical_features] = self.scaler.transform(df[numerical_features])
562
-
563
- if self.categorical_encoder and categorical_features:
564
- for feature in categorical_features:
565
- if feature in self.categorical_encoder:
566
- df[feature] = self.categorical_encoder[feature].transform(
567
- df[feature].astype(str)
568
- )
569
-
570
- return df
@@ -1,4 +0,0 @@
1
- """
2
- Politician Trading Data Workflow
3
- Tracks publicly available trading information for US and EU politicians
4
- """