mcli-framework 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (186) hide show
  1. mcli/app/chat_cmd.py +42 -0
  2. mcli/app/commands_cmd.py +226 -0
  3. mcli/app/completion_cmd.py +216 -0
  4. mcli/app/completion_helpers.py +288 -0
  5. mcli/app/cron_test_cmd.py +697 -0
  6. mcli/app/logs_cmd.py +419 -0
  7. mcli/app/main.py +492 -0
  8. mcli/app/model/model.py +1060 -0
  9. mcli/app/model_cmd.py +227 -0
  10. mcli/app/redis_cmd.py +269 -0
  11. mcli/app/video/video.py +1114 -0
  12. mcli/app/visual_cmd.py +303 -0
  13. mcli/chat/chat.py +2409 -0
  14. mcli/chat/command_rag.py +514 -0
  15. mcli/chat/enhanced_chat.py +652 -0
  16. mcli/chat/system_controller.py +1010 -0
  17. mcli/chat/system_integration.py +1016 -0
  18. mcli/cli.py +25 -0
  19. mcli/config.toml +20 -0
  20. mcli/lib/api/api.py +586 -0
  21. mcli/lib/api/daemon_client.py +203 -0
  22. mcli/lib/api/daemon_client_local.py +44 -0
  23. mcli/lib/api/daemon_decorator.py +217 -0
  24. mcli/lib/api/mcli_decorators.py +1032 -0
  25. mcli/lib/auth/auth.py +85 -0
  26. mcli/lib/auth/aws_manager.py +85 -0
  27. mcli/lib/auth/azure_manager.py +91 -0
  28. mcli/lib/auth/credential_manager.py +192 -0
  29. mcli/lib/auth/gcp_manager.py +93 -0
  30. mcli/lib/auth/key_manager.py +117 -0
  31. mcli/lib/auth/mcli_manager.py +93 -0
  32. mcli/lib/auth/token_manager.py +75 -0
  33. mcli/lib/auth/token_util.py +1011 -0
  34. mcli/lib/config/config.py +47 -0
  35. mcli/lib/discovery/__init__.py +1 -0
  36. mcli/lib/discovery/command_discovery.py +274 -0
  37. mcli/lib/erd/erd.py +1345 -0
  38. mcli/lib/erd/generate_graph.py +453 -0
  39. mcli/lib/files/files.py +76 -0
  40. mcli/lib/fs/fs.py +109 -0
  41. mcli/lib/lib.py +29 -0
  42. mcli/lib/logger/logger.py +611 -0
  43. mcli/lib/performance/optimizer.py +409 -0
  44. mcli/lib/performance/rust_bridge.py +502 -0
  45. mcli/lib/performance/uvloop_config.py +154 -0
  46. mcli/lib/pickles/pickles.py +50 -0
  47. mcli/lib/search/cached_vectorizer.py +479 -0
  48. mcli/lib/services/data_pipeline.py +460 -0
  49. mcli/lib/services/lsh_client.py +441 -0
  50. mcli/lib/services/redis_service.py +387 -0
  51. mcli/lib/shell/shell.py +137 -0
  52. mcli/lib/toml/toml.py +33 -0
  53. mcli/lib/ui/styling.py +47 -0
  54. mcli/lib/ui/visual_effects.py +634 -0
  55. mcli/lib/watcher/watcher.py +185 -0
  56. mcli/ml/api/app.py +215 -0
  57. mcli/ml/api/middleware.py +224 -0
  58. mcli/ml/api/routers/admin_router.py +12 -0
  59. mcli/ml/api/routers/auth_router.py +244 -0
  60. mcli/ml/api/routers/backtest_router.py +12 -0
  61. mcli/ml/api/routers/data_router.py +12 -0
  62. mcli/ml/api/routers/model_router.py +302 -0
  63. mcli/ml/api/routers/monitoring_router.py +12 -0
  64. mcli/ml/api/routers/portfolio_router.py +12 -0
  65. mcli/ml/api/routers/prediction_router.py +267 -0
  66. mcli/ml/api/routers/trade_router.py +12 -0
  67. mcli/ml/api/routers/websocket_router.py +76 -0
  68. mcli/ml/api/schemas.py +64 -0
  69. mcli/ml/auth/auth_manager.py +425 -0
  70. mcli/ml/auth/models.py +154 -0
  71. mcli/ml/auth/permissions.py +302 -0
  72. mcli/ml/backtesting/backtest_engine.py +502 -0
  73. mcli/ml/backtesting/performance_metrics.py +393 -0
  74. mcli/ml/cache.py +400 -0
  75. mcli/ml/cli/main.py +398 -0
  76. mcli/ml/config/settings.py +394 -0
  77. mcli/ml/configs/dvc_config.py +230 -0
  78. mcli/ml/configs/mlflow_config.py +131 -0
  79. mcli/ml/configs/mlops_manager.py +293 -0
  80. mcli/ml/dashboard/app.py +532 -0
  81. mcli/ml/dashboard/app_integrated.py +738 -0
  82. mcli/ml/dashboard/app_supabase.py +560 -0
  83. mcli/ml/dashboard/app_training.py +615 -0
  84. mcli/ml/dashboard/cli.py +51 -0
  85. mcli/ml/data_ingestion/api_connectors.py +501 -0
  86. mcli/ml/data_ingestion/data_pipeline.py +567 -0
  87. mcli/ml/data_ingestion/stream_processor.py +512 -0
  88. mcli/ml/database/migrations/env.py +94 -0
  89. mcli/ml/database/models.py +667 -0
  90. mcli/ml/database/session.py +200 -0
  91. mcli/ml/experimentation/ab_testing.py +845 -0
  92. mcli/ml/features/ensemble_features.py +607 -0
  93. mcli/ml/features/political_features.py +676 -0
  94. mcli/ml/features/recommendation_engine.py +809 -0
  95. mcli/ml/features/stock_features.py +573 -0
  96. mcli/ml/features/test_feature_engineering.py +346 -0
  97. mcli/ml/logging.py +85 -0
  98. mcli/ml/mlops/data_versioning.py +518 -0
  99. mcli/ml/mlops/experiment_tracker.py +377 -0
  100. mcli/ml/mlops/model_serving.py +481 -0
  101. mcli/ml/mlops/pipeline_orchestrator.py +614 -0
  102. mcli/ml/models/base_models.py +324 -0
  103. mcli/ml/models/ensemble_models.py +675 -0
  104. mcli/ml/models/recommendation_models.py +474 -0
  105. mcli/ml/models/test_models.py +487 -0
  106. mcli/ml/monitoring/drift_detection.py +676 -0
  107. mcli/ml/monitoring/metrics.py +45 -0
  108. mcli/ml/optimization/portfolio_optimizer.py +834 -0
  109. mcli/ml/preprocessing/data_cleaners.py +451 -0
  110. mcli/ml/preprocessing/feature_extractors.py +491 -0
  111. mcli/ml/preprocessing/ml_pipeline.py +382 -0
  112. mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
  113. mcli/ml/preprocessing/test_preprocessing.py +294 -0
  114. mcli/ml/scripts/populate_sample_data.py +200 -0
  115. mcli/ml/tasks.py +400 -0
  116. mcli/ml/tests/test_integration.py +429 -0
  117. mcli/ml/tests/test_training_dashboard.py +387 -0
  118. mcli/public/oi/oi.py +15 -0
  119. mcli/public/public.py +4 -0
  120. mcli/self/self_cmd.py +1246 -0
  121. mcli/workflow/daemon/api_daemon.py +800 -0
  122. mcli/workflow/daemon/async_command_database.py +681 -0
  123. mcli/workflow/daemon/async_process_manager.py +591 -0
  124. mcli/workflow/daemon/client.py +530 -0
  125. mcli/workflow/daemon/commands.py +1196 -0
  126. mcli/workflow/daemon/daemon.py +905 -0
  127. mcli/workflow/daemon/daemon_api.py +59 -0
  128. mcli/workflow/daemon/enhanced_daemon.py +571 -0
  129. mcli/workflow/daemon/process_cli.py +244 -0
  130. mcli/workflow/daemon/process_manager.py +439 -0
  131. mcli/workflow/daemon/test_daemon.py +275 -0
  132. mcli/workflow/dashboard/dashboard_cmd.py +113 -0
  133. mcli/workflow/docker/docker.py +0 -0
  134. mcli/workflow/file/file.py +100 -0
  135. mcli/workflow/gcloud/config.toml +21 -0
  136. mcli/workflow/gcloud/gcloud.py +58 -0
  137. mcli/workflow/git_commit/ai_service.py +328 -0
  138. mcli/workflow/git_commit/commands.py +430 -0
  139. mcli/workflow/lsh_integration.py +355 -0
  140. mcli/workflow/model_service/client.py +594 -0
  141. mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
  142. mcli/workflow/model_service/lightweight_embedder.py +397 -0
  143. mcli/workflow/model_service/lightweight_model_server.py +714 -0
  144. mcli/workflow/model_service/lightweight_test.py +241 -0
  145. mcli/workflow/model_service/model_service.py +1955 -0
  146. mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
  147. mcli/workflow/model_service/pdf_processor.py +386 -0
  148. mcli/workflow/model_service/test_efficient_runner.py +234 -0
  149. mcli/workflow/model_service/test_example.py +315 -0
  150. mcli/workflow/model_service/test_integration.py +131 -0
  151. mcli/workflow/model_service/test_new_features.py +149 -0
  152. mcli/workflow/openai/openai.py +99 -0
  153. mcli/workflow/politician_trading/commands.py +1790 -0
  154. mcli/workflow/politician_trading/config.py +134 -0
  155. mcli/workflow/politician_trading/connectivity.py +490 -0
  156. mcli/workflow/politician_trading/data_sources.py +395 -0
  157. mcli/workflow/politician_trading/database.py +410 -0
  158. mcli/workflow/politician_trading/demo.py +248 -0
  159. mcli/workflow/politician_trading/models.py +165 -0
  160. mcli/workflow/politician_trading/monitoring.py +413 -0
  161. mcli/workflow/politician_trading/scrapers.py +966 -0
  162. mcli/workflow/politician_trading/scrapers_california.py +412 -0
  163. mcli/workflow/politician_trading/scrapers_eu.py +377 -0
  164. mcli/workflow/politician_trading/scrapers_uk.py +350 -0
  165. mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
  166. mcli/workflow/politician_trading/supabase_functions.py +354 -0
  167. mcli/workflow/politician_trading/workflow.py +852 -0
  168. mcli/workflow/registry/registry.py +180 -0
  169. mcli/workflow/repo/repo.py +223 -0
  170. mcli/workflow/scheduler/commands.py +493 -0
  171. mcli/workflow/scheduler/cron_parser.py +238 -0
  172. mcli/workflow/scheduler/job.py +182 -0
  173. mcli/workflow/scheduler/monitor.py +139 -0
  174. mcli/workflow/scheduler/persistence.py +324 -0
  175. mcli/workflow/scheduler/scheduler.py +679 -0
  176. mcli/workflow/sync/sync_cmd.py +437 -0
  177. mcli/workflow/sync/test_cmd.py +314 -0
  178. mcli/workflow/videos/videos.py +242 -0
  179. mcli/workflow/wakatime/wakatime.py +11 -0
  180. mcli/workflow/workflow.py +37 -0
  181. mcli_framework-7.0.0.dist-info/METADATA +479 -0
  182. mcli_framework-7.0.0.dist-info/RECORD +186 -0
  183. mcli_framework-7.0.0.dist-info/WHEEL +5 -0
  184. mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
  185. mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
  186. mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,451 @@
1
+ """Data cleaning utilities for ML preprocessing"""
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from datetime import datetime, timedelta
6
+ from typing import Any, Dict, List, Optional, Tuple, Union
7
+ from dataclasses import dataclass
8
+ import re
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ @dataclass
15
+ class CleaningStats:
16
+ """Statistics about data cleaning operations"""
17
+
18
+ total_records: int
19
+ cleaned_records: int
20
+ removed_records: int
21
+ cleaning_operations: Dict[str, int]
22
+ outliers_detected: int
23
+ missing_values_filled: int
24
+
25
+
26
+ class TradingDataCleaner:
27
+ """Cleans and standardizes politician trading data for ML"""
28
+
29
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
30
+ self.config = config or {}
31
+ self.cleaning_stats = CleaningStats(
32
+ total_records=0,
33
+ cleaned_records=0,
34
+ removed_records=0,
35
+ cleaning_operations={},
36
+ outliers_detected=0,
37
+ missing_values_filled=0,
38
+ )
39
+
40
+ def clean_trading_records(
41
+ self, records: List[Dict[str, Any]]
42
+ ) -> Tuple[List[Dict[str, Any]], CleaningStats]:
43
+ """Clean a batch of trading records"""
44
+ self.cleaning_stats.total_records = len(records)
45
+ cleaned_records = []
46
+
47
+ for record in records:
48
+ cleaned_record = self._clean_single_record(record)
49
+ if cleaned_record is not None:
50
+ cleaned_records.append(cleaned_record)
51
+ self.cleaning_stats.cleaned_records += 1
52
+ else:
53
+ self.cleaning_stats.removed_records += 1
54
+
55
+ return cleaned_records, self.cleaning_stats
56
+
57
+ def _clean_single_record(self, record: Dict[str, Any]) -> Optional[Dict[str, Any]]:
58
+ """Clean a single trading record"""
59
+ try:
60
+ cleaned = record.copy()
61
+
62
+ # Clean politician name
63
+ cleaned = self._clean_politician_name(cleaned)
64
+
65
+ # Clean transaction amount
66
+ cleaned = self._clean_transaction_amount(cleaned)
67
+
68
+ # Clean transaction date
69
+ cleaned = self._clean_transaction_date(cleaned)
70
+
71
+ # Clean asset name/ticker
72
+ cleaned = self._clean_asset_info(cleaned)
73
+
74
+ # Clean transaction type
75
+ cleaned = self._clean_transaction_type(cleaned)
76
+
77
+ # Validate required fields exist
78
+ if not self._validate_required_fields(cleaned):
79
+ return None
80
+
81
+ return cleaned
82
+
83
+ except Exception as e:
84
+ logger.warning(f"Failed to clean record: {e}")
85
+ return None
86
+
87
+ def _clean_politician_name(self, record: Dict[str, Any]) -> Dict[str, Any]:
88
+ """Clean and standardize politician names"""
89
+ name_fields = ["politician_name", "name", "representative_name", "senator_name"]
90
+
91
+ for field in name_fields:
92
+ if field in record and record[field]:
93
+ name = str(record[field]).strip()
94
+
95
+ # Remove titles and suffixes
96
+ name = re.sub(
97
+ r"\b(Hon\.|Dr\.|Mr\.|Mrs\.|Ms\.|Sen\.|Rep\.)\s+", "", name, flags=re.IGNORECASE
98
+ )
99
+ name = re.sub(r"\s+(Jr\.?|Sr\.?|III|IV|II)$", "", name, flags=re.IGNORECASE)
100
+
101
+ # Title case
102
+ name = name.title()
103
+
104
+ # Handle special cases
105
+ name = re.sub(r"\bMc([a-z])", r"Mc\1", name)
106
+ name = re.sub(r"\bO\'([a-z])", r"O'\1", name)
107
+
108
+ record["politician_name_cleaned"] = name
109
+ self._increment_cleaning_operation("politician_name_cleaned")
110
+ break
111
+
112
+ return record
113
+
114
+ def _clean_transaction_amount(self, record: Dict[str, Any]) -> Dict[str, Any]:
115
+ """Clean and standardize transaction amounts"""
116
+ amount_fields = ["transaction_amount", "amount", "value", "transaction_value"]
117
+
118
+ for field in amount_fields:
119
+ if field in record and record[field] is not None:
120
+ amount_str = str(record[field]).strip()
121
+
122
+ # Remove currency symbols and commas
123
+ amount_str = re.sub(r"[$,\s]", "", amount_str)
124
+
125
+ # Handle ranges (take midpoint)
126
+ if " - " in amount_str or " to " in amount_str:
127
+ range_parts = re.split(r"\s*(?:-|to)\s*", amount_str)
128
+ if len(range_parts) == 2:
129
+ try:
130
+ min_val = float(re.sub(r"[^\d.]", "", range_parts[0]))
131
+ max_val = float(re.sub(r"[^\d.]", "", range_parts[1]))
132
+ amount_str = str((min_val + max_val) / 2)
133
+ self._increment_cleaning_operation("amount_range_midpoint")
134
+ except ValueError:
135
+ continue
136
+
137
+ # Convert to float
138
+ try:
139
+ amount = float(amount_str)
140
+ if amount >= 0: # Only positive amounts
141
+ record["transaction_amount_cleaned"] = amount
142
+ self._increment_cleaning_operation("transaction_amount_cleaned")
143
+ break
144
+ except ValueError:
145
+ continue
146
+
147
+ return record
148
+
149
+ def _clean_transaction_date(self, record: Dict[str, Any]) -> Dict[str, Any]:
150
+ """Clean and standardize transaction dates"""
151
+ date_fields = ["transaction_date", "date", "trade_date", "disclosure_date"]
152
+
153
+ for field in date_fields:
154
+ if field in record and record[field]:
155
+ date_str = str(record[field]).strip()
156
+
157
+ # Try multiple date formats
158
+ date_formats = [
159
+ "%Y-%m-%d",
160
+ "%m/%d/%Y",
161
+ "%m-%d-%Y",
162
+ "%Y/%m/%d",
163
+ "%B %d, %Y",
164
+ "%b %d, %Y",
165
+ "%Y-%m-%dT%H:%M:%S",
166
+ "%Y-%m-%dT%H:%M:%S.%f",
167
+ ]
168
+
169
+ for fmt in date_formats:
170
+ try:
171
+ date_obj = datetime.strptime(date_str, fmt)
172
+ record["transaction_date_cleaned"] = date_obj.strftime("%Y-%m-%d")
173
+ self._increment_cleaning_operation("transaction_date_cleaned")
174
+ break
175
+ except ValueError:
176
+ continue
177
+ else:
178
+ # Try pandas parsing as fallback
179
+ try:
180
+ import pandas as pd
181
+
182
+ date_obj = pd.to_datetime(date_str)
183
+ record["transaction_date_cleaned"] = date_obj.strftime("%Y-%m-%d")
184
+ self._increment_cleaning_operation("transaction_date_cleaned")
185
+ except:
186
+ continue
187
+
188
+ if "transaction_date_cleaned" in record:
189
+ break
190
+
191
+ return record
192
+
193
+ def _clean_asset_info(self, record: Dict[str, Any]) -> Dict[str, Any]:
194
+ """Clean and standardize asset information"""
195
+ asset_fields = ["asset_name", "stock_symbol", "ticker", "security_name"]
196
+
197
+ # Clean ticker/symbol
198
+ for field in ["stock_symbol", "ticker", "symbol"]:
199
+ if field in record and record[field]:
200
+ ticker = str(record[field]).strip().upper()
201
+
202
+ # Remove common prefixes/suffixes
203
+ ticker = re.sub(r"\s*(NYSE:|NASDAQ:|AMEX:)\s*", "", ticker)
204
+ ticker = re.sub(r"\s*\(.*\)\s*", "", ticker)
205
+
206
+ # Validate ticker format (letters and numbers only, 1-5 chars typically)
207
+ if re.match(r"^[A-Z0-9]{1,10}$", ticker):
208
+ record["ticker_cleaned"] = ticker
209
+ self._increment_cleaning_operation("ticker_cleaned")
210
+ break
211
+
212
+ # Clean asset name
213
+ for field in ["asset_name", "security_name", "company_name"]:
214
+ if field in record and record[field]:
215
+ name = str(record[field]).strip()
216
+
217
+ # Remove common suffixes
218
+ name = re.sub(
219
+ r"\s+(Inc\.?|Corp\.?|Co\.?|Ltd\.?|LLC|LP)$", "", name, flags=re.IGNORECASE
220
+ )
221
+
222
+ # Title case
223
+ name = name.title()
224
+
225
+ record["asset_name_cleaned"] = name
226
+ self._increment_cleaning_operation("asset_name_cleaned")
227
+ break
228
+
229
+ return record
230
+
231
+ def _clean_transaction_type(self, record: Dict[str, Any]) -> Dict[str, Any]:
232
+ """Clean and standardize transaction types"""
233
+ type_fields = ["transaction_type", "type", "action", "trade_type"]
234
+
235
+ for field in type_fields:
236
+ if field in record and record[field]:
237
+ transaction_type = str(record[field]).strip().lower()
238
+
239
+ # Standardize transaction types
240
+ if any(word in transaction_type for word in ["buy", "purchase", "acquired"]):
241
+ standardized_type = "buy"
242
+ elif any(word in transaction_type for word in ["sell", "sale", "sold", "disposed"]):
243
+ standardized_type = "sell"
244
+ elif any(word in transaction_type for word in ["exchange", "swap"]):
245
+ standardized_type = "exchange"
246
+ else:
247
+ standardized_type = "other"
248
+
249
+ record["transaction_type_cleaned"] = standardized_type
250
+ self._increment_cleaning_operation("transaction_type_cleaned")
251
+ break
252
+
253
+ return record
254
+
255
+ def _validate_required_fields(self, record: Dict[str, Any]) -> bool:
256
+ """Validate that required fields exist after cleaning"""
257
+ required_fields = [
258
+ "politician_name_cleaned",
259
+ "transaction_date_cleaned",
260
+ "transaction_type_cleaned",
261
+ ]
262
+
263
+ # At least one amount or asset field should exist
264
+ amount_or_asset = any(
265
+ field in record
266
+ for field in ["transaction_amount_cleaned", "ticker_cleaned", "asset_name_cleaned"]
267
+ )
268
+
269
+ has_required = all(field in record for field in required_fields)
270
+
271
+ return has_required and amount_or_asset
272
+
273
+ def _increment_cleaning_operation(self, operation: str):
274
+ """Track cleaning operations"""
275
+ if operation not in self.cleaning_stats.cleaning_operations:
276
+ self.cleaning_stats.cleaning_operations[operation] = 0
277
+ self.cleaning_stats.cleaning_operations[operation] += 1
278
+
279
+
280
+ class OutlierDetector:
281
+ """Detects and handles outliers in trading data"""
282
+
283
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
284
+ self.config = config or {}
285
+ self.outlier_thresholds = {
286
+ "transaction_amount": {
287
+ "min": 1, # Minimum $1
288
+ "max": 50_000_000, # Maximum $50M
289
+ "z_score": 3.0,
290
+ },
291
+ "days_to_disclosure": {
292
+ "min": 0,
293
+ "max": 365, # More than 1 year is suspicious
294
+ "z_score": 3.0,
295
+ },
296
+ }
297
+
298
+ def detect_outliers(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Any]]:
299
+ """Detect outliers in the dataset"""
300
+ outlier_info = {"total_outliers": 0, "outliers_by_field": {}, "outlier_indices": set()}
301
+
302
+ # Amount-based outliers
303
+ if "transaction_amount_cleaned" in df.columns:
304
+ amount_outliers = self._detect_amount_outliers(df)
305
+ outlier_info["outliers_by_field"]["amount"] = len(amount_outliers)
306
+ outlier_info["outlier_indices"].update(amount_outliers)
307
+
308
+ # Date-based outliers
309
+ if "transaction_date_cleaned" in df.columns:
310
+ date_outliers = self._detect_date_outliers(df)
311
+ outlier_info["outliers_by_field"]["date"] = len(date_outliers)
312
+ outlier_info["outlier_indices"].update(date_outliers)
313
+
314
+ # Statistical outliers
315
+ numeric_columns = df.select_dtypes(include=[np.number]).columns
316
+ for col in numeric_columns:
317
+ if col.endswith("_cleaned"):
318
+ col_outliers = self._detect_statistical_outliers(df, col)
319
+ outlier_info["outliers_by_field"][col] = len(col_outliers)
320
+ outlier_info["outlier_indices"].update(col_outliers)
321
+
322
+ outlier_info["total_outliers"] = len(outlier_info["outlier_indices"])
323
+
324
+ # Mark outliers in dataframe
325
+ df["is_outlier"] = df.index.isin(outlier_info["outlier_indices"])
326
+
327
+ return df, outlier_info
328
+
329
+ def _detect_amount_outliers(self, df: pd.DataFrame) -> List[int]:
330
+ """Detect amount-based outliers"""
331
+ outliers = []
332
+ amount_col = "transaction_amount_cleaned"
333
+
334
+ if amount_col not in df.columns:
335
+ return outliers
336
+
337
+ thresholds = self.outlier_thresholds["transaction_amount"]
338
+
339
+ # Hard limits
340
+ outliers.extend(df[df[amount_col] < thresholds["min"]].index.tolist())
341
+ outliers.extend(df[df[amount_col] > thresholds["max"]].index.tolist())
342
+
343
+ return list(set(outliers))
344
+
345
+ def _detect_date_outliers(self, df: pd.DataFrame) -> List[int]:
346
+ """Detect date-based outliers"""
347
+ outliers = []
348
+ date_col = "transaction_date_cleaned"
349
+
350
+ if date_col not in df.columns:
351
+ return outliers
352
+
353
+ # Convert to datetime
354
+ df[date_col] = pd.to_datetime(df[date_col])
355
+
356
+ # Future dates
357
+ future_dates = df[df[date_col] > datetime.now()].index.tolist()
358
+ outliers.extend(future_dates)
359
+
360
+ # Very old dates (before 1990)
361
+ old_dates = df[df[date_col] < datetime(1990, 1, 1)].index.tolist()
362
+ outliers.extend(old_dates)
363
+
364
+ return list(set(outliers))
365
+
366
+ def _detect_statistical_outliers(self, df: pd.DataFrame, column: str) -> List[int]:
367
+ """Detect statistical outliers using Z-score"""
368
+ outliers = []
369
+
370
+ if column not in df.columns or df[column].dtype not in [np.number, "float64", "int64"]:
371
+ return outliers
372
+
373
+ # Calculate Z-scores
374
+ mean_val = df[column].mean()
375
+ std_val = df[column].std()
376
+
377
+ if std_val == 0: # No variation
378
+ return outliers
379
+
380
+ z_scores = np.abs((df[column] - mean_val) / std_val)
381
+ threshold = self.outlier_thresholds.get(column, {}).get("z_score", 3.0)
382
+
383
+ outliers = df[z_scores > threshold].index.tolist()
384
+
385
+ return outliers
386
+
387
+
388
+ class MissingValueHandler:
389
+ """Handles missing values in trading data"""
390
+
391
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
392
+ self.config = config or {}
393
+ self.fill_strategies = {
394
+ "transaction_amount_cleaned": "median",
395
+ "transaction_date_cleaned": "forward_fill",
396
+ "politician_name_cleaned": "drop",
397
+ "transaction_type_cleaned": "mode",
398
+ "ticker_cleaned": "drop",
399
+ "asset_name_cleaned": "unknown",
400
+ }
401
+
402
+ def handle_missing_values(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Any]]:
403
+ """Handle missing values according to strategies"""
404
+ missing_info = {
405
+ "original_shape": df.shape,
406
+ "missing_counts": df.isnull().sum().to_dict(),
407
+ "filled_counts": {},
408
+ "dropped_rows": 0,
409
+ }
410
+
411
+ df_processed = df.copy()
412
+
413
+ for column, strategy in self.fill_strategies.items():
414
+ if column in df_processed.columns:
415
+ original_missing = df_processed[column].isnull().sum()
416
+
417
+ if strategy == "median" and df_processed[column].dtype in [
418
+ np.number,
419
+ "float64",
420
+ "int64",
421
+ ]:
422
+ df_processed[column].fillna(df_processed[column].median(), inplace=True)
423
+ elif strategy == "mean" and df_processed[column].dtype in [
424
+ np.number,
425
+ "float64",
426
+ "int64",
427
+ ]:
428
+ df_processed[column].fillna(df_processed[column].mean(), inplace=True)
429
+ elif strategy == "mode":
430
+ mode_val = df_processed[column].mode()
431
+ if not mode_val.empty:
432
+ df_processed[column].fillna(mode_val[0], inplace=True)
433
+ elif strategy == "forward_fill":
434
+ df_processed[column].fillna(method="ffill", inplace=True)
435
+ elif strategy == "backward_fill":
436
+ df_processed[column].fillna(method="bfill", inplace=True)
437
+ elif strategy == "unknown":
438
+ df_processed[column].fillna("unknown", inplace=True)
439
+ elif strategy == "drop":
440
+ # Drop rows with missing values in this column
441
+ rows_before = len(df_processed)
442
+ df_processed = df_processed.dropna(subset=[column])
443
+ missing_info["dropped_rows"] += rows_before - len(df_processed)
444
+
445
+ new_missing = df_processed[column].isnull().sum()
446
+ missing_info["filled_counts"][column] = original_missing - new_missing
447
+
448
+ missing_info["final_shape"] = df_processed.shape
449
+ missing_info["final_missing_counts"] = df_processed.isnull().sum().to_dict()
450
+
451
+ return df_processed, missing_info