mcli-framework 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/chat_cmd.py +42 -0
- mcli/app/commands_cmd.py +226 -0
- mcli/app/completion_cmd.py +216 -0
- mcli/app/completion_helpers.py +288 -0
- mcli/app/cron_test_cmd.py +697 -0
- mcli/app/logs_cmd.py +419 -0
- mcli/app/main.py +492 -0
- mcli/app/model/model.py +1060 -0
- mcli/app/model_cmd.py +227 -0
- mcli/app/redis_cmd.py +269 -0
- mcli/app/video/video.py +1114 -0
- mcli/app/visual_cmd.py +303 -0
- mcli/chat/chat.py +2409 -0
- mcli/chat/command_rag.py +514 -0
- mcli/chat/enhanced_chat.py +652 -0
- mcli/chat/system_controller.py +1010 -0
- mcli/chat/system_integration.py +1016 -0
- mcli/cli.py +25 -0
- mcli/config.toml +20 -0
- mcli/lib/api/api.py +586 -0
- mcli/lib/api/daemon_client.py +203 -0
- mcli/lib/api/daemon_client_local.py +44 -0
- mcli/lib/api/daemon_decorator.py +217 -0
- mcli/lib/api/mcli_decorators.py +1032 -0
- mcli/lib/auth/auth.py +85 -0
- mcli/lib/auth/aws_manager.py +85 -0
- mcli/lib/auth/azure_manager.py +91 -0
- mcli/lib/auth/credential_manager.py +192 -0
- mcli/lib/auth/gcp_manager.py +93 -0
- mcli/lib/auth/key_manager.py +117 -0
- mcli/lib/auth/mcli_manager.py +93 -0
- mcli/lib/auth/token_manager.py +75 -0
- mcli/lib/auth/token_util.py +1011 -0
- mcli/lib/config/config.py +47 -0
- mcli/lib/discovery/__init__.py +1 -0
- mcli/lib/discovery/command_discovery.py +274 -0
- mcli/lib/erd/erd.py +1345 -0
- mcli/lib/erd/generate_graph.py +453 -0
- mcli/lib/files/files.py +76 -0
- mcli/lib/fs/fs.py +109 -0
- mcli/lib/lib.py +29 -0
- mcli/lib/logger/logger.py +611 -0
- mcli/lib/performance/optimizer.py +409 -0
- mcli/lib/performance/rust_bridge.py +502 -0
- mcli/lib/performance/uvloop_config.py +154 -0
- mcli/lib/pickles/pickles.py +50 -0
- mcli/lib/search/cached_vectorizer.py +479 -0
- mcli/lib/services/data_pipeline.py +460 -0
- mcli/lib/services/lsh_client.py +441 -0
- mcli/lib/services/redis_service.py +387 -0
- mcli/lib/shell/shell.py +137 -0
- mcli/lib/toml/toml.py +33 -0
- mcli/lib/ui/styling.py +47 -0
- mcli/lib/ui/visual_effects.py +634 -0
- mcli/lib/watcher/watcher.py +185 -0
- mcli/ml/api/app.py +215 -0
- mcli/ml/api/middleware.py +224 -0
- mcli/ml/api/routers/admin_router.py +12 -0
- mcli/ml/api/routers/auth_router.py +244 -0
- mcli/ml/api/routers/backtest_router.py +12 -0
- mcli/ml/api/routers/data_router.py +12 -0
- mcli/ml/api/routers/model_router.py +302 -0
- mcli/ml/api/routers/monitoring_router.py +12 -0
- mcli/ml/api/routers/portfolio_router.py +12 -0
- mcli/ml/api/routers/prediction_router.py +267 -0
- mcli/ml/api/routers/trade_router.py +12 -0
- mcli/ml/api/routers/websocket_router.py +76 -0
- mcli/ml/api/schemas.py +64 -0
- mcli/ml/auth/auth_manager.py +425 -0
- mcli/ml/auth/models.py +154 -0
- mcli/ml/auth/permissions.py +302 -0
- mcli/ml/backtesting/backtest_engine.py +502 -0
- mcli/ml/backtesting/performance_metrics.py +393 -0
- mcli/ml/cache.py +400 -0
- mcli/ml/cli/main.py +398 -0
- mcli/ml/config/settings.py +394 -0
- mcli/ml/configs/dvc_config.py +230 -0
- mcli/ml/configs/mlflow_config.py +131 -0
- mcli/ml/configs/mlops_manager.py +293 -0
- mcli/ml/dashboard/app.py +532 -0
- mcli/ml/dashboard/app_integrated.py +738 -0
- mcli/ml/dashboard/app_supabase.py +560 -0
- mcli/ml/dashboard/app_training.py +615 -0
- mcli/ml/dashboard/cli.py +51 -0
- mcli/ml/data_ingestion/api_connectors.py +501 -0
- mcli/ml/data_ingestion/data_pipeline.py +567 -0
- mcli/ml/data_ingestion/stream_processor.py +512 -0
- mcli/ml/database/migrations/env.py +94 -0
- mcli/ml/database/models.py +667 -0
- mcli/ml/database/session.py +200 -0
- mcli/ml/experimentation/ab_testing.py +845 -0
- mcli/ml/features/ensemble_features.py +607 -0
- mcli/ml/features/political_features.py +676 -0
- mcli/ml/features/recommendation_engine.py +809 -0
- mcli/ml/features/stock_features.py +573 -0
- mcli/ml/features/test_feature_engineering.py +346 -0
- mcli/ml/logging.py +85 -0
- mcli/ml/mlops/data_versioning.py +518 -0
- mcli/ml/mlops/experiment_tracker.py +377 -0
- mcli/ml/mlops/model_serving.py +481 -0
- mcli/ml/mlops/pipeline_orchestrator.py +614 -0
- mcli/ml/models/base_models.py +324 -0
- mcli/ml/models/ensemble_models.py +675 -0
- mcli/ml/models/recommendation_models.py +474 -0
- mcli/ml/models/test_models.py +487 -0
- mcli/ml/monitoring/drift_detection.py +676 -0
- mcli/ml/monitoring/metrics.py +45 -0
- mcli/ml/optimization/portfolio_optimizer.py +834 -0
- mcli/ml/preprocessing/data_cleaners.py +451 -0
- mcli/ml/preprocessing/feature_extractors.py +491 -0
- mcli/ml/preprocessing/ml_pipeline.py +382 -0
- mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
- mcli/ml/preprocessing/test_preprocessing.py +294 -0
- mcli/ml/scripts/populate_sample_data.py +200 -0
- mcli/ml/tasks.py +400 -0
- mcli/ml/tests/test_integration.py +429 -0
- mcli/ml/tests/test_training_dashboard.py +387 -0
- mcli/public/oi/oi.py +15 -0
- mcli/public/public.py +4 -0
- mcli/self/self_cmd.py +1246 -0
- mcli/workflow/daemon/api_daemon.py +800 -0
- mcli/workflow/daemon/async_command_database.py +681 -0
- mcli/workflow/daemon/async_process_manager.py +591 -0
- mcli/workflow/daemon/client.py +530 -0
- mcli/workflow/daemon/commands.py +1196 -0
- mcli/workflow/daemon/daemon.py +905 -0
- mcli/workflow/daemon/daemon_api.py +59 -0
- mcli/workflow/daemon/enhanced_daemon.py +571 -0
- mcli/workflow/daemon/process_cli.py +244 -0
- mcli/workflow/daemon/process_manager.py +439 -0
- mcli/workflow/daemon/test_daemon.py +275 -0
- mcli/workflow/dashboard/dashboard_cmd.py +113 -0
- mcli/workflow/docker/docker.py +0 -0
- mcli/workflow/file/file.py +100 -0
- mcli/workflow/gcloud/config.toml +21 -0
- mcli/workflow/gcloud/gcloud.py +58 -0
- mcli/workflow/git_commit/ai_service.py +328 -0
- mcli/workflow/git_commit/commands.py +430 -0
- mcli/workflow/lsh_integration.py +355 -0
- mcli/workflow/model_service/client.py +594 -0
- mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
- mcli/workflow/model_service/lightweight_embedder.py +397 -0
- mcli/workflow/model_service/lightweight_model_server.py +714 -0
- mcli/workflow/model_service/lightweight_test.py +241 -0
- mcli/workflow/model_service/model_service.py +1955 -0
- mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
- mcli/workflow/model_service/pdf_processor.py +386 -0
- mcli/workflow/model_service/test_efficient_runner.py +234 -0
- mcli/workflow/model_service/test_example.py +315 -0
- mcli/workflow/model_service/test_integration.py +131 -0
- mcli/workflow/model_service/test_new_features.py +149 -0
- mcli/workflow/openai/openai.py +99 -0
- mcli/workflow/politician_trading/commands.py +1790 -0
- mcli/workflow/politician_trading/config.py +134 -0
- mcli/workflow/politician_trading/connectivity.py +490 -0
- mcli/workflow/politician_trading/data_sources.py +395 -0
- mcli/workflow/politician_trading/database.py +410 -0
- mcli/workflow/politician_trading/demo.py +248 -0
- mcli/workflow/politician_trading/models.py +165 -0
- mcli/workflow/politician_trading/monitoring.py +413 -0
- mcli/workflow/politician_trading/scrapers.py +966 -0
- mcli/workflow/politician_trading/scrapers_california.py +412 -0
- mcli/workflow/politician_trading/scrapers_eu.py +377 -0
- mcli/workflow/politician_trading/scrapers_uk.py +350 -0
- mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
- mcli/workflow/politician_trading/supabase_functions.py +354 -0
- mcli/workflow/politician_trading/workflow.py +852 -0
- mcli/workflow/registry/registry.py +180 -0
- mcli/workflow/repo/repo.py +223 -0
- mcli/workflow/scheduler/commands.py +493 -0
- mcli/workflow/scheduler/cron_parser.py +238 -0
- mcli/workflow/scheduler/job.py +182 -0
- mcli/workflow/scheduler/monitor.py +139 -0
- mcli/workflow/scheduler/persistence.py +324 -0
- mcli/workflow/scheduler/scheduler.py +679 -0
- mcli/workflow/sync/sync_cmd.py +437 -0
- mcli/workflow/sync/test_cmd.py +314 -0
- mcli/workflow/videos/videos.py +242 -0
- mcli/workflow/wakatime/wakatime.py +11 -0
- mcli/workflow/workflow.py +37 -0
- mcli_framework-7.0.0.dist-info/METADATA +479 -0
- mcli_framework-7.0.0.dist-info/RECORD +186 -0
- mcli_framework-7.0.0.dist-info/WHEEL +5 -0
- mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
- mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
- mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,451 @@
|
|
|
1
|
+
"""Data cleaning utilities for ML preprocessing"""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from datetime import datetime, timedelta
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
import re
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class CleaningStats:
|
|
16
|
+
"""Statistics about data cleaning operations"""
|
|
17
|
+
|
|
18
|
+
total_records: int
|
|
19
|
+
cleaned_records: int
|
|
20
|
+
removed_records: int
|
|
21
|
+
cleaning_operations: Dict[str, int]
|
|
22
|
+
outliers_detected: int
|
|
23
|
+
missing_values_filled: int
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TradingDataCleaner:
|
|
27
|
+
"""Cleans and standardizes politician trading data for ML"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
30
|
+
self.config = config or {}
|
|
31
|
+
self.cleaning_stats = CleaningStats(
|
|
32
|
+
total_records=0,
|
|
33
|
+
cleaned_records=0,
|
|
34
|
+
removed_records=0,
|
|
35
|
+
cleaning_operations={},
|
|
36
|
+
outliers_detected=0,
|
|
37
|
+
missing_values_filled=0,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def clean_trading_records(
|
|
41
|
+
self, records: List[Dict[str, Any]]
|
|
42
|
+
) -> Tuple[List[Dict[str, Any]], CleaningStats]:
|
|
43
|
+
"""Clean a batch of trading records"""
|
|
44
|
+
self.cleaning_stats.total_records = len(records)
|
|
45
|
+
cleaned_records = []
|
|
46
|
+
|
|
47
|
+
for record in records:
|
|
48
|
+
cleaned_record = self._clean_single_record(record)
|
|
49
|
+
if cleaned_record is not None:
|
|
50
|
+
cleaned_records.append(cleaned_record)
|
|
51
|
+
self.cleaning_stats.cleaned_records += 1
|
|
52
|
+
else:
|
|
53
|
+
self.cleaning_stats.removed_records += 1
|
|
54
|
+
|
|
55
|
+
return cleaned_records, self.cleaning_stats
|
|
56
|
+
|
|
57
|
+
def _clean_single_record(self, record: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
58
|
+
"""Clean a single trading record"""
|
|
59
|
+
try:
|
|
60
|
+
cleaned = record.copy()
|
|
61
|
+
|
|
62
|
+
# Clean politician name
|
|
63
|
+
cleaned = self._clean_politician_name(cleaned)
|
|
64
|
+
|
|
65
|
+
# Clean transaction amount
|
|
66
|
+
cleaned = self._clean_transaction_amount(cleaned)
|
|
67
|
+
|
|
68
|
+
# Clean transaction date
|
|
69
|
+
cleaned = self._clean_transaction_date(cleaned)
|
|
70
|
+
|
|
71
|
+
# Clean asset name/ticker
|
|
72
|
+
cleaned = self._clean_asset_info(cleaned)
|
|
73
|
+
|
|
74
|
+
# Clean transaction type
|
|
75
|
+
cleaned = self._clean_transaction_type(cleaned)
|
|
76
|
+
|
|
77
|
+
# Validate required fields exist
|
|
78
|
+
if not self._validate_required_fields(cleaned):
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
return cleaned
|
|
82
|
+
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logger.warning(f"Failed to clean record: {e}")
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
def _clean_politician_name(self, record: Dict[str, Any]) -> Dict[str, Any]:
|
|
88
|
+
"""Clean and standardize politician names"""
|
|
89
|
+
name_fields = ["politician_name", "name", "representative_name", "senator_name"]
|
|
90
|
+
|
|
91
|
+
for field in name_fields:
|
|
92
|
+
if field in record and record[field]:
|
|
93
|
+
name = str(record[field]).strip()
|
|
94
|
+
|
|
95
|
+
# Remove titles and suffixes
|
|
96
|
+
name = re.sub(
|
|
97
|
+
r"\b(Hon\.|Dr\.|Mr\.|Mrs\.|Ms\.|Sen\.|Rep\.)\s+", "", name, flags=re.IGNORECASE
|
|
98
|
+
)
|
|
99
|
+
name = re.sub(r"\s+(Jr\.?|Sr\.?|III|IV|II)$", "", name, flags=re.IGNORECASE)
|
|
100
|
+
|
|
101
|
+
# Title case
|
|
102
|
+
name = name.title()
|
|
103
|
+
|
|
104
|
+
# Handle special cases
|
|
105
|
+
name = re.sub(r"\bMc([a-z])", r"Mc\1", name)
|
|
106
|
+
name = re.sub(r"\bO\'([a-z])", r"O'\1", name)
|
|
107
|
+
|
|
108
|
+
record["politician_name_cleaned"] = name
|
|
109
|
+
self._increment_cleaning_operation("politician_name_cleaned")
|
|
110
|
+
break
|
|
111
|
+
|
|
112
|
+
return record
|
|
113
|
+
|
|
114
|
+
def _clean_transaction_amount(self, record: Dict[str, Any]) -> Dict[str, Any]:
|
|
115
|
+
"""Clean and standardize transaction amounts"""
|
|
116
|
+
amount_fields = ["transaction_amount", "amount", "value", "transaction_value"]
|
|
117
|
+
|
|
118
|
+
for field in amount_fields:
|
|
119
|
+
if field in record and record[field] is not None:
|
|
120
|
+
amount_str = str(record[field]).strip()
|
|
121
|
+
|
|
122
|
+
# Remove currency symbols and commas
|
|
123
|
+
amount_str = re.sub(r"[$,\s]", "", amount_str)
|
|
124
|
+
|
|
125
|
+
# Handle ranges (take midpoint)
|
|
126
|
+
if " - " in amount_str or " to " in amount_str:
|
|
127
|
+
range_parts = re.split(r"\s*(?:-|to)\s*", amount_str)
|
|
128
|
+
if len(range_parts) == 2:
|
|
129
|
+
try:
|
|
130
|
+
min_val = float(re.sub(r"[^\d.]", "", range_parts[0]))
|
|
131
|
+
max_val = float(re.sub(r"[^\d.]", "", range_parts[1]))
|
|
132
|
+
amount_str = str((min_val + max_val) / 2)
|
|
133
|
+
self._increment_cleaning_operation("amount_range_midpoint")
|
|
134
|
+
except ValueError:
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
# Convert to float
|
|
138
|
+
try:
|
|
139
|
+
amount = float(amount_str)
|
|
140
|
+
if amount >= 0: # Only positive amounts
|
|
141
|
+
record["transaction_amount_cleaned"] = amount
|
|
142
|
+
self._increment_cleaning_operation("transaction_amount_cleaned")
|
|
143
|
+
break
|
|
144
|
+
except ValueError:
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
return record
|
|
148
|
+
|
|
149
|
+
def _clean_transaction_date(self, record: Dict[str, Any]) -> Dict[str, Any]:
|
|
150
|
+
"""Clean and standardize transaction dates"""
|
|
151
|
+
date_fields = ["transaction_date", "date", "trade_date", "disclosure_date"]
|
|
152
|
+
|
|
153
|
+
for field in date_fields:
|
|
154
|
+
if field in record and record[field]:
|
|
155
|
+
date_str = str(record[field]).strip()
|
|
156
|
+
|
|
157
|
+
# Try multiple date formats
|
|
158
|
+
date_formats = [
|
|
159
|
+
"%Y-%m-%d",
|
|
160
|
+
"%m/%d/%Y",
|
|
161
|
+
"%m-%d-%Y",
|
|
162
|
+
"%Y/%m/%d",
|
|
163
|
+
"%B %d, %Y",
|
|
164
|
+
"%b %d, %Y",
|
|
165
|
+
"%Y-%m-%dT%H:%M:%S",
|
|
166
|
+
"%Y-%m-%dT%H:%M:%S.%f",
|
|
167
|
+
]
|
|
168
|
+
|
|
169
|
+
for fmt in date_formats:
|
|
170
|
+
try:
|
|
171
|
+
date_obj = datetime.strptime(date_str, fmt)
|
|
172
|
+
record["transaction_date_cleaned"] = date_obj.strftime("%Y-%m-%d")
|
|
173
|
+
self._increment_cleaning_operation("transaction_date_cleaned")
|
|
174
|
+
break
|
|
175
|
+
except ValueError:
|
|
176
|
+
continue
|
|
177
|
+
else:
|
|
178
|
+
# Try pandas parsing as fallback
|
|
179
|
+
try:
|
|
180
|
+
import pandas as pd
|
|
181
|
+
|
|
182
|
+
date_obj = pd.to_datetime(date_str)
|
|
183
|
+
record["transaction_date_cleaned"] = date_obj.strftime("%Y-%m-%d")
|
|
184
|
+
self._increment_cleaning_operation("transaction_date_cleaned")
|
|
185
|
+
except:
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
if "transaction_date_cleaned" in record:
|
|
189
|
+
break
|
|
190
|
+
|
|
191
|
+
return record
|
|
192
|
+
|
|
193
|
+
def _clean_asset_info(self, record: Dict[str, Any]) -> Dict[str, Any]:
|
|
194
|
+
"""Clean and standardize asset information"""
|
|
195
|
+
asset_fields = ["asset_name", "stock_symbol", "ticker", "security_name"]
|
|
196
|
+
|
|
197
|
+
# Clean ticker/symbol
|
|
198
|
+
for field in ["stock_symbol", "ticker", "symbol"]:
|
|
199
|
+
if field in record and record[field]:
|
|
200
|
+
ticker = str(record[field]).strip().upper()
|
|
201
|
+
|
|
202
|
+
# Remove common prefixes/suffixes
|
|
203
|
+
ticker = re.sub(r"\s*(NYSE:|NASDAQ:|AMEX:)\s*", "", ticker)
|
|
204
|
+
ticker = re.sub(r"\s*\(.*\)\s*", "", ticker)
|
|
205
|
+
|
|
206
|
+
# Validate ticker format (letters and numbers only, 1-5 chars typically)
|
|
207
|
+
if re.match(r"^[A-Z0-9]{1,10}$", ticker):
|
|
208
|
+
record["ticker_cleaned"] = ticker
|
|
209
|
+
self._increment_cleaning_operation("ticker_cleaned")
|
|
210
|
+
break
|
|
211
|
+
|
|
212
|
+
# Clean asset name
|
|
213
|
+
for field in ["asset_name", "security_name", "company_name"]:
|
|
214
|
+
if field in record and record[field]:
|
|
215
|
+
name = str(record[field]).strip()
|
|
216
|
+
|
|
217
|
+
# Remove common suffixes
|
|
218
|
+
name = re.sub(
|
|
219
|
+
r"\s+(Inc\.?|Corp\.?|Co\.?|Ltd\.?|LLC|LP)$", "", name, flags=re.IGNORECASE
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Title case
|
|
223
|
+
name = name.title()
|
|
224
|
+
|
|
225
|
+
record["asset_name_cleaned"] = name
|
|
226
|
+
self._increment_cleaning_operation("asset_name_cleaned")
|
|
227
|
+
break
|
|
228
|
+
|
|
229
|
+
return record
|
|
230
|
+
|
|
231
|
+
def _clean_transaction_type(self, record: Dict[str, Any]) -> Dict[str, Any]:
|
|
232
|
+
"""Clean and standardize transaction types"""
|
|
233
|
+
type_fields = ["transaction_type", "type", "action", "trade_type"]
|
|
234
|
+
|
|
235
|
+
for field in type_fields:
|
|
236
|
+
if field in record and record[field]:
|
|
237
|
+
transaction_type = str(record[field]).strip().lower()
|
|
238
|
+
|
|
239
|
+
# Standardize transaction types
|
|
240
|
+
if any(word in transaction_type for word in ["buy", "purchase", "acquired"]):
|
|
241
|
+
standardized_type = "buy"
|
|
242
|
+
elif any(word in transaction_type for word in ["sell", "sale", "sold", "disposed"]):
|
|
243
|
+
standardized_type = "sell"
|
|
244
|
+
elif any(word in transaction_type for word in ["exchange", "swap"]):
|
|
245
|
+
standardized_type = "exchange"
|
|
246
|
+
else:
|
|
247
|
+
standardized_type = "other"
|
|
248
|
+
|
|
249
|
+
record["transaction_type_cleaned"] = standardized_type
|
|
250
|
+
self._increment_cleaning_operation("transaction_type_cleaned")
|
|
251
|
+
break
|
|
252
|
+
|
|
253
|
+
return record
|
|
254
|
+
|
|
255
|
+
def _validate_required_fields(self, record: Dict[str, Any]) -> bool:
|
|
256
|
+
"""Validate that required fields exist after cleaning"""
|
|
257
|
+
required_fields = [
|
|
258
|
+
"politician_name_cleaned",
|
|
259
|
+
"transaction_date_cleaned",
|
|
260
|
+
"transaction_type_cleaned",
|
|
261
|
+
]
|
|
262
|
+
|
|
263
|
+
# At least one amount or asset field should exist
|
|
264
|
+
amount_or_asset = any(
|
|
265
|
+
field in record
|
|
266
|
+
for field in ["transaction_amount_cleaned", "ticker_cleaned", "asset_name_cleaned"]
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
has_required = all(field in record for field in required_fields)
|
|
270
|
+
|
|
271
|
+
return has_required and amount_or_asset
|
|
272
|
+
|
|
273
|
+
def _increment_cleaning_operation(self, operation: str):
|
|
274
|
+
"""Track cleaning operations"""
|
|
275
|
+
if operation not in self.cleaning_stats.cleaning_operations:
|
|
276
|
+
self.cleaning_stats.cleaning_operations[operation] = 0
|
|
277
|
+
self.cleaning_stats.cleaning_operations[operation] += 1
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
class OutlierDetector:
|
|
281
|
+
"""Detects and handles outliers in trading data"""
|
|
282
|
+
|
|
283
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
284
|
+
self.config = config or {}
|
|
285
|
+
self.outlier_thresholds = {
|
|
286
|
+
"transaction_amount": {
|
|
287
|
+
"min": 1, # Minimum $1
|
|
288
|
+
"max": 50_000_000, # Maximum $50M
|
|
289
|
+
"z_score": 3.0,
|
|
290
|
+
},
|
|
291
|
+
"days_to_disclosure": {
|
|
292
|
+
"min": 0,
|
|
293
|
+
"max": 365, # More than 1 year is suspicious
|
|
294
|
+
"z_score": 3.0,
|
|
295
|
+
},
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
def detect_outliers(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Any]]:
|
|
299
|
+
"""Detect outliers in the dataset"""
|
|
300
|
+
outlier_info = {"total_outliers": 0, "outliers_by_field": {}, "outlier_indices": set()}
|
|
301
|
+
|
|
302
|
+
# Amount-based outliers
|
|
303
|
+
if "transaction_amount_cleaned" in df.columns:
|
|
304
|
+
amount_outliers = self._detect_amount_outliers(df)
|
|
305
|
+
outlier_info["outliers_by_field"]["amount"] = len(amount_outliers)
|
|
306
|
+
outlier_info["outlier_indices"].update(amount_outliers)
|
|
307
|
+
|
|
308
|
+
# Date-based outliers
|
|
309
|
+
if "transaction_date_cleaned" in df.columns:
|
|
310
|
+
date_outliers = self._detect_date_outliers(df)
|
|
311
|
+
outlier_info["outliers_by_field"]["date"] = len(date_outliers)
|
|
312
|
+
outlier_info["outlier_indices"].update(date_outliers)
|
|
313
|
+
|
|
314
|
+
# Statistical outliers
|
|
315
|
+
numeric_columns = df.select_dtypes(include=[np.number]).columns
|
|
316
|
+
for col in numeric_columns:
|
|
317
|
+
if col.endswith("_cleaned"):
|
|
318
|
+
col_outliers = self._detect_statistical_outliers(df, col)
|
|
319
|
+
outlier_info["outliers_by_field"][col] = len(col_outliers)
|
|
320
|
+
outlier_info["outlier_indices"].update(col_outliers)
|
|
321
|
+
|
|
322
|
+
outlier_info["total_outliers"] = len(outlier_info["outlier_indices"])
|
|
323
|
+
|
|
324
|
+
# Mark outliers in dataframe
|
|
325
|
+
df["is_outlier"] = df.index.isin(outlier_info["outlier_indices"])
|
|
326
|
+
|
|
327
|
+
return df, outlier_info
|
|
328
|
+
|
|
329
|
+
def _detect_amount_outliers(self, df: pd.DataFrame) -> List[int]:
|
|
330
|
+
"""Detect amount-based outliers"""
|
|
331
|
+
outliers = []
|
|
332
|
+
amount_col = "transaction_amount_cleaned"
|
|
333
|
+
|
|
334
|
+
if amount_col not in df.columns:
|
|
335
|
+
return outliers
|
|
336
|
+
|
|
337
|
+
thresholds = self.outlier_thresholds["transaction_amount"]
|
|
338
|
+
|
|
339
|
+
# Hard limits
|
|
340
|
+
outliers.extend(df[df[amount_col] < thresholds["min"]].index.tolist())
|
|
341
|
+
outliers.extend(df[df[amount_col] > thresholds["max"]].index.tolist())
|
|
342
|
+
|
|
343
|
+
return list(set(outliers))
|
|
344
|
+
|
|
345
|
+
def _detect_date_outliers(self, df: pd.DataFrame) -> List[int]:
|
|
346
|
+
"""Detect date-based outliers"""
|
|
347
|
+
outliers = []
|
|
348
|
+
date_col = "transaction_date_cleaned"
|
|
349
|
+
|
|
350
|
+
if date_col not in df.columns:
|
|
351
|
+
return outliers
|
|
352
|
+
|
|
353
|
+
# Convert to datetime
|
|
354
|
+
df[date_col] = pd.to_datetime(df[date_col])
|
|
355
|
+
|
|
356
|
+
# Future dates
|
|
357
|
+
future_dates = df[df[date_col] > datetime.now()].index.tolist()
|
|
358
|
+
outliers.extend(future_dates)
|
|
359
|
+
|
|
360
|
+
# Very old dates (before 1990)
|
|
361
|
+
old_dates = df[df[date_col] < datetime(1990, 1, 1)].index.tolist()
|
|
362
|
+
outliers.extend(old_dates)
|
|
363
|
+
|
|
364
|
+
return list(set(outliers))
|
|
365
|
+
|
|
366
|
+
def _detect_statistical_outliers(self, df: pd.DataFrame, column: str) -> List[int]:
|
|
367
|
+
"""Detect statistical outliers using Z-score"""
|
|
368
|
+
outliers = []
|
|
369
|
+
|
|
370
|
+
if column not in df.columns or df[column].dtype not in [np.number, "float64", "int64"]:
|
|
371
|
+
return outliers
|
|
372
|
+
|
|
373
|
+
# Calculate Z-scores
|
|
374
|
+
mean_val = df[column].mean()
|
|
375
|
+
std_val = df[column].std()
|
|
376
|
+
|
|
377
|
+
if std_val == 0: # No variation
|
|
378
|
+
return outliers
|
|
379
|
+
|
|
380
|
+
z_scores = np.abs((df[column] - mean_val) / std_val)
|
|
381
|
+
threshold = self.outlier_thresholds.get(column, {}).get("z_score", 3.0)
|
|
382
|
+
|
|
383
|
+
outliers = df[z_scores > threshold].index.tolist()
|
|
384
|
+
|
|
385
|
+
return outliers
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
class MissingValueHandler:
|
|
389
|
+
"""Handles missing values in trading data"""
|
|
390
|
+
|
|
391
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
392
|
+
self.config = config or {}
|
|
393
|
+
self.fill_strategies = {
|
|
394
|
+
"transaction_amount_cleaned": "median",
|
|
395
|
+
"transaction_date_cleaned": "forward_fill",
|
|
396
|
+
"politician_name_cleaned": "drop",
|
|
397
|
+
"transaction_type_cleaned": "mode",
|
|
398
|
+
"ticker_cleaned": "drop",
|
|
399
|
+
"asset_name_cleaned": "unknown",
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
def handle_missing_values(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Any]]:
|
|
403
|
+
"""Handle missing values according to strategies"""
|
|
404
|
+
missing_info = {
|
|
405
|
+
"original_shape": df.shape,
|
|
406
|
+
"missing_counts": df.isnull().sum().to_dict(),
|
|
407
|
+
"filled_counts": {},
|
|
408
|
+
"dropped_rows": 0,
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
df_processed = df.copy()
|
|
412
|
+
|
|
413
|
+
for column, strategy in self.fill_strategies.items():
|
|
414
|
+
if column in df_processed.columns:
|
|
415
|
+
original_missing = df_processed[column].isnull().sum()
|
|
416
|
+
|
|
417
|
+
if strategy == "median" and df_processed[column].dtype in [
|
|
418
|
+
np.number,
|
|
419
|
+
"float64",
|
|
420
|
+
"int64",
|
|
421
|
+
]:
|
|
422
|
+
df_processed[column].fillna(df_processed[column].median(), inplace=True)
|
|
423
|
+
elif strategy == "mean" and df_processed[column].dtype in [
|
|
424
|
+
np.number,
|
|
425
|
+
"float64",
|
|
426
|
+
"int64",
|
|
427
|
+
]:
|
|
428
|
+
df_processed[column].fillna(df_processed[column].mean(), inplace=True)
|
|
429
|
+
elif strategy == "mode":
|
|
430
|
+
mode_val = df_processed[column].mode()
|
|
431
|
+
if not mode_val.empty:
|
|
432
|
+
df_processed[column].fillna(mode_val[0], inplace=True)
|
|
433
|
+
elif strategy == "forward_fill":
|
|
434
|
+
df_processed[column].fillna(method="ffill", inplace=True)
|
|
435
|
+
elif strategy == "backward_fill":
|
|
436
|
+
df_processed[column].fillna(method="bfill", inplace=True)
|
|
437
|
+
elif strategy == "unknown":
|
|
438
|
+
df_processed[column].fillna("unknown", inplace=True)
|
|
439
|
+
elif strategy == "drop":
|
|
440
|
+
# Drop rows with missing values in this column
|
|
441
|
+
rows_before = len(df_processed)
|
|
442
|
+
df_processed = df_processed.dropna(subset=[column])
|
|
443
|
+
missing_info["dropped_rows"] += rows_before - len(df_processed)
|
|
444
|
+
|
|
445
|
+
new_missing = df_processed[column].isnull().sum()
|
|
446
|
+
missing_info["filled_counts"][column] = original_missing - new_missing
|
|
447
|
+
|
|
448
|
+
missing_info["final_shape"] = df_processed.shape
|
|
449
|
+
missing_info["final_missing_counts"] = df_processed.isnull().sum().to_dict()
|
|
450
|
+
|
|
451
|
+
return df_processed, missing_info
|