mcli-framework 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/chat_cmd.py +42 -0
- mcli/app/commands_cmd.py +226 -0
- mcli/app/completion_cmd.py +216 -0
- mcli/app/completion_helpers.py +288 -0
- mcli/app/cron_test_cmd.py +697 -0
- mcli/app/logs_cmd.py +419 -0
- mcli/app/main.py +492 -0
- mcli/app/model/model.py +1060 -0
- mcli/app/model_cmd.py +227 -0
- mcli/app/redis_cmd.py +269 -0
- mcli/app/video/video.py +1114 -0
- mcli/app/visual_cmd.py +303 -0
- mcli/chat/chat.py +2409 -0
- mcli/chat/command_rag.py +514 -0
- mcli/chat/enhanced_chat.py +652 -0
- mcli/chat/system_controller.py +1010 -0
- mcli/chat/system_integration.py +1016 -0
- mcli/cli.py +25 -0
- mcli/config.toml +20 -0
- mcli/lib/api/api.py +586 -0
- mcli/lib/api/daemon_client.py +203 -0
- mcli/lib/api/daemon_client_local.py +44 -0
- mcli/lib/api/daemon_decorator.py +217 -0
- mcli/lib/api/mcli_decorators.py +1032 -0
- mcli/lib/auth/auth.py +85 -0
- mcli/lib/auth/aws_manager.py +85 -0
- mcli/lib/auth/azure_manager.py +91 -0
- mcli/lib/auth/credential_manager.py +192 -0
- mcli/lib/auth/gcp_manager.py +93 -0
- mcli/lib/auth/key_manager.py +117 -0
- mcli/lib/auth/mcli_manager.py +93 -0
- mcli/lib/auth/token_manager.py +75 -0
- mcli/lib/auth/token_util.py +1011 -0
- mcli/lib/config/config.py +47 -0
- mcli/lib/discovery/__init__.py +1 -0
- mcli/lib/discovery/command_discovery.py +274 -0
- mcli/lib/erd/erd.py +1345 -0
- mcli/lib/erd/generate_graph.py +453 -0
- mcli/lib/files/files.py +76 -0
- mcli/lib/fs/fs.py +109 -0
- mcli/lib/lib.py +29 -0
- mcli/lib/logger/logger.py +611 -0
- mcli/lib/performance/optimizer.py +409 -0
- mcli/lib/performance/rust_bridge.py +502 -0
- mcli/lib/performance/uvloop_config.py +154 -0
- mcli/lib/pickles/pickles.py +50 -0
- mcli/lib/search/cached_vectorizer.py +479 -0
- mcli/lib/services/data_pipeline.py +460 -0
- mcli/lib/services/lsh_client.py +441 -0
- mcli/lib/services/redis_service.py +387 -0
- mcli/lib/shell/shell.py +137 -0
- mcli/lib/toml/toml.py +33 -0
- mcli/lib/ui/styling.py +47 -0
- mcli/lib/ui/visual_effects.py +634 -0
- mcli/lib/watcher/watcher.py +185 -0
- mcli/ml/api/app.py +215 -0
- mcli/ml/api/middleware.py +224 -0
- mcli/ml/api/routers/admin_router.py +12 -0
- mcli/ml/api/routers/auth_router.py +244 -0
- mcli/ml/api/routers/backtest_router.py +12 -0
- mcli/ml/api/routers/data_router.py +12 -0
- mcli/ml/api/routers/model_router.py +302 -0
- mcli/ml/api/routers/monitoring_router.py +12 -0
- mcli/ml/api/routers/portfolio_router.py +12 -0
- mcli/ml/api/routers/prediction_router.py +267 -0
- mcli/ml/api/routers/trade_router.py +12 -0
- mcli/ml/api/routers/websocket_router.py +76 -0
- mcli/ml/api/schemas.py +64 -0
- mcli/ml/auth/auth_manager.py +425 -0
- mcli/ml/auth/models.py +154 -0
- mcli/ml/auth/permissions.py +302 -0
- mcli/ml/backtesting/backtest_engine.py +502 -0
- mcli/ml/backtesting/performance_metrics.py +393 -0
- mcli/ml/cache.py +400 -0
- mcli/ml/cli/main.py +398 -0
- mcli/ml/config/settings.py +394 -0
- mcli/ml/configs/dvc_config.py +230 -0
- mcli/ml/configs/mlflow_config.py +131 -0
- mcli/ml/configs/mlops_manager.py +293 -0
- mcli/ml/dashboard/app.py +532 -0
- mcli/ml/dashboard/app_integrated.py +738 -0
- mcli/ml/dashboard/app_supabase.py +560 -0
- mcli/ml/dashboard/app_training.py +615 -0
- mcli/ml/dashboard/cli.py +51 -0
- mcli/ml/data_ingestion/api_connectors.py +501 -0
- mcli/ml/data_ingestion/data_pipeline.py +567 -0
- mcli/ml/data_ingestion/stream_processor.py +512 -0
- mcli/ml/database/migrations/env.py +94 -0
- mcli/ml/database/models.py +667 -0
- mcli/ml/database/session.py +200 -0
- mcli/ml/experimentation/ab_testing.py +845 -0
- mcli/ml/features/ensemble_features.py +607 -0
- mcli/ml/features/political_features.py +676 -0
- mcli/ml/features/recommendation_engine.py +809 -0
- mcli/ml/features/stock_features.py +573 -0
- mcli/ml/features/test_feature_engineering.py +346 -0
- mcli/ml/logging.py +85 -0
- mcli/ml/mlops/data_versioning.py +518 -0
- mcli/ml/mlops/experiment_tracker.py +377 -0
- mcli/ml/mlops/model_serving.py +481 -0
- mcli/ml/mlops/pipeline_orchestrator.py +614 -0
- mcli/ml/models/base_models.py +324 -0
- mcli/ml/models/ensemble_models.py +675 -0
- mcli/ml/models/recommendation_models.py +474 -0
- mcli/ml/models/test_models.py +487 -0
- mcli/ml/monitoring/drift_detection.py +676 -0
- mcli/ml/monitoring/metrics.py +45 -0
- mcli/ml/optimization/portfolio_optimizer.py +834 -0
- mcli/ml/preprocessing/data_cleaners.py +451 -0
- mcli/ml/preprocessing/feature_extractors.py +491 -0
- mcli/ml/preprocessing/ml_pipeline.py +382 -0
- mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
- mcli/ml/preprocessing/test_preprocessing.py +294 -0
- mcli/ml/scripts/populate_sample_data.py +200 -0
- mcli/ml/tasks.py +400 -0
- mcli/ml/tests/test_integration.py +429 -0
- mcli/ml/tests/test_training_dashboard.py +387 -0
- mcli/public/oi/oi.py +15 -0
- mcli/public/public.py +4 -0
- mcli/self/self_cmd.py +1246 -0
- mcli/workflow/daemon/api_daemon.py +800 -0
- mcli/workflow/daemon/async_command_database.py +681 -0
- mcli/workflow/daemon/async_process_manager.py +591 -0
- mcli/workflow/daemon/client.py +530 -0
- mcli/workflow/daemon/commands.py +1196 -0
- mcli/workflow/daemon/daemon.py +905 -0
- mcli/workflow/daemon/daemon_api.py +59 -0
- mcli/workflow/daemon/enhanced_daemon.py +571 -0
- mcli/workflow/daemon/process_cli.py +244 -0
- mcli/workflow/daemon/process_manager.py +439 -0
- mcli/workflow/daemon/test_daemon.py +275 -0
- mcli/workflow/dashboard/dashboard_cmd.py +113 -0
- mcli/workflow/docker/docker.py +0 -0
- mcli/workflow/file/file.py +100 -0
- mcli/workflow/gcloud/config.toml +21 -0
- mcli/workflow/gcloud/gcloud.py +58 -0
- mcli/workflow/git_commit/ai_service.py +328 -0
- mcli/workflow/git_commit/commands.py +430 -0
- mcli/workflow/lsh_integration.py +355 -0
- mcli/workflow/model_service/client.py +594 -0
- mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
- mcli/workflow/model_service/lightweight_embedder.py +397 -0
- mcli/workflow/model_service/lightweight_model_server.py +714 -0
- mcli/workflow/model_service/lightweight_test.py +241 -0
- mcli/workflow/model_service/model_service.py +1955 -0
- mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
- mcli/workflow/model_service/pdf_processor.py +386 -0
- mcli/workflow/model_service/test_efficient_runner.py +234 -0
- mcli/workflow/model_service/test_example.py +315 -0
- mcli/workflow/model_service/test_integration.py +131 -0
- mcli/workflow/model_service/test_new_features.py +149 -0
- mcli/workflow/openai/openai.py +99 -0
- mcli/workflow/politician_trading/commands.py +1790 -0
- mcli/workflow/politician_trading/config.py +134 -0
- mcli/workflow/politician_trading/connectivity.py +490 -0
- mcli/workflow/politician_trading/data_sources.py +395 -0
- mcli/workflow/politician_trading/database.py +410 -0
- mcli/workflow/politician_trading/demo.py +248 -0
- mcli/workflow/politician_trading/models.py +165 -0
- mcli/workflow/politician_trading/monitoring.py +413 -0
- mcli/workflow/politician_trading/scrapers.py +966 -0
- mcli/workflow/politician_trading/scrapers_california.py +412 -0
- mcli/workflow/politician_trading/scrapers_eu.py +377 -0
- mcli/workflow/politician_trading/scrapers_uk.py +350 -0
- mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
- mcli/workflow/politician_trading/supabase_functions.py +354 -0
- mcli/workflow/politician_trading/workflow.py +852 -0
- mcli/workflow/registry/registry.py +180 -0
- mcli/workflow/repo/repo.py +223 -0
- mcli/workflow/scheduler/commands.py +493 -0
- mcli/workflow/scheduler/cron_parser.py +238 -0
- mcli/workflow/scheduler/job.py +182 -0
- mcli/workflow/scheduler/monitor.py +139 -0
- mcli/workflow/scheduler/persistence.py +324 -0
- mcli/workflow/scheduler/scheduler.py +679 -0
- mcli/workflow/sync/sync_cmd.py +437 -0
- mcli/workflow/sync/test_cmd.py +314 -0
- mcli/workflow/videos/videos.py +242 -0
- mcli/workflow/wakatime/wakatime.py +11 -0
- mcli/workflow/workflow.py +37 -0
- mcli_framework-7.0.0.dist-info/METADATA +479 -0
- mcli_framework-7.0.0.dist-info/RECORD +186 -0
- mcli_framework-7.0.0.dist-info/WHEEL +5 -0
- mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
- mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
- mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,567 @@
|
|
|
1
|
+
"""Complete data ingestion pipeline with validation and transformation"""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from typing import Dict, Any, Optional, List, Union, Callable
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime, timedelta
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import numpy as np
|
|
9
|
+
import logging
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
import json
|
|
12
|
+
from abc import ABC, abstractmethod
|
|
13
|
+
|
|
14
|
+
from .api_connectors import (
|
|
15
|
+
CongressionalDataAPI,
|
|
16
|
+
YahooFinanceConnector,
|
|
17
|
+
AlphaVantageConnector,
|
|
18
|
+
PolygonIOConnector,
|
|
19
|
+
QuiverQuantConnector,
|
|
20
|
+
DataAggregator,
|
|
21
|
+
)
|
|
22
|
+
from .stream_processor import StreamProcessor, StreamConfig, DataAggregator as StreamAggregator
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class PipelineConfig:
|
|
29
|
+
"""Data pipeline configuration"""
|
|
30
|
+
data_dir: Path = Path("data")
|
|
31
|
+
batch_size: int = 1000
|
|
32
|
+
enable_streaming: bool = True
|
|
33
|
+
enable_validation: bool = True
|
|
34
|
+
enable_transformation: bool = True
|
|
35
|
+
enable_caching: bool = True
|
|
36
|
+
cache_ttl: int = 300 # seconds
|
|
37
|
+
retry_count: int = 3
|
|
38
|
+
retry_delay: int = 1
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DataValidator:
|
|
42
|
+
"""Validate incoming data"""
|
|
43
|
+
|
|
44
|
+
def __init__(self):
|
|
45
|
+
self.validation_rules = {
|
|
46
|
+
'politician_trades': self._validate_politician_trade,
|
|
47
|
+
'stock_quotes': self._validate_stock_quote,
|
|
48
|
+
'market_data': self._validate_market_data
|
|
49
|
+
}
|
|
50
|
+
self.validation_stats = {
|
|
51
|
+
'total': 0,
|
|
52
|
+
'valid': 0,
|
|
53
|
+
'invalid': 0,
|
|
54
|
+
'errors': []
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
def validate(self, data: Dict[str, Any], data_type: str) -> bool:
|
|
58
|
+
"""Validate data based on type"""
|
|
59
|
+
self.validation_stats['total'] += 1
|
|
60
|
+
|
|
61
|
+
if data_type not in self.validation_rules:
|
|
62
|
+
logger.warning(f"Unknown data type: {data_type}")
|
|
63
|
+
return True
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
is_valid = self.validation_rules[data_type](data)
|
|
67
|
+
if is_valid:
|
|
68
|
+
self.validation_stats['valid'] += 1
|
|
69
|
+
else:
|
|
70
|
+
self.validation_stats['invalid'] += 1
|
|
71
|
+
return is_valid
|
|
72
|
+
except Exception as e:
|
|
73
|
+
self.validation_stats['invalid'] += 1
|
|
74
|
+
self.validation_stats['errors'].append(str(e))
|
|
75
|
+
logger.error(f"Validation error: {e}")
|
|
76
|
+
return False
|
|
77
|
+
|
|
78
|
+
def _validate_politician_trade(self, data: Dict[str, Any]) -> bool:
|
|
79
|
+
"""Validate politician trading data"""
|
|
80
|
+
required_fields = ['politician', 'ticker', 'transaction_type', 'amount', 'transaction_date']
|
|
81
|
+
|
|
82
|
+
# Check required fields
|
|
83
|
+
for field in required_fields:
|
|
84
|
+
if field not in data:
|
|
85
|
+
logger.warning(f"Missing required field: {field}")
|
|
86
|
+
return False
|
|
87
|
+
|
|
88
|
+
# Validate transaction type
|
|
89
|
+
if data['transaction_type'] not in ['buy', 'sell', 'exchange']:
|
|
90
|
+
logger.warning(f"Invalid transaction type: {data['transaction_type']}")
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
# Validate amount
|
|
94
|
+
if not isinstance(data['amount'], (int, float)) or data['amount'] <= 0:
|
|
95
|
+
logger.warning(f"Invalid amount: {data['amount']}")
|
|
96
|
+
return False
|
|
97
|
+
|
|
98
|
+
# Validate date
|
|
99
|
+
try:
|
|
100
|
+
if isinstance(data['transaction_date'], str):
|
|
101
|
+
datetime.fromisoformat(data['transaction_date'])
|
|
102
|
+
except:
|
|
103
|
+
logger.warning(f"Invalid date format: {data['transaction_date']}")
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
return True
|
|
107
|
+
|
|
108
|
+
def _validate_stock_quote(self, data: Dict[str, Any]) -> bool:
|
|
109
|
+
"""Validate stock quote data"""
|
|
110
|
+
required_fields = ['symbol', 'price', 'timestamp']
|
|
111
|
+
|
|
112
|
+
for field in required_fields:
|
|
113
|
+
if field not in data:
|
|
114
|
+
return False
|
|
115
|
+
|
|
116
|
+
# Validate price
|
|
117
|
+
if not isinstance(data['price'], (int, float)) or data['price'] <= 0:
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
return True
|
|
121
|
+
|
|
122
|
+
def _validate_market_data(self, data: Dict[str, Any]) -> bool:
|
|
123
|
+
"""Validate market data"""
|
|
124
|
+
required_fields = ['symbol', 'close', 'volume', 'date']
|
|
125
|
+
|
|
126
|
+
for field in required_fields:
|
|
127
|
+
if field not in data:
|
|
128
|
+
return False
|
|
129
|
+
|
|
130
|
+
# Validate prices
|
|
131
|
+
for price_field in ['close', 'open', 'high', 'low']:
|
|
132
|
+
if price_field in data:
|
|
133
|
+
if not isinstance(data[price_field], (int, float)) or data[price_field] <= 0:
|
|
134
|
+
return False
|
|
135
|
+
|
|
136
|
+
# Validate volume
|
|
137
|
+
if not isinstance(data['volume'], (int, float)) or data['volume'] < 0:
|
|
138
|
+
return False
|
|
139
|
+
|
|
140
|
+
return True
|
|
141
|
+
|
|
142
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
143
|
+
"""Get validation statistics"""
|
|
144
|
+
return self.validation_stats.copy()
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class DataTransformer:
|
|
148
|
+
"""Transform and normalize data"""
|
|
149
|
+
|
|
150
|
+
def __init__(self):
|
|
151
|
+
self.transformers = {
|
|
152
|
+
'politician_trades': self._transform_politician_trade,
|
|
153
|
+
'stock_quotes': self._transform_stock_quote,
|
|
154
|
+
'market_data': self._transform_market_data
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
def transform(self, data: Union[Dict[str, Any], List[Dict[str, Any]]],
|
|
158
|
+
data_type: str) -> Union[Dict[str, Any], pd.DataFrame]:
|
|
159
|
+
"""Transform data based on type"""
|
|
160
|
+
if data_type not in self.transformers:
|
|
161
|
+
return data
|
|
162
|
+
|
|
163
|
+
if isinstance(data, list):
|
|
164
|
+
transformed = [self.transformers[data_type](item) for item in data]
|
|
165
|
+
return pd.DataFrame(transformed)
|
|
166
|
+
else:
|
|
167
|
+
return self.transformers[data_type](data)
|
|
168
|
+
|
|
169
|
+
def _transform_politician_trade(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
170
|
+
"""Transform politician trading data"""
|
|
171
|
+
transformed = data.copy()
|
|
172
|
+
|
|
173
|
+
# Standardize politician name
|
|
174
|
+
transformed['politician_normalized'] = self._normalize_name(data.get('politician', ''))
|
|
175
|
+
|
|
176
|
+
# Convert dates to datetime
|
|
177
|
+
if 'transaction_date' in data:
|
|
178
|
+
transformed['transaction_date'] = pd.to_datetime(data['transaction_date'])
|
|
179
|
+
|
|
180
|
+
if 'disclosure_date' in data:
|
|
181
|
+
transformed['disclosure_date'] = pd.to_datetime(data['disclosure_date'])
|
|
182
|
+
|
|
183
|
+
# Calculate disclosure delay
|
|
184
|
+
if 'transaction_date' in transformed:
|
|
185
|
+
delay = (transformed['disclosure_date'] - transformed['transaction_date']).days
|
|
186
|
+
transformed['disclosure_delay_days'] = max(0, delay)
|
|
187
|
+
|
|
188
|
+
# Normalize ticker
|
|
189
|
+
transformed['ticker'] = data.get('ticker', '').upper()
|
|
190
|
+
|
|
191
|
+
# Categorize transaction amount
|
|
192
|
+
amount = data.get('amount', 0)
|
|
193
|
+
transformed['amount_category'] = self._categorize_amount(amount)
|
|
194
|
+
|
|
195
|
+
# Add derived features
|
|
196
|
+
transformed['is_purchase'] = data.get('transaction_type') == 'buy'
|
|
197
|
+
transformed['is_sale'] = data.get('transaction_type') == 'sell'
|
|
198
|
+
|
|
199
|
+
return transformed
|
|
200
|
+
|
|
201
|
+
def _transform_stock_quote(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
202
|
+
"""Transform stock quote data"""
|
|
203
|
+
transformed = data.copy()
|
|
204
|
+
|
|
205
|
+
# Normalize symbol
|
|
206
|
+
transformed['symbol'] = data.get('symbol', '').upper()
|
|
207
|
+
|
|
208
|
+
# Convert timestamp
|
|
209
|
+
if 'timestamp' in data:
|
|
210
|
+
if isinstance(data['timestamp'], (int, float)):
|
|
211
|
+
transformed['timestamp'] = datetime.fromtimestamp(data['timestamp'])
|
|
212
|
+
else:
|
|
213
|
+
transformed['timestamp'] = pd.to_datetime(data['timestamp'])
|
|
214
|
+
|
|
215
|
+
# Calculate spread if bid/ask available
|
|
216
|
+
if 'bid' in data and 'ask' in data:
|
|
217
|
+
transformed['spread'] = data['ask'] - data['bid']
|
|
218
|
+
transformed['spread_pct'] = (transformed['spread'] / data['ask']) * 100
|
|
219
|
+
|
|
220
|
+
return transformed
|
|
221
|
+
|
|
222
|
+
def _transform_market_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
223
|
+
"""Transform market data"""
|
|
224
|
+
transformed = data.copy()
|
|
225
|
+
|
|
226
|
+
# Normalize symbol
|
|
227
|
+
transformed['symbol'] = data.get('symbol', '').upper()
|
|
228
|
+
|
|
229
|
+
# Convert date
|
|
230
|
+
if 'date' in data:
|
|
231
|
+
transformed['date'] = pd.to_datetime(data['date'])
|
|
232
|
+
|
|
233
|
+
# Calculate OHLC metrics
|
|
234
|
+
if all(k in data for k in ['open', 'high', 'low', 'close']):
|
|
235
|
+
transformed['daily_range'] = data['high'] - data['low']
|
|
236
|
+
transformed['daily_return'] = (data['close'] - data['open']) / data['open']
|
|
237
|
+
transformed['intraday_volatility'] = transformed['daily_range'] / data['close']
|
|
238
|
+
|
|
239
|
+
# Calculate volume metrics
|
|
240
|
+
if 'volume' in data and 'close' in data:
|
|
241
|
+
transformed['dollar_volume'] = data['volume'] * data['close']
|
|
242
|
+
|
|
243
|
+
return transformed
|
|
244
|
+
|
|
245
|
+
def _normalize_name(self, name: str) -> str:
|
|
246
|
+
"""Normalize politician name"""
|
|
247
|
+
# Remove titles
|
|
248
|
+
titles = ['Sen.', 'Senator', 'Rep.', 'Representative', 'Hon.', 'Dr.', 'Mr.', 'Mrs.', 'Ms.']
|
|
249
|
+
normalized = name
|
|
250
|
+
for title in titles:
|
|
251
|
+
normalized = normalized.replace(title, '')
|
|
252
|
+
|
|
253
|
+
# Clean and standardize
|
|
254
|
+
normalized = ' '.join(normalized.split()) # Remove extra spaces
|
|
255
|
+
normalized = normalized.strip()
|
|
256
|
+
|
|
257
|
+
return normalized
|
|
258
|
+
|
|
259
|
+
def _categorize_amount(self, amount: float) -> str:
|
|
260
|
+
"""Categorize transaction amount"""
|
|
261
|
+
if amount < 1000:
|
|
262
|
+
return 'micro'
|
|
263
|
+
elif amount < 15000:
|
|
264
|
+
return 'small'
|
|
265
|
+
elif amount < 50000:
|
|
266
|
+
return 'medium'
|
|
267
|
+
elif amount < 250000:
|
|
268
|
+
return 'large'
|
|
269
|
+
elif amount < 1000000:
|
|
270
|
+
return 'very_large'
|
|
271
|
+
else:
|
|
272
|
+
return 'mega'
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
class DataLoader:
|
|
276
|
+
"""Load data to storage"""
|
|
277
|
+
|
|
278
|
+
def __init__(self, data_dir: Path):
|
|
279
|
+
self.data_dir = data_dir
|
|
280
|
+
self.data_dir.mkdir(parents=True, exist_ok=True)
|
|
281
|
+
|
|
282
|
+
async def save_batch(self, data: pd.DataFrame, data_type: str,
|
|
283
|
+
timestamp: Optional[datetime] = None):
|
|
284
|
+
"""Save batch of data"""
|
|
285
|
+
if timestamp is None:
|
|
286
|
+
timestamp = datetime.now()
|
|
287
|
+
|
|
288
|
+
# Create subdirectory for data type
|
|
289
|
+
type_dir = self.data_dir / data_type
|
|
290
|
+
type_dir.mkdir(exist_ok=True)
|
|
291
|
+
|
|
292
|
+
# Generate filename with timestamp
|
|
293
|
+
filename = f"{data_type}_{timestamp.strftime('%Y%m%d_%H%M%S')}.parquet"
|
|
294
|
+
filepath = type_dir / filename
|
|
295
|
+
|
|
296
|
+
# Save as parquet
|
|
297
|
+
data.to_parquet(filepath, compression='snappy')
|
|
298
|
+
logger.info(f"Saved {len(data)} records to {filepath}")
|
|
299
|
+
|
|
300
|
+
return filepath
|
|
301
|
+
|
|
302
|
+
async def save_json(self, data: Union[Dict, List], data_type: str,
|
|
303
|
+
timestamp: Optional[datetime] = None):
|
|
304
|
+
"""Save data as JSON"""
|
|
305
|
+
if timestamp is None:
|
|
306
|
+
timestamp = datetime.now()
|
|
307
|
+
|
|
308
|
+
# Create subdirectory
|
|
309
|
+
type_dir = self.data_dir / data_type
|
|
310
|
+
type_dir.mkdir(exist_ok=True)
|
|
311
|
+
|
|
312
|
+
# Generate filename
|
|
313
|
+
filename = f"{data_type}_{timestamp.strftime('%Y%m%d_%H%M%S')}.json"
|
|
314
|
+
filepath = type_dir / filename
|
|
315
|
+
|
|
316
|
+
# Save JSON
|
|
317
|
+
with open(filepath, 'w') as f:
|
|
318
|
+
json.dump(data, f, indent=2, default=str)
|
|
319
|
+
|
|
320
|
+
logger.info(f"Saved JSON to {filepath}")
|
|
321
|
+
return filepath
|
|
322
|
+
|
|
323
|
+
def load_latest(self, data_type: str, n_files: int = 1) -> pd.DataFrame:
|
|
324
|
+
"""Load latest data files"""
|
|
325
|
+
type_dir = self.data_dir / data_type
|
|
326
|
+
|
|
327
|
+
if not type_dir.exists():
|
|
328
|
+
return pd.DataFrame()
|
|
329
|
+
|
|
330
|
+
# Find parquet files
|
|
331
|
+
files = sorted(type_dir.glob("*.parquet"), key=lambda x: x.stat().st_mtime, reverse=True)
|
|
332
|
+
|
|
333
|
+
if not files:
|
|
334
|
+
return pd.DataFrame()
|
|
335
|
+
|
|
336
|
+
# Load and concatenate
|
|
337
|
+
dfs = []
|
|
338
|
+
for file in files[:n_files]:
|
|
339
|
+
df = pd.read_parquet(file)
|
|
340
|
+
dfs.append(df)
|
|
341
|
+
|
|
342
|
+
return pd.concat(dfs, ignore_index=True)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
class IngestionPipeline:
|
|
346
|
+
"""Complete data ingestion pipeline"""
|
|
347
|
+
|
|
348
|
+
def __init__(self, config: PipelineConfig):
|
|
349
|
+
self.config = config
|
|
350
|
+
self.validator = DataValidator()
|
|
351
|
+
self.transformer = DataTransformer()
|
|
352
|
+
self.loader = DataLoader(config.data_dir)
|
|
353
|
+
|
|
354
|
+
# Initialize data sources
|
|
355
|
+
self.sources = {}
|
|
356
|
+
self.stream_processor = None
|
|
357
|
+
|
|
358
|
+
# Pipeline metrics
|
|
359
|
+
self.metrics = {
|
|
360
|
+
'records_processed': 0,
|
|
361
|
+
'records_validated': 0,
|
|
362
|
+
'records_transformed': 0,
|
|
363
|
+
'records_saved': 0,
|
|
364
|
+
'errors': 0,
|
|
365
|
+
'start_time': None,
|
|
366
|
+
'last_update': None
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
def add_source(self, name: str, connector):
|
|
370
|
+
"""Add data source"""
|
|
371
|
+
self.sources[name] = connector
|
|
372
|
+
logger.info(f"Added data source: {name}")
|
|
373
|
+
|
|
374
|
+
async def initialize_sources(self):
|
|
375
|
+
"""Initialize all data sources"""
|
|
376
|
+
# Congressional data
|
|
377
|
+
congress_api = CongressionalDataAPI()
|
|
378
|
+
self.add_source('congress', congress_api)
|
|
379
|
+
|
|
380
|
+
# Stock data sources
|
|
381
|
+
yahoo = YahooFinanceConnector()
|
|
382
|
+
self.add_source('yahoo', yahoo)
|
|
383
|
+
|
|
384
|
+
# Add more sources as needed
|
|
385
|
+
logger.info(f"Initialized {len(self.sources)} data sources")
|
|
386
|
+
|
|
387
|
+
async def process_batch(self, data: List[Dict[str, Any]], data_type: str) -> pd.DataFrame:
|
|
388
|
+
"""Process batch of data through pipeline"""
|
|
389
|
+
processed_data = []
|
|
390
|
+
|
|
391
|
+
for record in data:
|
|
392
|
+
# Validate
|
|
393
|
+
if self.config.enable_validation:
|
|
394
|
+
if not self.validator.validate(record, data_type):
|
|
395
|
+
self.metrics['errors'] += 1
|
|
396
|
+
continue
|
|
397
|
+
self.metrics['records_validated'] += 1
|
|
398
|
+
|
|
399
|
+
# Transform
|
|
400
|
+
if self.config.enable_transformation:
|
|
401
|
+
record = self.transformer.transform(record, data_type)
|
|
402
|
+
self.metrics['records_transformed'] += 1
|
|
403
|
+
|
|
404
|
+
processed_data.append(record)
|
|
405
|
+
self.metrics['records_processed'] += 1
|
|
406
|
+
|
|
407
|
+
# Convert to DataFrame
|
|
408
|
+
if processed_data:
|
|
409
|
+
df = pd.DataFrame(processed_data)
|
|
410
|
+
|
|
411
|
+
# Save to storage
|
|
412
|
+
await self.loader.save_batch(df, data_type)
|
|
413
|
+
self.metrics['records_saved'] += len(df)
|
|
414
|
+
|
|
415
|
+
return df
|
|
416
|
+
|
|
417
|
+
return pd.DataFrame()
|
|
418
|
+
|
|
419
|
+
async def fetch_politician_trades(self, days: int = 30) -> pd.DataFrame:
|
|
420
|
+
"""Fetch recent politician trades"""
|
|
421
|
+
congress_api = self.sources.get('congress')
|
|
422
|
+
if not congress_api:
|
|
423
|
+
logger.error("Congressional data source not available")
|
|
424
|
+
return pd.DataFrame()
|
|
425
|
+
|
|
426
|
+
# Fetch trades
|
|
427
|
+
trades = await congress_api.fetch_recent_trades(days=days)
|
|
428
|
+
|
|
429
|
+
# Process through pipeline
|
|
430
|
+
df = await self.process_batch(trades, 'politician_trades')
|
|
431
|
+
|
|
432
|
+
logger.info(f"Fetched {len(df)} politician trades")
|
|
433
|
+
return df
|
|
434
|
+
|
|
435
|
+
async def fetch_stock_data(self, tickers: List[str], period: str = '1mo') -> Dict[str, pd.DataFrame]:
|
|
436
|
+
"""Fetch stock data for multiple tickers"""
|
|
437
|
+
stock_data = {}
|
|
438
|
+
|
|
439
|
+
for ticker in tickers:
|
|
440
|
+
# Try Yahoo Finance first
|
|
441
|
+
yahoo = self.sources.get('yahoo')
|
|
442
|
+
if yahoo:
|
|
443
|
+
try:
|
|
444
|
+
df = await yahoo.fetch_historical(ticker, period)
|
|
445
|
+
if not df.empty:
|
|
446
|
+
# Process through pipeline
|
|
447
|
+
records = df.to_dict('records')
|
|
448
|
+
for record in records:
|
|
449
|
+
record['symbol'] = ticker
|
|
450
|
+
|
|
451
|
+
processed = await self.process_batch(records, 'market_data')
|
|
452
|
+
stock_data[ticker] = processed
|
|
453
|
+
except Exception as e:
|
|
454
|
+
logger.error(f"Failed to fetch {ticker}: {e}")
|
|
455
|
+
|
|
456
|
+
return stock_data
|
|
457
|
+
|
|
458
|
+
async def start_streaming(self):
|
|
459
|
+
"""Start real-time streaming"""
|
|
460
|
+
if not self.config.enable_streaming:
|
|
461
|
+
logger.info("Streaming disabled")
|
|
462
|
+
return
|
|
463
|
+
|
|
464
|
+
# Initialize stream processor
|
|
465
|
+
stream_config = StreamConfig(
|
|
466
|
+
buffer_size=self.config.batch_size,
|
|
467
|
+
batch_size=100,
|
|
468
|
+
flush_interval=5
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
self.stream_processor = StreamProcessor(stream_config)
|
|
472
|
+
|
|
473
|
+
# Add processor for pipeline
|
|
474
|
+
async def pipeline_processor(batch):
|
|
475
|
+
await self.process_batch(batch, 'streaming_data')
|
|
476
|
+
|
|
477
|
+
self.stream_processor.add_processor(pipeline_processor)
|
|
478
|
+
|
|
479
|
+
# Start streaming
|
|
480
|
+
await self.stream_processor.start()
|
|
481
|
+
|
|
482
|
+
async def stop_streaming(self):
|
|
483
|
+
"""Stop streaming"""
|
|
484
|
+
if self.stream_processor:
|
|
485
|
+
await self.stream_processor.stop()
|
|
486
|
+
|
|
487
|
+
async def run(self, mode: str = 'batch'):
|
|
488
|
+
"""Run ingestion pipeline"""
|
|
489
|
+
self.metrics['start_time'] = datetime.now()
|
|
490
|
+
|
|
491
|
+
try:
|
|
492
|
+
# Initialize sources
|
|
493
|
+
await self.initialize_sources()
|
|
494
|
+
|
|
495
|
+
if mode == 'batch':
|
|
496
|
+
# Batch processing
|
|
497
|
+
await self.run_batch()
|
|
498
|
+
elif mode == 'streaming':
|
|
499
|
+
# Streaming mode
|
|
500
|
+
await self.start_streaming()
|
|
501
|
+
elif mode == 'hybrid':
|
|
502
|
+
# Both batch and streaming
|
|
503
|
+
batch_task = asyncio.create_task(self.run_batch())
|
|
504
|
+
stream_task = asyncio.create_task(self.start_streaming())
|
|
505
|
+
await asyncio.gather(batch_task, stream_task)
|
|
506
|
+
|
|
507
|
+
except Exception as e:
|
|
508
|
+
logger.error(f"Pipeline error: {e}")
|
|
509
|
+
self.metrics['errors'] += 1
|
|
510
|
+
raise
|
|
511
|
+
finally:
|
|
512
|
+
self.metrics['last_update'] = datetime.now()
|
|
513
|
+
|
|
514
|
+
async def run_batch(self):
|
|
515
|
+
"""Run batch processing"""
|
|
516
|
+
logger.info("Starting batch processing...")
|
|
517
|
+
|
|
518
|
+
# Fetch politician trades
|
|
519
|
+
trades_df = await self.fetch_politician_trades(days=30)
|
|
520
|
+
|
|
521
|
+
# Extract unique tickers
|
|
522
|
+
if not trades_df.empty and 'ticker' in trades_df.columns:
|
|
523
|
+
tickers = trades_df['ticker'].unique().tolist()
|
|
524
|
+
|
|
525
|
+
# Fetch stock data for those tickers
|
|
526
|
+
stock_data = await self.fetch_stock_data(tickers[:20]) # Limit to 20 for demo
|
|
527
|
+
|
|
528
|
+
logger.info(f"Processed {len(trades_df)} trades and {len(stock_data)} stocks")
|
|
529
|
+
|
|
530
|
+
def get_metrics(self) -> Dict[str, Any]:
|
|
531
|
+
"""Get pipeline metrics"""
|
|
532
|
+
metrics = self.metrics.copy()
|
|
533
|
+
|
|
534
|
+
# Calculate throughput
|
|
535
|
+
if metrics['start_time']:
|
|
536
|
+
elapsed = (datetime.now() - metrics['start_time']).total_seconds()
|
|
537
|
+
if elapsed > 0:
|
|
538
|
+
metrics['throughput'] = metrics['records_processed'] / elapsed
|
|
539
|
+
|
|
540
|
+
# Add validation stats
|
|
541
|
+
metrics['validation_stats'] = self.validator.get_stats()
|
|
542
|
+
|
|
543
|
+
return metrics
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
# Example usage
|
|
547
|
+
if __name__ == "__main__":
|
|
548
|
+
async def main():
|
|
549
|
+
# Configure pipeline
|
|
550
|
+
config = PipelineConfig(
|
|
551
|
+
data_dir=Path("data/ingestion"),
|
|
552
|
+
enable_streaming=False, # Batch mode for testing
|
|
553
|
+
enable_validation=True,
|
|
554
|
+
enable_transformation=True
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
# Create pipeline
|
|
558
|
+
pipeline = IngestionPipeline(config)
|
|
559
|
+
|
|
560
|
+
# Run batch processing
|
|
561
|
+
await pipeline.run(mode='batch')
|
|
562
|
+
|
|
563
|
+
# Get metrics
|
|
564
|
+
metrics = pipeline.get_metrics()
|
|
565
|
+
print(f"Pipeline metrics: {json.dumps(metrics, indent=2, default=str)}")
|
|
566
|
+
|
|
567
|
+
asyncio.run(main())
|