mcli-framework 7.1.0__py3-none-any.whl → 7.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/completion_cmd.py +59 -49
- mcli/app/completion_helpers.py +60 -138
- mcli/app/logs_cmd.py +46 -13
- mcli/app/main.py +17 -14
- mcli/app/model_cmd.py +19 -4
- mcli/chat/chat.py +3 -2
- mcli/lib/search/cached_vectorizer.py +1 -0
- mcli/lib/services/data_pipeline.py +12 -5
- mcli/lib/services/lsh_client.py +69 -58
- mcli/ml/api/app.py +28 -36
- mcli/ml/api/middleware.py +8 -16
- mcli/ml/api/routers/admin_router.py +3 -1
- mcli/ml/api/routers/auth_router.py +32 -56
- mcli/ml/api/routers/backtest_router.py +3 -1
- mcli/ml/api/routers/data_router.py +3 -1
- mcli/ml/api/routers/model_router.py +35 -74
- mcli/ml/api/routers/monitoring_router.py +3 -1
- mcli/ml/api/routers/portfolio_router.py +3 -1
- mcli/ml/api/routers/prediction_router.py +60 -65
- mcli/ml/api/routers/trade_router.py +6 -2
- mcli/ml/api/routers/websocket_router.py +12 -9
- mcli/ml/api/schemas.py +10 -2
- mcli/ml/auth/auth_manager.py +49 -114
- mcli/ml/auth/models.py +30 -15
- mcli/ml/auth/permissions.py +12 -19
- mcli/ml/backtesting/backtest_engine.py +134 -108
- mcli/ml/backtesting/performance_metrics.py +142 -108
- mcli/ml/cache.py +12 -18
- mcli/ml/cli/main.py +37 -23
- mcli/ml/config/settings.py +29 -12
- mcli/ml/dashboard/app.py +122 -130
- mcli/ml/dashboard/app_integrated.py +283 -152
- mcli/ml/dashboard/app_supabase.py +176 -108
- mcli/ml/dashboard/app_training.py +212 -206
- mcli/ml/dashboard/cli.py +14 -5
- mcli/ml/data_ingestion/api_connectors.py +51 -81
- mcli/ml/data_ingestion/data_pipeline.py +127 -125
- mcli/ml/data_ingestion/stream_processor.py +72 -80
- mcli/ml/database/migrations/env.py +3 -2
- mcli/ml/database/models.py +112 -79
- mcli/ml/database/session.py +6 -5
- mcli/ml/experimentation/ab_testing.py +149 -99
- mcli/ml/features/ensemble_features.py +9 -8
- mcli/ml/features/political_features.py +6 -5
- mcli/ml/features/recommendation_engine.py +15 -14
- mcli/ml/features/stock_features.py +7 -6
- mcli/ml/features/test_feature_engineering.py +8 -7
- mcli/ml/logging.py +10 -15
- mcli/ml/mlops/data_versioning.py +57 -64
- mcli/ml/mlops/experiment_tracker.py +49 -41
- mcli/ml/mlops/model_serving.py +59 -62
- mcli/ml/mlops/pipeline_orchestrator.py +203 -149
- mcli/ml/models/base_models.py +8 -7
- mcli/ml/models/ensemble_models.py +6 -5
- mcli/ml/models/recommendation_models.py +7 -6
- mcli/ml/models/test_models.py +18 -14
- mcli/ml/monitoring/drift_detection.py +95 -74
- mcli/ml/monitoring/metrics.py +10 -22
- mcli/ml/optimization/portfolio_optimizer.py +172 -132
- mcli/ml/predictions/prediction_engine.py +235 -0
- mcli/ml/preprocessing/data_cleaners.py +6 -5
- mcli/ml/preprocessing/feature_extractors.py +7 -6
- mcli/ml/preprocessing/ml_pipeline.py +3 -2
- mcli/ml/preprocessing/politician_trading_preprocessor.py +11 -10
- mcli/ml/preprocessing/test_preprocessing.py +4 -4
- mcli/ml/scripts/populate_sample_data.py +36 -16
- mcli/ml/tasks.py +82 -83
- mcli/ml/tests/test_integration.py +86 -76
- mcli/ml/tests/test_training_dashboard.py +169 -142
- mcli/mygroup/test_cmd.py +2 -1
- mcli/self/self_cmd.py +38 -18
- mcli/self/test_cmd.py +2 -1
- mcli/workflow/dashboard/dashboard_cmd.py +13 -6
- mcli/workflow/lsh_integration.py +46 -58
- mcli/workflow/politician_trading/commands.py +576 -427
- mcli/workflow/politician_trading/config.py +7 -7
- mcli/workflow/politician_trading/connectivity.py +35 -33
- mcli/workflow/politician_trading/data_sources.py +72 -71
- mcli/workflow/politician_trading/database.py +18 -16
- mcli/workflow/politician_trading/demo.py +4 -3
- mcli/workflow/politician_trading/models.py +5 -5
- mcli/workflow/politician_trading/monitoring.py +13 -13
- mcli/workflow/politician_trading/scrapers.py +332 -224
- mcli/workflow/politician_trading/scrapers_california.py +116 -94
- mcli/workflow/politician_trading/scrapers_eu.py +70 -71
- mcli/workflow/politician_trading/scrapers_uk.py +118 -90
- mcli/workflow/politician_trading/scrapers_us_states.py +125 -92
- mcli/workflow/politician_trading/workflow.py +98 -71
- {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/METADATA +2 -2
- {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/RECORD +94 -93
- {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/WHEEL +0 -0
- {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/entry_points.txt +0 -0
- {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/licenses/LICENSE +0 -0
- {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/top_level.txt +0 -0
|
@@ -1,25 +1,27 @@
|
|
|
1
1
|
"""Complete data ingestion pipeline with validation and transformation"""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
-
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
5
7
|
from dataclasses import dataclass, field
|
|
6
8
|
from datetime import datetime, timedelta
|
|
7
|
-
import pandas as pd
|
|
8
|
-
import numpy as np
|
|
9
|
-
import logging
|
|
10
9
|
from pathlib import Path
|
|
11
|
-
import
|
|
12
|
-
|
|
10
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
13
14
|
|
|
14
15
|
from .api_connectors import (
|
|
15
|
-
CongressionalDataAPI,
|
|
16
|
-
YahooFinanceConnector,
|
|
17
16
|
AlphaVantageConnector,
|
|
17
|
+
CongressionalDataAPI,
|
|
18
|
+
DataAggregator,
|
|
18
19
|
PolygonIOConnector,
|
|
19
20
|
QuiverQuantConnector,
|
|
20
|
-
|
|
21
|
+
YahooFinanceConnector,
|
|
21
22
|
)
|
|
22
|
-
from .stream_processor import
|
|
23
|
+
from .stream_processor import DataAggregator as StreamAggregator
|
|
24
|
+
from .stream_processor import StreamConfig, StreamProcessor
|
|
23
25
|
|
|
24
26
|
logger = logging.getLogger(__name__)
|
|
25
27
|
|
|
@@ -27,6 +29,7 @@ logger = logging.getLogger(__name__)
|
|
|
27
29
|
@dataclass
|
|
28
30
|
class PipelineConfig:
|
|
29
31
|
"""Data pipeline configuration"""
|
|
32
|
+
|
|
30
33
|
data_dir: Path = Path("data")
|
|
31
34
|
batch_size: int = 1000
|
|
32
35
|
enable_streaming: bool = True
|
|
@@ -43,20 +46,15 @@ class DataValidator:
|
|
|
43
46
|
|
|
44
47
|
def __init__(self):
|
|
45
48
|
self.validation_rules = {
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
}
|
|
50
|
-
self.validation_stats = {
|
|
51
|
-
'total': 0,
|
|
52
|
-
'valid': 0,
|
|
53
|
-
'invalid': 0,
|
|
54
|
-
'errors': []
|
|
49
|
+
"politician_trades": self._validate_politician_trade,
|
|
50
|
+
"stock_quotes": self._validate_stock_quote,
|
|
51
|
+
"market_data": self._validate_market_data,
|
|
55
52
|
}
|
|
53
|
+
self.validation_stats = {"total": 0, "valid": 0, "invalid": 0, "errors": []}
|
|
56
54
|
|
|
57
55
|
def validate(self, data: Dict[str, Any], data_type: str) -> bool:
|
|
58
56
|
"""Validate data based on type"""
|
|
59
|
-
self.validation_stats[
|
|
57
|
+
self.validation_stats["total"] += 1
|
|
60
58
|
|
|
61
59
|
if data_type not in self.validation_rules:
|
|
62
60
|
logger.warning(f"Unknown data type: {data_type}")
|
|
@@ -65,19 +63,19 @@ class DataValidator:
|
|
|
65
63
|
try:
|
|
66
64
|
is_valid = self.validation_rules[data_type](data)
|
|
67
65
|
if is_valid:
|
|
68
|
-
self.validation_stats[
|
|
66
|
+
self.validation_stats["valid"] += 1
|
|
69
67
|
else:
|
|
70
|
-
self.validation_stats[
|
|
68
|
+
self.validation_stats["invalid"] += 1
|
|
71
69
|
return is_valid
|
|
72
70
|
except Exception as e:
|
|
73
|
-
self.validation_stats[
|
|
74
|
-
self.validation_stats[
|
|
71
|
+
self.validation_stats["invalid"] += 1
|
|
72
|
+
self.validation_stats["errors"].append(str(e))
|
|
75
73
|
logger.error(f"Validation error: {e}")
|
|
76
74
|
return False
|
|
77
75
|
|
|
78
76
|
def _validate_politician_trade(self, data: Dict[str, Any]) -> bool:
|
|
79
77
|
"""Validate politician trading data"""
|
|
80
|
-
required_fields = [
|
|
78
|
+
required_fields = ["politician", "ticker", "transaction_type", "amount", "transaction_date"]
|
|
81
79
|
|
|
82
80
|
# Check required fields
|
|
83
81
|
for field in required_fields:
|
|
@@ -86,19 +84,19 @@ class DataValidator:
|
|
|
86
84
|
return False
|
|
87
85
|
|
|
88
86
|
# Validate transaction type
|
|
89
|
-
if data[
|
|
87
|
+
if data["transaction_type"] not in ["buy", "sell", "exchange"]:
|
|
90
88
|
logger.warning(f"Invalid transaction type: {data['transaction_type']}")
|
|
91
89
|
return False
|
|
92
90
|
|
|
93
91
|
# Validate amount
|
|
94
|
-
if not isinstance(data[
|
|
92
|
+
if not isinstance(data["amount"], (int, float)) or data["amount"] <= 0:
|
|
95
93
|
logger.warning(f"Invalid amount: {data['amount']}")
|
|
96
94
|
return False
|
|
97
95
|
|
|
98
96
|
# Validate date
|
|
99
97
|
try:
|
|
100
|
-
if isinstance(data[
|
|
101
|
-
datetime.fromisoformat(data[
|
|
98
|
+
if isinstance(data["transaction_date"], str):
|
|
99
|
+
datetime.fromisoformat(data["transaction_date"])
|
|
102
100
|
except:
|
|
103
101
|
logger.warning(f"Invalid date format: {data['transaction_date']}")
|
|
104
102
|
return False
|
|
@@ -107,34 +105,34 @@ class DataValidator:
|
|
|
107
105
|
|
|
108
106
|
def _validate_stock_quote(self, data: Dict[str, Any]) -> bool:
|
|
109
107
|
"""Validate stock quote data"""
|
|
110
|
-
required_fields = [
|
|
108
|
+
required_fields = ["symbol", "price", "timestamp"]
|
|
111
109
|
|
|
112
110
|
for field in required_fields:
|
|
113
111
|
if field not in data:
|
|
114
112
|
return False
|
|
115
113
|
|
|
116
114
|
# Validate price
|
|
117
|
-
if not isinstance(data[
|
|
115
|
+
if not isinstance(data["price"], (int, float)) or data["price"] <= 0:
|
|
118
116
|
return False
|
|
119
117
|
|
|
120
118
|
return True
|
|
121
119
|
|
|
122
120
|
def _validate_market_data(self, data: Dict[str, Any]) -> bool:
|
|
123
121
|
"""Validate market data"""
|
|
124
|
-
required_fields = [
|
|
122
|
+
required_fields = ["symbol", "close", "volume", "date"]
|
|
125
123
|
|
|
126
124
|
for field in required_fields:
|
|
127
125
|
if field not in data:
|
|
128
126
|
return False
|
|
129
127
|
|
|
130
128
|
# Validate prices
|
|
131
|
-
for price_field in [
|
|
129
|
+
for price_field in ["close", "open", "high", "low"]:
|
|
132
130
|
if price_field in data:
|
|
133
131
|
if not isinstance(data[price_field], (int, float)) or data[price_field] <= 0:
|
|
134
132
|
return False
|
|
135
133
|
|
|
136
134
|
# Validate volume
|
|
137
|
-
if not isinstance(data[
|
|
135
|
+
if not isinstance(data["volume"], (int, float)) or data["volume"] < 0:
|
|
138
136
|
return False
|
|
139
137
|
|
|
140
138
|
return True
|
|
@@ -149,13 +147,14 @@ class DataTransformer:
|
|
|
149
147
|
|
|
150
148
|
def __init__(self):
|
|
151
149
|
self.transformers = {
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
150
|
+
"politician_trades": self._transform_politician_trade,
|
|
151
|
+
"stock_quotes": self._transform_stock_quote,
|
|
152
|
+
"market_data": self._transform_market_data,
|
|
155
153
|
}
|
|
156
154
|
|
|
157
|
-
def transform(
|
|
158
|
-
|
|
155
|
+
def transform(
|
|
156
|
+
self, data: Union[Dict[str, Any], List[Dict[str, Any]]], data_type: str
|
|
157
|
+
) -> Union[Dict[str, Any], pd.DataFrame]:
|
|
159
158
|
"""Transform data based on type"""
|
|
160
159
|
if data_type not in self.transformers:
|
|
161
160
|
return data
|
|
@@ -171,30 +170,30 @@ class DataTransformer:
|
|
|
171
170
|
transformed = data.copy()
|
|
172
171
|
|
|
173
172
|
# Standardize politician name
|
|
174
|
-
transformed[
|
|
173
|
+
transformed["politician_normalized"] = self._normalize_name(data.get("politician", ""))
|
|
175
174
|
|
|
176
175
|
# Convert dates to datetime
|
|
177
|
-
if
|
|
178
|
-
transformed[
|
|
176
|
+
if "transaction_date" in data:
|
|
177
|
+
transformed["transaction_date"] = pd.to_datetime(data["transaction_date"])
|
|
179
178
|
|
|
180
|
-
if
|
|
181
|
-
transformed[
|
|
179
|
+
if "disclosure_date" in data:
|
|
180
|
+
transformed["disclosure_date"] = pd.to_datetime(data["disclosure_date"])
|
|
182
181
|
|
|
183
182
|
# Calculate disclosure delay
|
|
184
|
-
if
|
|
185
|
-
delay = (transformed[
|
|
186
|
-
transformed[
|
|
183
|
+
if "transaction_date" in transformed:
|
|
184
|
+
delay = (transformed["disclosure_date"] - transformed["transaction_date"]).days
|
|
185
|
+
transformed["disclosure_delay_days"] = max(0, delay)
|
|
187
186
|
|
|
188
187
|
# Normalize ticker
|
|
189
|
-
transformed[
|
|
188
|
+
transformed["ticker"] = data.get("ticker", "").upper()
|
|
190
189
|
|
|
191
190
|
# Categorize transaction amount
|
|
192
|
-
amount = data.get(
|
|
193
|
-
transformed[
|
|
191
|
+
amount = data.get("amount", 0)
|
|
192
|
+
transformed["amount_category"] = self._categorize_amount(amount)
|
|
194
193
|
|
|
195
194
|
# Add derived features
|
|
196
|
-
transformed[
|
|
197
|
-
transformed[
|
|
195
|
+
transformed["is_purchase"] = data.get("transaction_type") == "buy"
|
|
196
|
+
transformed["is_sale"] = data.get("transaction_type") == "sell"
|
|
198
197
|
|
|
199
198
|
return transformed
|
|
200
199
|
|
|
@@ -203,19 +202,19 @@ class DataTransformer:
|
|
|
203
202
|
transformed = data.copy()
|
|
204
203
|
|
|
205
204
|
# Normalize symbol
|
|
206
|
-
transformed[
|
|
205
|
+
transformed["symbol"] = data.get("symbol", "").upper()
|
|
207
206
|
|
|
208
207
|
# Convert timestamp
|
|
209
|
-
if
|
|
210
|
-
if isinstance(data[
|
|
211
|
-
transformed[
|
|
208
|
+
if "timestamp" in data:
|
|
209
|
+
if isinstance(data["timestamp"], (int, float)):
|
|
210
|
+
transformed["timestamp"] = datetime.fromtimestamp(data["timestamp"])
|
|
212
211
|
else:
|
|
213
|
-
transformed[
|
|
212
|
+
transformed["timestamp"] = pd.to_datetime(data["timestamp"])
|
|
214
213
|
|
|
215
214
|
# Calculate spread if bid/ask available
|
|
216
|
-
if
|
|
217
|
-
transformed[
|
|
218
|
-
transformed[
|
|
215
|
+
if "bid" in data and "ask" in data:
|
|
216
|
+
transformed["spread"] = data["ask"] - data["bid"]
|
|
217
|
+
transformed["spread_pct"] = (transformed["spread"] / data["ask"]) * 100
|
|
219
218
|
|
|
220
219
|
return transformed
|
|
221
220
|
|
|
@@ -224,34 +223,34 @@ class DataTransformer:
|
|
|
224
223
|
transformed = data.copy()
|
|
225
224
|
|
|
226
225
|
# Normalize symbol
|
|
227
|
-
transformed[
|
|
226
|
+
transformed["symbol"] = data.get("symbol", "").upper()
|
|
228
227
|
|
|
229
228
|
# Convert date
|
|
230
|
-
if
|
|
231
|
-
transformed[
|
|
229
|
+
if "date" in data:
|
|
230
|
+
transformed["date"] = pd.to_datetime(data["date"])
|
|
232
231
|
|
|
233
232
|
# Calculate OHLC metrics
|
|
234
|
-
if all(k in data for k in [
|
|
235
|
-
transformed[
|
|
236
|
-
transformed[
|
|
237
|
-
transformed[
|
|
233
|
+
if all(k in data for k in ["open", "high", "low", "close"]):
|
|
234
|
+
transformed["daily_range"] = data["high"] - data["low"]
|
|
235
|
+
transformed["daily_return"] = (data["close"] - data["open"]) / data["open"]
|
|
236
|
+
transformed["intraday_volatility"] = transformed["daily_range"] / data["close"]
|
|
238
237
|
|
|
239
238
|
# Calculate volume metrics
|
|
240
|
-
if
|
|
241
|
-
transformed[
|
|
239
|
+
if "volume" in data and "close" in data:
|
|
240
|
+
transformed["dollar_volume"] = data["volume"] * data["close"]
|
|
242
241
|
|
|
243
242
|
return transformed
|
|
244
243
|
|
|
245
244
|
def _normalize_name(self, name: str) -> str:
|
|
246
245
|
"""Normalize politician name"""
|
|
247
246
|
# Remove titles
|
|
248
|
-
titles = [
|
|
247
|
+
titles = ["Sen.", "Senator", "Rep.", "Representative", "Hon.", "Dr.", "Mr.", "Mrs.", "Ms."]
|
|
249
248
|
normalized = name
|
|
250
249
|
for title in titles:
|
|
251
|
-
normalized = normalized.replace(title,
|
|
250
|
+
normalized = normalized.replace(title, "")
|
|
252
251
|
|
|
253
252
|
# Clean and standardize
|
|
254
|
-
normalized =
|
|
253
|
+
normalized = " ".join(normalized.split()) # Remove extra spaces
|
|
255
254
|
normalized = normalized.strip()
|
|
256
255
|
|
|
257
256
|
return normalized
|
|
@@ -259,17 +258,17 @@ class DataTransformer:
|
|
|
259
258
|
def _categorize_amount(self, amount: float) -> str:
|
|
260
259
|
"""Categorize transaction amount"""
|
|
261
260
|
if amount < 1000:
|
|
262
|
-
return
|
|
261
|
+
return "micro"
|
|
263
262
|
elif amount < 15000:
|
|
264
|
-
return
|
|
263
|
+
return "small"
|
|
265
264
|
elif amount < 50000:
|
|
266
|
-
return
|
|
265
|
+
return "medium"
|
|
267
266
|
elif amount < 250000:
|
|
268
|
-
return
|
|
267
|
+
return "large"
|
|
269
268
|
elif amount < 1000000:
|
|
270
|
-
return
|
|
269
|
+
return "very_large"
|
|
271
270
|
else:
|
|
272
|
-
return
|
|
271
|
+
return "mega"
|
|
273
272
|
|
|
274
273
|
|
|
275
274
|
class DataLoader:
|
|
@@ -279,8 +278,9 @@ class DataLoader:
|
|
|
279
278
|
self.data_dir = data_dir
|
|
280
279
|
self.data_dir.mkdir(parents=True, exist_ok=True)
|
|
281
280
|
|
|
282
|
-
async def save_batch(
|
|
283
|
-
|
|
281
|
+
async def save_batch(
|
|
282
|
+
self, data: pd.DataFrame, data_type: str, timestamp: Optional[datetime] = None
|
|
283
|
+
):
|
|
284
284
|
"""Save batch of data"""
|
|
285
285
|
if timestamp is None:
|
|
286
286
|
timestamp = datetime.now()
|
|
@@ -294,13 +294,14 @@ class DataLoader:
|
|
|
294
294
|
filepath = type_dir / filename
|
|
295
295
|
|
|
296
296
|
# Save as parquet
|
|
297
|
-
data.to_parquet(filepath, compression=
|
|
297
|
+
data.to_parquet(filepath, compression="snappy")
|
|
298
298
|
logger.info(f"Saved {len(data)} records to {filepath}")
|
|
299
299
|
|
|
300
300
|
return filepath
|
|
301
301
|
|
|
302
|
-
async def save_json(
|
|
303
|
-
|
|
302
|
+
async def save_json(
|
|
303
|
+
self, data: Union[Dict, List], data_type: str, timestamp: Optional[datetime] = None
|
|
304
|
+
):
|
|
304
305
|
"""Save data as JSON"""
|
|
305
306
|
if timestamp is None:
|
|
306
307
|
timestamp = datetime.now()
|
|
@@ -314,7 +315,7 @@ class DataLoader:
|
|
|
314
315
|
filepath = type_dir / filename
|
|
315
316
|
|
|
316
317
|
# Save JSON
|
|
317
|
-
with open(filepath,
|
|
318
|
+
with open(filepath, "w") as f:
|
|
318
319
|
json.dump(data, f, indent=2, default=str)
|
|
319
320
|
|
|
320
321
|
logger.info(f"Saved JSON to {filepath}")
|
|
@@ -357,13 +358,13 @@ class IngestionPipeline:
|
|
|
357
358
|
|
|
358
359
|
# Pipeline metrics
|
|
359
360
|
self.metrics = {
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
361
|
+
"records_processed": 0,
|
|
362
|
+
"records_validated": 0,
|
|
363
|
+
"records_transformed": 0,
|
|
364
|
+
"records_saved": 0,
|
|
365
|
+
"errors": 0,
|
|
366
|
+
"start_time": None,
|
|
367
|
+
"last_update": None,
|
|
367
368
|
}
|
|
368
369
|
|
|
369
370
|
def add_source(self, name: str, connector):
|
|
@@ -375,11 +376,11 @@ class IngestionPipeline:
|
|
|
375
376
|
"""Initialize all data sources"""
|
|
376
377
|
# Congressional data
|
|
377
378
|
congress_api = CongressionalDataAPI()
|
|
378
|
-
self.add_source(
|
|
379
|
+
self.add_source("congress", congress_api)
|
|
379
380
|
|
|
380
381
|
# Stock data sources
|
|
381
382
|
yahoo = YahooFinanceConnector()
|
|
382
|
-
self.add_source(
|
|
383
|
+
self.add_source("yahoo", yahoo)
|
|
383
384
|
|
|
384
385
|
# Add more sources as needed
|
|
385
386
|
logger.info(f"Initialized {len(self.sources)} data sources")
|
|
@@ -392,17 +393,17 @@ class IngestionPipeline:
|
|
|
392
393
|
# Validate
|
|
393
394
|
if self.config.enable_validation:
|
|
394
395
|
if not self.validator.validate(record, data_type):
|
|
395
|
-
self.metrics[
|
|
396
|
+
self.metrics["errors"] += 1
|
|
396
397
|
continue
|
|
397
|
-
self.metrics[
|
|
398
|
+
self.metrics["records_validated"] += 1
|
|
398
399
|
|
|
399
400
|
# Transform
|
|
400
401
|
if self.config.enable_transformation:
|
|
401
402
|
record = self.transformer.transform(record, data_type)
|
|
402
|
-
self.metrics[
|
|
403
|
+
self.metrics["records_transformed"] += 1
|
|
403
404
|
|
|
404
405
|
processed_data.append(record)
|
|
405
|
-
self.metrics[
|
|
406
|
+
self.metrics["records_processed"] += 1
|
|
406
407
|
|
|
407
408
|
# Convert to DataFrame
|
|
408
409
|
if processed_data:
|
|
@@ -410,7 +411,7 @@ class IngestionPipeline:
|
|
|
410
411
|
|
|
411
412
|
# Save to storage
|
|
412
413
|
await self.loader.save_batch(df, data_type)
|
|
413
|
-
self.metrics[
|
|
414
|
+
self.metrics["records_saved"] += len(df)
|
|
414
415
|
|
|
415
416
|
return df
|
|
416
417
|
|
|
@@ -418,7 +419,7 @@ class IngestionPipeline:
|
|
|
418
419
|
|
|
419
420
|
async def fetch_politician_trades(self, days: int = 30) -> pd.DataFrame:
|
|
420
421
|
"""Fetch recent politician trades"""
|
|
421
|
-
congress_api = self.sources.get(
|
|
422
|
+
congress_api = self.sources.get("congress")
|
|
422
423
|
if not congress_api:
|
|
423
424
|
logger.error("Congressional data source not available")
|
|
424
425
|
return pd.DataFrame()
|
|
@@ -427,28 +428,30 @@ class IngestionPipeline:
|
|
|
427
428
|
trades = await congress_api.fetch_recent_trades(days=days)
|
|
428
429
|
|
|
429
430
|
# Process through pipeline
|
|
430
|
-
df = await self.process_batch(trades,
|
|
431
|
+
df = await self.process_batch(trades, "politician_trades")
|
|
431
432
|
|
|
432
433
|
logger.info(f"Fetched {len(df)} politician trades")
|
|
433
434
|
return df
|
|
434
435
|
|
|
435
|
-
async def fetch_stock_data(
|
|
436
|
+
async def fetch_stock_data(
|
|
437
|
+
self, tickers: List[str], period: str = "1mo"
|
|
438
|
+
) -> Dict[str, pd.DataFrame]:
|
|
436
439
|
"""Fetch stock data for multiple tickers"""
|
|
437
440
|
stock_data = {}
|
|
438
441
|
|
|
439
442
|
for ticker in tickers:
|
|
440
443
|
# Try Yahoo Finance first
|
|
441
|
-
yahoo = self.sources.get(
|
|
444
|
+
yahoo = self.sources.get("yahoo")
|
|
442
445
|
if yahoo:
|
|
443
446
|
try:
|
|
444
447
|
df = await yahoo.fetch_historical(ticker, period)
|
|
445
448
|
if not df.empty:
|
|
446
449
|
# Process through pipeline
|
|
447
|
-
records = df.to_dict(
|
|
450
|
+
records = df.to_dict("records")
|
|
448
451
|
for record in records:
|
|
449
|
-
record[
|
|
452
|
+
record["symbol"] = ticker
|
|
450
453
|
|
|
451
|
-
processed = await self.process_batch(records,
|
|
454
|
+
processed = await self.process_batch(records, "market_data")
|
|
452
455
|
stock_data[ticker] = processed
|
|
453
456
|
except Exception as e:
|
|
454
457
|
logger.error(f"Failed to fetch {ticker}: {e}")
|
|
@@ -463,16 +466,14 @@ class IngestionPipeline:
|
|
|
463
466
|
|
|
464
467
|
# Initialize stream processor
|
|
465
468
|
stream_config = StreamConfig(
|
|
466
|
-
buffer_size=self.config.batch_size,
|
|
467
|
-
batch_size=100,
|
|
468
|
-
flush_interval=5
|
|
469
|
+
buffer_size=self.config.batch_size, batch_size=100, flush_interval=5
|
|
469
470
|
)
|
|
470
471
|
|
|
471
472
|
self.stream_processor = StreamProcessor(stream_config)
|
|
472
473
|
|
|
473
474
|
# Add processor for pipeline
|
|
474
475
|
async def pipeline_processor(batch):
|
|
475
|
-
await self.process_batch(batch,
|
|
476
|
+
await self.process_batch(batch, "streaming_data")
|
|
476
477
|
|
|
477
478
|
self.stream_processor.add_processor(pipeline_processor)
|
|
478
479
|
|
|
@@ -484,21 +485,21 @@ class IngestionPipeline:
|
|
|
484
485
|
if self.stream_processor:
|
|
485
486
|
await self.stream_processor.stop()
|
|
486
487
|
|
|
487
|
-
async def run(self, mode: str =
|
|
488
|
+
async def run(self, mode: str = "batch"):
|
|
488
489
|
"""Run ingestion pipeline"""
|
|
489
|
-
self.metrics[
|
|
490
|
+
self.metrics["start_time"] = datetime.now()
|
|
490
491
|
|
|
491
492
|
try:
|
|
492
493
|
# Initialize sources
|
|
493
494
|
await self.initialize_sources()
|
|
494
495
|
|
|
495
|
-
if mode ==
|
|
496
|
+
if mode == "batch":
|
|
496
497
|
# Batch processing
|
|
497
498
|
await self.run_batch()
|
|
498
|
-
elif mode ==
|
|
499
|
+
elif mode == "streaming":
|
|
499
500
|
# Streaming mode
|
|
500
501
|
await self.start_streaming()
|
|
501
|
-
elif mode ==
|
|
502
|
+
elif mode == "hybrid":
|
|
502
503
|
# Both batch and streaming
|
|
503
504
|
batch_task = asyncio.create_task(self.run_batch())
|
|
504
505
|
stream_task = asyncio.create_task(self.start_streaming())
|
|
@@ -506,10 +507,10 @@ class IngestionPipeline:
|
|
|
506
507
|
|
|
507
508
|
except Exception as e:
|
|
508
509
|
logger.error(f"Pipeline error: {e}")
|
|
509
|
-
self.metrics[
|
|
510
|
+
self.metrics["errors"] += 1
|
|
510
511
|
raise
|
|
511
512
|
finally:
|
|
512
|
-
self.metrics[
|
|
513
|
+
self.metrics["last_update"] = datetime.now()
|
|
513
514
|
|
|
514
515
|
async def run_batch(self):
|
|
515
516
|
"""Run batch processing"""
|
|
@@ -519,8 +520,8 @@ class IngestionPipeline:
|
|
|
519
520
|
trades_df = await self.fetch_politician_trades(days=30)
|
|
520
521
|
|
|
521
522
|
# Extract unique tickers
|
|
522
|
-
if not trades_df.empty and
|
|
523
|
-
tickers = trades_df[
|
|
523
|
+
if not trades_df.empty and "ticker" in trades_df.columns:
|
|
524
|
+
tickers = trades_df["ticker"].unique().tolist()
|
|
524
525
|
|
|
525
526
|
# Fetch stock data for those tickers
|
|
526
527
|
stock_data = await self.fetch_stock_data(tickers[:20]) # Limit to 20 for demo
|
|
@@ -532,36 +533,37 @@ class IngestionPipeline:
|
|
|
532
533
|
metrics = self.metrics.copy()
|
|
533
534
|
|
|
534
535
|
# Calculate throughput
|
|
535
|
-
if metrics[
|
|
536
|
-
elapsed = (datetime.now() - metrics[
|
|
536
|
+
if metrics["start_time"]:
|
|
537
|
+
elapsed = (datetime.now() - metrics["start_time"]).total_seconds()
|
|
537
538
|
if elapsed > 0:
|
|
538
|
-
metrics[
|
|
539
|
+
metrics["throughput"] = metrics["records_processed"] / elapsed
|
|
539
540
|
|
|
540
541
|
# Add validation stats
|
|
541
|
-
metrics[
|
|
542
|
+
metrics["validation_stats"] = self.validator.get_stats()
|
|
542
543
|
|
|
543
544
|
return metrics
|
|
544
545
|
|
|
545
546
|
|
|
546
547
|
# Example usage
|
|
547
548
|
if __name__ == "__main__":
|
|
549
|
+
|
|
548
550
|
async def main():
|
|
549
551
|
# Configure pipeline
|
|
550
552
|
config = PipelineConfig(
|
|
551
553
|
data_dir=Path("data/ingestion"),
|
|
552
554
|
enable_streaming=False, # Batch mode for testing
|
|
553
555
|
enable_validation=True,
|
|
554
|
-
enable_transformation=True
|
|
556
|
+
enable_transformation=True,
|
|
555
557
|
)
|
|
556
558
|
|
|
557
559
|
# Create pipeline
|
|
558
560
|
pipeline = IngestionPipeline(config)
|
|
559
561
|
|
|
560
562
|
# Run batch processing
|
|
561
|
-
await pipeline.run(mode=
|
|
563
|
+
await pipeline.run(mode="batch")
|
|
562
564
|
|
|
563
565
|
# Get metrics
|
|
564
566
|
metrics = pipeline.get_metrics()
|
|
565
567
|
print(f"Pipeline metrics: {json.dumps(metrics, indent=2, default=str)}")
|
|
566
568
|
|
|
567
|
-
asyncio.run(main())
|
|
569
|
+
asyncio.run(main())
|