mcli-framework 7.1.0__py3-none-any.whl → 7.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (94) hide show
  1. mcli/app/completion_cmd.py +59 -49
  2. mcli/app/completion_helpers.py +60 -138
  3. mcli/app/logs_cmd.py +46 -13
  4. mcli/app/main.py +17 -14
  5. mcli/app/model_cmd.py +19 -4
  6. mcli/chat/chat.py +3 -2
  7. mcli/lib/search/cached_vectorizer.py +1 -0
  8. mcli/lib/services/data_pipeline.py +12 -5
  9. mcli/lib/services/lsh_client.py +69 -58
  10. mcli/ml/api/app.py +28 -36
  11. mcli/ml/api/middleware.py +8 -16
  12. mcli/ml/api/routers/admin_router.py +3 -1
  13. mcli/ml/api/routers/auth_router.py +32 -56
  14. mcli/ml/api/routers/backtest_router.py +3 -1
  15. mcli/ml/api/routers/data_router.py +3 -1
  16. mcli/ml/api/routers/model_router.py +35 -74
  17. mcli/ml/api/routers/monitoring_router.py +3 -1
  18. mcli/ml/api/routers/portfolio_router.py +3 -1
  19. mcli/ml/api/routers/prediction_router.py +60 -65
  20. mcli/ml/api/routers/trade_router.py +6 -2
  21. mcli/ml/api/routers/websocket_router.py +12 -9
  22. mcli/ml/api/schemas.py +10 -2
  23. mcli/ml/auth/auth_manager.py +49 -114
  24. mcli/ml/auth/models.py +30 -15
  25. mcli/ml/auth/permissions.py +12 -19
  26. mcli/ml/backtesting/backtest_engine.py +134 -108
  27. mcli/ml/backtesting/performance_metrics.py +142 -108
  28. mcli/ml/cache.py +12 -18
  29. mcli/ml/cli/main.py +37 -23
  30. mcli/ml/config/settings.py +29 -12
  31. mcli/ml/dashboard/app.py +122 -130
  32. mcli/ml/dashboard/app_integrated.py +283 -152
  33. mcli/ml/dashboard/app_supabase.py +176 -108
  34. mcli/ml/dashboard/app_training.py +212 -206
  35. mcli/ml/dashboard/cli.py +14 -5
  36. mcli/ml/data_ingestion/api_connectors.py +51 -81
  37. mcli/ml/data_ingestion/data_pipeline.py +127 -125
  38. mcli/ml/data_ingestion/stream_processor.py +72 -80
  39. mcli/ml/database/migrations/env.py +3 -2
  40. mcli/ml/database/models.py +112 -79
  41. mcli/ml/database/session.py +6 -5
  42. mcli/ml/experimentation/ab_testing.py +149 -99
  43. mcli/ml/features/ensemble_features.py +9 -8
  44. mcli/ml/features/political_features.py +6 -5
  45. mcli/ml/features/recommendation_engine.py +15 -14
  46. mcli/ml/features/stock_features.py +7 -6
  47. mcli/ml/features/test_feature_engineering.py +8 -7
  48. mcli/ml/logging.py +10 -15
  49. mcli/ml/mlops/data_versioning.py +57 -64
  50. mcli/ml/mlops/experiment_tracker.py +49 -41
  51. mcli/ml/mlops/model_serving.py +59 -62
  52. mcli/ml/mlops/pipeline_orchestrator.py +203 -149
  53. mcli/ml/models/base_models.py +8 -7
  54. mcli/ml/models/ensemble_models.py +6 -5
  55. mcli/ml/models/recommendation_models.py +7 -6
  56. mcli/ml/models/test_models.py +18 -14
  57. mcli/ml/monitoring/drift_detection.py +95 -74
  58. mcli/ml/monitoring/metrics.py +10 -22
  59. mcli/ml/optimization/portfolio_optimizer.py +172 -132
  60. mcli/ml/predictions/prediction_engine.py +235 -0
  61. mcli/ml/preprocessing/data_cleaners.py +6 -5
  62. mcli/ml/preprocessing/feature_extractors.py +7 -6
  63. mcli/ml/preprocessing/ml_pipeline.py +3 -2
  64. mcli/ml/preprocessing/politician_trading_preprocessor.py +11 -10
  65. mcli/ml/preprocessing/test_preprocessing.py +4 -4
  66. mcli/ml/scripts/populate_sample_data.py +36 -16
  67. mcli/ml/tasks.py +82 -83
  68. mcli/ml/tests/test_integration.py +86 -76
  69. mcli/ml/tests/test_training_dashboard.py +169 -142
  70. mcli/mygroup/test_cmd.py +2 -1
  71. mcli/self/self_cmd.py +38 -18
  72. mcli/self/test_cmd.py +2 -1
  73. mcli/workflow/dashboard/dashboard_cmd.py +13 -6
  74. mcli/workflow/lsh_integration.py +46 -58
  75. mcli/workflow/politician_trading/commands.py +576 -427
  76. mcli/workflow/politician_trading/config.py +7 -7
  77. mcli/workflow/politician_trading/connectivity.py +35 -33
  78. mcli/workflow/politician_trading/data_sources.py +72 -71
  79. mcli/workflow/politician_trading/database.py +18 -16
  80. mcli/workflow/politician_trading/demo.py +4 -3
  81. mcli/workflow/politician_trading/models.py +5 -5
  82. mcli/workflow/politician_trading/monitoring.py +13 -13
  83. mcli/workflow/politician_trading/scrapers.py +332 -224
  84. mcli/workflow/politician_trading/scrapers_california.py +116 -94
  85. mcli/workflow/politician_trading/scrapers_eu.py +70 -71
  86. mcli/workflow/politician_trading/scrapers_uk.py +118 -90
  87. mcli/workflow/politician_trading/scrapers_us_states.py +125 -92
  88. mcli/workflow/politician_trading/workflow.py +98 -71
  89. {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/METADATA +2 -2
  90. {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/RECORD +94 -93
  91. {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/WHEEL +0 -0
  92. {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/entry_points.txt +0 -0
  93. {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/licenses/LICENSE +0 -0
  94. {mcli_framework-7.1.0.dist-info → mcli_framework-7.1.2.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,27 @@
1
1
  """Complete data ingestion pipeline with validation and transformation"""
2
2
 
3
3
  import asyncio
4
- from typing import Dict, Any, Optional, List, Union, Callable
4
+ import json
5
+ import logging
6
+ from abc import ABC, abstractmethod
5
7
  from dataclasses import dataclass, field
6
8
  from datetime import datetime, timedelta
7
- import pandas as pd
8
- import numpy as np
9
- import logging
10
9
  from pathlib import Path
11
- import json
12
- from abc import ABC, abstractmethod
10
+ from typing import Any, Callable, Dict, List, Optional, Union
11
+
12
+ import numpy as np
13
+ import pandas as pd
13
14
 
14
15
  from .api_connectors import (
15
- CongressionalDataAPI,
16
- YahooFinanceConnector,
17
16
  AlphaVantageConnector,
17
+ CongressionalDataAPI,
18
+ DataAggregator,
18
19
  PolygonIOConnector,
19
20
  QuiverQuantConnector,
20
- DataAggregator,
21
+ YahooFinanceConnector,
21
22
  )
22
- from .stream_processor import StreamProcessor, StreamConfig, DataAggregator as StreamAggregator
23
+ from .stream_processor import DataAggregator as StreamAggregator
24
+ from .stream_processor import StreamConfig, StreamProcessor
23
25
 
24
26
  logger = logging.getLogger(__name__)
25
27
 
@@ -27,6 +29,7 @@ logger = logging.getLogger(__name__)
27
29
  @dataclass
28
30
  class PipelineConfig:
29
31
  """Data pipeline configuration"""
32
+
30
33
  data_dir: Path = Path("data")
31
34
  batch_size: int = 1000
32
35
  enable_streaming: bool = True
@@ -43,20 +46,15 @@ class DataValidator:
43
46
 
44
47
  def __init__(self):
45
48
  self.validation_rules = {
46
- 'politician_trades': self._validate_politician_trade,
47
- 'stock_quotes': self._validate_stock_quote,
48
- 'market_data': self._validate_market_data
49
- }
50
- self.validation_stats = {
51
- 'total': 0,
52
- 'valid': 0,
53
- 'invalid': 0,
54
- 'errors': []
49
+ "politician_trades": self._validate_politician_trade,
50
+ "stock_quotes": self._validate_stock_quote,
51
+ "market_data": self._validate_market_data,
55
52
  }
53
+ self.validation_stats = {"total": 0, "valid": 0, "invalid": 0, "errors": []}
56
54
 
57
55
  def validate(self, data: Dict[str, Any], data_type: str) -> bool:
58
56
  """Validate data based on type"""
59
- self.validation_stats['total'] += 1
57
+ self.validation_stats["total"] += 1
60
58
 
61
59
  if data_type not in self.validation_rules:
62
60
  logger.warning(f"Unknown data type: {data_type}")
@@ -65,19 +63,19 @@ class DataValidator:
65
63
  try:
66
64
  is_valid = self.validation_rules[data_type](data)
67
65
  if is_valid:
68
- self.validation_stats['valid'] += 1
66
+ self.validation_stats["valid"] += 1
69
67
  else:
70
- self.validation_stats['invalid'] += 1
68
+ self.validation_stats["invalid"] += 1
71
69
  return is_valid
72
70
  except Exception as e:
73
- self.validation_stats['invalid'] += 1
74
- self.validation_stats['errors'].append(str(e))
71
+ self.validation_stats["invalid"] += 1
72
+ self.validation_stats["errors"].append(str(e))
75
73
  logger.error(f"Validation error: {e}")
76
74
  return False
77
75
 
78
76
  def _validate_politician_trade(self, data: Dict[str, Any]) -> bool:
79
77
  """Validate politician trading data"""
80
- required_fields = ['politician', 'ticker', 'transaction_type', 'amount', 'transaction_date']
78
+ required_fields = ["politician", "ticker", "transaction_type", "amount", "transaction_date"]
81
79
 
82
80
  # Check required fields
83
81
  for field in required_fields:
@@ -86,19 +84,19 @@ class DataValidator:
86
84
  return False
87
85
 
88
86
  # Validate transaction type
89
- if data['transaction_type'] not in ['buy', 'sell', 'exchange']:
87
+ if data["transaction_type"] not in ["buy", "sell", "exchange"]:
90
88
  logger.warning(f"Invalid transaction type: {data['transaction_type']}")
91
89
  return False
92
90
 
93
91
  # Validate amount
94
- if not isinstance(data['amount'], (int, float)) or data['amount'] <= 0:
92
+ if not isinstance(data["amount"], (int, float)) or data["amount"] <= 0:
95
93
  logger.warning(f"Invalid amount: {data['amount']}")
96
94
  return False
97
95
 
98
96
  # Validate date
99
97
  try:
100
- if isinstance(data['transaction_date'], str):
101
- datetime.fromisoformat(data['transaction_date'])
98
+ if isinstance(data["transaction_date"], str):
99
+ datetime.fromisoformat(data["transaction_date"])
102
100
  except:
103
101
  logger.warning(f"Invalid date format: {data['transaction_date']}")
104
102
  return False
@@ -107,34 +105,34 @@ class DataValidator:
107
105
 
108
106
  def _validate_stock_quote(self, data: Dict[str, Any]) -> bool:
109
107
  """Validate stock quote data"""
110
- required_fields = ['symbol', 'price', 'timestamp']
108
+ required_fields = ["symbol", "price", "timestamp"]
111
109
 
112
110
  for field in required_fields:
113
111
  if field not in data:
114
112
  return False
115
113
 
116
114
  # Validate price
117
- if not isinstance(data['price'], (int, float)) or data['price'] <= 0:
115
+ if not isinstance(data["price"], (int, float)) or data["price"] <= 0:
118
116
  return False
119
117
 
120
118
  return True
121
119
 
122
120
  def _validate_market_data(self, data: Dict[str, Any]) -> bool:
123
121
  """Validate market data"""
124
- required_fields = ['symbol', 'close', 'volume', 'date']
122
+ required_fields = ["symbol", "close", "volume", "date"]
125
123
 
126
124
  for field in required_fields:
127
125
  if field not in data:
128
126
  return False
129
127
 
130
128
  # Validate prices
131
- for price_field in ['close', 'open', 'high', 'low']:
129
+ for price_field in ["close", "open", "high", "low"]:
132
130
  if price_field in data:
133
131
  if not isinstance(data[price_field], (int, float)) or data[price_field] <= 0:
134
132
  return False
135
133
 
136
134
  # Validate volume
137
- if not isinstance(data['volume'], (int, float)) or data['volume'] < 0:
135
+ if not isinstance(data["volume"], (int, float)) or data["volume"] < 0:
138
136
  return False
139
137
 
140
138
  return True
@@ -149,13 +147,14 @@ class DataTransformer:
149
147
 
150
148
  def __init__(self):
151
149
  self.transformers = {
152
- 'politician_trades': self._transform_politician_trade,
153
- 'stock_quotes': self._transform_stock_quote,
154
- 'market_data': self._transform_market_data
150
+ "politician_trades": self._transform_politician_trade,
151
+ "stock_quotes": self._transform_stock_quote,
152
+ "market_data": self._transform_market_data,
155
153
  }
156
154
 
157
- def transform(self, data: Union[Dict[str, Any], List[Dict[str, Any]]],
158
- data_type: str) -> Union[Dict[str, Any], pd.DataFrame]:
155
+ def transform(
156
+ self, data: Union[Dict[str, Any], List[Dict[str, Any]]], data_type: str
157
+ ) -> Union[Dict[str, Any], pd.DataFrame]:
159
158
  """Transform data based on type"""
160
159
  if data_type not in self.transformers:
161
160
  return data
@@ -171,30 +170,30 @@ class DataTransformer:
171
170
  transformed = data.copy()
172
171
 
173
172
  # Standardize politician name
174
- transformed['politician_normalized'] = self._normalize_name(data.get('politician', ''))
173
+ transformed["politician_normalized"] = self._normalize_name(data.get("politician", ""))
175
174
 
176
175
  # Convert dates to datetime
177
- if 'transaction_date' in data:
178
- transformed['transaction_date'] = pd.to_datetime(data['transaction_date'])
176
+ if "transaction_date" in data:
177
+ transformed["transaction_date"] = pd.to_datetime(data["transaction_date"])
179
178
 
180
- if 'disclosure_date' in data:
181
- transformed['disclosure_date'] = pd.to_datetime(data['disclosure_date'])
179
+ if "disclosure_date" in data:
180
+ transformed["disclosure_date"] = pd.to_datetime(data["disclosure_date"])
182
181
 
183
182
  # Calculate disclosure delay
184
- if 'transaction_date' in transformed:
185
- delay = (transformed['disclosure_date'] - transformed['transaction_date']).days
186
- transformed['disclosure_delay_days'] = max(0, delay)
183
+ if "transaction_date" in transformed:
184
+ delay = (transformed["disclosure_date"] - transformed["transaction_date"]).days
185
+ transformed["disclosure_delay_days"] = max(0, delay)
187
186
 
188
187
  # Normalize ticker
189
- transformed['ticker'] = data.get('ticker', '').upper()
188
+ transformed["ticker"] = data.get("ticker", "").upper()
190
189
 
191
190
  # Categorize transaction amount
192
- amount = data.get('amount', 0)
193
- transformed['amount_category'] = self._categorize_amount(amount)
191
+ amount = data.get("amount", 0)
192
+ transformed["amount_category"] = self._categorize_amount(amount)
194
193
 
195
194
  # Add derived features
196
- transformed['is_purchase'] = data.get('transaction_type') == 'buy'
197
- transformed['is_sale'] = data.get('transaction_type') == 'sell'
195
+ transformed["is_purchase"] = data.get("transaction_type") == "buy"
196
+ transformed["is_sale"] = data.get("transaction_type") == "sell"
198
197
 
199
198
  return transformed
200
199
 
@@ -203,19 +202,19 @@ class DataTransformer:
203
202
  transformed = data.copy()
204
203
 
205
204
  # Normalize symbol
206
- transformed['symbol'] = data.get('symbol', '').upper()
205
+ transformed["symbol"] = data.get("symbol", "").upper()
207
206
 
208
207
  # Convert timestamp
209
- if 'timestamp' in data:
210
- if isinstance(data['timestamp'], (int, float)):
211
- transformed['timestamp'] = datetime.fromtimestamp(data['timestamp'])
208
+ if "timestamp" in data:
209
+ if isinstance(data["timestamp"], (int, float)):
210
+ transformed["timestamp"] = datetime.fromtimestamp(data["timestamp"])
212
211
  else:
213
- transformed['timestamp'] = pd.to_datetime(data['timestamp'])
212
+ transformed["timestamp"] = pd.to_datetime(data["timestamp"])
214
213
 
215
214
  # Calculate spread if bid/ask available
216
- if 'bid' in data and 'ask' in data:
217
- transformed['spread'] = data['ask'] - data['bid']
218
- transformed['spread_pct'] = (transformed['spread'] / data['ask']) * 100
215
+ if "bid" in data and "ask" in data:
216
+ transformed["spread"] = data["ask"] - data["bid"]
217
+ transformed["spread_pct"] = (transformed["spread"] / data["ask"]) * 100
219
218
 
220
219
  return transformed
221
220
 
@@ -224,34 +223,34 @@ class DataTransformer:
224
223
  transformed = data.copy()
225
224
 
226
225
  # Normalize symbol
227
- transformed['symbol'] = data.get('symbol', '').upper()
226
+ transformed["symbol"] = data.get("symbol", "").upper()
228
227
 
229
228
  # Convert date
230
- if 'date' in data:
231
- transformed['date'] = pd.to_datetime(data['date'])
229
+ if "date" in data:
230
+ transformed["date"] = pd.to_datetime(data["date"])
232
231
 
233
232
  # Calculate OHLC metrics
234
- if all(k in data for k in ['open', 'high', 'low', 'close']):
235
- transformed['daily_range'] = data['high'] - data['low']
236
- transformed['daily_return'] = (data['close'] - data['open']) / data['open']
237
- transformed['intraday_volatility'] = transformed['daily_range'] / data['close']
233
+ if all(k in data for k in ["open", "high", "low", "close"]):
234
+ transformed["daily_range"] = data["high"] - data["low"]
235
+ transformed["daily_return"] = (data["close"] - data["open"]) / data["open"]
236
+ transformed["intraday_volatility"] = transformed["daily_range"] / data["close"]
238
237
 
239
238
  # Calculate volume metrics
240
- if 'volume' in data and 'close' in data:
241
- transformed['dollar_volume'] = data['volume'] * data['close']
239
+ if "volume" in data and "close" in data:
240
+ transformed["dollar_volume"] = data["volume"] * data["close"]
242
241
 
243
242
  return transformed
244
243
 
245
244
  def _normalize_name(self, name: str) -> str:
246
245
  """Normalize politician name"""
247
246
  # Remove titles
248
- titles = ['Sen.', 'Senator', 'Rep.', 'Representative', 'Hon.', 'Dr.', 'Mr.', 'Mrs.', 'Ms.']
247
+ titles = ["Sen.", "Senator", "Rep.", "Representative", "Hon.", "Dr.", "Mr.", "Mrs.", "Ms."]
249
248
  normalized = name
250
249
  for title in titles:
251
- normalized = normalized.replace(title, '')
250
+ normalized = normalized.replace(title, "")
252
251
 
253
252
  # Clean and standardize
254
- normalized = ' '.join(normalized.split()) # Remove extra spaces
253
+ normalized = " ".join(normalized.split()) # Remove extra spaces
255
254
  normalized = normalized.strip()
256
255
 
257
256
  return normalized
@@ -259,17 +258,17 @@ class DataTransformer:
259
258
  def _categorize_amount(self, amount: float) -> str:
260
259
  """Categorize transaction amount"""
261
260
  if amount < 1000:
262
- return 'micro'
261
+ return "micro"
263
262
  elif amount < 15000:
264
- return 'small'
263
+ return "small"
265
264
  elif amount < 50000:
266
- return 'medium'
265
+ return "medium"
267
266
  elif amount < 250000:
268
- return 'large'
267
+ return "large"
269
268
  elif amount < 1000000:
270
- return 'very_large'
269
+ return "very_large"
271
270
  else:
272
- return 'mega'
271
+ return "mega"
273
272
 
274
273
 
275
274
  class DataLoader:
@@ -279,8 +278,9 @@ class DataLoader:
279
278
  self.data_dir = data_dir
280
279
  self.data_dir.mkdir(parents=True, exist_ok=True)
281
280
 
282
- async def save_batch(self, data: pd.DataFrame, data_type: str,
283
- timestamp: Optional[datetime] = None):
281
+ async def save_batch(
282
+ self, data: pd.DataFrame, data_type: str, timestamp: Optional[datetime] = None
283
+ ):
284
284
  """Save batch of data"""
285
285
  if timestamp is None:
286
286
  timestamp = datetime.now()
@@ -294,13 +294,14 @@ class DataLoader:
294
294
  filepath = type_dir / filename
295
295
 
296
296
  # Save as parquet
297
- data.to_parquet(filepath, compression='snappy')
297
+ data.to_parquet(filepath, compression="snappy")
298
298
  logger.info(f"Saved {len(data)} records to {filepath}")
299
299
 
300
300
  return filepath
301
301
 
302
- async def save_json(self, data: Union[Dict, List], data_type: str,
303
- timestamp: Optional[datetime] = None):
302
+ async def save_json(
303
+ self, data: Union[Dict, List], data_type: str, timestamp: Optional[datetime] = None
304
+ ):
304
305
  """Save data as JSON"""
305
306
  if timestamp is None:
306
307
  timestamp = datetime.now()
@@ -314,7 +315,7 @@ class DataLoader:
314
315
  filepath = type_dir / filename
315
316
 
316
317
  # Save JSON
317
- with open(filepath, 'w') as f:
318
+ with open(filepath, "w") as f:
318
319
  json.dump(data, f, indent=2, default=str)
319
320
 
320
321
  logger.info(f"Saved JSON to {filepath}")
@@ -357,13 +358,13 @@ class IngestionPipeline:
357
358
 
358
359
  # Pipeline metrics
359
360
  self.metrics = {
360
- 'records_processed': 0,
361
- 'records_validated': 0,
362
- 'records_transformed': 0,
363
- 'records_saved': 0,
364
- 'errors': 0,
365
- 'start_time': None,
366
- 'last_update': None
361
+ "records_processed": 0,
362
+ "records_validated": 0,
363
+ "records_transformed": 0,
364
+ "records_saved": 0,
365
+ "errors": 0,
366
+ "start_time": None,
367
+ "last_update": None,
367
368
  }
368
369
 
369
370
  def add_source(self, name: str, connector):
@@ -375,11 +376,11 @@ class IngestionPipeline:
375
376
  """Initialize all data sources"""
376
377
  # Congressional data
377
378
  congress_api = CongressionalDataAPI()
378
- self.add_source('congress', congress_api)
379
+ self.add_source("congress", congress_api)
379
380
 
380
381
  # Stock data sources
381
382
  yahoo = YahooFinanceConnector()
382
- self.add_source('yahoo', yahoo)
383
+ self.add_source("yahoo", yahoo)
383
384
 
384
385
  # Add more sources as needed
385
386
  logger.info(f"Initialized {len(self.sources)} data sources")
@@ -392,17 +393,17 @@ class IngestionPipeline:
392
393
  # Validate
393
394
  if self.config.enable_validation:
394
395
  if not self.validator.validate(record, data_type):
395
- self.metrics['errors'] += 1
396
+ self.metrics["errors"] += 1
396
397
  continue
397
- self.metrics['records_validated'] += 1
398
+ self.metrics["records_validated"] += 1
398
399
 
399
400
  # Transform
400
401
  if self.config.enable_transformation:
401
402
  record = self.transformer.transform(record, data_type)
402
- self.metrics['records_transformed'] += 1
403
+ self.metrics["records_transformed"] += 1
403
404
 
404
405
  processed_data.append(record)
405
- self.metrics['records_processed'] += 1
406
+ self.metrics["records_processed"] += 1
406
407
 
407
408
  # Convert to DataFrame
408
409
  if processed_data:
@@ -410,7 +411,7 @@ class IngestionPipeline:
410
411
 
411
412
  # Save to storage
412
413
  await self.loader.save_batch(df, data_type)
413
- self.metrics['records_saved'] += len(df)
414
+ self.metrics["records_saved"] += len(df)
414
415
 
415
416
  return df
416
417
 
@@ -418,7 +419,7 @@ class IngestionPipeline:
418
419
 
419
420
  async def fetch_politician_trades(self, days: int = 30) -> pd.DataFrame:
420
421
  """Fetch recent politician trades"""
421
- congress_api = self.sources.get('congress')
422
+ congress_api = self.sources.get("congress")
422
423
  if not congress_api:
423
424
  logger.error("Congressional data source not available")
424
425
  return pd.DataFrame()
@@ -427,28 +428,30 @@ class IngestionPipeline:
427
428
  trades = await congress_api.fetch_recent_trades(days=days)
428
429
 
429
430
  # Process through pipeline
430
- df = await self.process_batch(trades, 'politician_trades')
431
+ df = await self.process_batch(trades, "politician_trades")
431
432
 
432
433
  logger.info(f"Fetched {len(df)} politician trades")
433
434
  return df
434
435
 
435
- async def fetch_stock_data(self, tickers: List[str], period: str = '1mo') -> Dict[str, pd.DataFrame]:
436
+ async def fetch_stock_data(
437
+ self, tickers: List[str], period: str = "1mo"
438
+ ) -> Dict[str, pd.DataFrame]:
436
439
  """Fetch stock data for multiple tickers"""
437
440
  stock_data = {}
438
441
 
439
442
  for ticker in tickers:
440
443
  # Try Yahoo Finance first
441
- yahoo = self.sources.get('yahoo')
444
+ yahoo = self.sources.get("yahoo")
442
445
  if yahoo:
443
446
  try:
444
447
  df = await yahoo.fetch_historical(ticker, period)
445
448
  if not df.empty:
446
449
  # Process through pipeline
447
- records = df.to_dict('records')
450
+ records = df.to_dict("records")
448
451
  for record in records:
449
- record['symbol'] = ticker
452
+ record["symbol"] = ticker
450
453
 
451
- processed = await self.process_batch(records, 'market_data')
454
+ processed = await self.process_batch(records, "market_data")
452
455
  stock_data[ticker] = processed
453
456
  except Exception as e:
454
457
  logger.error(f"Failed to fetch {ticker}: {e}")
@@ -463,16 +466,14 @@ class IngestionPipeline:
463
466
 
464
467
  # Initialize stream processor
465
468
  stream_config = StreamConfig(
466
- buffer_size=self.config.batch_size,
467
- batch_size=100,
468
- flush_interval=5
469
+ buffer_size=self.config.batch_size, batch_size=100, flush_interval=5
469
470
  )
470
471
 
471
472
  self.stream_processor = StreamProcessor(stream_config)
472
473
 
473
474
  # Add processor for pipeline
474
475
  async def pipeline_processor(batch):
475
- await self.process_batch(batch, 'streaming_data')
476
+ await self.process_batch(batch, "streaming_data")
476
477
 
477
478
  self.stream_processor.add_processor(pipeline_processor)
478
479
 
@@ -484,21 +485,21 @@ class IngestionPipeline:
484
485
  if self.stream_processor:
485
486
  await self.stream_processor.stop()
486
487
 
487
- async def run(self, mode: str = 'batch'):
488
+ async def run(self, mode: str = "batch"):
488
489
  """Run ingestion pipeline"""
489
- self.metrics['start_time'] = datetime.now()
490
+ self.metrics["start_time"] = datetime.now()
490
491
 
491
492
  try:
492
493
  # Initialize sources
493
494
  await self.initialize_sources()
494
495
 
495
- if mode == 'batch':
496
+ if mode == "batch":
496
497
  # Batch processing
497
498
  await self.run_batch()
498
- elif mode == 'streaming':
499
+ elif mode == "streaming":
499
500
  # Streaming mode
500
501
  await self.start_streaming()
501
- elif mode == 'hybrid':
502
+ elif mode == "hybrid":
502
503
  # Both batch and streaming
503
504
  batch_task = asyncio.create_task(self.run_batch())
504
505
  stream_task = asyncio.create_task(self.start_streaming())
@@ -506,10 +507,10 @@ class IngestionPipeline:
506
507
 
507
508
  except Exception as e:
508
509
  logger.error(f"Pipeline error: {e}")
509
- self.metrics['errors'] += 1
510
+ self.metrics["errors"] += 1
510
511
  raise
511
512
  finally:
512
- self.metrics['last_update'] = datetime.now()
513
+ self.metrics["last_update"] = datetime.now()
513
514
 
514
515
  async def run_batch(self):
515
516
  """Run batch processing"""
@@ -519,8 +520,8 @@ class IngestionPipeline:
519
520
  trades_df = await self.fetch_politician_trades(days=30)
520
521
 
521
522
  # Extract unique tickers
522
- if not trades_df.empty and 'ticker' in trades_df.columns:
523
- tickers = trades_df['ticker'].unique().tolist()
523
+ if not trades_df.empty and "ticker" in trades_df.columns:
524
+ tickers = trades_df["ticker"].unique().tolist()
524
525
 
525
526
  # Fetch stock data for those tickers
526
527
  stock_data = await self.fetch_stock_data(tickers[:20]) # Limit to 20 for demo
@@ -532,36 +533,37 @@ class IngestionPipeline:
532
533
  metrics = self.metrics.copy()
533
534
 
534
535
  # Calculate throughput
535
- if metrics['start_time']:
536
- elapsed = (datetime.now() - metrics['start_time']).total_seconds()
536
+ if metrics["start_time"]:
537
+ elapsed = (datetime.now() - metrics["start_time"]).total_seconds()
537
538
  if elapsed > 0:
538
- metrics['throughput'] = metrics['records_processed'] / elapsed
539
+ metrics["throughput"] = metrics["records_processed"] / elapsed
539
540
 
540
541
  # Add validation stats
541
- metrics['validation_stats'] = self.validator.get_stats()
542
+ metrics["validation_stats"] = self.validator.get_stats()
542
543
 
543
544
  return metrics
544
545
 
545
546
 
546
547
  # Example usage
547
548
  if __name__ == "__main__":
549
+
548
550
  async def main():
549
551
  # Configure pipeline
550
552
  config = PipelineConfig(
551
553
  data_dir=Path("data/ingestion"),
552
554
  enable_streaming=False, # Batch mode for testing
553
555
  enable_validation=True,
554
- enable_transformation=True
556
+ enable_transformation=True,
555
557
  )
556
558
 
557
559
  # Create pipeline
558
560
  pipeline = IngestionPipeline(config)
559
561
 
560
562
  # Run batch processing
561
- await pipeline.run(mode='batch')
563
+ await pipeline.run(mode="batch")
562
564
 
563
565
  # Get metrics
564
566
  metrics = pipeline.get_metrics()
565
567
  print(f"Pipeline metrics: {json.dumps(metrics, indent=2, default=str)}")
566
568
 
567
- asyncio.run(main())
569
+ asyncio.run(main())