mcli-framework 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (186) hide show
  1. mcli/app/chat_cmd.py +42 -0
  2. mcli/app/commands_cmd.py +226 -0
  3. mcli/app/completion_cmd.py +216 -0
  4. mcli/app/completion_helpers.py +288 -0
  5. mcli/app/cron_test_cmd.py +697 -0
  6. mcli/app/logs_cmd.py +419 -0
  7. mcli/app/main.py +492 -0
  8. mcli/app/model/model.py +1060 -0
  9. mcli/app/model_cmd.py +227 -0
  10. mcli/app/redis_cmd.py +269 -0
  11. mcli/app/video/video.py +1114 -0
  12. mcli/app/visual_cmd.py +303 -0
  13. mcli/chat/chat.py +2409 -0
  14. mcli/chat/command_rag.py +514 -0
  15. mcli/chat/enhanced_chat.py +652 -0
  16. mcli/chat/system_controller.py +1010 -0
  17. mcli/chat/system_integration.py +1016 -0
  18. mcli/cli.py +25 -0
  19. mcli/config.toml +20 -0
  20. mcli/lib/api/api.py +586 -0
  21. mcli/lib/api/daemon_client.py +203 -0
  22. mcli/lib/api/daemon_client_local.py +44 -0
  23. mcli/lib/api/daemon_decorator.py +217 -0
  24. mcli/lib/api/mcli_decorators.py +1032 -0
  25. mcli/lib/auth/auth.py +85 -0
  26. mcli/lib/auth/aws_manager.py +85 -0
  27. mcli/lib/auth/azure_manager.py +91 -0
  28. mcli/lib/auth/credential_manager.py +192 -0
  29. mcli/lib/auth/gcp_manager.py +93 -0
  30. mcli/lib/auth/key_manager.py +117 -0
  31. mcli/lib/auth/mcli_manager.py +93 -0
  32. mcli/lib/auth/token_manager.py +75 -0
  33. mcli/lib/auth/token_util.py +1011 -0
  34. mcli/lib/config/config.py +47 -0
  35. mcli/lib/discovery/__init__.py +1 -0
  36. mcli/lib/discovery/command_discovery.py +274 -0
  37. mcli/lib/erd/erd.py +1345 -0
  38. mcli/lib/erd/generate_graph.py +453 -0
  39. mcli/lib/files/files.py +76 -0
  40. mcli/lib/fs/fs.py +109 -0
  41. mcli/lib/lib.py +29 -0
  42. mcli/lib/logger/logger.py +611 -0
  43. mcli/lib/performance/optimizer.py +409 -0
  44. mcli/lib/performance/rust_bridge.py +502 -0
  45. mcli/lib/performance/uvloop_config.py +154 -0
  46. mcli/lib/pickles/pickles.py +50 -0
  47. mcli/lib/search/cached_vectorizer.py +479 -0
  48. mcli/lib/services/data_pipeline.py +460 -0
  49. mcli/lib/services/lsh_client.py +441 -0
  50. mcli/lib/services/redis_service.py +387 -0
  51. mcli/lib/shell/shell.py +137 -0
  52. mcli/lib/toml/toml.py +33 -0
  53. mcli/lib/ui/styling.py +47 -0
  54. mcli/lib/ui/visual_effects.py +634 -0
  55. mcli/lib/watcher/watcher.py +185 -0
  56. mcli/ml/api/app.py +215 -0
  57. mcli/ml/api/middleware.py +224 -0
  58. mcli/ml/api/routers/admin_router.py +12 -0
  59. mcli/ml/api/routers/auth_router.py +244 -0
  60. mcli/ml/api/routers/backtest_router.py +12 -0
  61. mcli/ml/api/routers/data_router.py +12 -0
  62. mcli/ml/api/routers/model_router.py +302 -0
  63. mcli/ml/api/routers/monitoring_router.py +12 -0
  64. mcli/ml/api/routers/portfolio_router.py +12 -0
  65. mcli/ml/api/routers/prediction_router.py +267 -0
  66. mcli/ml/api/routers/trade_router.py +12 -0
  67. mcli/ml/api/routers/websocket_router.py +76 -0
  68. mcli/ml/api/schemas.py +64 -0
  69. mcli/ml/auth/auth_manager.py +425 -0
  70. mcli/ml/auth/models.py +154 -0
  71. mcli/ml/auth/permissions.py +302 -0
  72. mcli/ml/backtesting/backtest_engine.py +502 -0
  73. mcli/ml/backtesting/performance_metrics.py +393 -0
  74. mcli/ml/cache.py +400 -0
  75. mcli/ml/cli/main.py +398 -0
  76. mcli/ml/config/settings.py +394 -0
  77. mcli/ml/configs/dvc_config.py +230 -0
  78. mcli/ml/configs/mlflow_config.py +131 -0
  79. mcli/ml/configs/mlops_manager.py +293 -0
  80. mcli/ml/dashboard/app.py +532 -0
  81. mcli/ml/dashboard/app_integrated.py +738 -0
  82. mcli/ml/dashboard/app_supabase.py +560 -0
  83. mcli/ml/dashboard/app_training.py +615 -0
  84. mcli/ml/dashboard/cli.py +51 -0
  85. mcli/ml/data_ingestion/api_connectors.py +501 -0
  86. mcli/ml/data_ingestion/data_pipeline.py +567 -0
  87. mcli/ml/data_ingestion/stream_processor.py +512 -0
  88. mcli/ml/database/migrations/env.py +94 -0
  89. mcli/ml/database/models.py +667 -0
  90. mcli/ml/database/session.py +200 -0
  91. mcli/ml/experimentation/ab_testing.py +845 -0
  92. mcli/ml/features/ensemble_features.py +607 -0
  93. mcli/ml/features/political_features.py +676 -0
  94. mcli/ml/features/recommendation_engine.py +809 -0
  95. mcli/ml/features/stock_features.py +573 -0
  96. mcli/ml/features/test_feature_engineering.py +346 -0
  97. mcli/ml/logging.py +85 -0
  98. mcli/ml/mlops/data_versioning.py +518 -0
  99. mcli/ml/mlops/experiment_tracker.py +377 -0
  100. mcli/ml/mlops/model_serving.py +481 -0
  101. mcli/ml/mlops/pipeline_orchestrator.py +614 -0
  102. mcli/ml/models/base_models.py +324 -0
  103. mcli/ml/models/ensemble_models.py +675 -0
  104. mcli/ml/models/recommendation_models.py +474 -0
  105. mcli/ml/models/test_models.py +487 -0
  106. mcli/ml/monitoring/drift_detection.py +676 -0
  107. mcli/ml/monitoring/metrics.py +45 -0
  108. mcli/ml/optimization/portfolio_optimizer.py +834 -0
  109. mcli/ml/preprocessing/data_cleaners.py +451 -0
  110. mcli/ml/preprocessing/feature_extractors.py +491 -0
  111. mcli/ml/preprocessing/ml_pipeline.py +382 -0
  112. mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
  113. mcli/ml/preprocessing/test_preprocessing.py +294 -0
  114. mcli/ml/scripts/populate_sample_data.py +200 -0
  115. mcli/ml/tasks.py +400 -0
  116. mcli/ml/tests/test_integration.py +429 -0
  117. mcli/ml/tests/test_training_dashboard.py +387 -0
  118. mcli/public/oi/oi.py +15 -0
  119. mcli/public/public.py +4 -0
  120. mcli/self/self_cmd.py +1246 -0
  121. mcli/workflow/daemon/api_daemon.py +800 -0
  122. mcli/workflow/daemon/async_command_database.py +681 -0
  123. mcli/workflow/daemon/async_process_manager.py +591 -0
  124. mcli/workflow/daemon/client.py +530 -0
  125. mcli/workflow/daemon/commands.py +1196 -0
  126. mcli/workflow/daemon/daemon.py +905 -0
  127. mcli/workflow/daemon/daemon_api.py +59 -0
  128. mcli/workflow/daemon/enhanced_daemon.py +571 -0
  129. mcli/workflow/daemon/process_cli.py +244 -0
  130. mcli/workflow/daemon/process_manager.py +439 -0
  131. mcli/workflow/daemon/test_daemon.py +275 -0
  132. mcli/workflow/dashboard/dashboard_cmd.py +113 -0
  133. mcli/workflow/docker/docker.py +0 -0
  134. mcli/workflow/file/file.py +100 -0
  135. mcli/workflow/gcloud/config.toml +21 -0
  136. mcli/workflow/gcloud/gcloud.py +58 -0
  137. mcli/workflow/git_commit/ai_service.py +328 -0
  138. mcli/workflow/git_commit/commands.py +430 -0
  139. mcli/workflow/lsh_integration.py +355 -0
  140. mcli/workflow/model_service/client.py +594 -0
  141. mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
  142. mcli/workflow/model_service/lightweight_embedder.py +397 -0
  143. mcli/workflow/model_service/lightweight_model_server.py +714 -0
  144. mcli/workflow/model_service/lightweight_test.py +241 -0
  145. mcli/workflow/model_service/model_service.py +1955 -0
  146. mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
  147. mcli/workflow/model_service/pdf_processor.py +386 -0
  148. mcli/workflow/model_service/test_efficient_runner.py +234 -0
  149. mcli/workflow/model_service/test_example.py +315 -0
  150. mcli/workflow/model_service/test_integration.py +131 -0
  151. mcli/workflow/model_service/test_new_features.py +149 -0
  152. mcli/workflow/openai/openai.py +99 -0
  153. mcli/workflow/politician_trading/commands.py +1790 -0
  154. mcli/workflow/politician_trading/config.py +134 -0
  155. mcli/workflow/politician_trading/connectivity.py +490 -0
  156. mcli/workflow/politician_trading/data_sources.py +395 -0
  157. mcli/workflow/politician_trading/database.py +410 -0
  158. mcli/workflow/politician_trading/demo.py +248 -0
  159. mcli/workflow/politician_trading/models.py +165 -0
  160. mcli/workflow/politician_trading/monitoring.py +413 -0
  161. mcli/workflow/politician_trading/scrapers.py +966 -0
  162. mcli/workflow/politician_trading/scrapers_california.py +412 -0
  163. mcli/workflow/politician_trading/scrapers_eu.py +377 -0
  164. mcli/workflow/politician_trading/scrapers_uk.py +350 -0
  165. mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
  166. mcli/workflow/politician_trading/supabase_functions.py +354 -0
  167. mcli/workflow/politician_trading/workflow.py +852 -0
  168. mcli/workflow/registry/registry.py +180 -0
  169. mcli/workflow/repo/repo.py +223 -0
  170. mcli/workflow/scheduler/commands.py +493 -0
  171. mcli/workflow/scheduler/cron_parser.py +238 -0
  172. mcli/workflow/scheduler/job.py +182 -0
  173. mcli/workflow/scheduler/monitor.py +139 -0
  174. mcli/workflow/scheduler/persistence.py +324 -0
  175. mcli/workflow/scheduler/scheduler.py +679 -0
  176. mcli/workflow/sync/sync_cmd.py +437 -0
  177. mcli/workflow/sync/test_cmd.py +314 -0
  178. mcli/workflow/videos/videos.py +242 -0
  179. mcli/workflow/wakatime/wakatime.py +11 -0
  180. mcli/workflow/workflow.py +37 -0
  181. mcli_framework-7.0.0.dist-info/METADATA +479 -0
  182. mcli_framework-7.0.0.dist-info/RECORD +186 -0
  183. mcli_framework-7.0.0.dist-info/WHEEL +5 -0
  184. mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
  185. mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
  186. mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,567 @@
1
+ """Complete data ingestion pipeline with validation and transformation"""
2
+
3
+ import asyncio
4
+ from typing import Dict, Any, Optional, List, Union, Callable
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime, timedelta
7
+ import pandas as pd
8
+ import numpy as np
9
+ import logging
10
+ from pathlib import Path
11
+ import json
12
+ from abc import ABC, abstractmethod
13
+
14
+ from .api_connectors import (
15
+ CongressionalDataAPI,
16
+ YahooFinanceConnector,
17
+ AlphaVantageConnector,
18
+ PolygonIOConnector,
19
+ QuiverQuantConnector,
20
+ DataAggregator,
21
+ )
22
+ from .stream_processor import StreamProcessor, StreamConfig, DataAggregator as StreamAggregator
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ @dataclass
28
+ class PipelineConfig:
29
+ """Data pipeline configuration"""
30
+ data_dir: Path = Path("data")
31
+ batch_size: int = 1000
32
+ enable_streaming: bool = True
33
+ enable_validation: bool = True
34
+ enable_transformation: bool = True
35
+ enable_caching: bool = True
36
+ cache_ttl: int = 300 # seconds
37
+ retry_count: int = 3
38
+ retry_delay: int = 1
39
+
40
+
41
+ class DataValidator:
42
+ """Validate incoming data"""
43
+
44
+ def __init__(self):
45
+ self.validation_rules = {
46
+ 'politician_trades': self._validate_politician_trade,
47
+ 'stock_quotes': self._validate_stock_quote,
48
+ 'market_data': self._validate_market_data
49
+ }
50
+ self.validation_stats = {
51
+ 'total': 0,
52
+ 'valid': 0,
53
+ 'invalid': 0,
54
+ 'errors': []
55
+ }
56
+
57
+ def validate(self, data: Dict[str, Any], data_type: str) -> bool:
58
+ """Validate data based on type"""
59
+ self.validation_stats['total'] += 1
60
+
61
+ if data_type not in self.validation_rules:
62
+ logger.warning(f"Unknown data type: {data_type}")
63
+ return True
64
+
65
+ try:
66
+ is_valid = self.validation_rules[data_type](data)
67
+ if is_valid:
68
+ self.validation_stats['valid'] += 1
69
+ else:
70
+ self.validation_stats['invalid'] += 1
71
+ return is_valid
72
+ except Exception as e:
73
+ self.validation_stats['invalid'] += 1
74
+ self.validation_stats['errors'].append(str(e))
75
+ logger.error(f"Validation error: {e}")
76
+ return False
77
+
78
+ def _validate_politician_trade(self, data: Dict[str, Any]) -> bool:
79
+ """Validate politician trading data"""
80
+ required_fields = ['politician', 'ticker', 'transaction_type', 'amount', 'transaction_date']
81
+
82
+ # Check required fields
83
+ for field in required_fields:
84
+ if field not in data:
85
+ logger.warning(f"Missing required field: {field}")
86
+ return False
87
+
88
+ # Validate transaction type
89
+ if data['transaction_type'] not in ['buy', 'sell', 'exchange']:
90
+ logger.warning(f"Invalid transaction type: {data['transaction_type']}")
91
+ return False
92
+
93
+ # Validate amount
94
+ if not isinstance(data['amount'], (int, float)) or data['amount'] <= 0:
95
+ logger.warning(f"Invalid amount: {data['amount']}")
96
+ return False
97
+
98
+ # Validate date
99
+ try:
100
+ if isinstance(data['transaction_date'], str):
101
+ datetime.fromisoformat(data['transaction_date'])
102
+ except:
103
+ logger.warning(f"Invalid date format: {data['transaction_date']}")
104
+ return False
105
+
106
+ return True
107
+
108
+ def _validate_stock_quote(self, data: Dict[str, Any]) -> bool:
109
+ """Validate stock quote data"""
110
+ required_fields = ['symbol', 'price', 'timestamp']
111
+
112
+ for field in required_fields:
113
+ if field not in data:
114
+ return False
115
+
116
+ # Validate price
117
+ if not isinstance(data['price'], (int, float)) or data['price'] <= 0:
118
+ return False
119
+
120
+ return True
121
+
122
+ def _validate_market_data(self, data: Dict[str, Any]) -> bool:
123
+ """Validate market data"""
124
+ required_fields = ['symbol', 'close', 'volume', 'date']
125
+
126
+ for field in required_fields:
127
+ if field not in data:
128
+ return False
129
+
130
+ # Validate prices
131
+ for price_field in ['close', 'open', 'high', 'low']:
132
+ if price_field in data:
133
+ if not isinstance(data[price_field], (int, float)) or data[price_field] <= 0:
134
+ return False
135
+
136
+ # Validate volume
137
+ if not isinstance(data['volume'], (int, float)) or data['volume'] < 0:
138
+ return False
139
+
140
+ return True
141
+
142
+ def get_stats(self) -> Dict[str, Any]:
143
+ """Get validation statistics"""
144
+ return self.validation_stats.copy()
145
+
146
+
147
+ class DataTransformer:
148
+ """Transform and normalize data"""
149
+
150
+ def __init__(self):
151
+ self.transformers = {
152
+ 'politician_trades': self._transform_politician_trade,
153
+ 'stock_quotes': self._transform_stock_quote,
154
+ 'market_data': self._transform_market_data
155
+ }
156
+
157
+ def transform(self, data: Union[Dict[str, Any], List[Dict[str, Any]]],
158
+ data_type: str) -> Union[Dict[str, Any], pd.DataFrame]:
159
+ """Transform data based on type"""
160
+ if data_type not in self.transformers:
161
+ return data
162
+
163
+ if isinstance(data, list):
164
+ transformed = [self.transformers[data_type](item) for item in data]
165
+ return pd.DataFrame(transformed)
166
+ else:
167
+ return self.transformers[data_type](data)
168
+
169
+ def _transform_politician_trade(self, data: Dict[str, Any]) -> Dict[str, Any]:
170
+ """Transform politician trading data"""
171
+ transformed = data.copy()
172
+
173
+ # Standardize politician name
174
+ transformed['politician_normalized'] = self._normalize_name(data.get('politician', ''))
175
+
176
+ # Convert dates to datetime
177
+ if 'transaction_date' in data:
178
+ transformed['transaction_date'] = pd.to_datetime(data['transaction_date'])
179
+
180
+ if 'disclosure_date' in data:
181
+ transformed['disclosure_date'] = pd.to_datetime(data['disclosure_date'])
182
+
183
+ # Calculate disclosure delay
184
+ if 'transaction_date' in transformed:
185
+ delay = (transformed['disclosure_date'] - transformed['transaction_date']).days
186
+ transformed['disclosure_delay_days'] = max(0, delay)
187
+
188
+ # Normalize ticker
189
+ transformed['ticker'] = data.get('ticker', '').upper()
190
+
191
+ # Categorize transaction amount
192
+ amount = data.get('amount', 0)
193
+ transformed['amount_category'] = self._categorize_amount(amount)
194
+
195
+ # Add derived features
196
+ transformed['is_purchase'] = data.get('transaction_type') == 'buy'
197
+ transformed['is_sale'] = data.get('transaction_type') == 'sell'
198
+
199
+ return transformed
200
+
201
+ def _transform_stock_quote(self, data: Dict[str, Any]) -> Dict[str, Any]:
202
+ """Transform stock quote data"""
203
+ transformed = data.copy()
204
+
205
+ # Normalize symbol
206
+ transformed['symbol'] = data.get('symbol', '').upper()
207
+
208
+ # Convert timestamp
209
+ if 'timestamp' in data:
210
+ if isinstance(data['timestamp'], (int, float)):
211
+ transformed['timestamp'] = datetime.fromtimestamp(data['timestamp'])
212
+ else:
213
+ transformed['timestamp'] = pd.to_datetime(data['timestamp'])
214
+
215
+ # Calculate spread if bid/ask available
216
+ if 'bid' in data and 'ask' in data:
217
+ transformed['spread'] = data['ask'] - data['bid']
218
+ transformed['spread_pct'] = (transformed['spread'] / data['ask']) * 100
219
+
220
+ return transformed
221
+
222
+ def _transform_market_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
223
+ """Transform market data"""
224
+ transformed = data.copy()
225
+
226
+ # Normalize symbol
227
+ transformed['symbol'] = data.get('symbol', '').upper()
228
+
229
+ # Convert date
230
+ if 'date' in data:
231
+ transformed['date'] = pd.to_datetime(data['date'])
232
+
233
+ # Calculate OHLC metrics
234
+ if all(k in data for k in ['open', 'high', 'low', 'close']):
235
+ transformed['daily_range'] = data['high'] - data['low']
236
+ transformed['daily_return'] = (data['close'] - data['open']) / data['open']
237
+ transformed['intraday_volatility'] = transformed['daily_range'] / data['close']
238
+
239
+ # Calculate volume metrics
240
+ if 'volume' in data and 'close' in data:
241
+ transformed['dollar_volume'] = data['volume'] * data['close']
242
+
243
+ return transformed
244
+
245
+ def _normalize_name(self, name: str) -> str:
246
+ """Normalize politician name"""
247
+ # Remove titles
248
+ titles = ['Sen.', 'Senator', 'Rep.', 'Representative', 'Hon.', 'Dr.', 'Mr.', 'Mrs.', 'Ms.']
249
+ normalized = name
250
+ for title in titles:
251
+ normalized = normalized.replace(title, '')
252
+
253
+ # Clean and standardize
254
+ normalized = ' '.join(normalized.split()) # Remove extra spaces
255
+ normalized = normalized.strip()
256
+
257
+ return normalized
258
+
259
+ def _categorize_amount(self, amount: float) -> str:
260
+ """Categorize transaction amount"""
261
+ if amount < 1000:
262
+ return 'micro'
263
+ elif amount < 15000:
264
+ return 'small'
265
+ elif amount < 50000:
266
+ return 'medium'
267
+ elif amount < 250000:
268
+ return 'large'
269
+ elif amount < 1000000:
270
+ return 'very_large'
271
+ else:
272
+ return 'mega'
273
+
274
+
275
+ class DataLoader:
276
+ """Load data to storage"""
277
+
278
+ def __init__(self, data_dir: Path):
279
+ self.data_dir = data_dir
280
+ self.data_dir.mkdir(parents=True, exist_ok=True)
281
+
282
+ async def save_batch(self, data: pd.DataFrame, data_type: str,
283
+ timestamp: Optional[datetime] = None):
284
+ """Save batch of data"""
285
+ if timestamp is None:
286
+ timestamp = datetime.now()
287
+
288
+ # Create subdirectory for data type
289
+ type_dir = self.data_dir / data_type
290
+ type_dir.mkdir(exist_ok=True)
291
+
292
+ # Generate filename with timestamp
293
+ filename = f"{data_type}_{timestamp.strftime('%Y%m%d_%H%M%S')}.parquet"
294
+ filepath = type_dir / filename
295
+
296
+ # Save as parquet
297
+ data.to_parquet(filepath, compression='snappy')
298
+ logger.info(f"Saved {len(data)} records to {filepath}")
299
+
300
+ return filepath
301
+
302
+ async def save_json(self, data: Union[Dict, List], data_type: str,
303
+ timestamp: Optional[datetime] = None):
304
+ """Save data as JSON"""
305
+ if timestamp is None:
306
+ timestamp = datetime.now()
307
+
308
+ # Create subdirectory
309
+ type_dir = self.data_dir / data_type
310
+ type_dir.mkdir(exist_ok=True)
311
+
312
+ # Generate filename
313
+ filename = f"{data_type}_{timestamp.strftime('%Y%m%d_%H%M%S')}.json"
314
+ filepath = type_dir / filename
315
+
316
+ # Save JSON
317
+ with open(filepath, 'w') as f:
318
+ json.dump(data, f, indent=2, default=str)
319
+
320
+ logger.info(f"Saved JSON to {filepath}")
321
+ return filepath
322
+
323
+ def load_latest(self, data_type: str, n_files: int = 1) -> pd.DataFrame:
324
+ """Load latest data files"""
325
+ type_dir = self.data_dir / data_type
326
+
327
+ if not type_dir.exists():
328
+ return pd.DataFrame()
329
+
330
+ # Find parquet files
331
+ files = sorted(type_dir.glob("*.parquet"), key=lambda x: x.stat().st_mtime, reverse=True)
332
+
333
+ if not files:
334
+ return pd.DataFrame()
335
+
336
+ # Load and concatenate
337
+ dfs = []
338
+ for file in files[:n_files]:
339
+ df = pd.read_parquet(file)
340
+ dfs.append(df)
341
+
342
+ return pd.concat(dfs, ignore_index=True)
343
+
344
+
345
+ class IngestionPipeline:
346
+ """Complete data ingestion pipeline"""
347
+
348
+ def __init__(self, config: PipelineConfig):
349
+ self.config = config
350
+ self.validator = DataValidator()
351
+ self.transformer = DataTransformer()
352
+ self.loader = DataLoader(config.data_dir)
353
+
354
+ # Initialize data sources
355
+ self.sources = {}
356
+ self.stream_processor = None
357
+
358
+ # Pipeline metrics
359
+ self.metrics = {
360
+ 'records_processed': 0,
361
+ 'records_validated': 0,
362
+ 'records_transformed': 0,
363
+ 'records_saved': 0,
364
+ 'errors': 0,
365
+ 'start_time': None,
366
+ 'last_update': None
367
+ }
368
+
369
+ def add_source(self, name: str, connector):
370
+ """Add data source"""
371
+ self.sources[name] = connector
372
+ logger.info(f"Added data source: {name}")
373
+
374
+ async def initialize_sources(self):
375
+ """Initialize all data sources"""
376
+ # Congressional data
377
+ congress_api = CongressionalDataAPI()
378
+ self.add_source('congress', congress_api)
379
+
380
+ # Stock data sources
381
+ yahoo = YahooFinanceConnector()
382
+ self.add_source('yahoo', yahoo)
383
+
384
+ # Add more sources as needed
385
+ logger.info(f"Initialized {len(self.sources)} data sources")
386
+
387
+ async def process_batch(self, data: List[Dict[str, Any]], data_type: str) -> pd.DataFrame:
388
+ """Process batch of data through pipeline"""
389
+ processed_data = []
390
+
391
+ for record in data:
392
+ # Validate
393
+ if self.config.enable_validation:
394
+ if not self.validator.validate(record, data_type):
395
+ self.metrics['errors'] += 1
396
+ continue
397
+ self.metrics['records_validated'] += 1
398
+
399
+ # Transform
400
+ if self.config.enable_transformation:
401
+ record = self.transformer.transform(record, data_type)
402
+ self.metrics['records_transformed'] += 1
403
+
404
+ processed_data.append(record)
405
+ self.metrics['records_processed'] += 1
406
+
407
+ # Convert to DataFrame
408
+ if processed_data:
409
+ df = pd.DataFrame(processed_data)
410
+
411
+ # Save to storage
412
+ await self.loader.save_batch(df, data_type)
413
+ self.metrics['records_saved'] += len(df)
414
+
415
+ return df
416
+
417
+ return pd.DataFrame()
418
+
419
+ async def fetch_politician_trades(self, days: int = 30) -> pd.DataFrame:
420
+ """Fetch recent politician trades"""
421
+ congress_api = self.sources.get('congress')
422
+ if not congress_api:
423
+ logger.error("Congressional data source not available")
424
+ return pd.DataFrame()
425
+
426
+ # Fetch trades
427
+ trades = await congress_api.fetch_recent_trades(days=days)
428
+
429
+ # Process through pipeline
430
+ df = await self.process_batch(trades, 'politician_trades')
431
+
432
+ logger.info(f"Fetched {len(df)} politician trades")
433
+ return df
434
+
435
+ async def fetch_stock_data(self, tickers: List[str], period: str = '1mo') -> Dict[str, pd.DataFrame]:
436
+ """Fetch stock data for multiple tickers"""
437
+ stock_data = {}
438
+
439
+ for ticker in tickers:
440
+ # Try Yahoo Finance first
441
+ yahoo = self.sources.get('yahoo')
442
+ if yahoo:
443
+ try:
444
+ df = await yahoo.fetch_historical(ticker, period)
445
+ if not df.empty:
446
+ # Process through pipeline
447
+ records = df.to_dict('records')
448
+ for record in records:
449
+ record['symbol'] = ticker
450
+
451
+ processed = await self.process_batch(records, 'market_data')
452
+ stock_data[ticker] = processed
453
+ except Exception as e:
454
+ logger.error(f"Failed to fetch {ticker}: {e}")
455
+
456
+ return stock_data
457
+
458
+ async def start_streaming(self):
459
+ """Start real-time streaming"""
460
+ if not self.config.enable_streaming:
461
+ logger.info("Streaming disabled")
462
+ return
463
+
464
+ # Initialize stream processor
465
+ stream_config = StreamConfig(
466
+ buffer_size=self.config.batch_size,
467
+ batch_size=100,
468
+ flush_interval=5
469
+ )
470
+
471
+ self.stream_processor = StreamProcessor(stream_config)
472
+
473
+ # Add processor for pipeline
474
+ async def pipeline_processor(batch):
475
+ await self.process_batch(batch, 'streaming_data')
476
+
477
+ self.stream_processor.add_processor(pipeline_processor)
478
+
479
+ # Start streaming
480
+ await self.stream_processor.start()
481
+
482
+ async def stop_streaming(self):
483
+ """Stop streaming"""
484
+ if self.stream_processor:
485
+ await self.stream_processor.stop()
486
+
487
+ async def run(self, mode: str = 'batch'):
488
+ """Run ingestion pipeline"""
489
+ self.metrics['start_time'] = datetime.now()
490
+
491
+ try:
492
+ # Initialize sources
493
+ await self.initialize_sources()
494
+
495
+ if mode == 'batch':
496
+ # Batch processing
497
+ await self.run_batch()
498
+ elif mode == 'streaming':
499
+ # Streaming mode
500
+ await self.start_streaming()
501
+ elif mode == 'hybrid':
502
+ # Both batch and streaming
503
+ batch_task = asyncio.create_task(self.run_batch())
504
+ stream_task = asyncio.create_task(self.start_streaming())
505
+ await asyncio.gather(batch_task, stream_task)
506
+
507
+ except Exception as e:
508
+ logger.error(f"Pipeline error: {e}")
509
+ self.metrics['errors'] += 1
510
+ raise
511
+ finally:
512
+ self.metrics['last_update'] = datetime.now()
513
+
514
+ async def run_batch(self):
515
+ """Run batch processing"""
516
+ logger.info("Starting batch processing...")
517
+
518
+ # Fetch politician trades
519
+ trades_df = await self.fetch_politician_trades(days=30)
520
+
521
+ # Extract unique tickers
522
+ if not trades_df.empty and 'ticker' in trades_df.columns:
523
+ tickers = trades_df['ticker'].unique().tolist()
524
+
525
+ # Fetch stock data for those tickers
526
+ stock_data = await self.fetch_stock_data(tickers[:20]) # Limit to 20 for demo
527
+
528
+ logger.info(f"Processed {len(trades_df)} trades and {len(stock_data)} stocks")
529
+
530
+ def get_metrics(self) -> Dict[str, Any]:
531
+ """Get pipeline metrics"""
532
+ metrics = self.metrics.copy()
533
+
534
+ # Calculate throughput
535
+ if metrics['start_time']:
536
+ elapsed = (datetime.now() - metrics['start_time']).total_seconds()
537
+ if elapsed > 0:
538
+ metrics['throughput'] = metrics['records_processed'] / elapsed
539
+
540
+ # Add validation stats
541
+ metrics['validation_stats'] = self.validator.get_stats()
542
+
543
+ return metrics
544
+
545
+
546
+ # Example usage
547
+ if __name__ == "__main__":
548
+ async def main():
549
+ # Configure pipeline
550
+ config = PipelineConfig(
551
+ data_dir=Path("data/ingestion"),
552
+ enable_streaming=False, # Batch mode for testing
553
+ enable_validation=True,
554
+ enable_transformation=True
555
+ )
556
+
557
+ # Create pipeline
558
+ pipeline = IngestionPipeline(config)
559
+
560
+ # Run batch processing
561
+ await pipeline.run(mode='batch')
562
+
563
+ # Get metrics
564
+ metrics = pipeline.get_metrics()
565
+ print(f"Pipeline metrics: {json.dumps(metrics, indent=2, default=str)}")
566
+
567
+ asyncio.run(main())