mcli-framework 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/chat_cmd.py +42 -0
- mcli/app/commands_cmd.py +226 -0
- mcli/app/completion_cmd.py +216 -0
- mcli/app/completion_helpers.py +288 -0
- mcli/app/cron_test_cmd.py +697 -0
- mcli/app/logs_cmd.py +419 -0
- mcli/app/main.py +492 -0
- mcli/app/model/model.py +1060 -0
- mcli/app/model_cmd.py +227 -0
- mcli/app/redis_cmd.py +269 -0
- mcli/app/video/video.py +1114 -0
- mcli/app/visual_cmd.py +303 -0
- mcli/chat/chat.py +2409 -0
- mcli/chat/command_rag.py +514 -0
- mcli/chat/enhanced_chat.py +652 -0
- mcli/chat/system_controller.py +1010 -0
- mcli/chat/system_integration.py +1016 -0
- mcli/cli.py +25 -0
- mcli/config.toml +20 -0
- mcli/lib/api/api.py +586 -0
- mcli/lib/api/daemon_client.py +203 -0
- mcli/lib/api/daemon_client_local.py +44 -0
- mcli/lib/api/daemon_decorator.py +217 -0
- mcli/lib/api/mcli_decorators.py +1032 -0
- mcli/lib/auth/auth.py +85 -0
- mcli/lib/auth/aws_manager.py +85 -0
- mcli/lib/auth/azure_manager.py +91 -0
- mcli/lib/auth/credential_manager.py +192 -0
- mcli/lib/auth/gcp_manager.py +93 -0
- mcli/lib/auth/key_manager.py +117 -0
- mcli/lib/auth/mcli_manager.py +93 -0
- mcli/lib/auth/token_manager.py +75 -0
- mcli/lib/auth/token_util.py +1011 -0
- mcli/lib/config/config.py +47 -0
- mcli/lib/discovery/__init__.py +1 -0
- mcli/lib/discovery/command_discovery.py +274 -0
- mcli/lib/erd/erd.py +1345 -0
- mcli/lib/erd/generate_graph.py +453 -0
- mcli/lib/files/files.py +76 -0
- mcli/lib/fs/fs.py +109 -0
- mcli/lib/lib.py +29 -0
- mcli/lib/logger/logger.py +611 -0
- mcli/lib/performance/optimizer.py +409 -0
- mcli/lib/performance/rust_bridge.py +502 -0
- mcli/lib/performance/uvloop_config.py +154 -0
- mcli/lib/pickles/pickles.py +50 -0
- mcli/lib/search/cached_vectorizer.py +479 -0
- mcli/lib/services/data_pipeline.py +460 -0
- mcli/lib/services/lsh_client.py +441 -0
- mcli/lib/services/redis_service.py +387 -0
- mcli/lib/shell/shell.py +137 -0
- mcli/lib/toml/toml.py +33 -0
- mcli/lib/ui/styling.py +47 -0
- mcli/lib/ui/visual_effects.py +634 -0
- mcli/lib/watcher/watcher.py +185 -0
- mcli/ml/api/app.py +215 -0
- mcli/ml/api/middleware.py +224 -0
- mcli/ml/api/routers/admin_router.py +12 -0
- mcli/ml/api/routers/auth_router.py +244 -0
- mcli/ml/api/routers/backtest_router.py +12 -0
- mcli/ml/api/routers/data_router.py +12 -0
- mcli/ml/api/routers/model_router.py +302 -0
- mcli/ml/api/routers/monitoring_router.py +12 -0
- mcli/ml/api/routers/portfolio_router.py +12 -0
- mcli/ml/api/routers/prediction_router.py +267 -0
- mcli/ml/api/routers/trade_router.py +12 -0
- mcli/ml/api/routers/websocket_router.py +76 -0
- mcli/ml/api/schemas.py +64 -0
- mcli/ml/auth/auth_manager.py +425 -0
- mcli/ml/auth/models.py +154 -0
- mcli/ml/auth/permissions.py +302 -0
- mcli/ml/backtesting/backtest_engine.py +502 -0
- mcli/ml/backtesting/performance_metrics.py +393 -0
- mcli/ml/cache.py +400 -0
- mcli/ml/cli/main.py +398 -0
- mcli/ml/config/settings.py +394 -0
- mcli/ml/configs/dvc_config.py +230 -0
- mcli/ml/configs/mlflow_config.py +131 -0
- mcli/ml/configs/mlops_manager.py +293 -0
- mcli/ml/dashboard/app.py +532 -0
- mcli/ml/dashboard/app_integrated.py +738 -0
- mcli/ml/dashboard/app_supabase.py +560 -0
- mcli/ml/dashboard/app_training.py +615 -0
- mcli/ml/dashboard/cli.py +51 -0
- mcli/ml/data_ingestion/api_connectors.py +501 -0
- mcli/ml/data_ingestion/data_pipeline.py +567 -0
- mcli/ml/data_ingestion/stream_processor.py +512 -0
- mcli/ml/database/migrations/env.py +94 -0
- mcli/ml/database/models.py +667 -0
- mcli/ml/database/session.py +200 -0
- mcli/ml/experimentation/ab_testing.py +845 -0
- mcli/ml/features/ensemble_features.py +607 -0
- mcli/ml/features/political_features.py +676 -0
- mcli/ml/features/recommendation_engine.py +809 -0
- mcli/ml/features/stock_features.py +573 -0
- mcli/ml/features/test_feature_engineering.py +346 -0
- mcli/ml/logging.py +85 -0
- mcli/ml/mlops/data_versioning.py +518 -0
- mcli/ml/mlops/experiment_tracker.py +377 -0
- mcli/ml/mlops/model_serving.py +481 -0
- mcli/ml/mlops/pipeline_orchestrator.py +614 -0
- mcli/ml/models/base_models.py +324 -0
- mcli/ml/models/ensemble_models.py +675 -0
- mcli/ml/models/recommendation_models.py +474 -0
- mcli/ml/models/test_models.py +487 -0
- mcli/ml/monitoring/drift_detection.py +676 -0
- mcli/ml/monitoring/metrics.py +45 -0
- mcli/ml/optimization/portfolio_optimizer.py +834 -0
- mcli/ml/preprocessing/data_cleaners.py +451 -0
- mcli/ml/preprocessing/feature_extractors.py +491 -0
- mcli/ml/preprocessing/ml_pipeline.py +382 -0
- mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
- mcli/ml/preprocessing/test_preprocessing.py +294 -0
- mcli/ml/scripts/populate_sample_data.py +200 -0
- mcli/ml/tasks.py +400 -0
- mcli/ml/tests/test_integration.py +429 -0
- mcli/ml/tests/test_training_dashboard.py +387 -0
- mcli/public/oi/oi.py +15 -0
- mcli/public/public.py +4 -0
- mcli/self/self_cmd.py +1246 -0
- mcli/workflow/daemon/api_daemon.py +800 -0
- mcli/workflow/daemon/async_command_database.py +681 -0
- mcli/workflow/daemon/async_process_manager.py +591 -0
- mcli/workflow/daemon/client.py +530 -0
- mcli/workflow/daemon/commands.py +1196 -0
- mcli/workflow/daemon/daemon.py +905 -0
- mcli/workflow/daemon/daemon_api.py +59 -0
- mcli/workflow/daemon/enhanced_daemon.py +571 -0
- mcli/workflow/daemon/process_cli.py +244 -0
- mcli/workflow/daemon/process_manager.py +439 -0
- mcli/workflow/daemon/test_daemon.py +275 -0
- mcli/workflow/dashboard/dashboard_cmd.py +113 -0
- mcli/workflow/docker/docker.py +0 -0
- mcli/workflow/file/file.py +100 -0
- mcli/workflow/gcloud/config.toml +21 -0
- mcli/workflow/gcloud/gcloud.py +58 -0
- mcli/workflow/git_commit/ai_service.py +328 -0
- mcli/workflow/git_commit/commands.py +430 -0
- mcli/workflow/lsh_integration.py +355 -0
- mcli/workflow/model_service/client.py +594 -0
- mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
- mcli/workflow/model_service/lightweight_embedder.py +397 -0
- mcli/workflow/model_service/lightweight_model_server.py +714 -0
- mcli/workflow/model_service/lightweight_test.py +241 -0
- mcli/workflow/model_service/model_service.py +1955 -0
- mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
- mcli/workflow/model_service/pdf_processor.py +386 -0
- mcli/workflow/model_service/test_efficient_runner.py +234 -0
- mcli/workflow/model_service/test_example.py +315 -0
- mcli/workflow/model_service/test_integration.py +131 -0
- mcli/workflow/model_service/test_new_features.py +149 -0
- mcli/workflow/openai/openai.py +99 -0
- mcli/workflow/politician_trading/commands.py +1790 -0
- mcli/workflow/politician_trading/config.py +134 -0
- mcli/workflow/politician_trading/connectivity.py +490 -0
- mcli/workflow/politician_trading/data_sources.py +395 -0
- mcli/workflow/politician_trading/database.py +410 -0
- mcli/workflow/politician_trading/demo.py +248 -0
- mcli/workflow/politician_trading/models.py +165 -0
- mcli/workflow/politician_trading/monitoring.py +413 -0
- mcli/workflow/politician_trading/scrapers.py +966 -0
- mcli/workflow/politician_trading/scrapers_california.py +412 -0
- mcli/workflow/politician_trading/scrapers_eu.py +377 -0
- mcli/workflow/politician_trading/scrapers_uk.py +350 -0
- mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
- mcli/workflow/politician_trading/supabase_functions.py +354 -0
- mcli/workflow/politician_trading/workflow.py +852 -0
- mcli/workflow/registry/registry.py +180 -0
- mcli/workflow/repo/repo.py +223 -0
- mcli/workflow/scheduler/commands.py +493 -0
- mcli/workflow/scheduler/cron_parser.py +238 -0
- mcli/workflow/scheduler/job.py +182 -0
- mcli/workflow/scheduler/monitor.py +139 -0
- mcli/workflow/scheduler/persistence.py +324 -0
- mcli/workflow/scheduler/scheduler.py +679 -0
- mcli/workflow/sync/sync_cmd.py +437 -0
- mcli/workflow/sync/test_cmd.py +314 -0
- mcli/workflow/videos/videos.py +242 -0
- mcli/workflow/wakatime/wakatime.py +11 -0
- mcli/workflow/workflow.py +37 -0
- mcli_framework-7.0.0.dist-info/METADATA +479 -0
- mcli_framework-7.0.0.dist-info/RECORD +186 -0
- mcli_framework-7.0.0.dist-info/WHEEL +5 -0
- mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
- mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
- mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,460 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Pipeline Service for mcli-LSH Integration
|
|
3
|
+
Handles ETL processes for data received from LSH daemon
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import json
|
|
8
|
+
import time
|
|
9
|
+
from typing import Any, Dict, List, Optional, Callable
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from mcli.lib.logger.logger import get_logger
|
|
14
|
+
from .lsh_client import LSHClient, LSHEventProcessor
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DataPipelineConfig:
|
|
20
|
+
"""Configuration for data pipeline"""
|
|
21
|
+
|
|
22
|
+
def __init__(self):
|
|
23
|
+
self.batch_size = 100
|
|
24
|
+
self.batch_timeout = 30 # seconds
|
|
25
|
+
self.retry_attempts = 3
|
|
26
|
+
self.retry_delay = 5 # seconds
|
|
27
|
+
self.output_dir = Path("./data/processed")
|
|
28
|
+
self.enable_validation = True
|
|
29
|
+
self.enable_enrichment = True
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DataValidator:
|
|
33
|
+
"""Validates incoming data"""
|
|
34
|
+
|
|
35
|
+
def __init__(self):
|
|
36
|
+
self.logger = get_logger(f"{__name__}.validator")
|
|
37
|
+
|
|
38
|
+
async def validate_trading_record(self, record: Dict[str, Any]) -> bool:
|
|
39
|
+
"""Validate politician trading record"""
|
|
40
|
+
required_fields = [
|
|
41
|
+
"politician_name",
|
|
42
|
+
"transaction_date",
|
|
43
|
+
"transaction_type",
|
|
44
|
+
"asset_name",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
for field in required_fields:
|
|
48
|
+
if field not in record:
|
|
49
|
+
self.logger.warning(f"Missing required field: {field}")
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
# Validate transaction date
|
|
53
|
+
if "transaction_date" in record:
|
|
54
|
+
try:
|
|
55
|
+
datetime.fromisoformat(record["transaction_date"])
|
|
56
|
+
except ValueError:
|
|
57
|
+
self.logger.warning(f"Invalid transaction date: {record['transaction_date']}")
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
# Validate amount if present
|
|
61
|
+
if "transaction_amount" in record:
|
|
62
|
+
try:
|
|
63
|
+
float(record["transaction_amount"])
|
|
64
|
+
except (ValueError, TypeError):
|
|
65
|
+
self.logger.warning(f"Invalid transaction amount: {record['transaction_amount']}")
|
|
66
|
+
return False
|
|
67
|
+
|
|
68
|
+
return True
|
|
69
|
+
|
|
70
|
+
async def validate_supabase_record(self, table: str, record: Dict[str, Any]) -> bool:
|
|
71
|
+
"""Validate Supabase record based on table schema"""
|
|
72
|
+
if not record:
|
|
73
|
+
return False
|
|
74
|
+
|
|
75
|
+
# Basic validation - can be extended with schema validation
|
|
76
|
+
if "id" in record and not record["id"]:
|
|
77
|
+
self.logger.warning("Record missing ID")
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
return True
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class DataEnricher:
|
|
84
|
+
"""Enriches data with additional information"""
|
|
85
|
+
|
|
86
|
+
def __init__(self):
|
|
87
|
+
self.logger = get_logger(f"{__name__}.enricher")
|
|
88
|
+
|
|
89
|
+
async def enrich_trading_record(self, record: Dict[str, Any]) -> Dict[str, Any]:
|
|
90
|
+
"""Enrich trading record with additional data"""
|
|
91
|
+
enriched = record.copy()
|
|
92
|
+
|
|
93
|
+
# Add processing timestamp
|
|
94
|
+
enriched["processed_at"] = datetime.now(timezone.utc).isoformat()
|
|
95
|
+
|
|
96
|
+
# Add amount categorization
|
|
97
|
+
if "transaction_amount" in record:
|
|
98
|
+
amount = float(record["transaction_amount"])
|
|
99
|
+
enriched["amount_category"] = self._categorize_amount(amount)
|
|
100
|
+
enriched["amount_bucket"] = self._bucket_amount(amount)
|
|
101
|
+
|
|
102
|
+
# Add politician party enrichment (placeholder)
|
|
103
|
+
if "politician_name" in record:
|
|
104
|
+
enriched["politician_metadata"] = await self._get_politician_metadata(
|
|
105
|
+
record["politician_name"]
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Add market context (placeholder)
|
|
109
|
+
if "asset_name" in record and "transaction_date" in record:
|
|
110
|
+
enriched["market_context"] = await self._get_market_context(
|
|
111
|
+
record["asset_name"], record["transaction_date"]
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
return enriched
|
|
115
|
+
|
|
116
|
+
def _categorize_amount(self, amount: float) -> str:
|
|
117
|
+
"""Categorize transaction amount"""
|
|
118
|
+
if amount < 1000:
|
|
119
|
+
return "micro"
|
|
120
|
+
elif amount < 15000:
|
|
121
|
+
return "small"
|
|
122
|
+
elif amount < 50000:
|
|
123
|
+
return "medium"
|
|
124
|
+
elif amount < 500000:
|
|
125
|
+
return "large"
|
|
126
|
+
else:
|
|
127
|
+
return "mega"
|
|
128
|
+
|
|
129
|
+
def _bucket_amount(self, amount: float) -> str:
|
|
130
|
+
"""Bucket amounts for analysis"""
|
|
131
|
+
if amount < 1000:
|
|
132
|
+
return "0-1K"
|
|
133
|
+
elif amount < 10000:
|
|
134
|
+
return "1K-10K"
|
|
135
|
+
elif amount < 50000:
|
|
136
|
+
return "10K-50K"
|
|
137
|
+
elif amount < 100000:
|
|
138
|
+
return "50K-100K"
|
|
139
|
+
elif amount < 500000:
|
|
140
|
+
return "100K-500K"
|
|
141
|
+
elif amount < 1000000:
|
|
142
|
+
return "500K-1M"
|
|
143
|
+
else:
|
|
144
|
+
return "1M+"
|
|
145
|
+
|
|
146
|
+
async def _get_politician_metadata(self, politician_name: str) -> Dict[str, Any]:
|
|
147
|
+
"""Get politician metadata (placeholder for external API)"""
|
|
148
|
+
# This would typically call an external API
|
|
149
|
+
return {
|
|
150
|
+
"enriched_at": datetime.now(timezone.utc).isoformat(),
|
|
151
|
+
"source": "mcli_enricher",
|
|
152
|
+
"name_normalized": politician_name.title(),
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
async def _get_market_context(self, asset_name: str, transaction_date: str) -> Dict[str, Any]:
|
|
156
|
+
"""Get market context for the transaction (placeholder)"""
|
|
157
|
+
# This would typically call financial APIs
|
|
158
|
+
return {
|
|
159
|
+
"enriched_at": datetime.now(timezone.utc).isoformat(),
|
|
160
|
+
"asset_normalized": asset_name.upper(),
|
|
161
|
+
"transaction_date": transaction_date,
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class DataProcessor:
|
|
166
|
+
"""Main data processing engine"""
|
|
167
|
+
|
|
168
|
+
def __init__(self, config: DataPipelineConfig):
|
|
169
|
+
self.config = config
|
|
170
|
+
self.logger = get_logger(f"{__name__}.processor")
|
|
171
|
+
self.validator = DataValidator()
|
|
172
|
+
self.enricher = DataEnricher()
|
|
173
|
+
self.batch_buffer: List[Dict[str, Any]] = []
|
|
174
|
+
self.last_batch_time = time.time()
|
|
175
|
+
self._processing_lock = asyncio.Lock()
|
|
176
|
+
|
|
177
|
+
# Ensure output directory exists
|
|
178
|
+
self.config.output_dir.mkdir(parents=True, exist_ok=True)
|
|
179
|
+
|
|
180
|
+
async def process_trading_data(self, records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
181
|
+
"""Process politician trading data"""
|
|
182
|
+
processed_records = []
|
|
183
|
+
|
|
184
|
+
for record in records:
|
|
185
|
+
try:
|
|
186
|
+
# Validate
|
|
187
|
+
if self.config.enable_validation:
|
|
188
|
+
if not await self.validator.validate_trading_record(record):
|
|
189
|
+
self.logger.warning(f"Validation failed for record: {record.get('id', 'unknown')}")
|
|
190
|
+
continue
|
|
191
|
+
|
|
192
|
+
# Enrich
|
|
193
|
+
if self.config.enable_enrichment:
|
|
194
|
+
enriched_record = await self.enricher.enrich_trading_record(record)
|
|
195
|
+
else:
|
|
196
|
+
enriched_record = record.copy()
|
|
197
|
+
|
|
198
|
+
# Add processing metadata
|
|
199
|
+
enriched_record["mcli_processed_at"] = datetime.now(timezone.utc).isoformat()
|
|
200
|
+
enriched_record["mcli_pipeline_version"] = "1.0.0"
|
|
201
|
+
|
|
202
|
+
processed_records.append(enriched_record)
|
|
203
|
+
|
|
204
|
+
except Exception as e:
|
|
205
|
+
self.logger.error(f"Error processing trading record: {e}")
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
self.logger.info(f"Processed {len(processed_records)}/{len(records)} trading records")
|
|
209
|
+
return processed_records
|
|
210
|
+
|
|
211
|
+
async def process_supabase_sync(self, table: str, operation: str, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
212
|
+
"""Process Supabase sync data"""
|
|
213
|
+
try:
|
|
214
|
+
# Validate
|
|
215
|
+
if self.config.enable_validation:
|
|
216
|
+
if not await self.validator.validate_supabase_record(table, data):
|
|
217
|
+
self.logger.warning(f"Validation failed for {table} record")
|
|
218
|
+
return {}
|
|
219
|
+
|
|
220
|
+
# Transform based on table and operation
|
|
221
|
+
processed_data = await self._transform_supabase_data(table, operation, data)
|
|
222
|
+
|
|
223
|
+
# Add processing metadata
|
|
224
|
+
processed_data["mcli_processed_at"] = datetime.now(timezone.utc).isoformat()
|
|
225
|
+
processed_data["mcli_source_table"] = table
|
|
226
|
+
processed_data["mcli_operation"] = operation
|
|
227
|
+
|
|
228
|
+
return processed_data
|
|
229
|
+
|
|
230
|
+
except Exception as e:
|
|
231
|
+
self.logger.error(f"Error processing Supabase sync: {e}")
|
|
232
|
+
return {}
|
|
233
|
+
|
|
234
|
+
async def _transform_supabase_data(self, table: str, operation: str, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
235
|
+
"""Transform Supabase data based on table schema"""
|
|
236
|
+
transformed = data.copy()
|
|
237
|
+
|
|
238
|
+
# Apply table-specific transformations
|
|
239
|
+
if "politician" in table.lower():
|
|
240
|
+
transformed = await self._transform_politician_table(transformed)
|
|
241
|
+
elif "trading" in table.lower():
|
|
242
|
+
transformed = await self._transform_trading_table(transformed)
|
|
243
|
+
|
|
244
|
+
return transformed
|
|
245
|
+
|
|
246
|
+
async def _transform_politician_table(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
247
|
+
"""Transform politician table data"""
|
|
248
|
+
# Normalize names
|
|
249
|
+
if "name" in data:
|
|
250
|
+
data["name_normalized"] = data["name"].title()
|
|
251
|
+
|
|
252
|
+
# Add derived fields
|
|
253
|
+
if "party" in data:
|
|
254
|
+
data["party_normalized"] = data["party"].upper()
|
|
255
|
+
|
|
256
|
+
return data
|
|
257
|
+
|
|
258
|
+
async def _transform_trading_table(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
259
|
+
"""Transform trading table data"""
|
|
260
|
+
# Normalize asset names
|
|
261
|
+
if "asset_name" in data:
|
|
262
|
+
data["asset_name_normalized"] = data["asset_name"].upper()
|
|
263
|
+
|
|
264
|
+
# Convert amounts to float
|
|
265
|
+
if "amount" in data and isinstance(data["amount"], str):
|
|
266
|
+
try:
|
|
267
|
+
data["amount_float"] = float(data["amount"])
|
|
268
|
+
except ValueError:
|
|
269
|
+
pass
|
|
270
|
+
|
|
271
|
+
return data
|
|
272
|
+
|
|
273
|
+
async def add_to_batch(self, record: Dict[str, Any]):
|
|
274
|
+
"""Add record to batch for processing"""
|
|
275
|
+
async with self._processing_lock:
|
|
276
|
+
self.batch_buffer.append(record)
|
|
277
|
+
|
|
278
|
+
# Check if batch should be processed
|
|
279
|
+
current_time = time.time()
|
|
280
|
+
time_since_last_batch = current_time - self.last_batch_time
|
|
281
|
+
|
|
282
|
+
if (
|
|
283
|
+
len(self.batch_buffer) >= self.config.batch_size
|
|
284
|
+
or time_since_last_batch >= self.config.batch_timeout
|
|
285
|
+
):
|
|
286
|
+
await self._process_batch()
|
|
287
|
+
|
|
288
|
+
async def _process_batch(self):
|
|
289
|
+
"""Process accumulated batch"""
|
|
290
|
+
if not self.batch_buffer:
|
|
291
|
+
return
|
|
292
|
+
|
|
293
|
+
batch = self.batch_buffer.copy()
|
|
294
|
+
self.batch_buffer.clear()
|
|
295
|
+
self.last_batch_time = time.time()
|
|
296
|
+
|
|
297
|
+
self.logger.info(f"Processing batch of {len(batch)} records")
|
|
298
|
+
|
|
299
|
+
try:
|
|
300
|
+
# Process batch
|
|
301
|
+
processed_batch = await self.process_trading_data(batch)
|
|
302
|
+
|
|
303
|
+
# Save to file
|
|
304
|
+
await self._save_batch(processed_batch)
|
|
305
|
+
|
|
306
|
+
# Emit completion event
|
|
307
|
+
await self._emit_batch_completed(processed_batch)
|
|
308
|
+
|
|
309
|
+
except Exception as e:
|
|
310
|
+
self.logger.error(f"Batch processing failed: {e}")
|
|
311
|
+
# Re-add to buffer for retry (simplified)
|
|
312
|
+
self.batch_buffer.extend(batch)
|
|
313
|
+
|
|
314
|
+
async def _save_batch(self, batch: List[Dict[str, Any]]):
|
|
315
|
+
"""Save processed batch to file"""
|
|
316
|
+
if not batch:
|
|
317
|
+
return
|
|
318
|
+
|
|
319
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
320
|
+
filename = f"processed_batch_{timestamp}.jsonl"
|
|
321
|
+
filepath = self.config.output_dir / filename
|
|
322
|
+
|
|
323
|
+
try:
|
|
324
|
+
with open(filepath, "w") as f:
|
|
325
|
+
for record in batch:
|
|
326
|
+
f.write(json.dumps(record) + "\n")
|
|
327
|
+
|
|
328
|
+
self.logger.info(f"Saved {len(batch)} records to {filepath}")
|
|
329
|
+
|
|
330
|
+
except Exception as e:
|
|
331
|
+
self.logger.error(f"Failed to save batch: {e}")
|
|
332
|
+
|
|
333
|
+
async def _emit_batch_completed(self, batch: List[Dict[str, Any]]):
|
|
334
|
+
"""Emit batch completion event"""
|
|
335
|
+
self.logger.info(f"Batch processing completed: {len(batch)} records")
|
|
336
|
+
|
|
337
|
+
async def flush_batch(self):
|
|
338
|
+
"""Force process current batch"""
|
|
339
|
+
async with self._processing_lock:
|
|
340
|
+
if self.batch_buffer:
|
|
341
|
+
await self._process_batch()
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
class LSHDataPipeline:
|
|
345
|
+
"""Main integration service for LSH-mcli data pipeline"""
|
|
346
|
+
|
|
347
|
+
def __init__(self, lsh_client: LSHClient, config: Optional[DataPipelineConfig] = None):
|
|
348
|
+
self.lsh_client = lsh_client
|
|
349
|
+
self.config = config or DataPipelineConfig()
|
|
350
|
+
self.processor = DataProcessor(self.config)
|
|
351
|
+
self.event_processor = LSHEventProcessor(lsh_client)
|
|
352
|
+
self.logger = get_logger(__name__)
|
|
353
|
+
self._is_running = False
|
|
354
|
+
|
|
355
|
+
# Setup event handlers
|
|
356
|
+
self._setup_pipeline_handlers()
|
|
357
|
+
|
|
358
|
+
def _setup_pipeline_handlers(self):
|
|
359
|
+
"""Setup event handlers for pipeline processing"""
|
|
360
|
+
self.lsh_client.on("lsh.job.completed", self._handle_job_completed)
|
|
361
|
+
self.lsh_client.on("lsh.supabase.sync", self._handle_supabase_sync)
|
|
362
|
+
self.lsh_client.on("trading.data.processed", self._handle_trading_data)
|
|
363
|
+
|
|
364
|
+
async def _handle_job_completed(self, event_data: Dict[str, Any]):
|
|
365
|
+
"""Handle LSH job completion"""
|
|
366
|
+
job_name = event_data.get("job_name", "")
|
|
367
|
+
job_id = event_data.get("job_id", "")
|
|
368
|
+
|
|
369
|
+
self.logger.info(f"Processing completed job: {job_name}")
|
|
370
|
+
|
|
371
|
+
# Check if this is a trading-related job
|
|
372
|
+
if "trading" in job_name.lower() or "politician" in job_name.lower():
|
|
373
|
+
stdout = event_data.get("stdout", "")
|
|
374
|
+
if stdout.strip():
|
|
375
|
+
await self._process_job_output(job_id, stdout)
|
|
376
|
+
|
|
377
|
+
async def _handle_supabase_sync(self, event_data: Dict[str, Any]):
|
|
378
|
+
"""Handle Supabase sync event"""
|
|
379
|
+
table = event_data.get("table", "")
|
|
380
|
+
operation = event_data.get("operation", "")
|
|
381
|
+
data = event_data.get("data", {})
|
|
382
|
+
|
|
383
|
+
self.logger.info(f"Processing Supabase sync: {operation} on {table}")
|
|
384
|
+
|
|
385
|
+
processed_data = await self.processor.process_supabase_sync(table, operation, data)
|
|
386
|
+
if processed_data:
|
|
387
|
+
await self.processor.add_to_batch(processed_data)
|
|
388
|
+
|
|
389
|
+
async def _handle_trading_data(self, event_data: Dict[str, Any]):
|
|
390
|
+
"""Handle processed trading data"""
|
|
391
|
+
records = event_data.get("records", [])
|
|
392
|
+
|
|
393
|
+
self.logger.info(f"Received {len(records)} trading records for pipeline processing")
|
|
394
|
+
|
|
395
|
+
for record in records:
|
|
396
|
+
await self.processor.add_to_batch(record)
|
|
397
|
+
|
|
398
|
+
async def _process_job_output(self, job_id: str, output: str):
|
|
399
|
+
"""Process job output data"""
|
|
400
|
+
try:
|
|
401
|
+
# Parse output lines as JSON
|
|
402
|
+
records = []
|
|
403
|
+
for line in output.strip().split("\n"):
|
|
404
|
+
if line.strip():
|
|
405
|
+
try:
|
|
406
|
+
record = json.loads(line)
|
|
407
|
+
record["source_job_id"] = job_id
|
|
408
|
+
records.append(record)
|
|
409
|
+
except json.JSONDecodeError:
|
|
410
|
+
continue
|
|
411
|
+
|
|
412
|
+
if records:
|
|
413
|
+
processed_records = await self.processor.process_trading_data(records)
|
|
414
|
+
for record in processed_records:
|
|
415
|
+
await self.processor.add_to_batch(record)
|
|
416
|
+
|
|
417
|
+
except Exception as e:
|
|
418
|
+
self.logger.error(f"Error processing job output: {e}")
|
|
419
|
+
|
|
420
|
+
async def start(self):
|
|
421
|
+
"""Start the data pipeline"""
|
|
422
|
+
if self._is_running:
|
|
423
|
+
self.logger.warning("Pipeline already running")
|
|
424
|
+
return
|
|
425
|
+
|
|
426
|
+
self.logger.info("Starting LSH data pipeline...")
|
|
427
|
+
self._is_running = True
|
|
428
|
+
|
|
429
|
+
try:
|
|
430
|
+
# Start LSH event processing
|
|
431
|
+
await self.event_processor.start_processing()
|
|
432
|
+
|
|
433
|
+
except Exception as e:
|
|
434
|
+
self.logger.error(f"Pipeline error: {e}")
|
|
435
|
+
self._is_running = False
|
|
436
|
+
raise
|
|
437
|
+
|
|
438
|
+
async def stop(self):
|
|
439
|
+
"""Stop the data pipeline"""
|
|
440
|
+
if not self._is_running:
|
|
441
|
+
return
|
|
442
|
+
|
|
443
|
+
self.logger.info("Stopping LSH data pipeline...")
|
|
444
|
+
self._is_running = False
|
|
445
|
+
|
|
446
|
+
# Flush any remaining batches
|
|
447
|
+
await self.processor.flush_batch()
|
|
448
|
+
|
|
449
|
+
async def get_stats(self) -> Dict[str, Any]:
|
|
450
|
+
"""Get pipeline statistics"""
|
|
451
|
+
return {
|
|
452
|
+
"is_running": self._is_running,
|
|
453
|
+
"batch_buffer_size": len(self.processor.batch_buffer),
|
|
454
|
+
"last_batch_time": self.processor.last_batch_time,
|
|
455
|
+
"config": {
|
|
456
|
+
"batch_size": self.config.batch_size,
|
|
457
|
+
"batch_timeout": self.config.batch_timeout,
|
|
458
|
+
"output_dir": str(self.config.output_dir),
|
|
459
|
+
},
|
|
460
|
+
}
|