mcli-framework 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/chat_cmd.py +42 -0
- mcli/app/commands_cmd.py +226 -0
- mcli/app/completion_cmd.py +216 -0
- mcli/app/completion_helpers.py +288 -0
- mcli/app/cron_test_cmd.py +697 -0
- mcli/app/logs_cmd.py +419 -0
- mcli/app/main.py +492 -0
- mcli/app/model/model.py +1060 -0
- mcli/app/model_cmd.py +227 -0
- mcli/app/redis_cmd.py +269 -0
- mcli/app/video/video.py +1114 -0
- mcli/app/visual_cmd.py +303 -0
- mcli/chat/chat.py +2409 -0
- mcli/chat/command_rag.py +514 -0
- mcli/chat/enhanced_chat.py +652 -0
- mcli/chat/system_controller.py +1010 -0
- mcli/chat/system_integration.py +1016 -0
- mcli/cli.py +25 -0
- mcli/config.toml +20 -0
- mcli/lib/api/api.py +586 -0
- mcli/lib/api/daemon_client.py +203 -0
- mcli/lib/api/daemon_client_local.py +44 -0
- mcli/lib/api/daemon_decorator.py +217 -0
- mcli/lib/api/mcli_decorators.py +1032 -0
- mcli/lib/auth/auth.py +85 -0
- mcli/lib/auth/aws_manager.py +85 -0
- mcli/lib/auth/azure_manager.py +91 -0
- mcli/lib/auth/credential_manager.py +192 -0
- mcli/lib/auth/gcp_manager.py +93 -0
- mcli/lib/auth/key_manager.py +117 -0
- mcli/lib/auth/mcli_manager.py +93 -0
- mcli/lib/auth/token_manager.py +75 -0
- mcli/lib/auth/token_util.py +1011 -0
- mcli/lib/config/config.py +47 -0
- mcli/lib/discovery/__init__.py +1 -0
- mcli/lib/discovery/command_discovery.py +274 -0
- mcli/lib/erd/erd.py +1345 -0
- mcli/lib/erd/generate_graph.py +453 -0
- mcli/lib/files/files.py +76 -0
- mcli/lib/fs/fs.py +109 -0
- mcli/lib/lib.py +29 -0
- mcli/lib/logger/logger.py +611 -0
- mcli/lib/performance/optimizer.py +409 -0
- mcli/lib/performance/rust_bridge.py +502 -0
- mcli/lib/performance/uvloop_config.py +154 -0
- mcli/lib/pickles/pickles.py +50 -0
- mcli/lib/search/cached_vectorizer.py +479 -0
- mcli/lib/services/data_pipeline.py +460 -0
- mcli/lib/services/lsh_client.py +441 -0
- mcli/lib/services/redis_service.py +387 -0
- mcli/lib/shell/shell.py +137 -0
- mcli/lib/toml/toml.py +33 -0
- mcli/lib/ui/styling.py +47 -0
- mcli/lib/ui/visual_effects.py +634 -0
- mcli/lib/watcher/watcher.py +185 -0
- mcli/ml/api/app.py +215 -0
- mcli/ml/api/middleware.py +224 -0
- mcli/ml/api/routers/admin_router.py +12 -0
- mcli/ml/api/routers/auth_router.py +244 -0
- mcli/ml/api/routers/backtest_router.py +12 -0
- mcli/ml/api/routers/data_router.py +12 -0
- mcli/ml/api/routers/model_router.py +302 -0
- mcli/ml/api/routers/monitoring_router.py +12 -0
- mcli/ml/api/routers/portfolio_router.py +12 -0
- mcli/ml/api/routers/prediction_router.py +267 -0
- mcli/ml/api/routers/trade_router.py +12 -0
- mcli/ml/api/routers/websocket_router.py +76 -0
- mcli/ml/api/schemas.py +64 -0
- mcli/ml/auth/auth_manager.py +425 -0
- mcli/ml/auth/models.py +154 -0
- mcli/ml/auth/permissions.py +302 -0
- mcli/ml/backtesting/backtest_engine.py +502 -0
- mcli/ml/backtesting/performance_metrics.py +393 -0
- mcli/ml/cache.py +400 -0
- mcli/ml/cli/main.py +398 -0
- mcli/ml/config/settings.py +394 -0
- mcli/ml/configs/dvc_config.py +230 -0
- mcli/ml/configs/mlflow_config.py +131 -0
- mcli/ml/configs/mlops_manager.py +293 -0
- mcli/ml/dashboard/app.py +532 -0
- mcli/ml/dashboard/app_integrated.py +738 -0
- mcli/ml/dashboard/app_supabase.py +560 -0
- mcli/ml/dashboard/app_training.py +615 -0
- mcli/ml/dashboard/cli.py +51 -0
- mcli/ml/data_ingestion/api_connectors.py +501 -0
- mcli/ml/data_ingestion/data_pipeline.py +567 -0
- mcli/ml/data_ingestion/stream_processor.py +512 -0
- mcli/ml/database/migrations/env.py +94 -0
- mcli/ml/database/models.py +667 -0
- mcli/ml/database/session.py +200 -0
- mcli/ml/experimentation/ab_testing.py +845 -0
- mcli/ml/features/ensemble_features.py +607 -0
- mcli/ml/features/political_features.py +676 -0
- mcli/ml/features/recommendation_engine.py +809 -0
- mcli/ml/features/stock_features.py +573 -0
- mcli/ml/features/test_feature_engineering.py +346 -0
- mcli/ml/logging.py +85 -0
- mcli/ml/mlops/data_versioning.py +518 -0
- mcli/ml/mlops/experiment_tracker.py +377 -0
- mcli/ml/mlops/model_serving.py +481 -0
- mcli/ml/mlops/pipeline_orchestrator.py +614 -0
- mcli/ml/models/base_models.py +324 -0
- mcli/ml/models/ensemble_models.py +675 -0
- mcli/ml/models/recommendation_models.py +474 -0
- mcli/ml/models/test_models.py +487 -0
- mcli/ml/monitoring/drift_detection.py +676 -0
- mcli/ml/monitoring/metrics.py +45 -0
- mcli/ml/optimization/portfolio_optimizer.py +834 -0
- mcli/ml/preprocessing/data_cleaners.py +451 -0
- mcli/ml/preprocessing/feature_extractors.py +491 -0
- mcli/ml/preprocessing/ml_pipeline.py +382 -0
- mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
- mcli/ml/preprocessing/test_preprocessing.py +294 -0
- mcli/ml/scripts/populate_sample_data.py +200 -0
- mcli/ml/tasks.py +400 -0
- mcli/ml/tests/test_integration.py +429 -0
- mcli/ml/tests/test_training_dashboard.py +387 -0
- mcli/public/oi/oi.py +15 -0
- mcli/public/public.py +4 -0
- mcli/self/self_cmd.py +1246 -0
- mcli/workflow/daemon/api_daemon.py +800 -0
- mcli/workflow/daemon/async_command_database.py +681 -0
- mcli/workflow/daemon/async_process_manager.py +591 -0
- mcli/workflow/daemon/client.py +530 -0
- mcli/workflow/daemon/commands.py +1196 -0
- mcli/workflow/daemon/daemon.py +905 -0
- mcli/workflow/daemon/daemon_api.py +59 -0
- mcli/workflow/daemon/enhanced_daemon.py +571 -0
- mcli/workflow/daemon/process_cli.py +244 -0
- mcli/workflow/daemon/process_manager.py +439 -0
- mcli/workflow/daemon/test_daemon.py +275 -0
- mcli/workflow/dashboard/dashboard_cmd.py +113 -0
- mcli/workflow/docker/docker.py +0 -0
- mcli/workflow/file/file.py +100 -0
- mcli/workflow/gcloud/config.toml +21 -0
- mcli/workflow/gcloud/gcloud.py +58 -0
- mcli/workflow/git_commit/ai_service.py +328 -0
- mcli/workflow/git_commit/commands.py +430 -0
- mcli/workflow/lsh_integration.py +355 -0
- mcli/workflow/model_service/client.py +594 -0
- mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
- mcli/workflow/model_service/lightweight_embedder.py +397 -0
- mcli/workflow/model_service/lightweight_model_server.py +714 -0
- mcli/workflow/model_service/lightweight_test.py +241 -0
- mcli/workflow/model_service/model_service.py +1955 -0
- mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
- mcli/workflow/model_service/pdf_processor.py +386 -0
- mcli/workflow/model_service/test_efficient_runner.py +234 -0
- mcli/workflow/model_service/test_example.py +315 -0
- mcli/workflow/model_service/test_integration.py +131 -0
- mcli/workflow/model_service/test_new_features.py +149 -0
- mcli/workflow/openai/openai.py +99 -0
- mcli/workflow/politician_trading/commands.py +1790 -0
- mcli/workflow/politician_trading/config.py +134 -0
- mcli/workflow/politician_trading/connectivity.py +490 -0
- mcli/workflow/politician_trading/data_sources.py +395 -0
- mcli/workflow/politician_trading/database.py +410 -0
- mcli/workflow/politician_trading/demo.py +248 -0
- mcli/workflow/politician_trading/models.py +165 -0
- mcli/workflow/politician_trading/monitoring.py +413 -0
- mcli/workflow/politician_trading/scrapers.py +966 -0
- mcli/workflow/politician_trading/scrapers_california.py +412 -0
- mcli/workflow/politician_trading/scrapers_eu.py +377 -0
- mcli/workflow/politician_trading/scrapers_uk.py +350 -0
- mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
- mcli/workflow/politician_trading/supabase_functions.py +354 -0
- mcli/workflow/politician_trading/workflow.py +852 -0
- mcli/workflow/registry/registry.py +180 -0
- mcli/workflow/repo/repo.py +223 -0
- mcli/workflow/scheduler/commands.py +493 -0
- mcli/workflow/scheduler/cron_parser.py +238 -0
- mcli/workflow/scheduler/job.py +182 -0
- mcli/workflow/scheduler/monitor.py +139 -0
- mcli/workflow/scheduler/persistence.py +324 -0
- mcli/workflow/scheduler/scheduler.py +679 -0
- mcli/workflow/sync/sync_cmd.py +437 -0
- mcli/workflow/sync/test_cmd.py +314 -0
- mcli/workflow/videos/videos.py +242 -0
- mcli/workflow/wakatime/wakatime.py +11 -0
- mcli/workflow/workflow.py +37 -0
- mcli_framework-7.0.0.dist-info/METADATA +479 -0
- mcli_framework-7.0.0.dist-info/RECORD +186 -0
- mcli_framework-7.0.0.dist-info/WHEEL +5 -0
- mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
- mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
- mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
"""ML Data Pipeline Integration"""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict, List, Optional, Union
|
|
9
|
+
from dataclasses import dataclass, asdict
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
from mcli.lib.services.data_pipeline import LSHDataPipeline, DataPipelineConfig
|
|
14
|
+
from mcli.lib.services.lsh_client import LSHClient
|
|
15
|
+
from mcli.ml.configs.mlops_manager import get_mlops_manager
|
|
16
|
+
from .politician_trading_preprocessor import (
|
|
17
|
+
PoliticianTradingPreprocessor,
|
|
18
|
+
PreprocessingConfig,
|
|
19
|
+
PreprocessingResults,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class MLDataPipelineConfig:
|
|
27
|
+
"""Configuration for ML data pipeline"""
|
|
28
|
+
|
|
29
|
+
# Data ingestion
|
|
30
|
+
batch_size: int = 50
|
|
31
|
+
batch_timeout: int = 60 # seconds
|
|
32
|
+
max_buffer_size: int = 1000
|
|
33
|
+
|
|
34
|
+
# Preprocessing
|
|
35
|
+
preprocessing_config: Optional[PreprocessingConfig] = None
|
|
36
|
+
auto_retrain_threshold: int = 100 # New records needed to trigger retraining
|
|
37
|
+
|
|
38
|
+
# Storage
|
|
39
|
+
processed_data_dir: Path = Path("./data/ml_ready")
|
|
40
|
+
model_training_data_dir: Path = Path("./data/training")
|
|
41
|
+
|
|
42
|
+
# MLOps integration
|
|
43
|
+
enable_mlflow_logging: bool = True
|
|
44
|
+
experiment_name: str = "politician_trading_preprocessing"
|
|
45
|
+
|
|
46
|
+
def __post_init__(self):
|
|
47
|
+
if self.preprocessing_config is None:
|
|
48
|
+
self.preprocessing_config = PreprocessingConfig()
|
|
49
|
+
|
|
50
|
+
self.processed_data_dir.mkdir(parents=True, exist_ok=True)
|
|
51
|
+
self.model_training_data_dir.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class MLDataPipeline:
|
|
55
|
+
"""ML-focused data pipeline for politician trading data"""
|
|
56
|
+
|
|
57
|
+
def __init__(self, lsh_client: LSHClient, config: Optional[MLDataPipelineConfig] = None):
|
|
58
|
+
self.lsh_client = lsh_client
|
|
59
|
+
self.config = config or MLDataPipelineConfig()
|
|
60
|
+
|
|
61
|
+
# Initialize components
|
|
62
|
+
self.base_pipeline = LSHDataPipeline(lsh_client, DataPipelineConfig())
|
|
63
|
+
self.preprocessor = PoliticianTradingPreprocessor(self.config.preprocessing_config)
|
|
64
|
+
self.mlops_manager = get_mlops_manager()
|
|
65
|
+
|
|
66
|
+
# Data buffers
|
|
67
|
+
self.raw_data_buffer: List[Dict[str, Any]] = []
|
|
68
|
+
self.processed_data_buffer: List[Dict[str, Any]] = []
|
|
69
|
+
|
|
70
|
+
# State tracking
|
|
71
|
+
self._is_running = False
|
|
72
|
+
self._last_preprocessing_time = None
|
|
73
|
+
self._total_records_processed = 0
|
|
74
|
+
|
|
75
|
+
# Setup event handlers
|
|
76
|
+
self._setup_ml_handlers()
|
|
77
|
+
|
|
78
|
+
def _setup_ml_handlers(self):
|
|
79
|
+
"""Setup ML-specific event handlers"""
|
|
80
|
+
self.lsh_client.on("trading.data.received", self._handle_trading_data_for_ml)
|
|
81
|
+
self.lsh_client.on("politician.data.updated", self._handle_politician_update)
|
|
82
|
+
self.lsh_client.on("market.data.sync", self._handle_market_data)
|
|
83
|
+
|
|
84
|
+
async def start(self):
|
|
85
|
+
"""Start the ML data pipeline"""
|
|
86
|
+
if self._is_running:
|
|
87
|
+
logger.warning("ML pipeline already running")
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
logger.info("Starting ML data pipeline")
|
|
91
|
+
self._is_running = True
|
|
92
|
+
|
|
93
|
+
# Setup MLOps infrastructure
|
|
94
|
+
self.mlops_manager.setup()
|
|
95
|
+
|
|
96
|
+
if self.config.enable_mlflow_logging:
|
|
97
|
+
# Start MLflow experiment
|
|
98
|
+
self.mlops_manager.start_experiment_run(
|
|
99
|
+
run_name=f"preprocessing_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
|
100
|
+
tags={"component": "data_preprocessing", "pipeline_version": "1.0.0"},
|
|
101
|
+
description="Politician trading data preprocessing pipeline",
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Start base pipeline
|
|
105
|
+
await self.base_pipeline.start()
|
|
106
|
+
|
|
107
|
+
# Start periodic processing
|
|
108
|
+
asyncio.create_task(self._periodic_processing())
|
|
109
|
+
|
|
110
|
+
async def stop(self):
|
|
111
|
+
"""Stop the ML data pipeline"""
|
|
112
|
+
if not self._is_running:
|
|
113
|
+
return
|
|
114
|
+
|
|
115
|
+
logger.info("Stopping ML data pipeline")
|
|
116
|
+
self._is_running = False
|
|
117
|
+
|
|
118
|
+
# Process any remaining data
|
|
119
|
+
await self._process_accumulated_data()
|
|
120
|
+
|
|
121
|
+
# Stop base pipeline
|
|
122
|
+
await self.base_pipeline.stop()
|
|
123
|
+
|
|
124
|
+
# End MLflow run
|
|
125
|
+
if self.config.enable_mlflow_logging:
|
|
126
|
+
self.mlops_manager.end_run()
|
|
127
|
+
|
|
128
|
+
async def _handle_trading_data_for_ml(self, event_data: Dict[str, Any]):
|
|
129
|
+
"""Handle trading data for ML processing"""
|
|
130
|
+
records = event_data.get("records", [])
|
|
131
|
+
|
|
132
|
+
if not records:
|
|
133
|
+
return
|
|
134
|
+
|
|
135
|
+
logger.info(f"Received {len(records)} trading records for ML processing")
|
|
136
|
+
|
|
137
|
+
# Add to buffer
|
|
138
|
+
self.raw_data_buffer.extend(records)
|
|
139
|
+
|
|
140
|
+
# Check if we should process
|
|
141
|
+
if (
|
|
142
|
+
len(self.raw_data_buffer) >= self.config.batch_size
|
|
143
|
+
or len(self.raw_data_buffer) >= self.config.max_buffer_size
|
|
144
|
+
):
|
|
145
|
+
await self._process_accumulated_data()
|
|
146
|
+
|
|
147
|
+
async def _handle_politician_update(self, event_data: Dict[str, Any]):
|
|
148
|
+
"""Handle politician metadata updates"""
|
|
149
|
+
politician_data = event_data.get("politician", {})
|
|
150
|
+
logger.info(f"Received politician update: {politician_data.get('name', 'unknown')}")
|
|
151
|
+
|
|
152
|
+
# This could trigger reprocessing of related records
|
|
153
|
+
# For now, just log the update
|
|
154
|
+
|
|
155
|
+
async def _handle_market_data(self, event_data: Dict[str, Any]):
|
|
156
|
+
"""Handle market data updates"""
|
|
157
|
+
market_data = event_data.get("market", {})
|
|
158
|
+
logger.info(f"Received market data update")
|
|
159
|
+
|
|
160
|
+
# This could be used to enrich existing records
|
|
161
|
+
# For now, just log the update
|
|
162
|
+
|
|
163
|
+
async def _periodic_processing(self):
|
|
164
|
+
"""Periodic processing of accumulated data"""
|
|
165
|
+
while self._is_running:
|
|
166
|
+
try:
|
|
167
|
+
# Wait for timeout period
|
|
168
|
+
await asyncio.sleep(self.config.batch_timeout)
|
|
169
|
+
|
|
170
|
+
# Process if we have data
|
|
171
|
+
if self.raw_data_buffer:
|
|
172
|
+
await self._process_accumulated_data()
|
|
173
|
+
|
|
174
|
+
# Check if we need to retrain models
|
|
175
|
+
if self._should_trigger_retraining():
|
|
176
|
+
await self._trigger_model_retraining()
|
|
177
|
+
|
|
178
|
+
except Exception as e:
|
|
179
|
+
logger.error(f"Error in periodic processing: {e}")
|
|
180
|
+
|
|
181
|
+
async def _process_accumulated_data(self):
|
|
182
|
+
"""Process accumulated raw data through ML preprocessing"""
|
|
183
|
+
if not self.raw_data_buffer:
|
|
184
|
+
return
|
|
185
|
+
|
|
186
|
+
logger.info(f"Processing {len(self.raw_data_buffer)} accumulated records")
|
|
187
|
+
|
|
188
|
+
try:
|
|
189
|
+
# Take snapshot of buffer and clear it
|
|
190
|
+
records_to_process = self.raw_data_buffer.copy()
|
|
191
|
+
self.raw_data_buffer.clear()
|
|
192
|
+
|
|
193
|
+
# Run preprocessing
|
|
194
|
+
preprocessing_results = await self._run_preprocessing(records_to_process)
|
|
195
|
+
|
|
196
|
+
if preprocessing_results:
|
|
197
|
+
# Save processed data
|
|
198
|
+
await self._save_processed_data(preprocessing_results)
|
|
199
|
+
|
|
200
|
+
# Log to MLOps
|
|
201
|
+
if self.config.enable_mlflow_logging:
|
|
202
|
+
await self._log_preprocessing_metrics(preprocessing_results)
|
|
203
|
+
|
|
204
|
+
# Update state
|
|
205
|
+
self._total_records_processed += len(records_to_process)
|
|
206
|
+
self._last_preprocessing_time = datetime.now()
|
|
207
|
+
|
|
208
|
+
logger.info(f"Successfully processed {len(records_to_process)} records")
|
|
209
|
+
|
|
210
|
+
except Exception as e:
|
|
211
|
+
logger.error(f"Error processing accumulated data: {e}")
|
|
212
|
+
# Re-add records to buffer for retry
|
|
213
|
+
self.raw_data_buffer.extend(records_to_process)
|
|
214
|
+
|
|
215
|
+
async def _run_preprocessing(
|
|
216
|
+
self, records: List[Dict[str, Any]]
|
|
217
|
+
) -> Optional[PreprocessingResults]:
|
|
218
|
+
"""Run the preprocessing pipeline"""
|
|
219
|
+
if not records:
|
|
220
|
+
return None
|
|
221
|
+
|
|
222
|
+
try:
|
|
223
|
+
# Convert to DataFrame
|
|
224
|
+
raw_df = pd.DataFrame(records)
|
|
225
|
+
|
|
226
|
+
# Add metadata
|
|
227
|
+
raw_df["pipeline_batch_id"] = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
228
|
+
raw_df["pipeline_version"] = "1.0.0"
|
|
229
|
+
|
|
230
|
+
# Run preprocessing
|
|
231
|
+
results = self.preprocessor.preprocess(raw_df)
|
|
232
|
+
|
|
233
|
+
return results
|
|
234
|
+
|
|
235
|
+
except Exception as e:
|
|
236
|
+
logger.error(f"Preprocessing failed: {e}")
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
async def _save_processed_data(self, results: PreprocessingResults):
|
|
240
|
+
"""Save processed data to files"""
|
|
241
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
242
|
+
|
|
243
|
+
# Save train/val/test splits
|
|
244
|
+
train_path = self.config.model_training_data_dir / f"train_{timestamp}.parquet"
|
|
245
|
+
val_path = self.config.model_training_data_dir / f"val_{timestamp}.parquet"
|
|
246
|
+
test_path = self.config.model_training_data_dir / f"test_{timestamp}.parquet"
|
|
247
|
+
|
|
248
|
+
results.train_data.to_parquet(train_path)
|
|
249
|
+
results.val_data.to_parquet(val_path)
|
|
250
|
+
results.test_data.to_parquet(test_path)
|
|
251
|
+
|
|
252
|
+
# Save processed data for inference
|
|
253
|
+
processed_path = self.config.processed_data_dir / f"processed_{timestamp}.parquet"
|
|
254
|
+
all_data = pd.concat([results.train_data, results.val_data, results.test_data])
|
|
255
|
+
all_data.to_parquet(processed_path)
|
|
256
|
+
|
|
257
|
+
# Save metadata
|
|
258
|
+
metadata = {
|
|
259
|
+
"timestamp": timestamp,
|
|
260
|
+
"feature_names": results.feature_names,
|
|
261
|
+
"categorical_features": results.categorical_features,
|
|
262
|
+
"numerical_features": results.numerical_features,
|
|
263
|
+
"target_columns": results.target_columns,
|
|
264
|
+
"original_shape": results.original_shape,
|
|
265
|
+
"final_shape": results.final_shape,
|
|
266
|
+
"feature_count": results.feature_count,
|
|
267
|
+
"cleaning_stats": asdict(results.cleaning_stats),
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
metadata_path = self.config.processed_data_dir / f"metadata_{timestamp}.json"
|
|
271
|
+
with open(metadata_path, "w") as f:
|
|
272
|
+
json.dump(metadata, f, indent=2, default=str)
|
|
273
|
+
|
|
274
|
+
logger.info(f"Saved processed data to {processed_path}")
|
|
275
|
+
|
|
276
|
+
async def _log_preprocessing_metrics(self, results: PreprocessingResults):
|
|
277
|
+
"""Log preprocessing metrics to MLOps"""
|
|
278
|
+
try:
|
|
279
|
+
# Log parameters
|
|
280
|
+
params = {
|
|
281
|
+
"batch_size": self.config.batch_size,
|
|
282
|
+
"preprocessing_version": "1.0.0",
|
|
283
|
+
"original_records": results.original_shape[0],
|
|
284
|
+
"final_records": results.final_shape[0],
|
|
285
|
+
"feature_count": results.feature_count,
|
|
286
|
+
"target_count": len(results.target_columns),
|
|
287
|
+
}
|
|
288
|
+
self.mlops_manager.log_parameters(params)
|
|
289
|
+
|
|
290
|
+
# Log metrics
|
|
291
|
+
metrics = {
|
|
292
|
+
"data_retention_rate": results.final_shape[0] / results.original_shape[0],
|
|
293
|
+
"feature_extraction_ratio": results.feature_count / results.original_shape[1],
|
|
294
|
+
"cleaning_success_rate": results.cleaning_stats.cleaned_records
|
|
295
|
+
/ results.cleaning_stats.total_records,
|
|
296
|
+
"outliers_detected": results.cleaning_stats.outliers_detected,
|
|
297
|
+
"missing_values_filled": results.cleaning_stats.missing_values_filled,
|
|
298
|
+
}
|
|
299
|
+
self.mlops_manager.log_metrics(metrics)
|
|
300
|
+
|
|
301
|
+
# Log artifacts
|
|
302
|
+
if results.feature_metadata_path and results.feature_metadata_path.exists():
|
|
303
|
+
self.mlops_manager.log_artifacts(results.feature_metadata_path)
|
|
304
|
+
|
|
305
|
+
except Exception as e:
|
|
306
|
+
logger.error(f"Failed to log preprocessing metrics: {e}")
|
|
307
|
+
|
|
308
|
+
def _should_trigger_retraining(self) -> bool:
|
|
309
|
+
"""Check if we should trigger model retraining"""
|
|
310
|
+
if self._total_records_processed >= self.config.auto_retrain_threshold:
|
|
311
|
+
# Reset counter
|
|
312
|
+
self._total_records_processed = 0
|
|
313
|
+
return True
|
|
314
|
+
return False
|
|
315
|
+
|
|
316
|
+
async def _trigger_model_retraining(self):
|
|
317
|
+
"""Trigger model retraining"""
|
|
318
|
+
logger.info("Triggering model retraining due to data threshold")
|
|
319
|
+
|
|
320
|
+
# This would integrate with the model training pipeline
|
|
321
|
+
# For now, just emit an event
|
|
322
|
+
self.lsh_client.emit(
|
|
323
|
+
"ml.retrain.triggered",
|
|
324
|
+
{
|
|
325
|
+
"trigger_reason": "data_threshold",
|
|
326
|
+
"timestamp": datetime.now().isoformat(),
|
|
327
|
+
"records_processed": self._total_records_processed,
|
|
328
|
+
},
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
async def get_processing_stats(self) -> Dict[str, Any]:
|
|
332
|
+
"""Get pipeline processing statistics"""
|
|
333
|
+
return {
|
|
334
|
+
"is_running": self._is_running,
|
|
335
|
+
"raw_buffer_size": len(self.raw_data_buffer),
|
|
336
|
+
"processed_buffer_size": len(self.processed_data_buffer),
|
|
337
|
+
"total_records_processed": self._total_records_processed,
|
|
338
|
+
"last_preprocessing_time": (
|
|
339
|
+
self._last_preprocessing_time.isoformat() if self._last_preprocessing_time else None
|
|
340
|
+
),
|
|
341
|
+
"config": {
|
|
342
|
+
"batch_size": self.config.batch_size,
|
|
343
|
+
"batch_timeout": self.config.batch_timeout,
|
|
344
|
+
"auto_retrain_threshold": self.config.auto_retrain_threshold,
|
|
345
|
+
},
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
async def force_preprocessing(self) -> bool:
|
|
349
|
+
"""Force preprocessing of current buffer"""
|
|
350
|
+
if not self.raw_data_buffer:
|
|
351
|
+
logger.warning("No data in buffer to process")
|
|
352
|
+
return False
|
|
353
|
+
|
|
354
|
+
await self._process_accumulated_data()
|
|
355
|
+
return True
|
|
356
|
+
|
|
357
|
+
async def load_historical_data(self, data_path: Path) -> bool:
|
|
358
|
+
"""Load and process historical data"""
|
|
359
|
+
try:
|
|
360
|
+
if data_path.suffix == ".parquet":
|
|
361
|
+
df = pd.read_parquet(data_path)
|
|
362
|
+
elif data_path.suffix == ".csv":
|
|
363
|
+
df = pd.read_csv(data_path)
|
|
364
|
+
elif data_path.suffix == ".json":
|
|
365
|
+
df = pd.read_json(data_path)
|
|
366
|
+
else:
|
|
367
|
+
logger.error(f"Unsupported file format: {data_path.suffix}")
|
|
368
|
+
return False
|
|
369
|
+
|
|
370
|
+
# Convert to records and process
|
|
371
|
+
records = df.to_dict("records")
|
|
372
|
+
self.raw_data_buffer.extend(records)
|
|
373
|
+
|
|
374
|
+
# Process immediately
|
|
375
|
+
await self._process_accumulated_data()
|
|
376
|
+
|
|
377
|
+
logger.info(f"Loaded and processed {len(records)} historical records from {data_path}")
|
|
378
|
+
return True
|
|
379
|
+
|
|
380
|
+
except Exception as e:
|
|
381
|
+
logger.error(f"Failed to load historical data: {e}")
|
|
382
|
+
return False
|