mcli-framework 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (186) hide show
  1. mcli/app/chat_cmd.py +42 -0
  2. mcli/app/commands_cmd.py +226 -0
  3. mcli/app/completion_cmd.py +216 -0
  4. mcli/app/completion_helpers.py +288 -0
  5. mcli/app/cron_test_cmd.py +697 -0
  6. mcli/app/logs_cmd.py +419 -0
  7. mcli/app/main.py +492 -0
  8. mcli/app/model/model.py +1060 -0
  9. mcli/app/model_cmd.py +227 -0
  10. mcli/app/redis_cmd.py +269 -0
  11. mcli/app/video/video.py +1114 -0
  12. mcli/app/visual_cmd.py +303 -0
  13. mcli/chat/chat.py +2409 -0
  14. mcli/chat/command_rag.py +514 -0
  15. mcli/chat/enhanced_chat.py +652 -0
  16. mcli/chat/system_controller.py +1010 -0
  17. mcli/chat/system_integration.py +1016 -0
  18. mcli/cli.py +25 -0
  19. mcli/config.toml +20 -0
  20. mcli/lib/api/api.py +586 -0
  21. mcli/lib/api/daemon_client.py +203 -0
  22. mcli/lib/api/daemon_client_local.py +44 -0
  23. mcli/lib/api/daemon_decorator.py +217 -0
  24. mcli/lib/api/mcli_decorators.py +1032 -0
  25. mcli/lib/auth/auth.py +85 -0
  26. mcli/lib/auth/aws_manager.py +85 -0
  27. mcli/lib/auth/azure_manager.py +91 -0
  28. mcli/lib/auth/credential_manager.py +192 -0
  29. mcli/lib/auth/gcp_manager.py +93 -0
  30. mcli/lib/auth/key_manager.py +117 -0
  31. mcli/lib/auth/mcli_manager.py +93 -0
  32. mcli/lib/auth/token_manager.py +75 -0
  33. mcli/lib/auth/token_util.py +1011 -0
  34. mcli/lib/config/config.py +47 -0
  35. mcli/lib/discovery/__init__.py +1 -0
  36. mcli/lib/discovery/command_discovery.py +274 -0
  37. mcli/lib/erd/erd.py +1345 -0
  38. mcli/lib/erd/generate_graph.py +453 -0
  39. mcli/lib/files/files.py +76 -0
  40. mcli/lib/fs/fs.py +109 -0
  41. mcli/lib/lib.py +29 -0
  42. mcli/lib/logger/logger.py +611 -0
  43. mcli/lib/performance/optimizer.py +409 -0
  44. mcli/lib/performance/rust_bridge.py +502 -0
  45. mcli/lib/performance/uvloop_config.py +154 -0
  46. mcli/lib/pickles/pickles.py +50 -0
  47. mcli/lib/search/cached_vectorizer.py +479 -0
  48. mcli/lib/services/data_pipeline.py +460 -0
  49. mcli/lib/services/lsh_client.py +441 -0
  50. mcli/lib/services/redis_service.py +387 -0
  51. mcli/lib/shell/shell.py +137 -0
  52. mcli/lib/toml/toml.py +33 -0
  53. mcli/lib/ui/styling.py +47 -0
  54. mcli/lib/ui/visual_effects.py +634 -0
  55. mcli/lib/watcher/watcher.py +185 -0
  56. mcli/ml/api/app.py +215 -0
  57. mcli/ml/api/middleware.py +224 -0
  58. mcli/ml/api/routers/admin_router.py +12 -0
  59. mcli/ml/api/routers/auth_router.py +244 -0
  60. mcli/ml/api/routers/backtest_router.py +12 -0
  61. mcli/ml/api/routers/data_router.py +12 -0
  62. mcli/ml/api/routers/model_router.py +302 -0
  63. mcli/ml/api/routers/monitoring_router.py +12 -0
  64. mcli/ml/api/routers/portfolio_router.py +12 -0
  65. mcli/ml/api/routers/prediction_router.py +267 -0
  66. mcli/ml/api/routers/trade_router.py +12 -0
  67. mcli/ml/api/routers/websocket_router.py +76 -0
  68. mcli/ml/api/schemas.py +64 -0
  69. mcli/ml/auth/auth_manager.py +425 -0
  70. mcli/ml/auth/models.py +154 -0
  71. mcli/ml/auth/permissions.py +302 -0
  72. mcli/ml/backtesting/backtest_engine.py +502 -0
  73. mcli/ml/backtesting/performance_metrics.py +393 -0
  74. mcli/ml/cache.py +400 -0
  75. mcli/ml/cli/main.py +398 -0
  76. mcli/ml/config/settings.py +394 -0
  77. mcli/ml/configs/dvc_config.py +230 -0
  78. mcli/ml/configs/mlflow_config.py +131 -0
  79. mcli/ml/configs/mlops_manager.py +293 -0
  80. mcli/ml/dashboard/app.py +532 -0
  81. mcli/ml/dashboard/app_integrated.py +738 -0
  82. mcli/ml/dashboard/app_supabase.py +560 -0
  83. mcli/ml/dashboard/app_training.py +615 -0
  84. mcli/ml/dashboard/cli.py +51 -0
  85. mcli/ml/data_ingestion/api_connectors.py +501 -0
  86. mcli/ml/data_ingestion/data_pipeline.py +567 -0
  87. mcli/ml/data_ingestion/stream_processor.py +512 -0
  88. mcli/ml/database/migrations/env.py +94 -0
  89. mcli/ml/database/models.py +667 -0
  90. mcli/ml/database/session.py +200 -0
  91. mcli/ml/experimentation/ab_testing.py +845 -0
  92. mcli/ml/features/ensemble_features.py +607 -0
  93. mcli/ml/features/political_features.py +676 -0
  94. mcli/ml/features/recommendation_engine.py +809 -0
  95. mcli/ml/features/stock_features.py +573 -0
  96. mcli/ml/features/test_feature_engineering.py +346 -0
  97. mcli/ml/logging.py +85 -0
  98. mcli/ml/mlops/data_versioning.py +518 -0
  99. mcli/ml/mlops/experiment_tracker.py +377 -0
  100. mcli/ml/mlops/model_serving.py +481 -0
  101. mcli/ml/mlops/pipeline_orchestrator.py +614 -0
  102. mcli/ml/models/base_models.py +324 -0
  103. mcli/ml/models/ensemble_models.py +675 -0
  104. mcli/ml/models/recommendation_models.py +474 -0
  105. mcli/ml/models/test_models.py +487 -0
  106. mcli/ml/monitoring/drift_detection.py +676 -0
  107. mcli/ml/monitoring/metrics.py +45 -0
  108. mcli/ml/optimization/portfolio_optimizer.py +834 -0
  109. mcli/ml/preprocessing/data_cleaners.py +451 -0
  110. mcli/ml/preprocessing/feature_extractors.py +491 -0
  111. mcli/ml/preprocessing/ml_pipeline.py +382 -0
  112. mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
  113. mcli/ml/preprocessing/test_preprocessing.py +294 -0
  114. mcli/ml/scripts/populate_sample_data.py +200 -0
  115. mcli/ml/tasks.py +400 -0
  116. mcli/ml/tests/test_integration.py +429 -0
  117. mcli/ml/tests/test_training_dashboard.py +387 -0
  118. mcli/public/oi/oi.py +15 -0
  119. mcli/public/public.py +4 -0
  120. mcli/self/self_cmd.py +1246 -0
  121. mcli/workflow/daemon/api_daemon.py +800 -0
  122. mcli/workflow/daemon/async_command_database.py +681 -0
  123. mcli/workflow/daemon/async_process_manager.py +591 -0
  124. mcli/workflow/daemon/client.py +530 -0
  125. mcli/workflow/daemon/commands.py +1196 -0
  126. mcli/workflow/daemon/daemon.py +905 -0
  127. mcli/workflow/daemon/daemon_api.py +59 -0
  128. mcli/workflow/daemon/enhanced_daemon.py +571 -0
  129. mcli/workflow/daemon/process_cli.py +244 -0
  130. mcli/workflow/daemon/process_manager.py +439 -0
  131. mcli/workflow/daemon/test_daemon.py +275 -0
  132. mcli/workflow/dashboard/dashboard_cmd.py +113 -0
  133. mcli/workflow/docker/docker.py +0 -0
  134. mcli/workflow/file/file.py +100 -0
  135. mcli/workflow/gcloud/config.toml +21 -0
  136. mcli/workflow/gcloud/gcloud.py +58 -0
  137. mcli/workflow/git_commit/ai_service.py +328 -0
  138. mcli/workflow/git_commit/commands.py +430 -0
  139. mcli/workflow/lsh_integration.py +355 -0
  140. mcli/workflow/model_service/client.py +594 -0
  141. mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
  142. mcli/workflow/model_service/lightweight_embedder.py +397 -0
  143. mcli/workflow/model_service/lightweight_model_server.py +714 -0
  144. mcli/workflow/model_service/lightweight_test.py +241 -0
  145. mcli/workflow/model_service/model_service.py +1955 -0
  146. mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
  147. mcli/workflow/model_service/pdf_processor.py +386 -0
  148. mcli/workflow/model_service/test_efficient_runner.py +234 -0
  149. mcli/workflow/model_service/test_example.py +315 -0
  150. mcli/workflow/model_service/test_integration.py +131 -0
  151. mcli/workflow/model_service/test_new_features.py +149 -0
  152. mcli/workflow/openai/openai.py +99 -0
  153. mcli/workflow/politician_trading/commands.py +1790 -0
  154. mcli/workflow/politician_trading/config.py +134 -0
  155. mcli/workflow/politician_trading/connectivity.py +490 -0
  156. mcli/workflow/politician_trading/data_sources.py +395 -0
  157. mcli/workflow/politician_trading/database.py +410 -0
  158. mcli/workflow/politician_trading/demo.py +248 -0
  159. mcli/workflow/politician_trading/models.py +165 -0
  160. mcli/workflow/politician_trading/monitoring.py +413 -0
  161. mcli/workflow/politician_trading/scrapers.py +966 -0
  162. mcli/workflow/politician_trading/scrapers_california.py +412 -0
  163. mcli/workflow/politician_trading/scrapers_eu.py +377 -0
  164. mcli/workflow/politician_trading/scrapers_uk.py +350 -0
  165. mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
  166. mcli/workflow/politician_trading/supabase_functions.py +354 -0
  167. mcli/workflow/politician_trading/workflow.py +852 -0
  168. mcli/workflow/registry/registry.py +180 -0
  169. mcli/workflow/repo/repo.py +223 -0
  170. mcli/workflow/scheduler/commands.py +493 -0
  171. mcli/workflow/scheduler/cron_parser.py +238 -0
  172. mcli/workflow/scheduler/job.py +182 -0
  173. mcli/workflow/scheduler/monitor.py +139 -0
  174. mcli/workflow/scheduler/persistence.py +324 -0
  175. mcli/workflow/scheduler/scheduler.py +679 -0
  176. mcli/workflow/sync/sync_cmd.py +437 -0
  177. mcli/workflow/sync/test_cmd.py +314 -0
  178. mcli/workflow/videos/videos.py +242 -0
  179. mcli/workflow/wakatime/wakatime.py +11 -0
  180. mcli/workflow/workflow.py +37 -0
  181. mcli_framework-7.0.0.dist-info/METADATA +479 -0
  182. mcli_framework-7.0.0.dist-info/RECORD +186 -0
  183. mcli_framework-7.0.0.dist-info/WHEEL +5 -0
  184. mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
  185. mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
  186. mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,382 @@
1
+ """ML Data Pipeline Integration"""
2
+
3
+ import asyncio
4
+ import json
5
+ import logging
6
+ from datetime import datetime, timezone
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, Union
9
+ from dataclasses import dataclass, asdict
10
+
11
+ import pandas as pd
12
+
13
+ from mcli.lib.services.data_pipeline import LSHDataPipeline, DataPipelineConfig
14
+ from mcli.lib.services.lsh_client import LSHClient
15
+ from mcli.ml.configs.mlops_manager import get_mlops_manager
16
+ from .politician_trading_preprocessor import (
17
+ PoliticianTradingPreprocessor,
18
+ PreprocessingConfig,
19
+ PreprocessingResults,
20
+ )
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @dataclass
26
+ class MLDataPipelineConfig:
27
+ """Configuration for ML data pipeline"""
28
+
29
+ # Data ingestion
30
+ batch_size: int = 50
31
+ batch_timeout: int = 60 # seconds
32
+ max_buffer_size: int = 1000
33
+
34
+ # Preprocessing
35
+ preprocessing_config: Optional[PreprocessingConfig] = None
36
+ auto_retrain_threshold: int = 100 # New records needed to trigger retraining
37
+
38
+ # Storage
39
+ processed_data_dir: Path = Path("./data/ml_ready")
40
+ model_training_data_dir: Path = Path("./data/training")
41
+
42
+ # MLOps integration
43
+ enable_mlflow_logging: bool = True
44
+ experiment_name: str = "politician_trading_preprocessing"
45
+
46
+ def __post_init__(self):
47
+ if self.preprocessing_config is None:
48
+ self.preprocessing_config = PreprocessingConfig()
49
+
50
+ self.processed_data_dir.mkdir(parents=True, exist_ok=True)
51
+ self.model_training_data_dir.mkdir(parents=True, exist_ok=True)
52
+
53
+
54
+ class MLDataPipeline:
55
+ """ML-focused data pipeline for politician trading data"""
56
+
57
+ def __init__(self, lsh_client: LSHClient, config: Optional[MLDataPipelineConfig] = None):
58
+ self.lsh_client = lsh_client
59
+ self.config = config or MLDataPipelineConfig()
60
+
61
+ # Initialize components
62
+ self.base_pipeline = LSHDataPipeline(lsh_client, DataPipelineConfig())
63
+ self.preprocessor = PoliticianTradingPreprocessor(self.config.preprocessing_config)
64
+ self.mlops_manager = get_mlops_manager()
65
+
66
+ # Data buffers
67
+ self.raw_data_buffer: List[Dict[str, Any]] = []
68
+ self.processed_data_buffer: List[Dict[str, Any]] = []
69
+
70
+ # State tracking
71
+ self._is_running = False
72
+ self._last_preprocessing_time = None
73
+ self._total_records_processed = 0
74
+
75
+ # Setup event handlers
76
+ self._setup_ml_handlers()
77
+
78
+ def _setup_ml_handlers(self):
79
+ """Setup ML-specific event handlers"""
80
+ self.lsh_client.on("trading.data.received", self._handle_trading_data_for_ml)
81
+ self.lsh_client.on("politician.data.updated", self._handle_politician_update)
82
+ self.lsh_client.on("market.data.sync", self._handle_market_data)
83
+
84
+ async def start(self):
85
+ """Start the ML data pipeline"""
86
+ if self._is_running:
87
+ logger.warning("ML pipeline already running")
88
+ return
89
+
90
+ logger.info("Starting ML data pipeline")
91
+ self._is_running = True
92
+
93
+ # Setup MLOps infrastructure
94
+ self.mlops_manager.setup()
95
+
96
+ if self.config.enable_mlflow_logging:
97
+ # Start MLflow experiment
98
+ self.mlops_manager.start_experiment_run(
99
+ run_name=f"preprocessing_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
100
+ tags={"component": "data_preprocessing", "pipeline_version": "1.0.0"},
101
+ description="Politician trading data preprocessing pipeline",
102
+ )
103
+
104
+ # Start base pipeline
105
+ await self.base_pipeline.start()
106
+
107
+ # Start periodic processing
108
+ asyncio.create_task(self._periodic_processing())
109
+
110
+ async def stop(self):
111
+ """Stop the ML data pipeline"""
112
+ if not self._is_running:
113
+ return
114
+
115
+ logger.info("Stopping ML data pipeline")
116
+ self._is_running = False
117
+
118
+ # Process any remaining data
119
+ await self._process_accumulated_data()
120
+
121
+ # Stop base pipeline
122
+ await self.base_pipeline.stop()
123
+
124
+ # End MLflow run
125
+ if self.config.enable_mlflow_logging:
126
+ self.mlops_manager.end_run()
127
+
128
+ async def _handle_trading_data_for_ml(self, event_data: Dict[str, Any]):
129
+ """Handle trading data for ML processing"""
130
+ records = event_data.get("records", [])
131
+
132
+ if not records:
133
+ return
134
+
135
+ logger.info(f"Received {len(records)} trading records for ML processing")
136
+
137
+ # Add to buffer
138
+ self.raw_data_buffer.extend(records)
139
+
140
+ # Check if we should process
141
+ if (
142
+ len(self.raw_data_buffer) >= self.config.batch_size
143
+ or len(self.raw_data_buffer) >= self.config.max_buffer_size
144
+ ):
145
+ await self._process_accumulated_data()
146
+
147
+ async def _handle_politician_update(self, event_data: Dict[str, Any]):
148
+ """Handle politician metadata updates"""
149
+ politician_data = event_data.get("politician", {})
150
+ logger.info(f"Received politician update: {politician_data.get('name', 'unknown')}")
151
+
152
+ # This could trigger reprocessing of related records
153
+ # For now, just log the update
154
+
155
+ async def _handle_market_data(self, event_data: Dict[str, Any]):
156
+ """Handle market data updates"""
157
+ market_data = event_data.get("market", {})
158
+ logger.info(f"Received market data update")
159
+
160
+ # This could be used to enrich existing records
161
+ # For now, just log the update
162
+
163
+ async def _periodic_processing(self):
164
+ """Periodic processing of accumulated data"""
165
+ while self._is_running:
166
+ try:
167
+ # Wait for timeout period
168
+ await asyncio.sleep(self.config.batch_timeout)
169
+
170
+ # Process if we have data
171
+ if self.raw_data_buffer:
172
+ await self._process_accumulated_data()
173
+
174
+ # Check if we need to retrain models
175
+ if self._should_trigger_retraining():
176
+ await self._trigger_model_retraining()
177
+
178
+ except Exception as e:
179
+ logger.error(f"Error in periodic processing: {e}")
180
+
181
+ async def _process_accumulated_data(self):
182
+ """Process accumulated raw data through ML preprocessing"""
183
+ if not self.raw_data_buffer:
184
+ return
185
+
186
+ logger.info(f"Processing {len(self.raw_data_buffer)} accumulated records")
187
+
188
+ try:
189
+ # Take snapshot of buffer and clear it
190
+ records_to_process = self.raw_data_buffer.copy()
191
+ self.raw_data_buffer.clear()
192
+
193
+ # Run preprocessing
194
+ preprocessing_results = await self._run_preprocessing(records_to_process)
195
+
196
+ if preprocessing_results:
197
+ # Save processed data
198
+ await self._save_processed_data(preprocessing_results)
199
+
200
+ # Log to MLOps
201
+ if self.config.enable_mlflow_logging:
202
+ await self._log_preprocessing_metrics(preprocessing_results)
203
+
204
+ # Update state
205
+ self._total_records_processed += len(records_to_process)
206
+ self._last_preprocessing_time = datetime.now()
207
+
208
+ logger.info(f"Successfully processed {len(records_to_process)} records")
209
+
210
+ except Exception as e:
211
+ logger.error(f"Error processing accumulated data: {e}")
212
+ # Re-add records to buffer for retry
213
+ self.raw_data_buffer.extend(records_to_process)
214
+
215
+ async def _run_preprocessing(
216
+ self, records: List[Dict[str, Any]]
217
+ ) -> Optional[PreprocessingResults]:
218
+ """Run the preprocessing pipeline"""
219
+ if not records:
220
+ return None
221
+
222
+ try:
223
+ # Convert to DataFrame
224
+ raw_df = pd.DataFrame(records)
225
+
226
+ # Add metadata
227
+ raw_df["pipeline_batch_id"] = datetime.now().strftime("%Y%m%d_%H%M%S")
228
+ raw_df["pipeline_version"] = "1.0.0"
229
+
230
+ # Run preprocessing
231
+ results = self.preprocessor.preprocess(raw_df)
232
+
233
+ return results
234
+
235
+ except Exception as e:
236
+ logger.error(f"Preprocessing failed: {e}")
237
+ return None
238
+
239
+ async def _save_processed_data(self, results: PreprocessingResults):
240
+ """Save processed data to files"""
241
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
242
+
243
+ # Save train/val/test splits
244
+ train_path = self.config.model_training_data_dir / f"train_{timestamp}.parquet"
245
+ val_path = self.config.model_training_data_dir / f"val_{timestamp}.parquet"
246
+ test_path = self.config.model_training_data_dir / f"test_{timestamp}.parquet"
247
+
248
+ results.train_data.to_parquet(train_path)
249
+ results.val_data.to_parquet(val_path)
250
+ results.test_data.to_parquet(test_path)
251
+
252
+ # Save processed data for inference
253
+ processed_path = self.config.processed_data_dir / f"processed_{timestamp}.parquet"
254
+ all_data = pd.concat([results.train_data, results.val_data, results.test_data])
255
+ all_data.to_parquet(processed_path)
256
+
257
+ # Save metadata
258
+ metadata = {
259
+ "timestamp": timestamp,
260
+ "feature_names": results.feature_names,
261
+ "categorical_features": results.categorical_features,
262
+ "numerical_features": results.numerical_features,
263
+ "target_columns": results.target_columns,
264
+ "original_shape": results.original_shape,
265
+ "final_shape": results.final_shape,
266
+ "feature_count": results.feature_count,
267
+ "cleaning_stats": asdict(results.cleaning_stats),
268
+ }
269
+
270
+ metadata_path = self.config.processed_data_dir / f"metadata_{timestamp}.json"
271
+ with open(metadata_path, "w") as f:
272
+ json.dump(metadata, f, indent=2, default=str)
273
+
274
+ logger.info(f"Saved processed data to {processed_path}")
275
+
276
+ async def _log_preprocessing_metrics(self, results: PreprocessingResults):
277
+ """Log preprocessing metrics to MLOps"""
278
+ try:
279
+ # Log parameters
280
+ params = {
281
+ "batch_size": self.config.batch_size,
282
+ "preprocessing_version": "1.0.0",
283
+ "original_records": results.original_shape[0],
284
+ "final_records": results.final_shape[0],
285
+ "feature_count": results.feature_count,
286
+ "target_count": len(results.target_columns),
287
+ }
288
+ self.mlops_manager.log_parameters(params)
289
+
290
+ # Log metrics
291
+ metrics = {
292
+ "data_retention_rate": results.final_shape[0] / results.original_shape[0],
293
+ "feature_extraction_ratio": results.feature_count / results.original_shape[1],
294
+ "cleaning_success_rate": results.cleaning_stats.cleaned_records
295
+ / results.cleaning_stats.total_records,
296
+ "outliers_detected": results.cleaning_stats.outliers_detected,
297
+ "missing_values_filled": results.cleaning_stats.missing_values_filled,
298
+ }
299
+ self.mlops_manager.log_metrics(metrics)
300
+
301
+ # Log artifacts
302
+ if results.feature_metadata_path and results.feature_metadata_path.exists():
303
+ self.mlops_manager.log_artifacts(results.feature_metadata_path)
304
+
305
+ except Exception as e:
306
+ logger.error(f"Failed to log preprocessing metrics: {e}")
307
+
308
+ def _should_trigger_retraining(self) -> bool:
309
+ """Check if we should trigger model retraining"""
310
+ if self._total_records_processed >= self.config.auto_retrain_threshold:
311
+ # Reset counter
312
+ self._total_records_processed = 0
313
+ return True
314
+ return False
315
+
316
+ async def _trigger_model_retraining(self):
317
+ """Trigger model retraining"""
318
+ logger.info("Triggering model retraining due to data threshold")
319
+
320
+ # This would integrate with the model training pipeline
321
+ # For now, just emit an event
322
+ self.lsh_client.emit(
323
+ "ml.retrain.triggered",
324
+ {
325
+ "trigger_reason": "data_threshold",
326
+ "timestamp": datetime.now().isoformat(),
327
+ "records_processed": self._total_records_processed,
328
+ },
329
+ )
330
+
331
+ async def get_processing_stats(self) -> Dict[str, Any]:
332
+ """Get pipeline processing statistics"""
333
+ return {
334
+ "is_running": self._is_running,
335
+ "raw_buffer_size": len(self.raw_data_buffer),
336
+ "processed_buffer_size": len(self.processed_data_buffer),
337
+ "total_records_processed": self._total_records_processed,
338
+ "last_preprocessing_time": (
339
+ self._last_preprocessing_time.isoformat() if self._last_preprocessing_time else None
340
+ ),
341
+ "config": {
342
+ "batch_size": self.config.batch_size,
343
+ "batch_timeout": self.config.batch_timeout,
344
+ "auto_retrain_threshold": self.config.auto_retrain_threshold,
345
+ },
346
+ }
347
+
348
+ async def force_preprocessing(self) -> bool:
349
+ """Force preprocessing of current buffer"""
350
+ if not self.raw_data_buffer:
351
+ logger.warning("No data in buffer to process")
352
+ return False
353
+
354
+ await self._process_accumulated_data()
355
+ return True
356
+
357
+ async def load_historical_data(self, data_path: Path) -> bool:
358
+ """Load and process historical data"""
359
+ try:
360
+ if data_path.suffix == ".parquet":
361
+ df = pd.read_parquet(data_path)
362
+ elif data_path.suffix == ".csv":
363
+ df = pd.read_csv(data_path)
364
+ elif data_path.suffix == ".json":
365
+ df = pd.read_json(data_path)
366
+ else:
367
+ logger.error(f"Unsupported file format: {data_path.suffix}")
368
+ return False
369
+
370
+ # Convert to records and process
371
+ records = df.to_dict("records")
372
+ self.raw_data_buffer.extend(records)
373
+
374
+ # Process immediately
375
+ await self._process_accumulated_data()
376
+
377
+ logger.info(f"Loaded and processed {len(records)} historical records from {data_path}")
378
+ return True
379
+
380
+ except Exception as e:
381
+ logger.error(f"Failed to load historical data: {e}")
382
+ return False