mcli-framework 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/chat_cmd.py +42 -0
- mcli/app/commands_cmd.py +226 -0
- mcli/app/completion_cmd.py +216 -0
- mcli/app/completion_helpers.py +288 -0
- mcli/app/cron_test_cmd.py +697 -0
- mcli/app/logs_cmd.py +419 -0
- mcli/app/main.py +492 -0
- mcli/app/model/model.py +1060 -0
- mcli/app/model_cmd.py +227 -0
- mcli/app/redis_cmd.py +269 -0
- mcli/app/video/video.py +1114 -0
- mcli/app/visual_cmd.py +303 -0
- mcli/chat/chat.py +2409 -0
- mcli/chat/command_rag.py +514 -0
- mcli/chat/enhanced_chat.py +652 -0
- mcli/chat/system_controller.py +1010 -0
- mcli/chat/system_integration.py +1016 -0
- mcli/cli.py +25 -0
- mcli/config.toml +20 -0
- mcli/lib/api/api.py +586 -0
- mcli/lib/api/daemon_client.py +203 -0
- mcli/lib/api/daemon_client_local.py +44 -0
- mcli/lib/api/daemon_decorator.py +217 -0
- mcli/lib/api/mcli_decorators.py +1032 -0
- mcli/lib/auth/auth.py +85 -0
- mcli/lib/auth/aws_manager.py +85 -0
- mcli/lib/auth/azure_manager.py +91 -0
- mcli/lib/auth/credential_manager.py +192 -0
- mcli/lib/auth/gcp_manager.py +93 -0
- mcli/lib/auth/key_manager.py +117 -0
- mcli/lib/auth/mcli_manager.py +93 -0
- mcli/lib/auth/token_manager.py +75 -0
- mcli/lib/auth/token_util.py +1011 -0
- mcli/lib/config/config.py +47 -0
- mcli/lib/discovery/__init__.py +1 -0
- mcli/lib/discovery/command_discovery.py +274 -0
- mcli/lib/erd/erd.py +1345 -0
- mcli/lib/erd/generate_graph.py +453 -0
- mcli/lib/files/files.py +76 -0
- mcli/lib/fs/fs.py +109 -0
- mcli/lib/lib.py +29 -0
- mcli/lib/logger/logger.py +611 -0
- mcli/lib/performance/optimizer.py +409 -0
- mcli/lib/performance/rust_bridge.py +502 -0
- mcli/lib/performance/uvloop_config.py +154 -0
- mcli/lib/pickles/pickles.py +50 -0
- mcli/lib/search/cached_vectorizer.py +479 -0
- mcli/lib/services/data_pipeline.py +460 -0
- mcli/lib/services/lsh_client.py +441 -0
- mcli/lib/services/redis_service.py +387 -0
- mcli/lib/shell/shell.py +137 -0
- mcli/lib/toml/toml.py +33 -0
- mcli/lib/ui/styling.py +47 -0
- mcli/lib/ui/visual_effects.py +634 -0
- mcli/lib/watcher/watcher.py +185 -0
- mcli/ml/api/app.py +215 -0
- mcli/ml/api/middleware.py +224 -0
- mcli/ml/api/routers/admin_router.py +12 -0
- mcli/ml/api/routers/auth_router.py +244 -0
- mcli/ml/api/routers/backtest_router.py +12 -0
- mcli/ml/api/routers/data_router.py +12 -0
- mcli/ml/api/routers/model_router.py +302 -0
- mcli/ml/api/routers/monitoring_router.py +12 -0
- mcli/ml/api/routers/portfolio_router.py +12 -0
- mcli/ml/api/routers/prediction_router.py +267 -0
- mcli/ml/api/routers/trade_router.py +12 -0
- mcli/ml/api/routers/websocket_router.py +76 -0
- mcli/ml/api/schemas.py +64 -0
- mcli/ml/auth/auth_manager.py +425 -0
- mcli/ml/auth/models.py +154 -0
- mcli/ml/auth/permissions.py +302 -0
- mcli/ml/backtesting/backtest_engine.py +502 -0
- mcli/ml/backtesting/performance_metrics.py +393 -0
- mcli/ml/cache.py +400 -0
- mcli/ml/cli/main.py +398 -0
- mcli/ml/config/settings.py +394 -0
- mcli/ml/configs/dvc_config.py +230 -0
- mcli/ml/configs/mlflow_config.py +131 -0
- mcli/ml/configs/mlops_manager.py +293 -0
- mcli/ml/dashboard/app.py +532 -0
- mcli/ml/dashboard/app_integrated.py +738 -0
- mcli/ml/dashboard/app_supabase.py +560 -0
- mcli/ml/dashboard/app_training.py +615 -0
- mcli/ml/dashboard/cli.py +51 -0
- mcli/ml/data_ingestion/api_connectors.py +501 -0
- mcli/ml/data_ingestion/data_pipeline.py +567 -0
- mcli/ml/data_ingestion/stream_processor.py +512 -0
- mcli/ml/database/migrations/env.py +94 -0
- mcli/ml/database/models.py +667 -0
- mcli/ml/database/session.py +200 -0
- mcli/ml/experimentation/ab_testing.py +845 -0
- mcli/ml/features/ensemble_features.py +607 -0
- mcli/ml/features/political_features.py +676 -0
- mcli/ml/features/recommendation_engine.py +809 -0
- mcli/ml/features/stock_features.py +573 -0
- mcli/ml/features/test_feature_engineering.py +346 -0
- mcli/ml/logging.py +85 -0
- mcli/ml/mlops/data_versioning.py +518 -0
- mcli/ml/mlops/experiment_tracker.py +377 -0
- mcli/ml/mlops/model_serving.py +481 -0
- mcli/ml/mlops/pipeline_orchestrator.py +614 -0
- mcli/ml/models/base_models.py +324 -0
- mcli/ml/models/ensemble_models.py +675 -0
- mcli/ml/models/recommendation_models.py +474 -0
- mcli/ml/models/test_models.py +487 -0
- mcli/ml/monitoring/drift_detection.py +676 -0
- mcli/ml/monitoring/metrics.py +45 -0
- mcli/ml/optimization/portfolio_optimizer.py +834 -0
- mcli/ml/preprocessing/data_cleaners.py +451 -0
- mcli/ml/preprocessing/feature_extractors.py +491 -0
- mcli/ml/preprocessing/ml_pipeline.py +382 -0
- mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
- mcli/ml/preprocessing/test_preprocessing.py +294 -0
- mcli/ml/scripts/populate_sample_data.py +200 -0
- mcli/ml/tasks.py +400 -0
- mcli/ml/tests/test_integration.py +429 -0
- mcli/ml/tests/test_training_dashboard.py +387 -0
- mcli/public/oi/oi.py +15 -0
- mcli/public/public.py +4 -0
- mcli/self/self_cmd.py +1246 -0
- mcli/workflow/daemon/api_daemon.py +800 -0
- mcli/workflow/daemon/async_command_database.py +681 -0
- mcli/workflow/daemon/async_process_manager.py +591 -0
- mcli/workflow/daemon/client.py +530 -0
- mcli/workflow/daemon/commands.py +1196 -0
- mcli/workflow/daemon/daemon.py +905 -0
- mcli/workflow/daemon/daemon_api.py +59 -0
- mcli/workflow/daemon/enhanced_daemon.py +571 -0
- mcli/workflow/daemon/process_cli.py +244 -0
- mcli/workflow/daemon/process_manager.py +439 -0
- mcli/workflow/daemon/test_daemon.py +275 -0
- mcli/workflow/dashboard/dashboard_cmd.py +113 -0
- mcli/workflow/docker/docker.py +0 -0
- mcli/workflow/file/file.py +100 -0
- mcli/workflow/gcloud/config.toml +21 -0
- mcli/workflow/gcloud/gcloud.py +58 -0
- mcli/workflow/git_commit/ai_service.py +328 -0
- mcli/workflow/git_commit/commands.py +430 -0
- mcli/workflow/lsh_integration.py +355 -0
- mcli/workflow/model_service/client.py +594 -0
- mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
- mcli/workflow/model_service/lightweight_embedder.py +397 -0
- mcli/workflow/model_service/lightweight_model_server.py +714 -0
- mcli/workflow/model_service/lightweight_test.py +241 -0
- mcli/workflow/model_service/model_service.py +1955 -0
- mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
- mcli/workflow/model_service/pdf_processor.py +386 -0
- mcli/workflow/model_service/test_efficient_runner.py +234 -0
- mcli/workflow/model_service/test_example.py +315 -0
- mcli/workflow/model_service/test_integration.py +131 -0
- mcli/workflow/model_service/test_new_features.py +149 -0
- mcli/workflow/openai/openai.py +99 -0
- mcli/workflow/politician_trading/commands.py +1790 -0
- mcli/workflow/politician_trading/config.py +134 -0
- mcli/workflow/politician_trading/connectivity.py +490 -0
- mcli/workflow/politician_trading/data_sources.py +395 -0
- mcli/workflow/politician_trading/database.py +410 -0
- mcli/workflow/politician_trading/demo.py +248 -0
- mcli/workflow/politician_trading/models.py +165 -0
- mcli/workflow/politician_trading/monitoring.py +413 -0
- mcli/workflow/politician_trading/scrapers.py +966 -0
- mcli/workflow/politician_trading/scrapers_california.py +412 -0
- mcli/workflow/politician_trading/scrapers_eu.py +377 -0
- mcli/workflow/politician_trading/scrapers_uk.py +350 -0
- mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
- mcli/workflow/politician_trading/supabase_functions.py +354 -0
- mcli/workflow/politician_trading/workflow.py +852 -0
- mcli/workflow/registry/registry.py +180 -0
- mcli/workflow/repo/repo.py +223 -0
- mcli/workflow/scheduler/commands.py +493 -0
- mcli/workflow/scheduler/cron_parser.py +238 -0
- mcli/workflow/scheduler/job.py +182 -0
- mcli/workflow/scheduler/monitor.py +139 -0
- mcli/workflow/scheduler/persistence.py +324 -0
- mcli/workflow/scheduler/scheduler.py +679 -0
- mcli/workflow/sync/sync_cmd.py +437 -0
- mcli/workflow/sync/test_cmd.py +314 -0
- mcli/workflow/videos/videos.py +242 -0
- mcli/workflow/wakatime/wakatime.py +11 -0
- mcli/workflow/workflow.py +37 -0
- mcli_framework-7.0.0.dist-info/METADATA +479 -0
- mcli_framework-7.0.0.dist-info/RECORD +186 -0
- mcli_framework-7.0.0.dist-info/WHEEL +5 -0
- mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
- mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
- mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,518 @@
|
|
|
1
|
+
"""DVC integration for data versioning and pipeline management"""
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
import json
|
|
5
|
+
import yaml
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, Any, Optional, List, Union
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
import hashlib
|
|
10
|
+
import logging
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import os
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class DVCConfig:
|
|
20
|
+
"""DVC configuration"""
|
|
21
|
+
project_root: Path = Path(".")
|
|
22
|
+
remote_storage: str = "s3://my-bucket/dvc-storage" # or local path
|
|
23
|
+
cache_dir: Path = Path(".dvc/cache")
|
|
24
|
+
auto_commit: bool = True
|
|
25
|
+
verbose: bool = True
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DataVersionControl:
|
|
29
|
+
"""DVC wrapper for data versioning"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, config: DVCConfig):
|
|
32
|
+
self.config = config
|
|
33
|
+
self.project_root = config.project_root
|
|
34
|
+
self._ensure_dvc_initialized()
|
|
35
|
+
|
|
36
|
+
def _ensure_dvc_initialized(self):
|
|
37
|
+
"""Ensure DVC is initialized in project"""
|
|
38
|
+
dvc_dir = self.project_root / ".dvc"
|
|
39
|
+
|
|
40
|
+
if not dvc_dir.exists():
|
|
41
|
+
logger.info("Initializing DVC...")
|
|
42
|
+
self._run_command("dvc init")
|
|
43
|
+
|
|
44
|
+
# Configure remote storage
|
|
45
|
+
if self.config.remote_storage:
|
|
46
|
+
self._run_command(f"dvc remote add -d storage {self.config.remote_storage}")
|
|
47
|
+
|
|
48
|
+
def _run_command(self, command: str) -> str:
|
|
49
|
+
"""Run DVC command"""
|
|
50
|
+
try:
|
|
51
|
+
result = subprocess.run(
|
|
52
|
+
command.split(),
|
|
53
|
+
capture_output=True,
|
|
54
|
+
text=True,
|
|
55
|
+
cwd=self.project_root
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
if result.returncode != 0:
|
|
59
|
+
logger.error(f"DVC command failed: {result.stderr}")
|
|
60
|
+
raise RuntimeError(f"DVC command failed: {command}")
|
|
61
|
+
|
|
62
|
+
if self.config.verbose:
|
|
63
|
+
logger.debug(f"DVC: {command} -> {result.stdout}")
|
|
64
|
+
|
|
65
|
+
return result.stdout
|
|
66
|
+
except Exception as e:
|
|
67
|
+
logger.error(f"Failed to run DVC command: {e}")
|
|
68
|
+
raise
|
|
69
|
+
|
|
70
|
+
def add_data(self, data_path: Union[str, Path],
|
|
71
|
+
description: Optional[str] = None) -> str:
|
|
72
|
+
"""Add data file or directory to DVC tracking"""
|
|
73
|
+
data_path = Path(data_path)
|
|
74
|
+
|
|
75
|
+
if not data_path.exists():
|
|
76
|
+
raise FileNotFoundError(f"Data path not found: {data_path}")
|
|
77
|
+
|
|
78
|
+
# Add to DVC
|
|
79
|
+
self._run_command(f"dvc add {data_path}")
|
|
80
|
+
|
|
81
|
+
# Generate metadata
|
|
82
|
+
metadata = self._generate_metadata(data_path, description)
|
|
83
|
+
metadata_path = data_path.with_suffix(".meta.json")
|
|
84
|
+
|
|
85
|
+
with open(metadata_path, 'w') as f:
|
|
86
|
+
json.dump(metadata, f, indent=2)
|
|
87
|
+
|
|
88
|
+
# Commit if auto-commit enabled
|
|
89
|
+
if self.config.auto_commit:
|
|
90
|
+
self._commit_changes(f"Add data: {data_path.name}")
|
|
91
|
+
|
|
92
|
+
logger.info(f"Added {data_path} to DVC tracking")
|
|
93
|
+
return str(data_path) + ".dvc"
|
|
94
|
+
|
|
95
|
+
def push_data(self):
|
|
96
|
+
"""Push data to remote storage"""
|
|
97
|
+
logger.info("Pushing data to remote...")
|
|
98
|
+
self._run_command("dvc push")
|
|
99
|
+
|
|
100
|
+
def pull_data(self):
|
|
101
|
+
"""Pull data from remote storage"""
|
|
102
|
+
logger.info("Pulling data from remote...")
|
|
103
|
+
self._run_command("dvc pull")
|
|
104
|
+
|
|
105
|
+
def checkout(self, version: Optional[str] = None):
|
|
106
|
+
"""Checkout specific data version"""
|
|
107
|
+
if version:
|
|
108
|
+
self._run_command(f"git checkout {version}")
|
|
109
|
+
|
|
110
|
+
self._run_command("dvc checkout")
|
|
111
|
+
logger.info(f"Checked out data version: {version or 'latest'}")
|
|
112
|
+
|
|
113
|
+
def get_data_status(self) -> Dict[str, Any]:
|
|
114
|
+
"""Get status of tracked data"""
|
|
115
|
+
status_output = self._run_command("dvc status")
|
|
116
|
+
|
|
117
|
+
# Parse status
|
|
118
|
+
status = {
|
|
119
|
+
"modified": [],
|
|
120
|
+
"not_in_cache": [],
|
|
121
|
+
"deleted": []
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
for line in status_output.split('\n'):
|
|
125
|
+
if "modified:" in line:
|
|
126
|
+
status["modified"].append(line.split(":")[-1].strip())
|
|
127
|
+
elif "not in cache:" in line:
|
|
128
|
+
status["not_in_cache"].append(line.split(":")[-1].strip())
|
|
129
|
+
elif "deleted:" in line:
|
|
130
|
+
status["deleted"].append(line.split(":")[-1].strip())
|
|
131
|
+
|
|
132
|
+
return status
|
|
133
|
+
|
|
134
|
+
def _generate_metadata(self, data_path: Path,
|
|
135
|
+
description: Optional[str] = None) -> Dict[str, Any]:
|
|
136
|
+
"""Generate metadata for data file"""
|
|
137
|
+
stat = data_path.stat()
|
|
138
|
+
|
|
139
|
+
metadata = {
|
|
140
|
+
"path": str(data_path),
|
|
141
|
+
"size": stat.st_size,
|
|
142
|
+
"created": datetime.fromtimestamp(stat.st_ctime).isoformat(),
|
|
143
|
+
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
|
144
|
+
"hash": self._calculate_hash(data_path),
|
|
145
|
+
"description": description or "",
|
|
146
|
+
"type": "directory" if data_path.is_dir() else "file"
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
# Add data-specific metadata
|
|
150
|
+
if data_path.suffix in ['.csv', '.parquet']:
|
|
151
|
+
try:
|
|
152
|
+
df = pd.read_csv(data_path) if data_path.suffix == '.csv' else pd.read_parquet(data_path)
|
|
153
|
+
metadata["rows"] = len(df)
|
|
154
|
+
metadata["columns"] = len(df.columns)
|
|
155
|
+
metadata["column_names"] = df.columns.tolist()
|
|
156
|
+
except:
|
|
157
|
+
pass
|
|
158
|
+
|
|
159
|
+
return metadata
|
|
160
|
+
|
|
161
|
+
def _calculate_hash(self, file_path: Path) -> str:
|
|
162
|
+
"""Calculate file hash"""
|
|
163
|
+
if file_path.is_dir():
|
|
164
|
+
return "directory"
|
|
165
|
+
|
|
166
|
+
hash_md5 = hashlib.md5()
|
|
167
|
+
with open(file_path, "rb") as f:
|
|
168
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
|
169
|
+
hash_md5.update(chunk)
|
|
170
|
+
return hash_md5.hexdigest()
|
|
171
|
+
|
|
172
|
+
def _commit_changes(self, message: str):
|
|
173
|
+
"""Commit changes to git"""
|
|
174
|
+
subprocess.run(["git", "add", "-A"], cwd=self.project_root)
|
|
175
|
+
subprocess.run(["git", "commit", "-m", message], cwd=self.project_root)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class DVCPipeline:
|
|
179
|
+
"""DVC pipeline management"""
|
|
180
|
+
|
|
181
|
+
def __init__(self, config: DVCConfig):
|
|
182
|
+
self.config = config
|
|
183
|
+
self.dvc = DataVersionControl(config)
|
|
184
|
+
self.pipeline_file = config.project_root / "dvc.yaml"
|
|
185
|
+
self.params_file = config.project_root / "params.yaml"
|
|
186
|
+
|
|
187
|
+
def create_pipeline(self, stages: List[Dict[str, Any]]):
|
|
188
|
+
"""Create DVC pipeline"""
|
|
189
|
+
pipeline = {"stages": {}}
|
|
190
|
+
|
|
191
|
+
for stage in stages:
|
|
192
|
+
stage_name = stage["name"]
|
|
193
|
+
pipeline["stages"][stage_name] = {
|
|
194
|
+
"cmd": stage["cmd"],
|
|
195
|
+
"deps": stage.get("deps", []),
|
|
196
|
+
"params": stage.get("params", []),
|
|
197
|
+
"outs": stage.get("outs", []),
|
|
198
|
+
"metrics": stage.get("metrics", []),
|
|
199
|
+
"plots": stage.get("plots", [])
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
# Save pipeline
|
|
203
|
+
with open(self.pipeline_file, 'w') as f:
|
|
204
|
+
yaml.dump(pipeline, f, default_flow_style=False)
|
|
205
|
+
|
|
206
|
+
logger.info(f"Created DVC pipeline with {len(stages)} stages")
|
|
207
|
+
|
|
208
|
+
def add_stage(self, name: str, cmd: str,
|
|
209
|
+
deps: Optional[List[str]] = None,
|
|
210
|
+
params: Optional[List[str]] = None,
|
|
211
|
+
outs: Optional[List[str]] = None,
|
|
212
|
+
metrics: Optional[List[str]] = None):
|
|
213
|
+
"""Add stage to pipeline"""
|
|
214
|
+
stage_config = {
|
|
215
|
+
"cmd": cmd,
|
|
216
|
+
"deps": deps or [],
|
|
217
|
+
"params": params or [],
|
|
218
|
+
"outs": outs or [],
|
|
219
|
+
"metrics": metrics or []
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
# Load existing pipeline
|
|
223
|
+
if self.pipeline_file.exists():
|
|
224
|
+
with open(self.pipeline_file, 'r') as f:
|
|
225
|
+
pipeline = yaml.safe_load(f) or {"stages": {}}
|
|
226
|
+
else:
|
|
227
|
+
pipeline = {"stages": {}}
|
|
228
|
+
|
|
229
|
+
# Add stage
|
|
230
|
+
pipeline["stages"][name] = stage_config
|
|
231
|
+
|
|
232
|
+
# Save pipeline
|
|
233
|
+
with open(self.pipeline_file, 'w') as f:
|
|
234
|
+
yaml.dump(pipeline, f, default_flow_style=False)
|
|
235
|
+
|
|
236
|
+
logger.info(f"Added stage '{name}' to pipeline")
|
|
237
|
+
|
|
238
|
+
def run_pipeline(self, stage: Optional[str] = None):
|
|
239
|
+
"""Run DVC pipeline"""
|
|
240
|
+
if stage:
|
|
241
|
+
cmd = f"dvc repro {stage}"
|
|
242
|
+
else:
|
|
243
|
+
cmd = "dvc repro"
|
|
244
|
+
|
|
245
|
+
logger.info(f"Running DVC pipeline: {cmd}")
|
|
246
|
+
self.dvc._run_command(cmd)
|
|
247
|
+
|
|
248
|
+
def get_metrics(self) -> Dict[str, Any]:
|
|
249
|
+
"""Get pipeline metrics"""
|
|
250
|
+
metrics_output = self.dvc._run_command("dvc metrics show")
|
|
251
|
+
|
|
252
|
+
# Parse metrics (simplified)
|
|
253
|
+
metrics = {}
|
|
254
|
+
for line in metrics_output.split('\n'):
|
|
255
|
+
if ':' in line:
|
|
256
|
+
key, value = line.split(':', 1)
|
|
257
|
+
try:
|
|
258
|
+
metrics[key.strip()] = float(value.strip())
|
|
259
|
+
except:
|
|
260
|
+
metrics[key.strip()] = value.strip()
|
|
261
|
+
|
|
262
|
+
return metrics
|
|
263
|
+
|
|
264
|
+
def create_ml_pipeline(self):
|
|
265
|
+
"""Create standard ML pipeline"""
|
|
266
|
+
stages = [
|
|
267
|
+
{
|
|
268
|
+
"name": "data_preparation",
|
|
269
|
+
"cmd": "python src/prepare_data.py",
|
|
270
|
+
"deps": ["data/raw"],
|
|
271
|
+
"outs": ["data/processed"],
|
|
272
|
+
"params": ["prepare.test_split", "prepare.seed"]
|
|
273
|
+
},
|
|
274
|
+
{
|
|
275
|
+
"name": "feature_engineering",
|
|
276
|
+
"cmd": "python src/featurize.py",
|
|
277
|
+
"deps": ["data/processed"],
|
|
278
|
+
"outs": ["data/features"],
|
|
279
|
+
"params": ["featurize.max_features", "featurize.ngrams"]
|
|
280
|
+
},
|
|
281
|
+
{
|
|
282
|
+
"name": "train",
|
|
283
|
+
"cmd": "python src/train.py",
|
|
284
|
+
"deps": ["data/features"],
|
|
285
|
+
"outs": ["models/model.pkl"],
|
|
286
|
+
"params": ["train.epochs", "train.learning_rate"],
|
|
287
|
+
"metrics": [{"metrics.json": {"cache": False}}]
|
|
288
|
+
},
|
|
289
|
+
{
|
|
290
|
+
"name": "evaluate",
|
|
291
|
+
"cmd": "python src/evaluate.py",
|
|
292
|
+
"deps": ["models/model.pkl", "data/features"],
|
|
293
|
+
"metrics": [{"eval/metrics.json": {"cache": False}}],
|
|
294
|
+
"plots": [{"eval/plots/roc.json": {"x": "fpr", "y": "tpr"}}]
|
|
295
|
+
}
|
|
296
|
+
]
|
|
297
|
+
|
|
298
|
+
self.create_pipeline(stages)
|
|
299
|
+
|
|
300
|
+
# Create default params file
|
|
301
|
+
params = {
|
|
302
|
+
"prepare": {
|
|
303
|
+
"test_split": 0.2,
|
|
304
|
+
"seed": 42
|
|
305
|
+
},
|
|
306
|
+
"featurize": {
|
|
307
|
+
"max_features": 100,
|
|
308
|
+
"ngrams": 2
|
|
309
|
+
},
|
|
310
|
+
"train": {
|
|
311
|
+
"epochs": 10,
|
|
312
|
+
"learning_rate": 0.001
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
with open(self.params_file, 'w') as f:
|
|
317
|
+
yaml.dump(params, f, default_flow_style=False)
|
|
318
|
+
|
|
319
|
+
logger.info("Created ML pipeline with DVC")
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
class DataRegistry:
|
|
323
|
+
"""Central registry for versioned datasets"""
|
|
324
|
+
|
|
325
|
+
def __init__(self, registry_path: Path = Path("data_registry.json")):
|
|
326
|
+
self.registry_path = registry_path
|
|
327
|
+
self.registry = self._load_registry()
|
|
328
|
+
|
|
329
|
+
def _load_registry(self) -> Dict[str, Any]:
|
|
330
|
+
"""Load data registry"""
|
|
331
|
+
if self.registry_path.exists():
|
|
332
|
+
with open(self.registry_path, 'r') as f:
|
|
333
|
+
return json.load(f)
|
|
334
|
+
return {"datasets": {}}
|
|
335
|
+
|
|
336
|
+
def _save_registry(self):
|
|
337
|
+
"""Save data registry"""
|
|
338
|
+
with open(self.registry_path, 'w') as f:
|
|
339
|
+
json.dump(self.registry, f, indent=2)
|
|
340
|
+
|
|
341
|
+
def register_dataset(self, name: str, path: str,
|
|
342
|
+
version: str, metadata: Dict[str, Any]):
|
|
343
|
+
"""Register new dataset version"""
|
|
344
|
+
if name not in self.registry["datasets"]:
|
|
345
|
+
self.registry["datasets"][name] = {"versions": {}}
|
|
346
|
+
|
|
347
|
+
self.registry["datasets"][name]["versions"][version] = {
|
|
348
|
+
"path": path,
|
|
349
|
+
"metadata": metadata,
|
|
350
|
+
"registered": datetime.now().isoformat()
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
self.registry["datasets"][name]["latest"] = version
|
|
354
|
+
self._save_registry()
|
|
355
|
+
|
|
356
|
+
logger.info(f"Registered dataset '{name}' version '{version}'")
|
|
357
|
+
|
|
358
|
+
def get_dataset(self, name: str, version: Optional[str] = None) -> Dict[str, Any]:
|
|
359
|
+
"""Get dataset information"""
|
|
360
|
+
if name not in self.registry["datasets"]:
|
|
361
|
+
raise ValueError(f"Dataset '{name}' not found")
|
|
362
|
+
|
|
363
|
+
dataset = self.registry["datasets"][name]
|
|
364
|
+
version = version or dataset["latest"]
|
|
365
|
+
|
|
366
|
+
if version not in dataset["versions"]:
|
|
367
|
+
raise ValueError(f"Version '{version}' not found for dataset '{name}'")
|
|
368
|
+
|
|
369
|
+
return dataset["versions"][version]
|
|
370
|
+
|
|
371
|
+
def list_datasets(self) -> List[str]:
|
|
372
|
+
"""List all registered datasets"""
|
|
373
|
+
return list(self.registry["datasets"].keys())
|
|
374
|
+
|
|
375
|
+
def list_versions(self, name: str) -> List[str]:
|
|
376
|
+
"""List all versions of a dataset"""
|
|
377
|
+
if name not in self.registry["datasets"]:
|
|
378
|
+
raise ValueError(f"Dataset '{name}' not found")
|
|
379
|
+
|
|
380
|
+
return list(self.registry["datasets"][name]["versions"].keys())
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def create_dvc_config():
|
|
384
|
+
"""Create DVC configuration files"""
|
|
385
|
+
|
|
386
|
+
# Create .dvc/.gitignore
|
|
387
|
+
dvc_gitignore = """
|
|
388
|
+
/config.local
|
|
389
|
+
/tmp
|
|
390
|
+
/cache
|
|
391
|
+
"""
|
|
392
|
+
|
|
393
|
+
# Create .dvcignore
|
|
394
|
+
dvcignore = """
|
|
395
|
+
# Python
|
|
396
|
+
__pycache__
|
|
397
|
+
*.pyc
|
|
398
|
+
*.pyo
|
|
399
|
+
*.pyd
|
|
400
|
+
.pytest_cache
|
|
401
|
+
.coverage
|
|
402
|
+
htmlcov
|
|
403
|
+
|
|
404
|
+
# Jupyter
|
|
405
|
+
.ipynb_checkpoints
|
|
406
|
+
*.ipynb
|
|
407
|
+
|
|
408
|
+
# IDE
|
|
409
|
+
.vscode
|
|
410
|
+
.idea
|
|
411
|
+
*.swp
|
|
412
|
+
.DS_Store
|
|
413
|
+
|
|
414
|
+
# Temporary files
|
|
415
|
+
/tmp
|
|
416
|
+
/temp
|
|
417
|
+
*.tmp
|
|
418
|
+
"""
|
|
419
|
+
|
|
420
|
+
# Create dvc.yaml template
|
|
421
|
+
dvc_yaml = """
|
|
422
|
+
stages:
|
|
423
|
+
prepare_data:
|
|
424
|
+
cmd: python src/ml/preprocessing/prepare_data.py
|
|
425
|
+
deps:
|
|
426
|
+
- src/ml/preprocessing/prepare_data.py
|
|
427
|
+
- data/raw
|
|
428
|
+
outs:
|
|
429
|
+
- data/processed
|
|
430
|
+
params:
|
|
431
|
+
- prepare.split_ratio
|
|
432
|
+
- prepare.random_seed
|
|
433
|
+
|
|
434
|
+
train_model:
|
|
435
|
+
cmd: python src/ml/models/train.py
|
|
436
|
+
deps:
|
|
437
|
+
- src/ml/models/train.py
|
|
438
|
+
- data/processed
|
|
439
|
+
outs:
|
|
440
|
+
- models/model.pkl
|
|
441
|
+
params:
|
|
442
|
+
- train.epochs
|
|
443
|
+
- train.batch_size
|
|
444
|
+
- train.learning_rate
|
|
445
|
+
metrics:
|
|
446
|
+
- metrics/train_metrics.json:
|
|
447
|
+
cache: false
|
|
448
|
+
|
|
449
|
+
evaluate:
|
|
450
|
+
cmd: python src/ml/evaluate.py
|
|
451
|
+
deps:
|
|
452
|
+
- src/ml/evaluate.py
|
|
453
|
+
- models/model.pkl
|
|
454
|
+
- data/processed
|
|
455
|
+
metrics:
|
|
456
|
+
- metrics/eval_metrics.json:
|
|
457
|
+
cache: false
|
|
458
|
+
plots:
|
|
459
|
+
- metrics/confusion_matrix.csv:
|
|
460
|
+
template: confusion
|
|
461
|
+
x: actual
|
|
462
|
+
y: predicted
|
|
463
|
+
"""
|
|
464
|
+
|
|
465
|
+
# Create params.yaml template
|
|
466
|
+
params_yaml = """
|
|
467
|
+
prepare:
|
|
468
|
+
split_ratio: 0.2
|
|
469
|
+
random_seed: 42
|
|
470
|
+
|
|
471
|
+
train:
|
|
472
|
+
epochs: 100
|
|
473
|
+
batch_size: 32
|
|
474
|
+
learning_rate: 0.001
|
|
475
|
+
dropout_rate: 0.3
|
|
476
|
+
|
|
477
|
+
evaluate:
|
|
478
|
+
confidence_threshold: 0.6
|
|
479
|
+
metrics:
|
|
480
|
+
- accuracy
|
|
481
|
+
- precision
|
|
482
|
+
- recall
|
|
483
|
+
- f1_score
|
|
484
|
+
"""
|
|
485
|
+
|
|
486
|
+
return {
|
|
487
|
+
".dvc/.gitignore": dvc_gitignore,
|
|
488
|
+
".dvcignore": dvcignore,
|
|
489
|
+
"dvc.yaml": dvc_yaml,
|
|
490
|
+
"params.yaml": params_yaml
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
# Example usage
|
|
495
|
+
if __name__ == "__main__":
|
|
496
|
+
# Initialize DVC
|
|
497
|
+
config = DVCConfig()
|
|
498
|
+
dvc = DataVersionControl(config)
|
|
499
|
+
|
|
500
|
+
# Create data registry
|
|
501
|
+
registry = DataRegistry()
|
|
502
|
+
|
|
503
|
+
# Add some data
|
|
504
|
+
dvc.add_data("data/politician_trades.csv", "Politician trading data v1")
|
|
505
|
+
|
|
506
|
+
# Register in registry
|
|
507
|
+
registry.register_dataset(
|
|
508
|
+
name="politician_trades",
|
|
509
|
+
path="data/politician_trades.csv",
|
|
510
|
+
version="v1.0",
|
|
511
|
+
metadata={"source": "congress", "records": 10000}
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
# Create ML pipeline
|
|
515
|
+
pipeline = DVCPipeline(config)
|
|
516
|
+
pipeline.create_ml_pipeline()
|
|
517
|
+
|
|
518
|
+
logger.info("DVC setup complete")
|