mcli-framework 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (186) hide show
  1. mcli/app/chat_cmd.py +42 -0
  2. mcli/app/commands_cmd.py +226 -0
  3. mcli/app/completion_cmd.py +216 -0
  4. mcli/app/completion_helpers.py +288 -0
  5. mcli/app/cron_test_cmd.py +697 -0
  6. mcli/app/logs_cmd.py +419 -0
  7. mcli/app/main.py +492 -0
  8. mcli/app/model/model.py +1060 -0
  9. mcli/app/model_cmd.py +227 -0
  10. mcli/app/redis_cmd.py +269 -0
  11. mcli/app/video/video.py +1114 -0
  12. mcli/app/visual_cmd.py +303 -0
  13. mcli/chat/chat.py +2409 -0
  14. mcli/chat/command_rag.py +514 -0
  15. mcli/chat/enhanced_chat.py +652 -0
  16. mcli/chat/system_controller.py +1010 -0
  17. mcli/chat/system_integration.py +1016 -0
  18. mcli/cli.py +25 -0
  19. mcli/config.toml +20 -0
  20. mcli/lib/api/api.py +586 -0
  21. mcli/lib/api/daemon_client.py +203 -0
  22. mcli/lib/api/daemon_client_local.py +44 -0
  23. mcli/lib/api/daemon_decorator.py +217 -0
  24. mcli/lib/api/mcli_decorators.py +1032 -0
  25. mcli/lib/auth/auth.py +85 -0
  26. mcli/lib/auth/aws_manager.py +85 -0
  27. mcli/lib/auth/azure_manager.py +91 -0
  28. mcli/lib/auth/credential_manager.py +192 -0
  29. mcli/lib/auth/gcp_manager.py +93 -0
  30. mcli/lib/auth/key_manager.py +117 -0
  31. mcli/lib/auth/mcli_manager.py +93 -0
  32. mcli/lib/auth/token_manager.py +75 -0
  33. mcli/lib/auth/token_util.py +1011 -0
  34. mcli/lib/config/config.py +47 -0
  35. mcli/lib/discovery/__init__.py +1 -0
  36. mcli/lib/discovery/command_discovery.py +274 -0
  37. mcli/lib/erd/erd.py +1345 -0
  38. mcli/lib/erd/generate_graph.py +453 -0
  39. mcli/lib/files/files.py +76 -0
  40. mcli/lib/fs/fs.py +109 -0
  41. mcli/lib/lib.py +29 -0
  42. mcli/lib/logger/logger.py +611 -0
  43. mcli/lib/performance/optimizer.py +409 -0
  44. mcli/lib/performance/rust_bridge.py +502 -0
  45. mcli/lib/performance/uvloop_config.py +154 -0
  46. mcli/lib/pickles/pickles.py +50 -0
  47. mcli/lib/search/cached_vectorizer.py +479 -0
  48. mcli/lib/services/data_pipeline.py +460 -0
  49. mcli/lib/services/lsh_client.py +441 -0
  50. mcli/lib/services/redis_service.py +387 -0
  51. mcli/lib/shell/shell.py +137 -0
  52. mcli/lib/toml/toml.py +33 -0
  53. mcli/lib/ui/styling.py +47 -0
  54. mcli/lib/ui/visual_effects.py +634 -0
  55. mcli/lib/watcher/watcher.py +185 -0
  56. mcli/ml/api/app.py +215 -0
  57. mcli/ml/api/middleware.py +224 -0
  58. mcli/ml/api/routers/admin_router.py +12 -0
  59. mcli/ml/api/routers/auth_router.py +244 -0
  60. mcli/ml/api/routers/backtest_router.py +12 -0
  61. mcli/ml/api/routers/data_router.py +12 -0
  62. mcli/ml/api/routers/model_router.py +302 -0
  63. mcli/ml/api/routers/monitoring_router.py +12 -0
  64. mcli/ml/api/routers/portfolio_router.py +12 -0
  65. mcli/ml/api/routers/prediction_router.py +267 -0
  66. mcli/ml/api/routers/trade_router.py +12 -0
  67. mcli/ml/api/routers/websocket_router.py +76 -0
  68. mcli/ml/api/schemas.py +64 -0
  69. mcli/ml/auth/auth_manager.py +425 -0
  70. mcli/ml/auth/models.py +154 -0
  71. mcli/ml/auth/permissions.py +302 -0
  72. mcli/ml/backtesting/backtest_engine.py +502 -0
  73. mcli/ml/backtesting/performance_metrics.py +393 -0
  74. mcli/ml/cache.py +400 -0
  75. mcli/ml/cli/main.py +398 -0
  76. mcli/ml/config/settings.py +394 -0
  77. mcli/ml/configs/dvc_config.py +230 -0
  78. mcli/ml/configs/mlflow_config.py +131 -0
  79. mcli/ml/configs/mlops_manager.py +293 -0
  80. mcli/ml/dashboard/app.py +532 -0
  81. mcli/ml/dashboard/app_integrated.py +738 -0
  82. mcli/ml/dashboard/app_supabase.py +560 -0
  83. mcli/ml/dashboard/app_training.py +615 -0
  84. mcli/ml/dashboard/cli.py +51 -0
  85. mcli/ml/data_ingestion/api_connectors.py +501 -0
  86. mcli/ml/data_ingestion/data_pipeline.py +567 -0
  87. mcli/ml/data_ingestion/stream_processor.py +512 -0
  88. mcli/ml/database/migrations/env.py +94 -0
  89. mcli/ml/database/models.py +667 -0
  90. mcli/ml/database/session.py +200 -0
  91. mcli/ml/experimentation/ab_testing.py +845 -0
  92. mcli/ml/features/ensemble_features.py +607 -0
  93. mcli/ml/features/political_features.py +676 -0
  94. mcli/ml/features/recommendation_engine.py +809 -0
  95. mcli/ml/features/stock_features.py +573 -0
  96. mcli/ml/features/test_feature_engineering.py +346 -0
  97. mcli/ml/logging.py +85 -0
  98. mcli/ml/mlops/data_versioning.py +518 -0
  99. mcli/ml/mlops/experiment_tracker.py +377 -0
  100. mcli/ml/mlops/model_serving.py +481 -0
  101. mcli/ml/mlops/pipeline_orchestrator.py +614 -0
  102. mcli/ml/models/base_models.py +324 -0
  103. mcli/ml/models/ensemble_models.py +675 -0
  104. mcli/ml/models/recommendation_models.py +474 -0
  105. mcli/ml/models/test_models.py +487 -0
  106. mcli/ml/monitoring/drift_detection.py +676 -0
  107. mcli/ml/monitoring/metrics.py +45 -0
  108. mcli/ml/optimization/portfolio_optimizer.py +834 -0
  109. mcli/ml/preprocessing/data_cleaners.py +451 -0
  110. mcli/ml/preprocessing/feature_extractors.py +491 -0
  111. mcli/ml/preprocessing/ml_pipeline.py +382 -0
  112. mcli/ml/preprocessing/politician_trading_preprocessor.py +569 -0
  113. mcli/ml/preprocessing/test_preprocessing.py +294 -0
  114. mcli/ml/scripts/populate_sample_data.py +200 -0
  115. mcli/ml/tasks.py +400 -0
  116. mcli/ml/tests/test_integration.py +429 -0
  117. mcli/ml/tests/test_training_dashboard.py +387 -0
  118. mcli/public/oi/oi.py +15 -0
  119. mcli/public/public.py +4 -0
  120. mcli/self/self_cmd.py +1246 -0
  121. mcli/workflow/daemon/api_daemon.py +800 -0
  122. mcli/workflow/daemon/async_command_database.py +681 -0
  123. mcli/workflow/daemon/async_process_manager.py +591 -0
  124. mcli/workflow/daemon/client.py +530 -0
  125. mcli/workflow/daemon/commands.py +1196 -0
  126. mcli/workflow/daemon/daemon.py +905 -0
  127. mcli/workflow/daemon/daemon_api.py +59 -0
  128. mcli/workflow/daemon/enhanced_daemon.py +571 -0
  129. mcli/workflow/daemon/process_cli.py +244 -0
  130. mcli/workflow/daemon/process_manager.py +439 -0
  131. mcli/workflow/daemon/test_daemon.py +275 -0
  132. mcli/workflow/dashboard/dashboard_cmd.py +113 -0
  133. mcli/workflow/docker/docker.py +0 -0
  134. mcli/workflow/file/file.py +100 -0
  135. mcli/workflow/gcloud/config.toml +21 -0
  136. mcli/workflow/gcloud/gcloud.py +58 -0
  137. mcli/workflow/git_commit/ai_service.py +328 -0
  138. mcli/workflow/git_commit/commands.py +430 -0
  139. mcli/workflow/lsh_integration.py +355 -0
  140. mcli/workflow/model_service/client.py +594 -0
  141. mcli/workflow/model_service/download_and_run_efficient_models.py +288 -0
  142. mcli/workflow/model_service/lightweight_embedder.py +397 -0
  143. mcli/workflow/model_service/lightweight_model_server.py +714 -0
  144. mcli/workflow/model_service/lightweight_test.py +241 -0
  145. mcli/workflow/model_service/model_service.py +1955 -0
  146. mcli/workflow/model_service/ollama_efficient_runner.py +425 -0
  147. mcli/workflow/model_service/pdf_processor.py +386 -0
  148. mcli/workflow/model_service/test_efficient_runner.py +234 -0
  149. mcli/workflow/model_service/test_example.py +315 -0
  150. mcli/workflow/model_service/test_integration.py +131 -0
  151. mcli/workflow/model_service/test_new_features.py +149 -0
  152. mcli/workflow/openai/openai.py +99 -0
  153. mcli/workflow/politician_trading/commands.py +1790 -0
  154. mcli/workflow/politician_trading/config.py +134 -0
  155. mcli/workflow/politician_trading/connectivity.py +490 -0
  156. mcli/workflow/politician_trading/data_sources.py +395 -0
  157. mcli/workflow/politician_trading/database.py +410 -0
  158. mcli/workflow/politician_trading/demo.py +248 -0
  159. mcli/workflow/politician_trading/models.py +165 -0
  160. mcli/workflow/politician_trading/monitoring.py +413 -0
  161. mcli/workflow/politician_trading/scrapers.py +966 -0
  162. mcli/workflow/politician_trading/scrapers_california.py +412 -0
  163. mcli/workflow/politician_trading/scrapers_eu.py +377 -0
  164. mcli/workflow/politician_trading/scrapers_uk.py +350 -0
  165. mcli/workflow/politician_trading/scrapers_us_states.py +438 -0
  166. mcli/workflow/politician_trading/supabase_functions.py +354 -0
  167. mcli/workflow/politician_trading/workflow.py +852 -0
  168. mcli/workflow/registry/registry.py +180 -0
  169. mcli/workflow/repo/repo.py +223 -0
  170. mcli/workflow/scheduler/commands.py +493 -0
  171. mcli/workflow/scheduler/cron_parser.py +238 -0
  172. mcli/workflow/scheduler/job.py +182 -0
  173. mcli/workflow/scheduler/monitor.py +139 -0
  174. mcli/workflow/scheduler/persistence.py +324 -0
  175. mcli/workflow/scheduler/scheduler.py +679 -0
  176. mcli/workflow/sync/sync_cmd.py +437 -0
  177. mcli/workflow/sync/test_cmd.py +314 -0
  178. mcli/workflow/videos/videos.py +242 -0
  179. mcli/workflow/wakatime/wakatime.py +11 -0
  180. mcli/workflow/workflow.py +37 -0
  181. mcli_framework-7.0.0.dist-info/METADATA +479 -0
  182. mcli_framework-7.0.0.dist-info/RECORD +186 -0
  183. mcli_framework-7.0.0.dist-info/WHEEL +5 -0
  184. mcli_framework-7.0.0.dist-info/entry_points.txt +7 -0
  185. mcli_framework-7.0.0.dist-info/licenses/LICENSE +21 -0
  186. mcli_framework-7.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,518 @@
1
+ """DVC integration for data versioning and pipeline management"""
2
+
3
+ import subprocess
4
+ import json
5
+ import yaml
6
+ from pathlib import Path
7
+ from typing import Dict, Any, Optional, List, Union
8
+ from dataclasses import dataclass
9
+ import hashlib
10
+ import logging
11
+ from datetime import datetime
12
+ import pandas as pd
13
+ import os
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ @dataclass
19
+ class DVCConfig:
20
+ """DVC configuration"""
21
+ project_root: Path = Path(".")
22
+ remote_storage: str = "s3://my-bucket/dvc-storage" # or local path
23
+ cache_dir: Path = Path(".dvc/cache")
24
+ auto_commit: bool = True
25
+ verbose: bool = True
26
+
27
+
28
+ class DataVersionControl:
29
+ """DVC wrapper for data versioning"""
30
+
31
+ def __init__(self, config: DVCConfig):
32
+ self.config = config
33
+ self.project_root = config.project_root
34
+ self._ensure_dvc_initialized()
35
+
36
+ def _ensure_dvc_initialized(self):
37
+ """Ensure DVC is initialized in project"""
38
+ dvc_dir = self.project_root / ".dvc"
39
+
40
+ if not dvc_dir.exists():
41
+ logger.info("Initializing DVC...")
42
+ self._run_command("dvc init")
43
+
44
+ # Configure remote storage
45
+ if self.config.remote_storage:
46
+ self._run_command(f"dvc remote add -d storage {self.config.remote_storage}")
47
+
48
+ def _run_command(self, command: str) -> str:
49
+ """Run DVC command"""
50
+ try:
51
+ result = subprocess.run(
52
+ command.split(),
53
+ capture_output=True,
54
+ text=True,
55
+ cwd=self.project_root
56
+ )
57
+
58
+ if result.returncode != 0:
59
+ logger.error(f"DVC command failed: {result.stderr}")
60
+ raise RuntimeError(f"DVC command failed: {command}")
61
+
62
+ if self.config.verbose:
63
+ logger.debug(f"DVC: {command} -> {result.stdout}")
64
+
65
+ return result.stdout
66
+ except Exception as e:
67
+ logger.error(f"Failed to run DVC command: {e}")
68
+ raise
69
+
70
+ def add_data(self, data_path: Union[str, Path],
71
+ description: Optional[str] = None) -> str:
72
+ """Add data file or directory to DVC tracking"""
73
+ data_path = Path(data_path)
74
+
75
+ if not data_path.exists():
76
+ raise FileNotFoundError(f"Data path not found: {data_path}")
77
+
78
+ # Add to DVC
79
+ self._run_command(f"dvc add {data_path}")
80
+
81
+ # Generate metadata
82
+ metadata = self._generate_metadata(data_path, description)
83
+ metadata_path = data_path.with_suffix(".meta.json")
84
+
85
+ with open(metadata_path, 'w') as f:
86
+ json.dump(metadata, f, indent=2)
87
+
88
+ # Commit if auto-commit enabled
89
+ if self.config.auto_commit:
90
+ self._commit_changes(f"Add data: {data_path.name}")
91
+
92
+ logger.info(f"Added {data_path} to DVC tracking")
93
+ return str(data_path) + ".dvc"
94
+
95
+ def push_data(self):
96
+ """Push data to remote storage"""
97
+ logger.info("Pushing data to remote...")
98
+ self._run_command("dvc push")
99
+
100
+ def pull_data(self):
101
+ """Pull data from remote storage"""
102
+ logger.info("Pulling data from remote...")
103
+ self._run_command("dvc pull")
104
+
105
+ def checkout(self, version: Optional[str] = None):
106
+ """Checkout specific data version"""
107
+ if version:
108
+ self._run_command(f"git checkout {version}")
109
+
110
+ self._run_command("dvc checkout")
111
+ logger.info(f"Checked out data version: {version or 'latest'}")
112
+
113
+ def get_data_status(self) -> Dict[str, Any]:
114
+ """Get status of tracked data"""
115
+ status_output = self._run_command("dvc status")
116
+
117
+ # Parse status
118
+ status = {
119
+ "modified": [],
120
+ "not_in_cache": [],
121
+ "deleted": []
122
+ }
123
+
124
+ for line in status_output.split('\n'):
125
+ if "modified:" in line:
126
+ status["modified"].append(line.split(":")[-1].strip())
127
+ elif "not in cache:" in line:
128
+ status["not_in_cache"].append(line.split(":")[-1].strip())
129
+ elif "deleted:" in line:
130
+ status["deleted"].append(line.split(":")[-1].strip())
131
+
132
+ return status
133
+
134
+ def _generate_metadata(self, data_path: Path,
135
+ description: Optional[str] = None) -> Dict[str, Any]:
136
+ """Generate metadata for data file"""
137
+ stat = data_path.stat()
138
+
139
+ metadata = {
140
+ "path": str(data_path),
141
+ "size": stat.st_size,
142
+ "created": datetime.fromtimestamp(stat.st_ctime).isoformat(),
143
+ "modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
144
+ "hash": self._calculate_hash(data_path),
145
+ "description": description or "",
146
+ "type": "directory" if data_path.is_dir() else "file"
147
+ }
148
+
149
+ # Add data-specific metadata
150
+ if data_path.suffix in ['.csv', '.parquet']:
151
+ try:
152
+ df = pd.read_csv(data_path) if data_path.suffix == '.csv' else pd.read_parquet(data_path)
153
+ metadata["rows"] = len(df)
154
+ metadata["columns"] = len(df.columns)
155
+ metadata["column_names"] = df.columns.tolist()
156
+ except:
157
+ pass
158
+
159
+ return metadata
160
+
161
+ def _calculate_hash(self, file_path: Path) -> str:
162
+ """Calculate file hash"""
163
+ if file_path.is_dir():
164
+ return "directory"
165
+
166
+ hash_md5 = hashlib.md5()
167
+ with open(file_path, "rb") as f:
168
+ for chunk in iter(lambda: f.read(4096), b""):
169
+ hash_md5.update(chunk)
170
+ return hash_md5.hexdigest()
171
+
172
+ def _commit_changes(self, message: str):
173
+ """Commit changes to git"""
174
+ subprocess.run(["git", "add", "-A"], cwd=self.project_root)
175
+ subprocess.run(["git", "commit", "-m", message], cwd=self.project_root)
176
+
177
+
178
+ class DVCPipeline:
179
+ """DVC pipeline management"""
180
+
181
+ def __init__(self, config: DVCConfig):
182
+ self.config = config
183
+ self.dvc = DataVersionControl(config)
184
+ self.pipeline_file = config.project_root / "dvc.yaml"
185
+ self.params_file = config.project_root / "params.yaml"
186
+
187
+ def create_pipeline(self, stages: List[Dict[str, Any]]):
188
+ """Create DVC pipeline"""
189
+ pipeline = {"stages": {}}
190
+
191
+ for stage in stages:
192
+ stage_name = stage["name"]
193
+ pipeline["stages"][stage_name] = {
194
+ "cmd": stage["cmd"],
195
+ "deps": stage.get("deps", []),
196
+ "params": stage.get("params", []),
197
+ "outs": stage.get("outs", []),
198
+ "metrics": stage.get("metrics", []),
199
+ "plots": stage.get("plots", [])
200
+ }
201
+
202
+ # Save pipeline
203
+ with open(self.pipeline_file, 'w') as f:
204
+ yaml.dump(pipeline, f, default_flow_style=False)
205
+
206
+ logger.info(f"Created DVC pipeline with {len(stages)} stages")
207
+
208
+ def add_stage(self, name: str, cmd: str,
209
+ deps: Optional[List[str]] = None,
210
+ params: Optional[List[str]] = None,
211
+ outs: Optional[List[str]] = None,
212
+ metrics: Optional[List[str]] = None):
213
+ """Add stage to pipeline"""
214
+ stage_config = {
215
+ "cmd": cmd,
216
+ "deps": deps or [],
217
+ "params": params or [],
218
+ "outs": outs or [],
219
+ "metrics": metrics or []
220
+ }
221
+
222
+ # Load existing pipeline
223
+ if self.pipeline_file.exists():
224
+ with open(self.pipeline_file, 'r') as f:
225
+ pipeline = yaml.safe_load(f) or {"stages": {}}
226
+ else:
227
+ pipeline = {"stages": {}}
228
+
229
+ # Add stage
230
+ pipeline["stages"][name] = stage_config
231
+
232
+ # Save pipeline
233
+ with open(self.pipeline_file, 'w') as f:
234
+ yaml.dump(pipeline, f, default_flow_style=False)
235
+
236
+ logger.info(f"Added stage '{name}' to pipeline")
237
+
238
+ def run_pipeline(self, stage: Optional[str] = None):
239
+ """Run DVC pipeline"""
240
+ if stage:
241
+ cmd = f"dvc repro {stage}"
242
+ else:
243
+ cmd = "dvc repro"
244
+
245
+ logger.info(f"Running DVC pipeline: {cmd}")
246
+ self.dvc._run_command(cmd)
247
+
248
+ def get_metrics(self) -> Dict[str, Any]:
249
+ """Get pipeline metrics"""
250
+ metrics_output = self.dvc._run_command("dvc metrics show")
251
+
252
+ # Parse metrics (simplified)
253
+ metrics = {}
254
+ for line in metrics_output.split('\n'):
255
+ if ':' in line:
256
+ key, value = line.split(':', 1)
257
+ try:
258
+ metrics[key.strip()] = float(value.strip())
259
+ except:
260
+ metrics[key.strip()] = value.strip()
261
+
262
+ return metrics
263
+
264
+ def create_ml_pipeline(self):
265
+ """Create standard ML pipeline"""
266
+ stages = [
267
+ {
268
+ "name": "data_preparation",
269
+ "cmd": "python src/prepare_data.py",
270
+ "deps": ["data/raw"],
271
+ "outs": ["data/processed"],
272
+ "params": ["prepare.test_split", "prepare.seed"]
273
+ },
274
+ {
275
+ "name": "feature_engineering",
276
+ "cmd": "python src/featurize.py",
277
+ "deps": ["data/processed"],
278
+ "outs": ["data/features"],
279
+ "params": ["featurize.max_features", "featurize.ngrams"]
280
+ },
281
+ {
282
+ "name": "train",
283
+ "cmd": "python src/train.py",
284
+ "deps": ["data/features"],
285
+ "outs": ["models/model.pkl"],
286
+ "params": ["train.epochs", "train.learning_rate"],
287
+ "metrics": [{"metrics.json": {"cache": False}}]
288
+ },
289
+ {
290
+ "name": "evaluate",
291
+ "cmd": "python src/evaluate.py",
292
+ "deps": ["models/model.pkl", "data/features"],
293
+ "metrics": [{"eval/metrics.json": {"cache": False}}],
294
+ "plots": [{"eval/plots/roc.json": {"x": "fpr", "y": "tpr"}}]
295
+ }
296
+ ]
297
+
298
+ self.create_pipeline(stages)
299
+
300
+ # Create default params file
301
+ params = {
302
+ "prepare": {
303
+ "test_split": 0.2,
304
+ "seed": 42
305
+ },
306
+ "featurize": {
307
+ "max_features": 100,
308
+ "ngrams": 2
309
+ },
310
+ "train": {
311
+ "epochs": 10,
312
+ "learning_rate": 0.001
313
+ }
314
+ }
315
+
316
+ with open(self.params_file, 'w') as f:
317
+ yaml.dump(params, f, default_flow_style=False)
318
+
319
+ logger.info("Created ML pipeline with DVC")
320
+
321
+
322
+ class DataRegistry:
323
+ """Central registry for versioned datasets"""
324
+
325
+ def __init__(self, registry_path: Path = Path("data_registry.json")):
326
+ self.registry_path = registry_path
327
+ self.registry = self._load_registry()
328
+
329
+ def _load_registry(self) -> Dict[str, Any]:
330
+ """Load data registry"""
331
+ if self.registry_path.exists():
332
+ with open(self.registry_path, 'r') as f:
333
+ return json.load(f)
334
+ return {"datasets": {}}
335
+
336
+ def _save_registry(self):
337
+ """Save data registry"""
338
+ with open(self.registry_path, 'w') as f:
339
+ json.dump(self.registry, f, indent=2)
340
+
341
+ def register_dataset(self, name: str, path: str,
342
+ version: str, metadata: Dict[str, Any]):
343
+ """Register new dataset version"""
344
+ if name not in self.registry["datasets"]:
345
+ self.registry["datasets"][name] = {"versions": {}}
346
+
347
+ self.registry["datasets"][name]["versions"][version] = {
348
+ "path": path,
349
+ "metadata": metadata,
350
+ "registered": datetime.now().isoformat()
351
+ }
352
+
353
+ self.registry["datasets"][name]["latest"] = version
354
+ self._save_registry()
355
+
356
+ logger.info(f"Registered dataset '{name}' version '{version}'")
357
+
358
+ def get_dataset(self, name: str, version: Optional[str] = None) -> Dict[str, Any]:
359
+ """Get dataset information"""
360
+ if name not in self.registry["datasets"]:
361
+ raise ValueError(f"Dataset '{name}' not found")
362
+
363
+ dataset = self.registry["datasets"][name]
364
+ version = version or dataset["latest"]
365
+
366
+ if version not in dataset["versions"]:
367
+ raise ValueError(f"Version '{version}' not found for dataset '{name}'")
368
+
369
+ return dataset["versions"][version]
370
+
371
+ def list_datasets(self) -> List[str]:
372
+ """List all registered datasets"""
373
+ return list(self.registry["datasets"].keys())
374
+
375
+ def list_versions(self, name: str) -> List[str]:
376
+ """List all versions of a dataset"""
377
+ if name not in self.registry["datasets"]:
378
+ raise ValueError(f"Dataset '{name}' not found")
379
+
380
+ return list(self.registry["datasets"][name]["versions"].keys())
381
+
382
+
383
+ def create_dvc_config():
384
+ """Create DVC configuration files"""
385
+
386
+ # Create .dvc/.gitignore
387
+ dvc_gitignore = """
388
+ /config.local
389
+ /tmp
390
+ /cache
391
+ """
392
+
393
+ # Create .dvcignore
394
+ dvcignore = """
395
+ # Python
396
+ __pycache__
397
+ *.pyc
398
+ *.pyo
399
+ *.pyd
400
+ .pytest_cache
401
+ .coverage
402
+ htmlcov
403
+
404
+ # Jupyter
405
+ .ipynb_checkpoints
406
+ *.ipynb
407
+
408
+ # IDE
409
+ .vscode
410
+ .idea
411
+ *.swp
412
+ .DS_Store
413
+
414
+ # Temporary files
415
+ /tmp
416
+ /temp
417
+ *.tmp
418
+ """
419
+
420
+ # Create dvc.yaml template
421
+ dvc_yaml = """
422
+ stages:
423
+ prepare_data:
424
+ cmd: python src/ml/preprocessing/prepare_data.py
425
+ deps:
426
+ - src/ml/preprocessing/prepare_data.py
427
+ - data/raw
428
+ outs:
429
+ - data/processed
430
+ params:
431
+ - prepare.split_ratio
432
+ - prepare.random_seed
433
+
434
+ train_model:
435
+ cmd: python src/ml/models/train.py
436
+ deps:
437
+ - src/ml/models/train.py
438
+ - data/processed
439
+ outs:
440
+ - models/model.pkl
441
+ params:
442
+ - train.epochs
443
+ - train.batch_size
444
+ - train.learning_rate
445
+ metrics:
446
+ - metrics/train_metrics.json:
447
+ cache: false
448
+
449
+ evaluate:
450
+ cmd: python src/ml/evaluate.py
451
+ deps:
452
+ - src/ml/evaluate.py
453
+ - models/model.pkl
454
+ - data/processed
455
+ metrics:
456
+ - metrics/eval_metrics.json:
457
+ cache: false
458
+ plots:
459
+ - metrics/confusion_matrix.csv:
460
+ template: confusion
461
+ x: actual
462
+ y: predicted
463
+ """
464
+
465
+ # Create params.yaml template
466
+ params_yaml = """
467
+ prepare:
468
+ split_ratio: 0.2
469
+ random_seed: 42
470
+
471
+ train:
472
+ epochs: 100
473
+ batch_size: 32
474
+ learning_rate: 0.001
475
+ dropout_rate: 0.3
476
+
477
+ evaluate:
478
+ confidence_threshold: 0.6
479
+ metrics:
480
+ - accuracy
481
+ - precision
482
+ - recall
483
+ - f1_score
484
+ """
485
+
486
+ return {
487
+ ".dvc/.gitignore": dvc_gitignore,
488
+ ".dvcignore": dvcignore,
489
+ "dvc.yaml": dvc_yaml,
490
+ "params.yaml": params_yaml
491
+ }
492
+
493
+
494
+ # Example usage
495
+ if __name__ == "__main__":
496
+ # Initialize DVC
497
+ config = DVCConfig()
498
+ dvc = DataVersionControl(config)
499
+
500
+ # Create data registry
501
+ registry = DataRegistry()
502
+
503
+ # Add some data
504
+ dvc.add_data("data/politician_trades.csv", "Politician trading data v1")
505
+
506
+ # Register in registry
507
+ registry.register_dataset(
508
+ name="politician_trades",
509
+ path="data/politician_trades.csv",
510
+ version="v1.0",
511
+ metadata={"source": "congress", "records": 10000}
512
+ )
513
+
514
+ # Create ML pipeline
515
+ pipeline = DVCPipeline(config)
516
+ pipeline.create_ml_pipeline()
517
+
518
+ logger.info("DVC setup complete")