mcli-framework 7.1.1__py3-none-any.whl → 7.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcli-framework might be problematic. Click here for more details.

Files changed (94) hide show
  1. mcli/app/completion_cmd.py +59 -49
  2. mcli/app/completion_helpers.py +60 -138
  3. mcli/app/logs_cmd.py +6 -2
  4. mcli/app/main.py +17 -14
  5. mcli/app/model_cmd.py +19 -4
  6. mcli/chat/chat.py +3 -2
  7. mcli/lib/search/cached_vectorizer.py +1 -0
  8. mcli/lib/services/data_pipeline.py +12 -5
  9. mcli/lib/services/lsh_client.py +68 -57
  10. mcli/ml/api/app.py +28 -36
  11. mcli/ml/api/middleware.py +8 -16
  12. mcli/ml/api/routers/admin_router.py +3 -1
  13. mcli/ml/api/routers/auth_router.py +32 -56
  14. mcli/ml/api/routers/backtest_router.py +3 -1
  15. mcli/ml/api/routers/data_router.py +3 -1
  16. mcli/ml/api/routers/model_router.py +35 -74
  17. mcli/ml/api/routers/monitoring_router.py +3 -1
  18. mcli/ml/api/routers/portfolio_router.py +3 -1
  19. mcli/ml/api/routers/prediction_router.py +60 -65
  20. mcli/ml/api/routers/trade_router.py +6 -2
  21. mcli/ml/api/routers/websocket_router.py +12 -9
  22. mcli/ml/api/schemas.py +10 -2
  23. mcli/ml/auth/auth_manager.py +49 -114
  24. mcli/ml/auth/models.py +30 -15
  25. mcli/ml/auth/permissions.py +12 -19
  26. mcli/ml/backtesting/backtest_engine.py +134 -108
  27. mcli/ml/backtesting/performance_metrics.py +142 -108
  28. mcli/ml/cache.py +12 -18
  29. mcli/ml/cli/main.py +37 -23
  30. mcli/ml/config/settings.py +29 -12
  31. mcli/ml/dashboard/app.py +122 -130
  32. mcli/ml/dashboard/app_integrated.py +216 -150
  33. mcli/ml/dashboard/app_supabase.py +176 -108
  34. mcli/ml/dashboard/app_training.py +212 -206
  35. mcli/ml/dashboard/cli.py +14 -5
  36. mcli/ml/data_ingestion/api_connectors.py +51 -81
  37. mcli/ml/data_ingestion/data_pipeline.py +127 -125
  38. mcli/ml/data_ingestion/stream_processor.py +72 -80
  39. mcli/ml/database/migrations/env.py +3 -2
  40. mcli/ml/database/models.py +112 -79
  41. mcli/ml/database/session.py +6 -5
  42. mcli/ml/experimentation/ab_testing.py +149 -99
  43. mcli/ml/features/ensemble_features.py +9 -8
  44. mcli/ml/features/political_features.py +6 -5
  45. mcli/ml/features/recommendation_engine.py +15 -14
  46. mcli/ml/features/stock_features.py +7 -6
  47. mcli/ml/features/test_feature_engineering.py +8 -7
  48. mcli/ml/logging.py +10 -15
  49. mcli/ml/mlops/data_versioning.py +57 -64
  50. mcli/ml/mlops/experiment_tracker.py +49 -41
  51. mcli/ml/mlops/model_serving.py +59 -62
  52. mcli/ml/mlops/pipeline_orchestrator.py +203 -149
  53. mcli/ml/models/base_models.py +8 -7
  54. mcli/ml/models/ensemble_models.py +6 -5
  55. mcli/ml/models/recommendation_models.py +7 -6
  56. mcli/ml/models/test_models.py +18 -14
  57. mcli/ml/monitoring/drift_detection.py +95 -74
  58. mcli/ml/monitoring/metrics.py +10 -22
  59. mcli/ml/optimization/portfolio_optimizer.py +172 -132
  60. mcli/ml/predictions/prediction_engine.py +62 -50
  61. mcli/ml/preprocessing/data_cleaners.py +6 -5
  62. mcli/ml/preprocessing/feature_extractors.py +7 -6
  63. mcli/ml/preprocessing/ml_pipeline.py +3 -2
  64. mcli/ml/preprocessing/politician_trading_preprocessor.py +11 -10
  65. mcli/ml/preprocessing/test_preprocessing.py +4 -4
  66. mcli/ml/scripts/populate_sample_data.py +36 -16
  67. mcli/ml/tasks.py +82 -83
  68. mcli/ml/tests/test_integration.py +86 -76
  69. mcli/ml/tests/test_training_dashboard.py +169 -142
  70. mcli/mygroup/test_cmd.py +2 -1
  71. mcli/self/self_cmd.py +31 -16
  72. mcli/self/test_cmd.py +2 -1
  73. mcli/workflow/dashboard/dashboard_cmd.py +13 -6
  74. mcli/workflow/lsh_integration.py +46 -58
  75. mcli/workflow/politician_trading/commands.py +576 -427
  76. mcli/workflow/politician_trading/config.py +7 -7
  77. mcli/workflow/politician_trading/connectivity.py +35 -33
  78. mcli/workflow/politician_trading/data_sources.py +72 -71
  79. mcli/workflow/politician_trading/database.py +18 -16
  80. mcli/workflow/politician_trading/demo.py +4 -3
  81. mcli/workflow/politician_trading/models.py +5 -5
  82. mcli/workflow/politician_trading/monitoring.py +13 -13
  83. mcli/workflow/politician_trading/scrapers.py +332 -224
  84. mcli/workflow/politician_trading/scrapers_california.py +116 -94
  85. mcli/workflow/politician_trading/scrapers_eu.py +70 -71
  86. mcli/workflow/politician_trading/scrapers_uk.py +118 -90
  87. mcli/workflow/politician_trading/scrapers_us_states.py +125 -92
  88. mcli/workflow/politician_trading/workflow.py +98 -71
  89. {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.2.dist-info}/METADATA +1 -1
  90. {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.2.dist-info}/RECORD +94 -94
  91. {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.2.dist-info}/WHEEL +0 -0
  92. {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.2.dist-info}/entry_points.txt +0 -0
  93. {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.2.dist-info}/licenses/LICENSE +0 -0
  94. {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.2.dist-info}/top_level.txt +0 -0
@@ -1,28 +1,29 @@
1
1
  """Stock recommendation engine that combines all feature engineering components"""
2
2
 
3
- import numpy as np
4
- import pandas as pd
5
- from datetime import datetime, timedelta
6
- from typing import Any, Dict, List, Optional, Tuple, Union
7
- from dataclasses import dataclass, asdict
8
3
  import logging
4
+ from dataclasses import asdict, dataclass
5
+ from datetime import datetime, timedelta
9
6
  from pathlib import Path
7
+ from typing import Any, Dict, List, Optional, Tuple, Union
8
+
10
9
  import joblib
10
+ import numpy as np
11
+ import pandas as pd
11
12
 
12
- from .stock_features import (
13
- StockRecommendationFeatures,
14
- TechnicalIndicatorFeatures,
15
- MarketRegimeFeatures,
13
+ from .ensemble_features import (
14
+ DynamicFeatureSelector,
15
+ EnsembleFeatureBuilder,
16
+ FeatureInteractionEngine,
16
17
  )
17
18
  from .political_features import (
18
- PoliticalInfluenceFeatures,
19
19
  CongressionalTrackingFeatures,
20
20
  PolicyImpactFeatures,
21
+ PoliticalInfluenceFeatures,
21
22
  )
22
- from .ensemble_features import (
23
- EnsembleFeatureBuilder,
24
- FeatureInteractionEngine,
25
- DynamicFeatureSelector,
23
+ from .stock_features import (
24
+ MarketRegimeFeatures,
25
+ StockRecommendationFeatures,
26
+ TechnicalIndicatorFeatures,
26
27
  )
27
28
 
28
29
  logger = logging.getLogger(__name__)
@@ -1,13 +1,14 @@
1
1
  """Stock-specific feature engineering for recommendation models"""
2
2
 
3
- import numpy as np
4
- import pandas as pd
5
- from datetime import datetime, timedelta
6
- from typing import Any, Dict, List, Optional, Tuple, Union
7
- from dataclasses import dataclass
8
3
  import logging
9
- from collections import defaultdict
10
4
  import warnings
5
+ from collections import defaultdict
6
+ from dataclasses import dataclass
7
+ from datetime import datetime, timedelta
8
+ from typing import Any, Dict, List, Optional, Tuple, Union
9
+
10
+ import numpy as np
11
+ import pandas as pd
11
12
 
12
13
  logger = logging.getLogger(__name__)
13
14
 
@@ -1,14 +1,15 @@
1
1
  """Test script for feature engineering system"""
2
2
 
3
- import sys
4
3
  import os
4
+ import sys
5
5
 
6
6
  sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
7
7
 
8
- import pandas as pd
9
- import numpy as np
10
- from datetime import datetime, timedelta
11
8
  import logging
9
+ from datetime import datetime, timedelta
10
+
11
+ import numpy as np
12
+ import pandas as pd
12
13
 
13
14
  # Set up logging
14
15
  logging.basicConfig(level=logging.INFO)
@@ -209,7 +210,7 @@ def test_recommendation_engine():
209
210
  """Test the full recommendation engine"""
210
211
  logger.info("Testing recommendation engine...")
211
212
 
212
- from recommendation_engine import StockRecommendationEngine, RecommendationConfig
213
+ from recommendation_engine import RecommendationConfig, StockRecommendationEngine
213
214
 
214
215
  # Generate comprehensive test data
215
216
  trading_data = generate_mock_trading_data(100)
@@ -259,9 +260,9 @@ def test_feature_integration():
259
260
  """Test integration of all feature components"""
260
261
  logger.info("Testing feature integration...")
261
262
 
262
- from stock_features import StockRecommendationFeatures
263
- from political_features import PoliticalInfluenceFeatures
264
263
  from ensemble_features import EnsembleFeatureBuilder
264
+ from political_features import PoliticalInfluenceFeatures
265
+ from stock_features import StockRecommendationFeatures
265
266
 
266
267
  # Generate test data
267
268
  trading_data = generate_mock_trading_data(30)
mcli/ml/logging.py CHANGED
@@ -1,11 +1,11 @@
1
1
  """Logging configuration for ML system"""
2
2
 
3
+ import json
3
4
  import logging
4
5
  import sys
5
- from pathlib import Path
6
- from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler
7
- import json
8
6
  from datetime import datetime
7
+ from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler
8
+ from pathlib import Path
9
9
 
10
10
  from mcli.ml.config import settings
11
11
 
@@ -24,11 +24,11 @@ class StructuredFormatter(logging.Formatter):
24
24
  "line": record.lineno,
25
25
  }
26
26
 
27
- if hasattr(record, 'request_id'):
28
- log_obj['request_id'] = record.request_id
27
+ if hasattr(record, "request_id"):
28
+ log_obj["request_id"] = record.request_id
29
29
 
30
30
  if record.exc_info:
31
- log_obj['exception'] = self.formatException(record.exc_info)
31
+ log_obj["exception"] = self.formatException(record.exc_info)
32
32
 
33
33
  return json.dumps(log_obj)
34
34
 
@@ -52,7 +52,7 @@ def setup_logging():
52
52
  console_formatter = StructuredFormatter()
53
53
  else:
54
54
  console_formatter = logging.Formatter(
55
- '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
55
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
56
56
  )
57
57
 
58
58
  console_handler.setFormatter(console_formatter)
@@ -60,10 +60,7 @@ def setup_logging():
60
60
 
61
61
  # File handler for all logs
62
62
  file_handler = TimedRotatingFileHandler(
63
- log_dir / "mcli_ml.log",
64
- when="midnight",
65
- interval=1,
66
- backupCount=30
63
+ log_dir / "mcli_ml.log", when="midnight", interval=1, backupCount=30
67
64
  )
68
65
  file_handler.setLevel(logging.DEBUG)
69
66
  file_handler.setFormatter(StructuredFormatter())
@@ -71,9 +68,7 @@ def setup_logging():
71
68
 
72
69
  # Error file handler
73
70
  error_handler = RotatingFileHandler(
74
- log_dir / "mcli_ml_errors.log",
75
- maxBytes=10 * 1024 * 1024, # 10MB
76
- backupCount=5
71
+ log_dir / "mcli_ml_errors.log", maxBytes=10 * 1024 * 1024, backupCount=5 # 10MB
77
72
  )
78
73
  error_handler.setLevel(logging.ERROR)
79
74
  error_handler.setFormatter(StructuredFormatter())
@@ -82,4 +77,4 @@ def setup_logging():
82
77
 
83
78
  def get_logger(name: str) -> logging.Logger:
84
79
  """Get a logger instance"""
85
- return logging.getLogger(name)
80
+ return logging.getLogger(name)
@@ -1,16 +1,17 @@
1
1
  """DVC integration for data versioning and pipeline management"""
2
2
 
3
- import subprocess
4
- import json
5
- import yaml
6
- from pathlib import Path
7
- from typing import Dict, Any, Optional, List, Union
8
- from dataclasses import dataclass
9
3
  import hashlib
4
+ import json
10
5
  import logging
6
+ import os
7
+ import subprocess
8
+ from dataclasses import dataclass
11
9
  from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Any, Dict, List, Optional, Union
12
+
12
13
  import pandas as pd
13
- import os
14
+ import yaml
14
15
 
15
16
  logger = logging.getLogger(__name__)
16
17
 
@@ -18,6 +19,7 @@ logger = logging.getLogger(__name__)
18
19
  @dataclass
19
20
  class DVCConfig:
20
21
  """DVC configuration"""
22
+
21
23
  project_root: Path = Path(".")
22
24
  remote_storage: str = "s3://my-bucket/dvc-storage" # or local path
23
25
  cache_dir: Path = Path(".dvc/cache")
@@ -49,10 +51,7 @@ class DataVersionControl:
49
51
  """Run DVC command"""
50
52
  try:
51
53
  result = subprocess.run(
52
- command.split(),
53
- capture_output=True,
54
- text=True,
55
- cwd=self.project_root
54
+ command.split(), capture_output=True, text=True, cwd=self.project_root
56
55
  )
57
56
 
58
57
  if result.returncode != 0:
@@ -67,8 +66,7 @@ class DataVersionControl:
67
66
  logger.error(f"Failed to run DVC command: {e}")
68
67
  raise
69
68
 
70
- def add_data(self, data_path: Union[str, Path],
71
- description: Optional[str] = None) -> str:
69
+ def add_data(self, data_path: Union[str, Path], description: Optional[str] = None) -> str:
72
70
  """Add data file or directory to DVC tracking"""
73
71
  data_path = Path(data_path)
74
72
 
@@ -82,7 +80,7 @@ class DataVersionControl:
82
80
  metadata = self._generate_metadata(data_path, description)
83
81
  metadata_path = data_path.with_suffix(".meta.json")
84
82
 
85
- with open(metadata_path, 'w') as f:
83
+ with open(metadata_path, "w") as f:
86
84
  json.dump(metadata, f, indent=2)
87
85
 
88
86
  # Commit if auto-commit enabled
@@ -115,13 +113,9 @@ class DataVersionControl:
115
113
  status_output = self._run_command("dvc status")
116
114
 
117
115
  # Parse status
118
- status = {
119
- "modified": [],
120
- "not_in_cache": [],
121
- "deleted": []
122
- }
116
+ status = {"modified": [], "not_in_cache": [], "deleted": []}
123
117
 
124
- for line in status_output.split('\n'):
118
+ for line in status_output.split("\n"):
125
119
  if "modified:" in line:
126
120
  status["modified"].append(line.split(":")[-1].strip())
127
121
  elif "not in cache:" in line:
@@ -131,8 +125,9 @@ class DataVersionControl:
131
125
 
132
126
  return status
133
127
 
134
- def _generate_metadata(self, data_path: Path,
135
- description: Optional[str] = None) -> Dict[str, Any]:
128
+ def _generate_metadata(
129
+ self, data_path: Path, description: Optional[str] = None
130
+ ) -> Dict[str, Any]:
136
131
  """Generate metadata for data file"""
137
132
  stat = data_path.stat()
138
133
 
@@ -143,13 +138,17 @@ class DataVersionControl:
143
138
  "modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
144
139
  "hash": self._calculate_hash(data_path),
145
140
  "description": description or "",
146
- "type": "directory" if data_path.is_dir() else "file"
141
+ "type": "directory" if data_path.is_dir() else "file",
147
142
  }
148
143
 
149
144
  # Add data-specific metadata
150
- if data_path.suffix in ['.csv', '.parquet']:
145
+ if data_path.suffix in [".csv", ".parquet"]:
151
146
  try:
152
- df = pd.read_csv(data_path) if data_path.suffix == '.csv' else pd.read_parquet(data_path)
147
+ df = (
148
+ pd.read_csv(data_path)
149
+ if data_path.suffix == ".csv"
150
+ else pd.read_parquet(data_path)
151
+ )
153
152
  metadata["rows"] = len(df)
154
153
  metadata["columns"] = len(df.columns)
155
154
  metadata["column_names"] = df.columns.tolist()
@@ -196,32 +195,36 @@ class DVCPipeline:
196
195
  "params": stage.get("params", []),
197
196
  "outs": stage.get("outs", []),
198
197
  "metrics": stage.get("metrics", []),
199
- "plots": stage.get("plots", [])
198
+ "plots": stage.get("plots", []),
200
199
  }
201
200
 
202
201
  # Save pipeline
203
- with open(self.pipeline_file, 'w') as f:
202
+ with open(self.pipeline_file, "w") as f:
204
203
  yaml.dump(pipeline, f, default_flow_style=False)
205
204
 
206
205
  logger.info(f"Created DVC pipeline with {len(stages)} stages")
207
206
 
208
- def add_stage(self, name: str, cmd: str,
209
- deps: Optional[List[str]] = None,
210
- params: Optional[List[str]] = None,
211
- outs: Optional[List[str]] = None,
212
- metrics: Optional[List[str]] = None):
207
+ def add_stage(
208
+ self,
209
+ name: str,
210
+ cmd: str,
211
+ deps: Optional[List[str]] = None,
212
+ params: Optional[List[str]] = None,
213
+ outs: Optional[List[str]] = None,
214
+ metrics: Optional[List[str]] = None,
215
+ ):
213
216
  """Add stage to pipeline"""
214
217
  stage_config = {
215
218
  "cmd": cmd,
216
219
  "deps": deps or [],
217
220
  "params": params or [],
218
221
  "outs": outs or [],
219
- "metrics": metrics or []
222
+ "metrics": metrics or [],
220
223
  }
221
224
 
222
225
  # Load existing pipeline
223
226
  if self.pipeline_file.exists():
224
- with open(self.pipeline_file, 'r') as f:
227
+ with open(self.pipeline_file, "r") as f:
225
228
  pipeline = yaml.safe_load(f) or {"stages": {}}
226
229
  else:
227
230
  pipeline = {"stages": {}}
@@ -230,7 +233,7 @@ class DVCPipeline:
230
233
  pipeline["stages"][name] = stage_config
231
234
 
232
235
  # Save pipeline
233
- with open(self.pipeline_file, 'w') as f:
236
+ with open(self.pipeline_file, "w") as f:
234
237
  yaml.dump(pipeline, f, default_flow_style=False)
235
238
 
236
239
  logger.info(f"Added stage '{name}' to pipeline")
@@ -251,9 +254,9 @@ class DVCPipeline:
251
254
 
252
255
  # Parse metrics (simplified)
253
256
  metrics = {}
254
- for line in metrics_output.split('\n'):
255
- if ':' in line:
256
- key, value = line.split(':', 1)
257
+ for line in metrics_output.split("\n"):
258
+ if ":" in line:
259
+ key, value = line.split(":", 1)
257
260
  try:
258
261
  metrics[key.strip()] = float(value.strip())
259
262
  except:
@@ -269,14 +272,14 @@ class DVCPipeline:
269
272
  "cmd": "python src/prepare_data.py",
270
273
  "deps": ["data/raw"],
271
274
  "outs": ["data/processed"],
272
- "params": ["prepare.test_split", "prepare.seed"]
275
+ "params": ["prepare.test_split", "prepare.seed"],
273
276
  },
274
277
  {
275
278
  "name": "feature_engineering",
276
279
  "cmd": "python src/featurize.py",
277
280
  "deps": ["data/processed"],
278
281
  "outs": ["data/features"],
279
- "params": ["featurize.max_features", "featurize.ngrams"]
282
+ "params": ["featurize.max_features", "featurize.ngrams"],
280
283
  },
281
284
  {
282
285
  "name": "train",
@@ -284,36 +287,27 @@ class DVCPipeline:
284
287
  "deps": ["data/features"],
285
288
  "outs": ["models/model.pkl"],
286
289
  "params": ["train.epochs", "train.learning_rate"],
287
- "metrics": [{"metrics.json": {"cache": False}}]
290
+ "metrics": [{"metrics.json": {"cache": False}}],
288
291
  },
289
292
  {
290
293
  "name": "evaluate",
291
294
  "cmd": "python src/evaluate.py",
292
295
  "deps": ["models/model.pkl", "data/features"],
293
296
  "metrics": [{"eval/metrics.json": {"cache": False}}],
294
- "plots": [{"eval/plots/roc.json": {"x": "fpr", "y": "tpr"}}]
295
- }
297
+ "plots": [{"eval/plots/roc.json": {"x": "fpr", "y": "tpr"}}],
298
+ },
296
299
  ]
297
300
 
298
301
  self.create_pipeline(stages)
299
302
 
300
303
  # Create default params file
301
304
  params = {
302
- "prepare": {
303
- "test_split": 0.2,
304
- "seed": 42
305
- },
306
- "featurize": {
307
- "max_features": 100,
308
- "ngrams": 2
309
- },
310
- "train": {
311
- "epochs": 10,
312
- "learning_rate": 0.001
313
- }
305
+ "prepare": {"test_split": 0.2, "seed": 42},
306
+ "featurize": {"max_features": 100, "ngrams": 2},
307
+ "train": {"epochs": 10, "learning_rate": 0.001},
314
308
  }
315
309
 
316
- with open(self.params_file, 'w') as f:
310
+ with open(self.params_file, "w") as f:
317
311
  yaml.dump(params, f, default_flow_style=False)
318
312
 
319
313
  logger.info("Created ML pipeline with DVC")
@@ -329,17 +323,16 @@ class DataRegistry:
329
323
  def _load_registry(self) -> Dict[str, Any]:
330
324
  """Load data registry"""
331
325
  if self.registry_path.exists():
332
- with open(self.registry_path, 'r') as f:
326
+ with open(self.registry_path, "r") as f:
333
327
  return json.load(f)
334
328
  return {"datasets": {}}
335
329
 
336
330
  def _save_registry(self):
337
331
  """Save data registry"""
338
- with open(self.registry_path, 'w') as f:
332
+ with open(self.registry_path, "w") as f:
339
333
  json.dump(self.registry, f, indent=2)
340
334
 
341
- def register_dataset(self, name: str, path: str,
342
- version: str, metadata: Dict[str, Any]):
335
+ def register_dataset(self, name: str, path: str, version: str, metadata: Dict[str, Any]):
343
336
  """Register new dataset version"""
344
337
  if name not in self.registry["datasets"]:
345
338
  self.registry["datasets"][name] = {"versions": {}}
@@ -347,7 +340,7 @@ class DataRegistry:
347
340
  self.registry["datasets"][name]["versions"][version] = {
348
341
  "path": path,
349
342
  "metadata": metadata,
350
- "registered": datetime.now().isoformat()
343
+ "registered": datetime.now().isoformat(),
351
344
  }
352
345
 
353
346
  self.registry["datasets"][name]["latest"] = version
@@ -487,7 +480,7 @@ evaluate:
487
480
  ".dvc/.gitignore": dvc_gitignore,
488
481
  ".dvcignore": dvcignore,
489
482
  "dvc.yaml": dvc_yaml,
490
- "params.yaml": params_yaml
483
+ "params.yaml": params_yaml,
491
484
  }
492
485
 
493
486
 
@@ -508,11 +501,11 @@ if __name__ == "__main__":
508
501
  name="politician_trades",
509
502
  path="data/politician_trades.csv",
510
503
  version="v1.0",
511
- metadata={"source": "congress", "records": 10000}
504
+ metadata={"source": "congress", "records": 10000},
512
505
  )
513
506
 
514
507
  # Create ML pipeline
515
508
  pipeline = DVCPipeline(config)
516
509
  pipeline.create_ml_pipeline()
517
510
 
518
- logger.info("DVC setup complete")
511
+ logger.info("DVC setup complete")
@@ -1,19 +1,20 @@
1
1
  """MLflow experiment tracking and model registry"""
2
2
 
3
+ import json
4
+ import logging
5
+ from dataclasses import dataclass
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, Union
9
+
3
10
  import mlflow
4
11
  import mlflow.pytorch
5
12
  import mlflow.sklearn
6
- from mlflow.tracking import MlflowClient
7
- from mlflow.models.signature import ModelSignature, infer_signature
8
- import torch
9
13
  import numpy as np
10
14
  import pandas as pd
11
- from typing import Dict, Any, Optional, List, Union
12
- from dataclasses import dataclass
13
- from pathlib import Path
14
- import json
15
- import logging
16
- from datetime import datetime
15
+ import torch
16
+ from mlflow.models.signature import ModelSignature, infer_signature
17
+ from mlflow.tracking import MlflowClient
17
18
 
18
19
  logger = logging.getLogger(__name__)
19
20
 
@@ -21,6 +22,7 @@ logger = logging.getLogger(__name__)
21
22
  @dataclass
22
23
  class MLflowConfig:
23
24
  """Configuration for MLflow tracking"""
25
+
24
26
  tracking_uri: str = "sqlite:///mlruns.db"
25
27
  experiment_name: str = "politician-trading-predictions"
26
28
  artifact_location: Optional[str] = None
@@ -32,13 +34,14 @@ class MLflowConfig:
32
34
  self.tags = {
33
35
  "project": "politician-trading",
34
36
  "framework": "pytorch",
35
- "type": "stock-recommendation"
37
+ "type": "stock-recommendation",
36
38
  }
37
39
 
38
40
 
39
41
  @dataclass
40
42
  class ExperimentRun:
41
43
  """Container for experiment run information"""
44
+
42
45
  run_id: str
43
46
  experiment_id: str
44
47
  run_name: str
@@ -73,7 +76,7 @@ class ExperimentTracker:
73
76
  experiment_id = mlflow.create_experiment(
74
77
  self.config.experiment_name,
75
78
  artifact_location=self.config.artifact_location,
76
- tags=self.config.tags
79
+ tags=self.config.tags,
77
80
  )
78
81
  else:
79
82
  experiment_id = experiment.experiment_id
@@ -105,7 +108,7 @@ class ExperimentTracker:
105
108
  metrics={},
106
109
  params={},
107
110
  artifacts=[],
108
- start_time=datetime.now()
111
+ start_time=datetime.now(),
109
112
  )
110
113
 
111
114
  logger.info(f"Started MLflow run: {run_name} (ID: {run.info.run_id})")
@@ -156,11 +159,15 @@ class ExperimentTracker:
156
159
  self.current_run.artifacts.append(str(artifact_path))
157
160
  logger.debug(f"Logged artifact: {artifact_path}")
158
161
 
159
- def log_model(self, model: Any, model_name: str,
160
- input_example: Optional[Union[np.ndarray, pd.DataFrame]] = None,
161
- signature: Optional[ModelSignature] = None,
162
- conda_env: Optional[Dict] = None,
163
- pip_requirements: Optional[List[str]] = None):
162
+ def log_model(
163
+ self,
164
+ model: Any,
165
+ model_name: str,
166
+ input_example: Optional[Union[np.ndarray, pd.DataFrame]] = None,
167
+ signature: Optional[ModelSignature] = None,
168
+ conda_env: Optional[Dict] = None,
169
+ pip_requirements: Optional[List[str]] = None,
170
+ ):
164
171
  """Log model to current run"""
165
172
  if not self.current_run:
166
173
  raise ValueError("No active MLflow run. Call start_run() first.")
@@ -196,7 +203,7 @@ class ExperimentTracker:
196
203
  signature=signature,
197
204
  input_example=input_example,
198
205
  conda_env=conda_env,
199
- pip_requirements=pip_requirements
206
+ pip_requirements=pip_requirements,
200
207
  )
201
208
  framework = "pytorch"
202
209
  else:
@@ -207,7 +214,7 @@ class ExperimentTracker:
207
214
  signature=signature,
208
215
  input_example=input_example,
209
216
  conda_env=conda_env,
210
- pip_requirements=pip_requirements
217
+ pip_requirements=pip_requirements,
211
218
  )
212
219
  framework = "sklearn"
213
220
 
@@ -245,8 +252,10 @@ class ExperimentTracker:
245
252
  mlflow.end_run(status=status)
246
253
 
247
254
  duration = (self.current_run.end_time - self.current_run.start_time).total_seconds()
248
- logger.info(f"Ended MLflow run {self.current_run.run_name} "
249
- f"(Duration: {duration:.2f}s, Status: {status})")
255
+ logger.info(
256
+ f"Ended MLflow run {self.current_run.run_name} "
257
+ f"(Duration: {duration:.2f}s, Status: {status})"
258
+ )
250
259
 
251
260
  current_run = self.current_run
252
261
  self.current_run = None
@@ -256,17 +265,17 @@ class ExperimentTracker:
256
265
  """Get run by ID"""
257
266
  return self.client.get_run(run_id)
258
267
 
259
- def search_runs(self, filter_string: str = "",
260
- max_results: int = 100) -> List[mlflow.entities.Run]:
268
+ def search_runs(
269
+ self, filter_string: str = "", max_results: int = 100
270
+ ) -> List[mlflow.entities.Run]:
261
271
  """Search for runs in experiment"""
262
272
  return self.client.search_runs(
263
273
  experiment_ids=[self.experiment_id],
264
274
  filter_string=filter_string,
265
- max_results=max_results
275
+ max_results=max_results,
266
276
  )
267
277
 
268
- def compare_runs(self, run_ids: List[str],
269
- metrics: Optional[List[str]] = None) -> pd.DataFrame:
278
+ def compare_runs(self, run_ids: List[str], metrics: Optional[List[str]] = None) -> pd.DataFrame:
270
279
  """Compare multiple runs"""
271
280
  runs_data = []
272
281
 
@@ -307,15 +316,14 @@ class ModelRegistry:
307
316
  if config.registry_uri:
308
317
  mlflow.set_registry_uri(config.registry_uri)
309
318
 
310
- def register_model(self, model_uri: str, model_name: str,
311
- tags: Optional[Dict[str, str]] = None) -> str:
319
+ def register_model(
320
+ self, model_uri: str, model_name: str, tags: Optional[Dict[str, str]] = None
321
+ ) -> str:
312
322
  """Register model in MLflow registry"""
313
323
  try:
314
324
  # Create registered model if it doesn't exist
315
325
  self.client.create_registered_model(
316
- model_name,
317
- tags=tags or {},
318
- description=f"Model for {model_name}"
326
+ model_name, tags=tags or {}, description=f"Model for {model_name}"
319
327
  )
320
328
  except Exception as e:
321
329
  logger.debug(f"Model {model_name} already exists: {e}")
@@ -325,27 +333,28 @@ class ModelRegistry:
325
333
  name=model_name,
326
334
  source=model_uri,
327
335
  run_id=model_uri.split("/")[1] if "runs:/" in model_uri else None,
328
- tags=tags or {}
336
+ tags=tags or {},
329
337
  )
330
338
 
331
339
  logger.info(f"Registered model {model_name} version {model_version.version}")
332
340
  return f"models:/{model_name}/{model_version.version}"
333
341
 
334
- def transition_model_stage(self, model_name: str, version: int,
335
- stage: str, archive_existing: bool = True):
342
+ def transition_model_stage(
343
+ self, model_name: str, version: int, stage: str, archive_existing: bool = True
344
+ ):
336
345
  """Transition model version to new stage"""
337
346
  self.client.transition_model_version_stage(
338
347
  name=model_name,
339
348
  version=version,
340
349
  stage=stage,
341
- archive_existing_versions=archive_existing
350
+ archive_existing_versions=archive_existing,
342
351
  )
343
352
 
344
353
  logger.info(f"Transitioned {model_name} v{version} to {stage}")
345
354
 
346
- def load_model(self, model_name: str,
347
- version: Optional[int] = None,
348
- stage: Optional[str] = None) -> Any:
355
+ def load_model(
356
+ self, model_name: str, version: Optional[int] = None, stage: Optional[str] = None
357
+ ) -> Any:
349
358
  """Load model from registry"""
350
359
  if version:
351
360
  model_uri = f"models:/{model_name}/{version}"
@@ -362,8 +371,7 @@ class ModelRegistry:
362
371
  """Get specific model version details"""
363
372
  return self.client.get_model_version(model_name, version)
364
373
 
365
- def get_latest_versions(self, model_name: str,
366
- stages: Optional[List[str]] = None):
374
+ def get_latest_versions(self, model_name: str, stages: Optional[List[str]] = None):
367
375
  """Get latest model versions for given stages"""
368
376
  return self.client.get_latest_versions(model_name, stages=stages)
369
377
 
@@ -374,4 +382,4 @@ class ModelRegistry:
374
382
 
375
383
  def search_models(self, filter_string: str = "") -> List:
376
384
  """Search registered models"""
377
- return self.client.search_registered_models(filter_string=filter_string)
385
+ return self.client.search_registered_models(filter_string=filter_string)