mcli-framework 7.1.1__py3-none-any.whl → 7.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcli-framework might be problematic. Click here for more details.
- mcli/app/completion_cmd.py +59 -49
- mcli/app/completion_helpers.py +60 -138
- mcli/app/logs_cmd.py +6 -2
- mcli/app/main.py +17 -14
- mcli/app/model_cmd.py +19 -4
- mcli/chat/chat.py +3 -2
- mcli/lib/search/cached_vectorizer.py +1 -0
- mcli/lib/services/data_pipeline.py +12 -5
- mcli/lib/services/lsh_client.py +68 -57
- mcli/ml/api/app.py +28 -36
- mcli/ml/api/middleware.py +8 -16
- mcli/ml/api/routers/admin_router.py +3 -1
- mcli/ml/api/routers/auth_router.py +32 -56
- mcli/ml/api/routers/backtest_router.py +3 -1
- mcli/ml/api/routers/data_router.py +3 -1
- mcli/ml/api/routers/model_router.py +35 -74
- mcli/ml/api/routers/monitoring_router.py +3 -1
- mcli/ml/api/routers/portfolio_router.py +3 -1
- mcli/ml/api/routers/prediction_router.py +60 -65
- mcli/ml/api/routers/trade_router.py +6 -2
- mcli/ml/api/routers/websocket_router.py +12 -9
- mcli/ml/api/schemas.py +10 -2
- mcli/ml/auth/auth_manager.py +49 -114
- mcli/ml/auth/models.py +30 -15
- mcli/ml/auth/permissions.py +12 -19
- mcli/ml/backtesting/backtest_engine.py +134 -108
- mcli/ml/backtesting/performance_metrics.py +142 -108
- mcli/ml/cache.py +12 -18
- mcli/ml/cli/main.py +37 -23
- mcli/ml/config/settings.py +29 -12
- mcli/ml/dashboard/app.py +122 -130
- mcli/ml/dashboard/app_integrated.py +216 -150
- mcli/ml/dashboard/app_supabase.py +176 -108
- mcli/ml/dashboard/app_training.py +212 -206
- mcli/ml/dashboard/cli.py +14 -5
- mcli/ml/data_ingestion/api_connectors.py +51 -81
- mcli/ml/data_ingestion/data_pipeline.py +127 -125
- mcli/ml/data_ingestion/stream_processor.py +72 -80
- mcli/ml/database/migrations/env.py +3 -2
- mcli/ml/database/models.py +112 -79
- mcli/ml/database/session.py +6 -5
- mcli/ml/experimentation/ab_testing.py +149 -99
- mcli/ml/features/ensemble_features.py +9 -8
- mcli/ml/features/political_features.py +6 -5
- mcli/ml/features/recommendation_engine.py +15 -14
- mcli/ml/features/stock_features.py +7 -6
- mcli/ml/features/test_feature_engineering.py +8 -7
- mcli/ml/logging.py +10 -15
- mcli/ml/mlops/data_versioning.py +57 -64
- mcli/ml/mlops/experiment_tracker.py +49 -41
- mcli/ml/mlops/model_serving.py +59 -62
- mcli/ml/mlops/pipeline_orchestrator.py +203 -149
- mcli/ml/models/base_models.py +8 -7
- mcli/ml/models/ensemble_models.py +6 -5
- mcli/ml/models/recommendation_models.py +7 -6
- mcli/ml/models/test_models.py +18 -14
- mcli/ml/monitoring/drift_detection.py +95 -74
- mcli/ml/monitoring/metrics.py +10 -22
- mcli/ml/optimization/portfolio_optimizer.py +172 -132
- mcli/ml/predictions/prediction_engine.py +62 -50
- mcli/ml/preprocessing/data_cleaners.py +6 -5
- mcli/ml/preprocessing/feature_extractors.py +7 -6
- mcli/ml/preprocessing/ml_pipeline.py +3 -2
- mcli/ml/preprocessing/politician_trading_preprocessor.py +11 -10
- mcli/ml/preprocessing/test_preprocessing.py +4 -4
- mcli/ml/scripts/populate_sample_data.py +36 -16
- mcli/ml/tasks.py +82 -83
- mcli/ml/tests/test_integration.py +86 -76
- mcli/ml/tests/test_training_dashboard.py +169 -142
- mcli/mygroup/test_cmd.py +2 -1
- mcli/self/self_cmd.py +31 -16
- mcli/self/test_cmd.py +2 -1
- mcli/workflow/dashboard/dashboard_cmd.py +13 -6
- mcli/workflow/lsh_integration.py +46 -58
- mcli/workflow/politician_trading/commands.py +576 -427
- mcli/workflow/politician_trading/config.py +7 -7
- mcli/workflow/politician_trading/connectivity.py +35 -33
- mcli/workflow/politician_trading/data_sources.py +72 -71
- mcli/workflow/politician_trading/database.py +18 -16
- mcli/workflow/politician_trading/demo.py +4 -3
- mcli/workflow/politician_trading/models.py +5 -5
- mcli/workflow/politician_trading/monitoring.py +13 -13
- mcli/workflow/politician_trading/scrapers.py +332 -224
- mcli/workflow/politician_trading/scrapers_california.py +116 -94
- mcli/workflow/politician_trading/scrapers_eu.py +70 -71
- mcli/workflow/politician_trading/scrapers_uk.py +118 -90
- mcli/workflow/politician_trading/scrapers_us_states.py +125 -92
- mcli/workflow/politician_trading/workflow.py +98 -71
- {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.2.dist-info}/METADATA +1 -1
- {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.2.dist-info}/RECORD +94 -94
- {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.2.dist-info}/WHEEL +0 -0
- {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.2.dist-info}/entry_points.txt +0 -0
- {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.2.dist-info}/licenses/LICENSE +0 -0
- {mcli_framework-7.1.1.dist-info → mcli_framework-7.1.2.dist-info}/top_level.txt +0 -0
|
@@ -1,28 +1,29 @@
|
|
|
1
1
|
"""Stock recommendation engine that combines all feature engineering components"""
|
|
2
2
|
|
|
3
|
-
import numpy as np
|
|
4
|
-
import pandas as pd
|
|
5
|
-
from datetime import datetime, timedelta
|
|
6
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
7
|
-
from dataclasses import dataclass, asdict
|
|
8
3
|
import logging
|
|
4
|
+
from dataclasses import asdict, dataclass
|
|
5
|
+
from datetime import datetime, timedelta
|
|
9
6
|
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
8
|
+
|
|
10
9
|
import joblib
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
11
12
|
|
|
12
|
-
from .
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
13
|
+
from .ensemble_features import (
|
|
14
|
+
DynamicFeatureSelector,
|
|
15
|
+
EnsembleFeatureBuilder,
|
|
16
|
+
FeatureInteractionEngine,
|
|
16
17
|
)
|
|
17
18
|
from .political_features import (
|
|
18
|
-
PoliticalInfluenceFeatures,
|
|
19
19
|
CongressionalTrackingFeatures,
|
|
20
20
|
PolicyImpactFeatures,
|
|
21
|
+
PoliticalInfluenceFeatures,
|
|
21
22
|
)
|
|
22
|
-
from .
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
23
|
+
from .stock_features import (
|
|
24
|
+
MarketRegimeFeatures,
|
|
25
|
+
StockRecommendationFeatures,
|
|
26
|
+
TechnicalIndicatorFeatures,
|
|
26
27
|
)
|
|
27
28
|
|
|
28
29
|
logger = logging.getLogger(__name__)
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
"""Stock-specific feature engineering for recommendation models"""
|
|
2
2
|
|
|
3
|
-
import numpy as np
|
|
4
|
-
import pandas as pd
|
|
5
|
-
from datetime import datetime, timedelta
|
|
6
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
7
|
-
from dataclasses import dataclass
|
|
8
3
|
import logging
|
|
9
|
-
from collections import defaultdict
|
|
10
4
|
import warnings
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import datetime, timedelta
|
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
11
12
|
|
|
12
13
|
logger = logging.getLogger(__name__)
|
|
13
14
|
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
"""Test script for feature engineering system"""
|
|
2
2
|
|
|
3
|
-
import sys
|
|
4
3
|
import os
|
|
4
|
+
import sys
|
|
5
5
|
|
|
6
6
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../.."))
|
|
7
7
|
|
|
8
|
-
import pandas as pd
|
|
9
|
-
import numpy as np
|
|
10
|
-
from datetime import datetime, timedelta
|
|
11
8
|
import logging
|
|
9
|
+
from datetime import datetime, timedelta
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
12
13
|
|
|
13
14
|
# Set up logging
|
|
14
15
|
logging.basicConfig(level=logging.INFO)
|
|
@@ -209,7 +210,7 @@ def test_recommendation_engine():
|
|
|
209
210
|
"""Test the full recommendation engine"""
|
|
210
211
|
logger.info("Testing recommendation engine...")
|
|
211
212
|
|
|
212
|
-
from recommendation_engine import
|
|
213
|
+
from recommendation_engine import RecommendationConfig, StockRecommendationEngine
|
|
213
214
|
|
|
214
215
|
# Generate comprehensive test data
|
|
215
216
|
trading_data = generate_mock_trading_data(100)
|
|
@@ -259,9 +260,9 @@ def test_feature_integration():
|
|
|
259
260
|
"""Test integration of all feature components"""
|
|
260
261
|
logger.info("Testing feature integration...")
|
|
261
262
|
|
|
262
|
-
from stock_features import StockRecommendationFeatures
|
|
263
|
-
from political_features import PoliticalInfluenceFeatures
|
|
264
263
|
from ensemble_features import EnsembleFeatureBuilder
|
|
264
|
+
from political_features import PoliticalInfluenceFeatures
|
|
265
|
+
from stock_features import StockRecommendationFeatures
|
|
265
266
|
|
|
266
267
|
# Generate test data
|
|
267
268
|
trading_data = generate_mock_trading_data(30)
|
mcli/ml/logging.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
"""Logging configuration for ML system"""
|
|
2
2
|
|
|
3
|
+
import json
|
|
3
4
|
import logging
|
|
4
5
|
import sys
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler
|
|
7
|
-
import json
|
|
8
6
|
from datetime import datetime
|
|
7
|
+
from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler
|
|
8
|
+
from pathlib import Path
|
|
9
9
|
|
|
10
10
|
from mcli.ml.config import settings
|
|
11
11
|
|
|
@@ -24,11 +24,11 @@ class StructuredFormatter(logging.Formatter):
|
|
|
24
24
|
"line": record.lineno,
|
|
25
25
|
}
|
|
26
26
|
|
|
27
|
-
if hasattr(record,
|
|
28
|
-
log_obj[
|
|
27
|
+
if hasattr(record, "request_id"):
|
|
28
|
+
log_obj["request_id"] = record.request_id
|
|
29
29
|
|
|
30
30
|
if record.exc_info:
|
|
31
|
-
log_obj[
|
|
31
|
+
log_obj["exception"] = self.formatException(record.exc_info)
|
|
32
32
|
|
|
33
33
|
return json.dumps(log_obj)
|
|
34
34
|
|
|
@@ -52,7 +52,7 @@ def setup_logging():
|
|
|
52
52
|
console_formatter = StructuredFormatter()
|
|
53
53
|
else:
|
|
54
54
|
console_formatter = logging.Formatter(
|
|
55
|
-
|
|
55
|
+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
56
56
|
)
|
|
57
57
|
|
|
58
58
|
console_handler.setFormatter(console_formatter)
|
|
@@ -60,10 +60,7 @@ def setup_logging():
|
|
|
60
60
|
|
|
61
61
|
# File handler for all logs
|
|
62
62
|
file_handler = TimedRotatingFileHandler(
|
|
63
|
-
log_dir / "mcli_ml.log",
|
|
64
|
-
when="midnight",
|
|
65
|
-
interval=1,
|
|
66
|
-
backupCount=30
|
|
63
|
+
log_dir / "mcli_ml.log", when="midnight", interval=1, backupCount=30
|
|
67
64
|
)
|
|
68
65
|
file_handler.setLevel(logging.DEBUG)
|
|
69
66
|
file_handler.setFormatter(StructuredFormatter())
|
|
@@ -71,9 +68,7 @@ def setup_logging():
|
|
|
71
68
|
|
|
72
69
|
# Error file handler
|
|
73
70
|
error_handler = RotatingFileHandler(
|
|
74
|
-
log_dir / "mcli_ml_errors.log",
|
|
75
|
-
maxBytes=10 * 1024 * 1024, # 10MB
|
|
76
|
-
backupCount=5
|
|
71
|
+
log_dir / "mcli_ml_errors.log", maxBytes=10 * 1024 * 1024, backupCount=5 # 10MB
|
|
77
72
|
)
|
|
78
73
|
error_handler.setLevel(logging.ERROR)
|
|
79
74
|
error_handler.setFormatter(StructuredFormatter())
|
|
@@ -82,4 +77,4 @@ def setup_logging():
|
|
|
82
77
|
|
|
83
78
|
def get_logger(name: str) -> logging.Logger:
|
|
84
79
|
"""Get a logger instance"""
|
|
85
|
-
return logging.getLogger(name)
|
|
80
|
+
return logging.getLogger(name)
|
mcli/ml/mlops/data_versioning.py
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
"""DVC integration for data versioning and pipeline management"""
|
|
2
2
|
|
|
3
|
-
import subprocess
|
|
4
|
-
import json
|
|
5
|
-
import yaml
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Dict, Any, Optional, List, Union
|
|
8
|
-
from dataclasses import dataclass
|
|
9
3
|
import hashlib
|
|
4
|
+
import json
|
|
10
5
|
import logging
|
|
6
|
+
import os
|
|
7
|
+
import subprocess
|
|
8
|
+
from dataclasses import dataclass
|
|
11
9
|
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Dict, List, Optional, Union
|
|
12
|
+
|
|
12
13
|
import pandas as pd
|
|
13
|
-
import
|
|
14
|
+
import yaml
|
|
14
15
|
|
|
15
16
|
logger = logging.getLogger(__name__)
|
|
16
17
|
|
|
@@ -18,6 +19,7 @@ logger = logging.getLogger(__name__)
|
|
|
18
19
|
@dataclass
|
|
19
20
|
class DVCConfig:
|
|
20
21
|
"""DVC configuration"""
|
|
22
|
+
|
|
21
23
|
project_root: Path = Path(".")
|
|
22
24
|
remote_storage: str = "s3://my-bucket/dvc-storage" # or local path
|
|
23
25
|
cache_dir: Path = Path(".dvc/cache")
|
|
@@ -49,10 +51,7 @@ class DataVersionControl:
|
|
|
49
51
|
"""Run DVC command"""
|
|
50
52
|
try:
|
|
51
53
|
result = subprocess.run(
|
|
52
|
-
command.split(),
|
|
53
|
-
capture_output=True,
|
|
54
|
-
text=True,
|
|
55
|
-
cwd=self.project_root
|
|
54
|
+
command.split(), capture_output=True, text=True, cwd=self.project_root
|
|
56
55
|
)
|
|
57
56
|
|
|
58
57
|
if result.returncode != 0:
|
|
@@ -67,8 +66,7 @@ class DataVersionControl:
|
|
|
67
66
|
logger.error(f"Failed to run DVC command: {e}")
|
|
68
67
|
raise
|
|
69
68
|
|
|
70
|
-
def add_data(self, data_path: Union[str, Path],
|
|
71
|
-
description: Optional[str] = None) -> str:
|
|
69
|
+
def add_data(self, data_path: Union[str, Path], description: Optional[str] = None) -> str:
|
|
72
70
|
"""Add data file or directory to DVC tracking"""
|
|
73
71
|
data_path = Path(data_path)
|
|
74
72
|
|
|
@@ -82,7 +80,7 @@ class DataVersionControl:
|
|
|
82
80
|
metadata = self._generate_metadata(data_path, description)
|
|
83
81
|
metadata_path = data_path.with_suffix(".meta.json")
|
|
84
82
|
|
|
85
|
-
with open(metadata_path,
|
|
83
|
+
with open(metadata_path, "w") as f:
|
|
86
84
|
json.dump(metadata, f, indent=2)
|
|
87
85
|
|
|
88
86
|
# Commit if auto-commit enabled
|
|
@@ -115,13 +113,9 @@ class DataVersionControl:
|
|
|
115
113
|
status_output = self._run_command("dvc status")
|
|
116
114
|
|
|
117
115
|
# Parse status
|
|
118
|
-
status = {
|
|
119
|
-
"modified": [],
|
|
120
|
-
"not_in_cache": [],
|
|
121
|
-
"deleted": []
|
|
122
|
-
}
|
|
116
|
+
status = {"modified": [], "not_in_cache": [], "deleted": []}
|
|
123
117
|
|
|
124
|
-
for line in status_output.split(
|
|
118
|
+
for line in status_output.split("\n"):
|
|
125
119
|
if "modified:" in line:
|
|
126
120
|
status["modified"].append(line.split(":")[-1].strip())
|
|
127
121
|
elif "not in cache:" in line:
|
|
@@ -131,8 +125,9 @@ class DataVersionControl:
|
|
|
131
125
|
|
|
132
126
|
return status
|
|
133
127
|
|
|
134
|
-
def _generate_metadata(
|
|
135
|
-
|
|
128
|
+
def _generate_metadata(
|
|
129
|
+
self, data_path: Path, description: Optional[str] = None
|
|
130
|
+
) -> Dict[str, Any]:
|
|
136
131
|
"""Generate metadata for data file"""
|
|
137
132
|
stat = data_path.stat()
|
|
138
133
|
|
|
@@ -143,13 +138,17 @@ class DataVersionControl:
|
|
|
143
138
|
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
|
144
139
|
"hash": self._calculate_hash(data_path),
|
|
145
140
|
"description": description or "",
|
|
146
|
-
"type": "directory" if data_path.is_dir() else "file"
|
|
141
|
+
"type": "directory" if data_path.is_dir() else "file",
|
|
147
142
|
}
|
|
148
143
|
|
|
149
144
|
# Add data-specific metadata
|
|
150
|
-
if data_path.suffix in [
|
|
145
|
+
if data_path.suffix in [".csv", ".parquet"]:
|
|
151
146
|
try:
|
|
152
|
-
df =
|
|
147
|
+
df = (
|
|
148
|
+
pd.read_csv(data_path)
|
|
149
|
+
if data_path.suffix == ".csv"
|
|
150
|
+
else pd.read_parquet(data_path)
|
|
151
|
+
)
|
|
153
152
|
metadata["rows"] = len(df)
|
|
154
153
|
metadata["columns"] = len(df.columns)
|
|
155
154
|
metadata["column_names"] = df.columns.tolist()
|
|
@@ -196,32 +195,36 @@ class DVCPipeline:
|
|
|
196
195
|
"params": stage.get("params", []),
|
|
197
196
|
"outs": stage.get("outs", []),
|
|
198
197
|
"metrics": stage.get("metrics", []),
|
|
199
|
-
"plots": stage.get("plots", [])
|
|
198
|
+
"plots": stage.get("plots", []),
|
|
200
199
|
}
|
|
201
200
|
|
|
202
201
|
# Save pipeline
|
|
203
|
-
with open(self.pipeline_file,
|
|
202
|
+
with open(self.pipeline_file, "w") as f:
|
|
204
203
|
yaml.dump(pipeline, f, default_flow_style=False)
|
|
205
204
|
|
|
206
205
|
logger.info(f"Created DVC pipeline with {len(stages)} stages")
|
|
207
206
|
|
|
208
|
-
def add_stage(
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
207
|
+
def add_stage(
|
|
208
|
+
self,
|
|
209
|
+
name: str,
|
|
210
|
+
cmd: str,
|
|
211
|
+
deps: Optional[List[str]] = None,
|
|
212
|
+
params: Optional[List[str]] = None,
|
|
213
|
+
outs: Optional[List[str]] = None,
|
|
214
|
+
metrics: Optional[List[str]] = None,
|
|
215
|
+
):
|
|
213
216
|
"""Add stage to pipeline"""
|
|
214
217
|
stage_config = {
|
|
215
218
|
"cmd": cmd,
|
|
216
219
|
"deps": deps or [],
|
|
217
220
|
"params": params or [],
|
|
218
221
|
"outs": outs or [],
|
|
219
|
-
"metrics": metrics or []
|
|
222
|
+
"metrics": metrics or [],
|
|
220
223
|
}
|
|
221
224
|
|
|
222
225
|
# Load existing pipeline
|
|
223
226
|
if self.pipeline_file.exists():
|
|
224
|
-
with open(self.pipeline_file,
|
|
227
|
+
with open(self.pipeline_file, "r") as f:
|
|
225
228
|
pipeline = yaml.safe_load(f) or {"stages": {}}
|
|
226
229
|
else:
|
|
227
230
|
pipeline = {"stages": {}}
|
|
@@ -230,7 +233,7 @@ class DVCPipeline:
|
|
|
230
233
|
pipeline["stages"][name] = stage_config
|
|
231
234
|
|
|
232
235
|
# Save pipeline
|
|
233
|
-
with open(self.pipeline_file,
|
|
236
|
+
with open(self.pipeline_file, "w") as f:
|
|
234
237
|
yaml.dump(pipeline, f, default_flow_style=False)
|
|
235
238
|
|
|
236
239
|
logger.info(f"Added stage '{name}' to pipeline")
|
|
@@ -251,9 +254,9 @@ class DVCPipeline:
|
|
|
251
254
|
|
|
252
255
|
# Parse metrics (simplified)
|
|
253
256
|
metrics = {}
|
|
254
|
-
for line in metrics_output.split(
|
|
255
|
-
if
|
|
256
|
-
key, value = line.split(
|
|
257
|
+
for line in metrics_output.split("\n"):
|
|
258
|
+
if ":" in line:
|
|
259
|
+
key, value = line.split(":", 1)
|
|
257
260
|
try:
|
|
258
261
|
metrics[key.strip()] = float(value.strip())
|
|
259
262
|
except:
|
|
@@ -269,14 +272,14 @@ class DVCPipeline:
|
|
|
269
272
|
"cmd": "python src/prepare_data.py",
|
|
270
273
|
"deps": ["data/raw"],
|
|
271
274
|
"outs": ["data/processed"],
|
|
272
|
-
"params": ["prepare.test_split", "prepare.seed"]
|
|
275
|
+
"params": ["prepare.test_split", "prepare.seed"],
|
|
273
276
|
},
|
|
274
277
|
{
|
|
275
278
|
"name": "feature_engineering",
|
|
276
279
|
"cmd": "python src/featurize.py",
|
|
277
280
|
"deps": ["data/processed"],
|
|
278
281
|
"outs": ["data/features"],
|
|
279
|
-
"params": ["featurize.max_features", "featurize.ngrams"]
|
|
282
|
+
"params": ["featurize.max_features", "featurize.ngrams"],
|
|
280
283
|
},
|
|
281
284
|
{
|
|
282
285
|
"name": "train",
|
|
@@ -284,36 +287,27 @@ class DVCPipeline:
|
|
|
284
287
|
"deps": ["data/features"],
|
|
285
288
|
"outs": ["models/model.pkl"],
|
|
286
289
|
"params": ["train.epochs", "train.learning_rate"],
|
|
287
|
-
"metrics": [{"metrics.json": {"cache": False}}]
|
|
290
|
+
"metrics": [{"metrics.json": {"cache": False}}],
|
|
288
291
|
},
|
|
289
292
|
{
|
|
290
293
|
"name": "evaluate",
|
|
291
294
|
"cmd": "python src/evaluate.py",
|
|
292
295
|
"deps": ["models/model.pkl", "data/features"],
|
|
293
296
|
"metrics": [{"eval/metrics.json": {"cache": False}}],
|
|
294
|
-
"plots": [{"eval/plots/roc.json": {"x": "fpr", "y": "tpr"}}]
|
|
295
|
-
}
|
|
297
|
+
"plots": [{"eval/plots/roc.json": {"x": "fpr", "y": "tpr"}}],
|
|
298
|
+
},
|
|
296
299
|
]
|
|
297
300
|
|
|
298
301
|
self.create_pipeline(stages)
|
|
299
302
|
|
|
300
303
|
# Create default params file
|
|
301
304
|
params = {
|
|
302
|
-
"prepare": {
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
},
|
|
306
|
-
"featurize": {
|
|
307
|
-
"max_features": 100,
|
|
308
|
-
"ngrams": 2
|
|
309
|
-
},
|
|
310
|
-
"train": {
|
|
311
|
-
"epochs": 10,
|
|
312
|
-
"learning_rate": 0.001
|
|
313
|
-
}
|
|
305
|
+
"prepare": {"test_split": 0.2, "seed": 42},
|
|
306
|
+
"featurize": {"max_features": 100, "ngrams": 2},
|
|
307
|
+
"train": {"epochs": 10, "learning_rate": 0.001},
|
|
314
308
|
}
|
|
315
309
|
|
|
316
|
-
with open(self.params_file,
|
|
310
|
+
with open(self.params_file, "w") as f:
|
|
317
311
|
yaml.dump(params, f, default_flow_style=False)
|
|
318
312
|
|
|
319
313
|
logger.info("Created ML pipeline with DVC")
|
|
@@ -329,17 +323,16 @@ class DataRegistry:
|
|
|
329
323
|
def _load_registry(self) -> Dict[str, Any]:
|
|
330
324
|
"""Load data registry"""
|
|
331
325
|
if self.registry_path.exists():
|
|
332
|
-
with open(self.registry_path,
|
|
326
|
+
with open(self.registry_path, "r") as f:
|
|
333
327
|
return json.load(f)
|
|
334
328
|
return {"datasets": {}}
|
|
335
329
|
|
|
336
330
|
def _save_registry(self):
|
|
337
331
|
"""Save data registry"""
|
|
338
|
-
with open(self.registry_path,
|
|
332
|
+
with open(self.registry_path, "w") as f:
|
|
339
333
|
json.dump(self.registry, f, indent=2)
|
|
340
334
|
|
|
341
|
-
def register_dataset(self, name: str, path: str,
|
|
342
|
-
version: str, metadata: Dict[str, Any]):
|
|
335
|
+
def register_dataset(self, name: str, path: str, version: str, metadata: Dict[str, Any]):
|
|
343
336
|
"""Register new dataset version"""
|
|
344
337
|
if name not in self.registry["datasets"]:
|
|
345
338
|
self.registry["datasets"][name] = {"versions": {}}
|
|
@@ -347,7 +340,7 @@ class DataRegistry:
|
|
|
347
340
|
self.registry["datasets"][name]["versions"][version] = {
|
|
348
341
|
"path": path,
|
|
349
342
|
"metadata": metadata,
|
|
350
|
-
"registered": datetime.now().isoformat()
|
|
343
|
+
"registered": datetime.now().isoformat(),
|
|
351
344
|
}
|
|
352
345
|
|
|
353
346
|
self.registry["datasets"][name]["latest"] = version
|
|
@@ -487,7 +480,7 @@ evaluate:
|
|
|
487
480
|
".dvc/.gitignore": dvc_gitignore,
|
|
488
481
|
".dvcignore": dvcignore,
|
|
489
482
|
"dvc.yaml": dvc_yaml,
|
|
490
|
-
"params.yaml": params_yaml
|
|
483
|
+
"params.yaml": params_yaml,
|
|
491
484
|
}
|
|
492
485
|
|
|
493
486
|
|
|
@@ -508,11 +501,11 @@ if __name__ == "__main__":
|
|
|
508
501
|
name="politician_trades",
|
|
509
502
|
path="data/politician_trades.csv",
|
|
510
503
|
version="v1.0",
|
|
511
|
-
metadata={"source": "congress", "records": 10000}
|
|
504
|
+
metadata={"source": "congress", "records": 10000},
|
|
512
505
|
)
|
|
513
506
|
|
|
514
507
|
# Create ML pipeline
|
|
515
508
|
pipeline = DVCPipeline(config)
|
|
516
509
|
pipeline.create_ml_pipeline()
|
|
517
510
|
|
|
518
|
-
logger.info("DVC setup complete")
|
|
511
|
+
logger.info("DVC setup complete")
|
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
"""MLflow experiment tracking and model registry"""
|
|
2
2
|
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict, List, Optional, Union
|
|
9
|
+
|
|
3
10
|
import mlflow
|
|
4
11
|
import mlflow.pytorch
|
|
5
12
|
import mlflow.sklearn
|
|
6
|
-
from mlflow.tracking import MlflowClient
|
|
7
|
-
from mlflow.models.signature import ModelSignature, infer_signature
|
|
8
|
-
import torch
|
|
9
13
|
import numpy as np
|
|
10
14
|
import pandas as pd
|
|
11
|
-
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
import json
|
|
15
|
-
import logging
|
|
16
|
-
from datetime import datetime
|
|
15
|
+
import torch
|
|
16
|
+
from mlflow.models.signature import ModelSignature, infer_signature
|
|
17
|
+
from mlflow.tracking import MlflowClient
|
|
17
18
|
|
|
18
19
|
logger = logging.getLogger(__name__)
|
|
19
20
|
|
|
@@ -21,6 +22,7 @@ logger = logging.getLogger(__name__)
|
|
|
21
22
|
@dataclass
|
|
22
23
|
class MLflowConfig:
|
|
23
24
|
"""Configuration for MLflow tracking"""
|
|
25
|
+
|
|
24
26
|
tracking_uri: str = "sqlite:///mlruns.db"
|
|
25
27
|
experiment_name: str = "politician-trading-predictions"
|
|
26
28
|
artifact_location: Optional[str] = None
|
|
@@ -32,13 +34,14 @@ class MLflowConfig:
|
|
|
32
34
|
self.tags = {
|
|
33
35
|
"project": "politician-trading",
|
|
34
36
|
"framework": "pytorch",
|
|
35
|
-
"type": "stock-recommendation"
|
|
37
|
+
"type": "stock-recommendation",
|
|
36
38
|
}
|
|
37
39
|
|
|
38
40
|
|
|
39
41
|
@dataclass
|
|
40
42
|
class ExperimentRun:
|
|
41
43
|
"""Container for experiment run information"""
|
|
44
|
+
|
|
42
45
|
run_id: str
|
|
43
46
|
experiment_id: str
|
|
44
47
|
run_name: str
|
|
@@ -73,7 +76,7 @@ class ExperimentTracker:
|
|
|
73
76
|
experiment_id = mlflow.create_experiment(
|
|
74
77
|
self.config.experiment_name,
|
|
75
78
|
artifact_location=self.config.artifact_location,
|
|
76
|
-
tags=self.config.tags
|
|
79
|
+
tags=self.config.tags,
|
|
77
80
|
)
|
|
78
81
|
else:
|
|
79
82
|
experiment_id = experiment.experiment_id
|
|
@@ -105,7 +108,7 @@ class ExperimentTracker:
|
|
|
105
108
|
metrics={},
|
|
106
109
|
params={},
|
|
107
110
|
artifacts=[],
|
|
108
|
-
start_time=datetime.now()
|
|
111
|
+
start_time=datetime.now(),
|
|
109
112
|
)
|
|
110
113
|
|
|
111
114
|
logger.info(f"Started MLflow run: {run_name} (ID: {run.info.run_id})")
|
|
@@ -156,11 +159,15 @@ class ExperimentTracker:
|
|
|
156
159
|
self.current_run.artifacts.append(str(artifact_path))
|
|
157
160
|
logger.debug(f"Logged artifact: {artifact_path}")
|
|
158
161
|
|
|
159
|
-
def log_model(
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
162
|
+
def log_model(
|
|
163
|
+
self,
|
|
164
|
+
model: Any,
|
|
165
|
+
model_name: str,
|
|
166
|
+
input_example: Optional[Union[np.ndarray, pd.DataFrame]] = None,
|
|
167
|
+
signature: Optional[ModelSignature] = None,
|
|
168
|
+
conda_env: Optional[Dict] = None,
|
|
169
|
+
pip_requirements: Optional[List[str]] = None,
|
|
170
|
+
):
|
|
164
171
|
"""Log model to current run"""
|
|
165
172
|
if not self.current_run:
|
|
166
173
|
raise ValueError("No active MLflow run. Call start_run() first.")
|
|
@@ -196,7 +203,7 @@ class ExperimentTracker:
|
|
|
196
203
|
signature=signature,
|
|
197
204
|
input_example=input_example,
|
|
198
205
|
conda_env=conda_env,
|
|
199
|
-
pip_requirements=pip_requirements
|
|
206
|
+
pip_requirements=pip_requirements,
|
|
200
207
|
)
|
|
201
208
|
framework = "pytorch"
|
|
202
209
|
else:
|
|
@@ -207,7 +214,7 @@ class ExperimentTracker:
|
|
|
207
214
|
signature=signature,
|
|
208
215
|
input_example=input_example,
|
|
209
216
|
conda_env=conda_env,
|
|
210
|
-
pip_requirements=pip_requirements
|
|
217
|
+
pip_requirements=pip_requirements,
|
|
211
218
|
)
|
|
212
219
|
framework = "sklearn"
|
|
213
220
|
|
|
@@ -245,8 +252,10 @@ class ExperimentTracker:
|
|
|
245
252
|
mlflow.end_run(status=status)
|
|
246
253
|
|
|
247
254
|
duration = (self.current_run.end_time - self.current_run.start_time).total_seconds()
|
|
248
|
-
logger.info(
|
|
249
|
-
|
|
255
|
+
logger.info(
|
|
256
|
+
f"Ended MLflow run {self.current_run.run_name} "
|
|
257
|
+
f"(Duration: {duration:.2f}s, Status: {status})"
|
|
258
|
+
)
|
|
250
259
|
|
|
251
260
|
current_run = self.current_run
|
|
252
261
|
self.current_run = None
|
|
@@ -256,17 +265,17 @@ class ExperimentTracker:
|
|
|
256
265
|
"""Get run by ID"""
|
|
257
266
|
return self.client.get_run(run_id)
|
|
258
267
|
|
|
259
|
-
def search_runs(
|
|
260
|
-
|
|
268
|
+
def search_runs(
|
|
269
|
+
self, filter_string: str = "", max_results: int = 100
|
|
270
|
+
) -> List[mlflow.entities.Run]:
|
|
261
271
|
"""Search for runs in experiment"""
|
|
262
272
|
return self.client.search_runs(
|
|
263
273
|
experiment_ids=[self.experiment_id],
|
|
264
274
|
filter_string=filter_string,
|
|
265
|
-
max_results=max_results
|
|
275
|
+
max_results=max_results,
|
|
266
276
|
)
|
|
267
277
|
|
|
268
|
-
def compare_runs(self, run_ids: List[str],
|
|
269
|
-
metrics: Optional[List[str]] = None) -> pd.DataFrame:
|
|
278
|
+
def compare_runs(self, run_ids: List[str], metrics: Optional[List[str]] = None) -> pd.DataFrame:
|
|
270
279
|
"""Compare multiple runs"""
|
|
271
280
|
runs_data = []
|
|
272
281
|
|
|
@@ -307,15 +316,14 @@ class ModelRegistry:
|
|
|
307
316
|
if config.registry_uri:
|
|
308
317
|
mlflow.set_registry_uri(config.registry_uri)
|
|
309
318
|
|
|
310
|
-
def register_model(
|
|
311
|
-
|
|
319
|
+
def register_model(
|
|
320
|
+
self, model_uri: str, model_name: str, tags: Optional[Dict[str, str]] = None
|
|
321
|
+
) -> str:
|
|
312
322
|
"""Register model in MLflow registry"""
|
|
313
323
|
try:
|
|
314
324
|
# Create registered model if it doesn't exist
|
|
315
325
|
self.client.create_registered_model(
|
|
316
|
-
model_name,
|
|
317
|
-
tags=tags or {},
|
|
318
|
-
description=f"Model for {model_name}"
|
|
326
|
+
model_name, tags=tags or {}, description=f"Model for {model_name}"
|
|
319
327
|
)
|
|
320
328
|
except Exception as e:
|
|
321
329
|
logger.debug(f"Model {model_name} already exists: {e}")
|
|
@@ -325,27 +333,28 @@ class ModelRegistry:
|
|
|
325
333
|
name=model_name,
|
|
326
334
|
source=model_uri,
|
|
327
335
|
run_id=model_uri.split("/")[1] if "runs:/" in model_uri else None,
|
|
328
|
-
tags=tags or {}
|
|
336
|
+
tags=tags or {},
|
|
329
337
|
)
|
|
330
338
|
|
|
331
339
|
logger.info(f"Registered model {model_name} version {model_version.version}")
|
|
332
340
|
return f"models:/{model_name}/{model_version.version}"
|
|
333
341
|
|
|
334
|
-
def transition_model_stage(
|
|
335
|
-
|
|
342
|
+
def transition_model_stage(
|
|
343
|
+
self, model_name: str, version: int, stage: str, archive_existing: bool = True
|
|
344
|
+
):
|
|
336
345
|
"""Transition model version to new stage"""
|
|
337
346
|
self.client.transition_model_version_stage(
|
|
338
347
|
name=model_name,
|
|
339
348
|
version=version,
|
|
340
349
|
stage=stage,
|
|
341
|
-
archive_existing_versions=archive_existing
|
|
350
|
+
archive_existing_versions=archive_existing,
|
|
342
351
|
)
|
|
343
352
|
|
|
344
353
|
logger.info(f"Transitioned {model_name} v{version} to {stage}")
|
|
345
354
|
|
|
346
|
-
def load_model(
|
|
347
|
-
|
|
348
|
-
|
|
355
|
+
def load_model(
|
|
356
|
+
self, model_name: str, version: Optional[int] = None, stage: Optional[str] = None
|
|
357
|
+
) -> Any:
|
|
349
358
|
"""Load model from registry"""
|
|
350
359
|
if version:
|
|
351
360
|
model_uri = f"models:/{model_name}/{version}"
|
|
@@ -362,8 +371,7 @@ class ModelRegistry:
|
|
|
362
371
|
"""Get specific model version details"""
|
|
363
372
|
return self.client.get_model_version(model_name, version)
|
|
364
373
|
|
|
365
|
-
def get_latest_versions(self, model_name: str,
|
|
366
|
-
stages: Optional[List[str]] = None):
|
|
374
|
+
def get_latest_versions(self, model_name: str, stages: Optional[List[str]] = None):
|
|
367
375
|
"""Get latest model versions for given stages"""
|
|
368
376
|
return self.client.get_latest_versions(model_name, stages=stages)
|
|
369
377
|
|
|
@@ -374,4 +382,4 @@ class ModelRegistry:
|
|
|
374
382
|
|
|
375
383
|
def search_models(self, filter_string: str = "") -> List:
|
|
376
384
|
"""Search registered models"""
|
|
377
|
-
return self.client.search_registered_models(filter_string=filter_string)
|
|
385
|
+
return self.client.search_registered_models(filter_string=filter_string)
|