pulseflow-mlops 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pulseflow_mlops-0.1.0/LICENSE +21 -0
- pulseflow_mlops-0.1.0/PKG-INFO +56 -0
- pulseflow_mlops-0.1.0/README.md +41 -0
- pulseflow_mlops-0.1.0/airflow/dags/mlops_workflow_dag.py +56 -0
- pulseflow_mlops-0.1.0/deployment/app/__init__.py +0 -0
- pulseflow_mlops-0.1.0/deployment/app/main.py +170 -0
- pulseflow_mlops-0.1.0/deployment/app/model_loader.py +79 -0
- pulseflow_mlops-0.1.0/etl/__init__.py +0 -0
- pulseflow_mlops-0.1.0/etl/data_ingestion.py +36 -0
- pulseflow_mlops-0.1.0/etl/data_preprocessing.py +31 -0
- pulseflow_mlops-0.1.0/models/generate_model.py +71 -0
- pulseflow_mlops-0.1.0/pulseflow_mlops.egg-info/PKG-INFO +56 -0
- pulseflow_mlops-0.1.0/pulseflow_mlops.egg-info/SOURCES.txt +18 -0
- pulseflow_mlops-0.1.0/pulseflow_mlops.egg-info/dependency_links.txt +1 -0
- pulseflow_mlops-0.1.0/pulseflow_mlops.egg-info/top_level.txt +5 -0
- pulseflow_mlops-0.1.0/pyproject.toml +25 -0
- pulseflow_mlops-0.1.0/setup.cfg +4 -0
- pulseflow_mlops-0.1.0/training/__init__.py +0 -0
- pulseflow_mlops-0.1.0/training/evaluate_model.py +65 -0
- pulseflow_mlops-0.1.0/training/train_model.py +92 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Anil Prasad
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pulseflow-mlops
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Production-grade open source MLOps pipeline for enterprise data engineering and predictive modeling
|
|
5
|
+
Author-email: Anil Prasad <anil@ambharii.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/anilatambharii/PulseFlow
|
|
8
|
+
Project-URL: Documentation, https://github.com/anilatambharii/PulseFlow/blob/main/README.md
|
|
9
|
+
Keywords: mlops,machine-learning,fastapi,airflow,mlflow,etl
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
|
|
16
|
+
# Enterprise MLOps Pipeline
|
|
17
|
+
|
|
18
|
+
This repository provides a production-ready MLOps template for building and deploying machine learning pipelines in enterprise environments.
|
|
19
|
+
|
|
20
|
+
## Architecture Overview
|
|
21
|
+
|
|
22
|
+
Components included:
|
|
23
|
+
- ETL Pipeline: Data ingestion and preprocessing.
|
|
24
|
+
- Training Pipeline: Model training with MLflow tracking.
|
|
25
|
+
- Deployment Service: FastAPI microservice for real-time inference.
|
|
26
|
+
- Airflow Orchestration: Workflow automation for end-to-end pipelines.
|
|
27
|
+
- Dockerized Stack: Easily deployable with Docker Compose.
|
|
28
|
+
|
|
29
|
+
## Run Locally
|
|
30
|
+
|
|
31
|
+
### Prerequisites
|
|
32
|
+
- Python 3.10+
|
|
33
|
+
- Docker & Docker Compose
|
|
34
|
+
|
|
35
|
+
### 1. Install dependencies
|
|
36
|
+
python -m venv .venv
|
|
37
|
+
source .venv/bin/activate # or .venv\Scripts\activate on Windows
|
|
38
|
+
pip install -r requirements.txt
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
### 2. Run the pipeline manually
|
|
42
|
+
|
|
43
|
+
python etl/data_ingestion.py
|
|
44
|
+
python etl/data_preprocessing.py
|
|
45
|
+
python training/train_model.py
|
|
46
|
+
uvicorn deployment.app.main:app --reload
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
### 3. Start MLflow and Airflow (optional)
|
|
50
|
+
|
|
51
|
+
mlflow ui &
|
|
52
|
+
airflow db init && airflow webserver -p 8080 & airflow scheduler &
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
### 4. Run full stack with Docker
|
|
56
|
+
docker-compose up --build
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Enterprise MLOps Pipeline
|
|
2
|
+
|
|
3
|
+
This repository provides a production-ready MLOps template for building and deploying machine learning pipelines in enterprise environments.
|
|
4
|
+
|
|
5
|
+
## Architecture Overview
|
|
6
|
+
|
|
7
|
+
Components included:
|
|
8
|
+
- ETL Pipeline: Data ingestion and preprocessing.
|
|
9
|
+
- Training Pipeline: Model training with MLflow tracking.
|
|
10
|
+
- Deployment Service: FastAPI microservice for real-time inference.
|
|
11
|
+
- Airflow Orchestration: Workflow automation for end-to-end pipelines.
|
|
12
|
+
- Dockerized Stack: Easily deployable with Docker Compose.
|
|
13
|
+
|
|
14
|
+
## Run Locally
|
|
15
|
+
|
|
16
|
+
### Prerequisites
|
|
17
|
+
- Python 3.10+
|
|
18
|
+
- Docker & Docker Compose
|
|
19
|
+
|
|
20
|
+
### 1. Install dependencies
|
|
21
|
+
python -m venv .venv
|
|
22
|
+
source .venv/bin/activate # or .venv\Scripts\activate on Windows
|
|
23
|
+
pip install -r requirements.txt
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
### 2. Run the pipeline manually
|
|
27
|
+
|
|
28
|
+
python etl/data_ingestion.py
|
|
29
|
+
python etl/data_preprocessing.py
|
|
30
|
+
python training/train_model.py
|
|
31
|
+
uvicorn deployment.app.main:app --reload
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
### 3. Start MLflow and Airflow (optional)
|
|
35
|
+
|
|
36
|
+
mlflow ui &
|
|
37
|
+
airflow db init && airflow webserver -p 8080 & airflow scheduler &
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
### 4. Run full stack with Docker
|
|
41
|
+
docker-compose up --build
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from airflow import DAG
|
|
2
|
+
from airflow.operators.python import PythonOperator
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
import subprocess
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def run_etl():
|
|
8
|
+
subprocess.run(['python', 'etl/data_ingestion.py'], check=True)
|
|
9
|
+
subprocess.run(['python', 'etl/data_preprocessing.py'], check=True)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def run_training():
|
|
13
|
+
subprocess.run(['python', 'training/train_model.py'], check=True)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def run_deployment():
|
|
17
|
+
subprocess.run(['uvicorn', 'deployment.app.main:app', '--host', '0.0.0.0', '--port', '8000'], check=True)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
default_args = {
|
|
21
|
+
'owner': 'airflow',
|
|
22
|
+
'depends_on_past': False,
|
|
23
|
+
'email_on_failure': False,
|
|
24
|
+
'email_on_retry': False,
|
|
25
|
+
'retries': 1,
|
|
26
|
+
'retry_delay': timedelta(minutes=2),
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
dag = DAG(
|
|
30
|
+
'mlops_pipeline_dag',
|
|
31
|
+
default_args=default_args,
|
|
32
|
+
description='End-to-end Enterprise MLOps workflow',
|
|
33
|
+
schedule_interval='@daily',
|
|
34
|
+
start_date=datetime(2025, 1, 1),
|
|
35
|
+
catchup=False,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
etl_task = PythonOperator(
|
|
39
|
+
task_id='run_etl',
|
|
40
|
+
python_callable=run_etl,
|
|
41
|
+
dag=dag,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
train_task = PythonOperator(
|
|
45
|
+
task_id='run_training',
|
|
46
|
+
python_callable=run_training,
|
|
47
|
+
dag=dag,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
deploy_task = PythonOperator(
|
|
51
|
+
task_id='run_deployment',
|
|
52
|
+
python_callable=run_deployment,
|
|
53
|
+
dag=dag,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
etl_task >> train_task >> deploy_task
|
|
File without changes
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
from fastapi import FastAPI, HTTPException
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from typing import List, Dict
|
|
5
|
+
import sys
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
# Add parent directory to path for imports
|
|
9
|
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
10
|
+
|
|
11
|
+
from app.model_loader import ModelLoader
|
|
12
|
+
|
|
13
|
+
# Initialize FastAPI app
|
|
14
|
+
app = FastAPI(
|
|
15
|
+
title="Enterprise MLOps Prediction API",
|
|
16
|
+
description="Production-grade ML model serving with FastAPI",
|
|
17
|
+
version="1.0.0"
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Initialize model loader
|
|
21
|
+
model_loader = ModelLoader(model_path="models/saved_model.pkl")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PredictionInput(BaseModel):
|
|
25
|
+
"""Schema for single prediction request"""
|
|
26
|
+
features: Dict[str, float]
|
|
27
|
+
|
|
28
|
+
class Config:
|
|
29
|
+
schema_extra = {
|
|
30
|
+
"example": {
|
|
31
|
+
"features": {
|
|
32
|
+
"feature1": 50.0,
|
|
33
|
+
"feature2": 75.0
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class BatchPredictionInput(BaseModel):
|
|
40
|
+
"""Schema for batch prediction request"""
|
|
41
|
+
data: List[Dict[str, float]]
|
|
42
|
+
|
|
43
|
+
class Config:
|
|
44
|
+
schema_extra = {
|
|
45
|
+
"example": {
|
|
46
|
+
"data": [
|
|
47
|
+
{"feature1": 50.0, "feature2": 75.0},
|
|
48
|
+
{"feature1": 30.0, "feature2": 45.0}
|
|
49
|
+
]
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class PredictionResponse(BaseModel):
|
|
55
|
+
"""Schema for prediction response"""
|
|
56
|
+
prediction: float
|
|
57
|
+
model_version: str
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class BatchPredictionResponse(BaseModel):
|
|
61
|
+
"""Schema for batch prediction response"""
|
|
62
|
+
predictions: List[float]
|
|
63
|
+
model_version: str
|
|
64
|
+
count: int
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@app.get("/")
|
|
68
|
+
def root():
|
|
69
|
+
"""Health check endpoint"""
|
|
70
|
+
return {
|
|
71
|
+
"status": "online",
|
|
72
|
+
"service": "Enterprise MLOps Prediction API",
|
|
73
|
+
"version": "1.0.0"
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@app.get("/health")
|
|
78
|
+
def health_check():
|
|
79
|
+
"""Detailed health check"""
|
|
80
|
+
model_status = "loaded" if model_loader.model is not None else "not_loaded"
|
|
81
|
+
return {
|
|
82
|
+
"status": "healthy",
|
|
83
|
+
"model_status": model_status,
|
|
84
|
+
"model_path": model_loader.model_path
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@app.post("/predict", response_model=PredictionResponse)
|
|
89
|
+
def predict(input_data: PredictionInput):
|
|
90
|
+
"""
|
|
91
|
+
Single prediction endpoint
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
input_data: Dictionary of feature names and values
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Prediction result and model version
|
|
98
|
+
"""
|
|
99
|
+
try:
|
|
100
|
+
# Convert input to DataFrame
|
|
101
|
+
df = pd.DataFrame([input_data.features])
|
|
102
|
+
|
|
103
|
+
# Make prediction
|
|
104
|
+
prediction = model_loader.predict(df)
|
|
105
|
+
|
|
106
|
+
return PredictionResponse(
|
|
107
|
+
prediction=float(prediction[0]),
|
|
108
|
+
model_version=model_loader.get_model_version()
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
except Exception as e:
|
|
112
|
+
raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@app.post("/predict/batch", response_model=BatchPredictionResponse)
|
|
116
|
+
def predict_batch(input_data: BatchPredictionInput):
|
|
117
|
+
"""
|
|
118
|
+
Batch prediction endpoint
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
input_data: List of feature dictionaries
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
List of predictions and model version
|
|
125
|
+
"""
|
|
126
|
+
try:
|
|
127
|
+
# Convert input to DataFrame
|
|
128
|
+
df = pd.DataFrame(input_data.data)
|
|
129
|
+
|
|
130
|
+
# Make predictions
|
|
131
|
+
predictions = model_loader.predict(df)
|
|
132
|
+
|
|
133
|
+
return BatchPredictionResponse(
|
|
134
|
+
predictions=[float(p) for p in predictions],
|
|
135
|
+
model_version=model_loader.get_model_version(),
|
|
136
|
+
count=len(predictions)
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
except Exception as e:
|
|
140
|
+
raise HTTPException(status_code=500, detail=f"Batch prediction error: {str(e)}")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@app.get("/model/info")
|
|
144
|
+
def model_info():
|
|
145
|
+
"""Get model metadata and information"""
|
|
146
|
+
return {
|
|
147
|
+
"model_path": model_loader.model_path,
|
|
148
|
+
"model_version": model_loader.get_model_version(),
|
|
149
|
+
"model_type": str(type(model_loader.model).__name__) if model_loader.model else None,
|
|
150
|
+
"status": "loaded" if model_loader.model else "not_loaded"
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@app.post("/model/reload")
|
|
155
|
+
def reload_model():
|
|
156
|
+
"""Reload the model from disk"""
|
|
157
|
+
try:
|
|
158
|
+
model_loader.load_model()
|
|
159
|
+
return {
|
|
160
|
+
"status": "success",
|
|
161
|
+
"message": "Model reloaded successfully",
|
|
162
|
+
"model_version": model_loader.get_model_version()
|
|
163
|
+
}
|
|
164
|
+
except Exception as e:
|
|
165
|
+
raise HTTPException(status_code=500, detail=f"Model reload error: {str(e)}")
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
if __name__ == "__main__":
|
|
169
|
+
import uvicorn
|
|
170
|
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import joblib
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Any
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ModelLoader:
|
|
10
|
+
"""
|
|
11
|
+
Model loader and manager for ML model serving
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, model_path: str = "models/saved_model.pkl"):
|
|
15
|
+
"""
|
|
16
|
+
Initialize model loader
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
model_path: Path to the saved model file
|
|
20
|
+
"""
|
|
21
|
+
self.model_path = model_path
|
|
22
|
+
self.model = None
|
|
23
|
+
self.loaded_at = None
|
|
24
|
+
|
|
25
|
+
# Load model on initialization
|
|
26
|
+
self.load_model()
|
|
27
|
+
|
|
28
|
+
def load_model(self):
|
|
29
|
+
"""Load the model from disk"""
|
|
30
|
+
if not os.path.exists(self.model_path):
|
|
31
|
+
raise FileNotFoundError(f"Model file not found at {self.model_path}")
|
|
32
|
+
|
|
33
|
+
print(f"Loading model from {self.model_path}...")
|
|
34
|
+
self.model = joblib.load(self.model_path)
|
|
35
|
+
self.loaded_at = datetime.now()
|
|
36
|
+
print(f"Model loaded successfully at {self.loaded_at}")
|
|
37
|
+
|
|
38
|
+
def predict(self, features: pd.DataFrame) -> np.ndarray:
|
|
39
|
+
"""
|
|
40
|
+
Make predictions using the loaded model
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
features: DataFrame with feature columns
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Array of predictions
|
|
47
|
+
"""
|
|
48
|
+
if self.model is None:
|
|
49
|
+
raise ValueError("Model not loaded. Call load_model() first.")
|
|
50
|
+
|
|
51
|
+
predictions = self.model.predict(features)
|
|
52
|
+
return predictions
|
|
53
|
+
|
|
54
|
+
def get_model_version(self) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Get model version based on file modification time
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Model version string
|
|
60
|
+
"""
|
|
61
|
+
if not os.path.exists(self.model_path):
|
|
62
|
+
return "unknown"
|
|
63
|
+
|
|
64
|
+
mod_time = os.path.getmtime(self.model_path)
|
|
65
|
+
return datetime.fromtimestamp(mod_time).strftime("%Y%m%d_%H%M%S")
|
|
66
|
+
|
|
67
|
+
def get_model_info(self) -> dict:
|
|
68
|
+
"""
|
|
69
|
+
Get detailed model information
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Dictionary with model metadata
|
|
73
|
+
"""
|
|
74
|
+
return {
|
|
75
|
+
"model_path": self.model_path,
|
|
76
|
+
"model_type": str(type(self.model).__name__) if self.model else None,
|
|
77
|
+
"loaded_at": self.loaded_at.isoformat() if self.loaded_at else None,
|
|
78
|
+
"version": self.get_model_version()
|
|
79
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
def load_data(source_path: str, output_path: str):
|
|
5
|
+
if not os.path.exists(source_path):
|
|
6
|
+
raise FileNotFoundError(f"Source data not found at {source_path}")
|
|
7
|
+
|
|
8
|
+
print(f"Loading data from {source_path}...")
|
|
9
|
+
df = pd.read_csv(source_path)
|
|
10
|
+
print(f"Loaded {len(df)} records.")
|
|
11
|
+
|
|
12
|
+
# Basic cleaning: drop duplicates and NaNs
|
|
13
|
+
df = df.drop_duplicates().dropna()
|
|
14
|
+
|
|
15
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
16
|
+
df.to_parquet(output_path, index=False)
|
|
17
|
+
|
|
18
|
+
print(f"Data saved to {output_path}")
|
|
19
|
+
return df
|
|
20
|
+
|
|
21
|
+
if __name__ == "__main__":
|
|
22
|
+
INPUT_FILE = 'data/sample.csv'
|
|
23
|
+
OUTPUT_FILE = 'data/intermediate.parquet'
|
|
24
|
+
os.makedirs('data', exist_ok=True)
|
|
25
|
+
|
|
26
|
+
# Generate synthetic data if sample.csv doesn't exist
|
|
27
|
+
if not os.path.exists(INPUT_FILE):
|
|
28
|
+
print("sample.csv not found — generating mock dataset...")
|
|
29
|
+
sample_df = pd.DataFrame({
|
|
30
|
+
'feature1': range(100),
|
|
31
|
+
'feature2': [x * 1.5 for x in range(100)],
|
|
32
|
+
'target': [x * 3 + 5 for x in range(100)]
|
|
33
|
+
})
|
|
34
|
+
sample_df.to_csv(INPUT_FILE, index=False)
|
|
35
|
+
|
|
36
|
+
load_data(INPUT_FILE, OUTPUT_FILE)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from sklearn.preprocessing import StandardScaler
|
|
4
|
+
|
|
5
|
+
def preprocess_data(input_path: str, output_path: str):
|
|
6
|
+
if not os.path.exists(input_path):
|
|
7
|
+
raise FileNotFoundError(f"Input parquet file not found at {input_path}")
|
|
8
|
+
|
|
9
|
+
print(f"Preprocessing data from {input_path}...")
|
|
10
|
+
df = pd.read_parquet(input_path)
|
|
11
|
+
|
|
12
|
+
# Identify numeric columns
|
|
13
|
+
num_cols = df.select_dtypes(include='number').columns.tolist()
|
|
14
|
+
|
|
15
|
+
if 'target' in num_cols:
|
|
16
|
+
num_cols.remove('target')
|
|
17
|
+
|
|
18
|
+
print(f"Scaling columns: {num_cols}")
|
|
19
|
+
scaler = StandardScaler()
|
|
20
|
+
df[num_cols] = scaler.fit_transform(df[num_cols])
|
|
21
|
+
|
|
22
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
23
|
+
df.to_parquet(output_path, index=False)
|
|
24
|
+
print(f"Preprocessed data saved to {output_path}")
|
|
25
|
+
return df
|
|
26
|
+
|
|
27
|
+
if __name__ == "__main__":
|
|
28
|
+
INPUT_FILE = 'data/intermediate.parquet'
|
|
29
|
+
OUTPUT_FILE = 'data/processed.parquet'
|
|
30
|
+
|
|
31
|
+
preprocess_data(INPUT_FILE, OUTPUT_FILE)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import numpy as np
|
|
4
|
+
from sklearn.ensemble import RandomForestRegressor
|
|
5
|
+
from sklearn.model_selection import train_test_split
|
|
6
|
+
import joblib
|
|
7
|
+
|
|
8
|
+
def generate_sample_data():
|
|
9
|
+
"""Generate synthetic sample data"""
|
|
10
|
+
np.random.seed(42)
|
|
11
|
+
n_samples = 1000
|
|
12
|
+
|
|
13
|
+
data = {
|
|
14
|
+
'feature1': np.random.randn(n_samples) * 10 + 50,
|
|
15
|
+
'feature2': np.random.randn(n_samples) * 15 + 75,
|
|
16
|
+
'target': None
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
# Create target as a function of features with some noise
|
|
20
|
+
data['target'] = (
|
|
21
|
+
data['feature1'] * 2.5 +
|
|
22
|
+
data['feature2'] * 1.8 +
|
|
23
|
+
np.random.randn(n_samples) * 5
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
return pd.DataFrame(data)
|
|
27
|
+
|
|
28
|
+
def train_and_save_model(output_path='models/saved_model.pkl'):
|
|
29
|
+
"""Train a simple model and save it"""
|
|
30
|
+
|
|
31
|
+
print("Generating sample dataset...")
|
|
32
|
+
df = generate_sample_data()
|
|
33
|
+
|
|
34
|
+
X = df[['feature1', 'feature2']]
|
|
35
|
+
y = df['target']
|
|
36
|
+
|
|
37
|
+
print(f"Dataset shape: {X.shape}")
|
|
38
|
+
|
|
39
|
+
# Split data
|
|
40
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
41
|
+
X, y, test_size=0.2, random_state=42
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
print("Training Random Forest model...")
|
|
45
|
+
model = RandomForestRegressor(
|
|
46
|
+
n_estimators=100,
|
|
47
|
+
max_depth=10,
|
|
48
|
+
min_samples_split=5,
|
|
49
|
+
random_state=42
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
model.fit(X_train, y_train)
|
|
53
|
+
|
|
54
|
+
# Evaluate
|
|
55
|
+
train_score = model.score(X_train, y_train)
|
|
56
|
+
test_score = model.score(X_test, y_test)
|
|
57
|
+
|
|
58
|
+
print(f"Train R² Score: {train_score:.4f}")
|
|
59
|
+
print(f"Test R² Score: {test_score:.4f}")
|
|
60
|
+
|
|
61
|
+
# Save model
|
|
62
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
63
|
+
joblib.dump(model, output_path)
|
|
64
|
+
|
|
65
|
+
print(f"\nModel saved to: {output_path}")
|
|
66
|
+
print(f"Model file size: {os.path.getsize(output_path) / 1024:.2f} KB")
|
|
67
|
+
|
|
68
|
+
return model
|
|
69
|
+
|
|
70
|
+
if __name__ == "__main__":
|
|
71
|
+
train_and_save_model()
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pulseflow-mlops
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Production-grade open source MLOps pipeline for enterprise data engineering and predictive modeling
|
|
5
|
+
Author-email: Anil Prasad <anil@ambharii.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/anilatambharii/PulseFlow
|
|
8
|
+
Project-URL: Documentation, https://github.com/anilatambharii/PulseFlow/blob/main/README.md
|
|
9
|
+
Keywords: mlops,machine-learning,fastapi,airflow,mlflow,etl
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
|
|
16
|
+
# Enterprise MLOps Pipeline
|
|
17
|
+
|
|
18
|
+
This repository provides a production-ready MLOps template for building and deploying machine learning pipelines in enterprise environments.
|
|
19
|
+
|
|
20
|
+
## Architecture Overview
|
|
21
|
+
|
|
22
|
+
Components included:
|
|
23
|
+
- ETL Pipeline: Data ingestion and preprocessing.
|
|
24
|
+
- Training Pipeline: Model training with MLflow tracking.
|
|
25
|
+
- Deployment Service: FastAPI microservice for real-time inference.
|
|
26
|
+
- Airflow Orchestration: Workflow automation for end-to-end pipelines.
|
|
27
|
+
- Dockerized Stack: Easily deployable with Docker Compose.
|
|
28
|
+
|
|
29
|
+
## Run Locally
|
|
30
|
+
|
|
31
|
+
### Prerequisites
|
|
32
|
+
- Python 3.10+
|
|
33
|
+
- Docker & Docker Compose
|
|
34
|
+
|
|
35
|
+
### 1. Install dependencies
|
|
36
|
+
python -m venv .venv
|
|
37
|
+
source .venv/bin/activate # or .venv\Scripts\activate on Windows
|
|
38
|
+
pip install -r requirements.txt
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
### 2. Run the pipeline manually
|
|
42
|
+
|
|
43
|
+
python etl/data_ingestion.py
|
|
44
|
+
python etl/data_preprocessing.py
|
|
45
|
+
python training/train_model.py
|
|
46
|
+
uvicorn deployment.app.main:app --reload
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
### 3. Start MLflow and Airflow (optional)
|
|
50
|
+
|
|
51
|
+
mlflow ui &
|
|
52
|
+
airflow db init && airflow webserver -p 8080 & airflow scheduler &
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
### 4. Run full stack with Docker
|
|
56
|
+
docker-compose up --build
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
airflow/dags/mlops_workflow_dag.py
|
|
5
|
+
deployment/app/__init__.py
|
|
6
|
+
deployment/app/main.py
|
|
7
|
+
deployment/app/model_loader.py
|
|
8
|
+
etl/__init__.py
|
|
9
|
+
etl/data_ingestion.py
|
|
10
|
+
etl/data_preprocessing.py
|
|
11
|
+
models/generate_model.py
|
|
12
|
+
pulseflow_mlops.egg-info/PKG-INFO
|
|
13
|
+
pulseflow_mlops.egg-info/SOURCES.txt
|
|
14
|
+
pulseflow_mlops.egg-info/dependency_links.txt
|
|
15
|
+
pulseflow_mlops.egg-info/top_level.txt
|
|
16
|
+
training/__init__.py
|
|
17
|
+
training/evaluate_model.py
|
|
18
|
+
training/train_model.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools==73", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pulseflow-mlops"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Production-grade open source MLOps pipeline for enterprise data engineering and predictive modeling"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [{name = "Anil Prasad", email = "anil@ambharii.com"}]
|
|
13
|
+
keywords = ["mlops", "machine-learning", "fastapi", "airflow", "mlflow", "etl"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3.10",
|
|
16
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.urls]
|
|
20
|
+
Homepage = "https://github.com/anilatambharii/PulseFlow"
|
|
21
|
+
Documentation = "https://github.com/anilatambharii/PulseFlow/blob/main/README.md"
|
|
22
|
+
|
|
23
|
+
[tool.setuptools.packages.find]
|
|
24
|
+
where = ["."]
|
|
25
|
+
include = ["etl*", "training*", "deployment*", "models*", "airflow*"]
|
|
File without changes
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import joblib
|
|
4
|
+
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
def evaluate_model(model_path: str, data_path: str, output_metrics_path: str = 'models/metrics.json'):
|
|
8
|
+
"""
|
|
9
|
+
Evaluate a trained model and save metrics
|
|
10
|
+
"""
|
|
11
|
+
if not os.path.exists(model_path):
|
|
12
|
+
raise FileNotFoundError(f"Model not found at {model_path}")
|
|
13
|
+
|
|
14
|
+
if not os.path.exists(data_path):
|
|
15
|
+
raise FileNotFoundError(f"Data not found at {data_path}")
|
|
16
|
+
|
|
17
|
+
print(f"Loading model from {model_path}...")
|
|
18
|
+
model = joblib.load(model_path)
|
|
19
|
+
|
|
20
|
+
print(f"Loading evaluation data from {data_path}...")
|
|
21
|
+
df = pd.read_parquet(data_path)
|
|
22
|
+
|
|
23
|
+
if 'target' not in df.columns:
|
|
24
|
+
raise ValueError("Target column 'target' not found in dataset")
|
|
25
|
+
|
|
26
|
+
X = df.drop('target', axis=1)
|
|
27
|
+
y = df['target']
|
|
28
|
+
|
|
29
|
+
print(f"Evaluating model on {len(X)} samples...")
|
|
30
|
+
y_pred = model.predict(X)
|
|
31
|
+
|
|
32
|
+
# Calculate metrics
|
|
33
|
+
mse = mean_squared_error(y, y_pred)
|
|
34
|
+
mae = mean_absolute_error(y, y_pred)
|
|
35
|
+
r2 = r2_score(y, y_pred)
|
|
36
|
+
rmse = mse ** 0.5
|
|
37
|
+
|
|
38
|
+
metrics = {
|
|
39
|
+
'mse': float(mse),
|
|
40
|
+
'mae': float(mae),
|
|
41
|
+
'rmse': float(rmse),
|
|
42
|
+
'r2_score': float(r2)
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
print("\nEvaluation Results:")
|
|
46
|
+
print(f" MSE: {mse:.4f}")
|
|
47
|
+
print(f" MAE: {mae:.4f}")
|
|
48
|
+
print(f" RMSE: {rmse:.4f}")
|
|
49
|
+
print(f" R2 Score: {r2:.4f}")
|
|
50
|
+
|
|
51
|
+
# Save metrics to JSON
|
|
52
|
+
os.makedirs(os.path.dirname(output_metrics_path), exist_ok=True)
|
|
53
|
+
with open(output_metrics_path, 'w') as f:
|
|
54
|
+
json.dump(metrics, f, indent=4)
|
|
55
|
+
|
|
56
|
+
print(f"\nMetrics saved to {output_metrics_path}")
|
|
57
|
+
|
|
58
|
+
return metrics
|
|
59
|
+
|
|
60
|
+
if __name__ == "__main__":
|
|
61
|
+
MODEL_PATH = 'models/saved_model.pkl'
|
|
62
|
+
DATA_PATH = 'data/processed.parquet'
|
|
63
|
+
METRICS_OUTPUT_PATH = 'models/metrics.json'
|
|
64
|
+
|
|
65
|
+
evaluate_model(MODEL_PATH, DATA_PATH, METRICS_OUTPUT_PATH)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import mlflow
|
|
4
|
+
import mlflow.sklearn
|
|
5
|
+
from sklearn.ensemble import RandomForestRegressor
|
|
6
|
+
from sklearn.model_selection import train_test_split
|
|
7
|
+
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
|
8
|
+
import joblib
|
|
9
|
+
|
|
10
|
+
# MLflow tracking configuration
|
|
11
|
+
MLFLOW_TRACKING_URI = os.getenv('MLFLOW_TRACKING_URI', 'http://localhost:5000')
|
|
12
|
+
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
|
|
13
|
+
mlflow.set_experiment("enterprise_mlops_training")
|
|
14
|
+
|
|
15
|
+
def train_model(data_path: str, model_output_path: str):
|
|
16
|
+
"""
|
|
17
|
+
Train a Random Forest model with MLflow tracking
|
|
18
|
+
"""
|
|
19
|
+
if not os.path.exists(data_path):
|
|
20
|
+
raise FileNotFoundError(f"Processed data not found at {data_path}")
|
|
21
|
+
|
|
22
|
+
print(f"Loading processed data from {data_path}...")
|
|
23
|
+
df = pd.read_parquet(data_path)
|
|
24
|
+
|
|
25
|
+
# Separate features and target
|
|
26
|
+
if 'target' not in df.columns:
|
|
27
|
+
raise ValueError("Target column 'target' not found in dataset")
|
|
28
|
+
|
|
29
|
+
X = df.drop('target', axis=1)
|
|
30
|
+
y = df['target']
|
|
31
|
+
|
|
32
|
+
print(f"Dataset shape: {X.shape}")
|
|
33
|
+
|
|
34
|
+
# Train-test split
|
|
35
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
36
|
+
X, y, test_size=0.2, random_state=42
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")
|
|
40
|
+
|
|
41
|
+
# Start MLflow run
|
|
42
|
+
with mlflow.start_run(run_name="random_forest_training"):
|
|
43
|
+
# Model parameters
|
|
44
|
+
params = {
|
|
45
|
+
'n_estimators': 100,
|
|
46
|
+
'max_depth': 10,
|
|
47
|
+
'min_samples_split': 5,
|
|
48
|
+
'random_state': 42
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
print("Training Random Forest model...")
|
|
52
|
+
model = RandomForestRegressor(**params)
|
|
53
|
+
model.fit(X_train, y_train)
|
|
54
|
+
|
|
55
|
+
# Predictions
|
|
56
|
+
y_pred = model.predict(X_test)
|
|
57
|
+
|
|
58
|
+
# Metrics
|
|
59
|
+
mse = mean_squared_error(y_test, y_pred)
|
|
60
|
+
mae = mean_absolute_error(y_test, y_pred)
|
|
61
|
+
r2 = r2_score(y_test, y_pred)
|
|
62
|
+
|
|
63
|
+
print(f"Model Performance:")
|
|
64
|
+
print(f" MSE: {mse:.4f}")
|
|
65
|
+
print(f" MAE: {mae:.4f}")
|
|
66
|
+
print(f" R2 Score: {r2:.4f}")
|
|
67
|
+
|
|
68
|
+
# Log parameters and metrics to MLflow
|
|
69
|
+
mlflow.log_params(params)
|
|
70
|
+
mlflow.log_metric("mse", mse)
|
|
71
|
+
mlflow.log_metric("mae", mae)
|
|
72
|
+
mlflow.log_metric("r2_score", r2)
|
|
73
|
+
|
|
74
|
+
# Log model to MLflow
|
|
75
|
+
mlflow.sklearn.log_model(
|
|
76
|
+
model,
|
|
77
|
+
artifact_path="model",
|
|
78
|
+
registered_model_name="enterprise_mlops_rf_model"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Save model locally
|
|
82
|
+
os.makedirs(os.path.dirname(model_output_path), exist_ok=True)
|
|
83
|
+
joblib.dump(model, model_output_path)
|
|
84
|
+
print(f"Model saved to {model_output_path}")
|
|
85
|
+
|
|
86
|
+
return model, mse, mae, r2
|
|
87
|
+
|
|
88
|
+
if __name__ == "__main__":
|
|
89
|
+
DATA_PATH = 'data/processed.parquet'
|
|
90
|
+
MODEL_OUTPUT_PATH = 'models/saved_model.pkl'
|
|
91
|
+
|
|
92
|
+
train_model(DATA_PATH, MODEL_OUTPUT_PATH)
|