pulseflow-mlops 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Anil Prasad
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,56 @@
1
+ Metadata-Version: 2.1
2
+ Name: pulseflow-mlops
3
+ Version: 0.1.0
4
+ Summary: Production-grade open source MLOps pipeline for enterprise data engineering and predictive modeling
5
+ Author-email: Anil Prasad <anil@ambharii.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/anilatambharii/PulseFlow
8
+ Project-URL: Documentation, https://github.com/anilatambharii/PulseFlow/blob/main/README.md
9
+ Keywords: mlops,machine-learning,fastapi,airflow,mlflow,etl
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
12
+ Requires-Python: >=3.10
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+
16
+ # Enterprise MLOps Pipeline
17
+
18
+ This repository provides a production-ready MLOps template for building and deploying machine learning pipelines in enterprise environments.
19
+
20
+ ## Architecture Overview
21
+
22
+ Components included:
23
+ - ETL Pipeline: Data ingestion and preprocessing.
24
+ - Training Pipeline: Model training with MLflow tracking.
25
+ - Deployment Service: FastAPI microservice for real-time inference.
26
+ - Airflow Orchestration: Workflow automation for end-to-end pipelines.
27
+ - Dockerized Stack: Easily deployable with Docker Compose.
28
+
29
+ ## Run Locally
30
+
31
+ ### Prerequisites
32
+ - Python 3.10+
33
+ - Docker & Docker Compose
34
+
35
+ ### 1. Install dependencies
36
+ python -m venv .venv
37
+ source .venv/bin/activate # or .venv\Scripts\activate on Windows
38
+ pip install -r requirements.txt
39
+
40
+
41
+ ### 2. Run the pipeline manually
42
+
43
+ python etl/data_ingestion.py
44
+ python etl/data_preprocessing.py
45
+ python training/train_model.py
46
+ uvicorn deployment.app.main:app --reload
47
+
48
+
49
+ ### 3. Start MLflow and Airflow (optional)
50
+
51
+ mlflow ui &
52
+ airflow db init && airflow webserver -p 8080 & airflow scheduler &
53
+
54
+
55
+ ### 4. Run full stack with Docker
56
+ docker-compose up --build
@@ -0,0 +1,41 @@
1
+ # Enterprise MLOps Pipeline
2
+
3
+ This repository provides a production-ready MLOps template for building and deploying machine learning pipelines in enterprise environments.
4
+
5
+ ## Architecture Overview
6
+
7
+ Components included:
8
+ - ETL Pipeline: Data ingestion and preprocessing.
9
+ - Training Pipeline: Model training with MLflow tracking.
10
+ - Deployment Service: FastAPI microservice for real-time inference.
11
+ - Airflow Orchestration: Workflow automation for end-to-end pipelines.
12
+ - Dockerized Stack: Easily deployable with Docker Compose.
13
+
14
+ ## Run Locally
15
+
16
+ ### Prerequisites
17
+ - Python 3.10+
18
+ - Docker & Docker Compose
19
+
20
+ ### 1. Install dependencies
21
+ python -m venv .venv
22
+ source .venv/bin/activate # or .venv\Scripts\activate on Windows
23
+ pip install -r requirements.txt
24
+
25
+
26
+ ### 2. Run the pipeline manually
27
+
28
+ python etl/data_ingestion.py
29
+ python etl/data_preprocessing.py
30
+ python training/train_model.py
31
+ uvicorn deployment.app.main:app --reload
32
+
33
+
34
+ ### 3. Start MLflow and Airflow (optional)
35
+
36
+ mlflow ui &
37
+ airflow db init && airflow webserver -p 8080 & airflow scheduler &
38
+
39
+
40
+ ### 4. Run full stack with Docker
41
+ docker-compose up --build
@@ -0,0 +1,56 @@
1
+ from airflow import DAG
2
+ from airflow.operators.python import PythonOperator
3
+ from datetime import datetime, timedelta
4
+ import subprocess
5
+
6
+
7
+ def run_etl():
8
+ subprocess.run(['python', 'etl/data_ingestion.py'], check=True)
9
+ subprocess.run(['python', 'etl/data_preprocessing.py'], check=True)
10
+
11
+
12
+ def run_training():
13
+ subprocess.run(['python', 'training/train_model.py'], check=True)
14
+
15
+
16
+ def run_deployment():
17
+ subprocess.run(['uvicorn', 'deployment.app.main:app', '--host', '0.0.0.0', '--port', '8000'], check=True)
18
+
19
+
20
+ default_args = {
21
+ 'owner': 'airflow',
22
+ 'depends_on_past': False,
23
+ 'email_on_failure': False,
24
+ 'email_on_retry': False,
25
+ 'retries': 1,
26
+ 'retry_delay': timedelta(minutes=2),
27
+ }
28
+
29
+ dag = DAG(
30
+ 'mlops_pipeline_dag',
31
+ default_args=default_args,
32
+ description='End-to-end Enterprise MLOps workflow',
33
+ schedule_interval='@daily',
34
+ start_date=datetime(2025, 1, 1),
35
+ catchup=False,
36
+ )
37
+
38
+ etl_task = PythonOperator(
39
+ task_id='run_etl',
40
+ python_callable=run_etl,
41
+ dag=dag,
42
+ )
43
+
44
+ train_task = PythonOperator(
45
+ task_id='run_training',
46
+ python_callable=run_training,
47
+ dag=dag,
48
+ )
49
+
50
+ deploy_task = PythonOperator(
51
+ task_id='run_deployment',
52
+ python_callable=run_deployment,
53
+ dag=dag,
54
+ )
55
+
56
+ etl_task >> train_task >> deploy_task
File without changes
@@ -0,0 +1,170 @@
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ import pandas as pd
4
+ from typing import List, Dict
5
+ import sys
6
+ import os
7
+
8
+ # Add parent directory to path for imports
9
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
10
+
11
+ from app.model_loader import ModelLoader
12
+
13
+ # Initialize FastAPI app
14
+ app = FastAPI(
15
+ title="Enterprise MLOps Prediction API",
16
+ description="Production-grade ML model serving with FastAPI",
17
+ version="1.0.0"
18
+ )
19
+
20
+ # Initialize model loader
21
+ model_loader = ModelLoader(model_path="models/saved_model.pkl")
22
+
23
+
24
+ class PredictionInput(BaseModel):
25
+ """Schema for single prediction request"""
26
+ features: Dict[str, float]
27
+
28
+ class Config:
29
+ schema_extra = {
30
+ "example": {
31
+ "features": {
32
+ "feature1": 50.0,
33
+ "feature2": 75.0
34
+ }
35
+ }
36
+ }
37
+
38
+
39
+ class BatchPredictionInput(BaseModel):
40
+ """Schema for batch prediction request"""
41
+ data: List[Dict[str, float]]
42
+
43
+ class Config:
44
+ schema_extra = {
45
+ "example": {
46
+ "data": [
47
+ {"feature1": 50.0, "feature2": 75.0},
48
+ {"feature1": 30.0, "feature2": 45.0}
49
+ ]
50
+ }
51
+ }
52
+
53
+
54
+ class PredictionResponse(BaseModel):
55
+ """Schema for prediction response"""
56
+ prediction: float
57
+ model_version: str
58
+
59
+
60
+ class BatchPredictionResponse(BaseModel):
61
+ """Schema for batch prediction response"""
62
+ predictions: List[float]
63
+ model_version: str
64
+ count: int
65
+
66
+
67
+ @app.get("/")
68
+ def root():
69
+ """Health check endpoint"""
70
+ return {
71
+ "status": "online",
72
+ "service": "Enterprise MLOps Prediction API",
73
+ "version": "1.0.0"
74
+ }
75
+
76
+
77
+ @app.get("/health")
78
+ def health_check():
79
+ """Detailed health check"""
80
+ model_status = "loaded" if model_loader.model is not None else "not_loaded"
81
+ return {
82
+ "status": "healthy",
83
+ "model_status": model_status,
84
+ "model_path": model_loader.model_path
85
+ }
86
+
87
+
88
+ @app.post("/predict", response_model=PredictionResponse)
89
+ def predict(input_data: PredictionInput):
90
+ """
91
+ Single prediction endpoint
92
+
93
+ Args:
94
+ input_data: Dictionary of feature names and values
95
+
96
+ Returns:
97
+ Prediction result and model version
98
+ """
99
+ try:
100
+ # Convert input to DataFrame
101
+ df = pd.DataFrame([input_data.features])
102
+
103
+ # Make prediction
104
+ prediction = model_loader.predict(df)
105
+
106
+ return PredictionResponse(
107
+ prediction=float(prediction[0]),
108
+ model_version=model_loader.get_model_version()
109
+ )
110
+
111
+ except Exception as e:
112
+ raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")
113
+
114
+
115
+ @app.post("/predict/batch", response_model=BatchPredictionResponse)
116
+ def predict_batch(input_data: BatchPredictionInput):
117
+ """
118
+ Batch prediction endpoint
119
+
120
+ Args:
121
+ input_data: List of feature dictionaries
122
+
123
+ Returns:
124
+ List of predictions and model version
125
+ """
126
+ try:
127
+ # Convert input to DataFrame
128
+ df = pd.DataFrame(input_data.data)
129
+
130
+ # Make predictions
131
+ predictions = model_loader.predict(df)
132
+
133
+ return BatchPredictionResponse(
134
+ predictions=[float(p) for p in predictions],
135
+ model_version=model_loader.get_model_version(),
136
+ count=len(predictions)
137
+ )
138
+
139
+ except Exception as e:
140
+ raise HTTPException(status_code=500, detail=f"Batch prediction error: {str(e)}")
141
+
142
+
143
+ @app.get("/model/info")
144
+ def model_info():
145
+ """Get model metadata and information"""
146
+ return {
147
+ "model_path": model_loader.model_path,
148
+ "model_version": model_loader.get_model_version(),
149
+ "model_type": str(type(model_loader.model).__name__) if model_loader.model else None,
150
+ "status": "loaded" if model_loader.model else "not_loaded"
151
+ }
152
+
153
+
154
+ @app.post("/model/reload")
155
+ def reload_model():
156
+ """Reload the model from disk"""
157
+ try:
158
+ model_loader.load_model()
159
+ return {
160
+ "status": "success",
161
+ "message": "Model reloaded successfully",
162
+ "model_version": model_loader.get_model_version()
163
+ }
164
+ except Exception as e:
165
+ raise HTTPException(status_code=500, detail=f"Model reload error: {str(e)}")
166
+
167
+
168
+ if __name__ == "__main__":
169
+ import uvicorn
170
+ uvicorn.run(app, host="0.0.0.0", port=8000)
@@ -0,0 +1,79 @@
1
+ import os
2
+ import joblib
3
+ from datetime import datetime
4
+ from typing import Any
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+
9
+ class ModelLoader:
10
+ """
11
+ Model loader and manager for ML model serving
12
+ """
13
+
14
+ def __init__(self, model_path: str = "models/saved_model.pkl"):
15
+ """
16
+ Initialize model loader
17
+
18
+ Args:
19
+ model_path: Path to the saved model file
20
+ """
21
+ self.model_path = model_path
22
+ self.model = None
23
+ self.loaded_at = None
24
+
25
+ # Load model on initialization
26
+ self.load_model()
27
+
28
+ def load_model(self):
29
+ """Load the model from disk"""
30
+ if not os.path.exists(self.model_path):
31
+ raise FileNotFoundError(f"Model file not found at {self.model_path}")
32
+
33
+ print(f"Loading model from {self.model_path}...")
34
+ self.model = joblib.load(self.model_path)
35
+ self.loaded_at = datetime.now()
36
+ print(f"Model loaded successfully at {self.loaded_at}")
37
+
38
+ def predict(self, features: pd.DataFrame) -> np.ndarray:
39
+ """
40
+ Make predictions using the loaded model
41
+
42
+ Args:
43
+ features: DataFrame with feature columns
44
+
45
+ Returns:
46
+ Array of predictions
47
+ """
48
+ if self.model is None:
49
+ raise ValueError("Model not loaded. Call load_model() first.")
50
+
51
+ predictions = self.model.predict(features)
52
+ return predictions
53
+
54
+ def get_model_version(self) -> str:
55
+ """
56
+ Get model version based on file modification time
57
+
58
+ Returns:
59
+ Model version string
60
+ """
61
+ if not os.path.exists(self.model_path):
62
+ return "unknown"
63
+
64
+ mod_time = os.path.getmtime(self.model_path)
65
+ return datetime.fromtimestamp(mod_time).strftime("%Y%m%d_%H%M%S")
66
+
67
+ def get_model_info(self) -> dict:
68
+ """
69
+ Get detailed model information
70
+
71
+ Returns:
72
+ Dictionary with model metadata
73
+ """
74
+ return {
75
+ "model_path": self.model_path,
76
+ "model_type": str(type(self.model).__name__) if self.model else None,
77
+ "loaded_at": self.loaded_at.isoformat() if self.loaded_at else None,
78
+ "version": self.get_model_version()
79
+ }
File without changes
@@ -0,0 +1,36 @@
1
+ import os
2
+ import pandas as pd
3
+
4
+ def load_data(source_path: str, output_path: str):
5
+ if not os.path.exists(source_path):
6
+ raise FileNotFoundError(f"Source data not found at {source_path}")
7
+
8
+ print(f"Loading data from {source_path}...")
9
+ df = pd.read_csv(source_path)
10
+ print(f"Loaded {len(df)} records.")
11
+
12
+ # Basic cleaning: drop duplicates and NaNs
13
+ df = df.drop_duplicates().dropna()
14
+
15
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
16
+ df.to_parquet(output_path, index=False)
17
+
18
+ print(f"Data saved to {output_path}")
19
+ return df
20
+
21
+ if __name__ == "__main__":
22
+ INPUT_FILE = 'data/sample.csv'
23
+ OUTPUT_FILE = 'data/intermediate.parquet'
24
+ os.makedirs('data', exist_ok=True)
25
+
26
+ # Generate synthetic data if sample.csv doesn't exist
27
+ if not os.path.exists(INPUT_FILE):
28
+ print("sample.csv not found — generating mock dataset...")
29
+ sample_df = pd.DataFrame({
30
+ 'feature1': range(100),
31
+ 'feature2': [x * 1.5 for x in range(100)],
32
+ 'target': [x * 3 + 5 for x in range(100)]
33
+ })
34
+ sample_df.to_csv(INPUT_FILE, index=False)
35
+
36
+ load_data(INPUT_FILE, OUTPUT_FILE)
@@ -0,0 +1,31 @@
1
+ import os
2
+ import pandas as pd
3
+ from sklearn.preprocessing import StandardScaler
4
+
5
+ def preprocess_data(input_path: str, output_path: str):
6
+ if not os.path.exists(input_path):
7
+ raise FileNotFoundError(f"Input parquet file not found at {input_path}")
8
+
9
+ print(f"Preprocessing data from {input_path}...")
10
+ df = pd.read_parquet(input_path)
11
+
12
+ # Identify numeric columns
13
+ num_cols = df.select_dtypes(include='number').columns.tolist()
14
+
15
+ if 'target' in num_cols:
16
+ num_cols.remove('target')
17
+
18
+ print(f"Scaling columns: {num_cols}")
19
+ scaler = StandardScaler()
20
+ df[num_cols] = scaler.fit_transform(df[num_cols])
21
+
22
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
23
+ df.to_parquet(output_path, index=False)
24
+ print(f"Preprocessed data saved to {output_path}")
25
+ return df
26
+
27
+ if __name__ == "__main__":
28
+ INPUT_FILE = 'data/intermediate.parquet'
29
+ OUTPUT_FILE = 'data/processed.parquet'
30
+
31
+ preprocess_data(INPUT_FILE, OUTPUT_FILE)
@@ -0,0 +1,71 @@
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.ensemble import RandomForestRegressor
5
+ from sklearn.model_selection import train_test_split
6
+ import joblib
7
+
8
+ def generate_sample_data():
9
+ """Generate synthetic sample data"""
10
+ np.random.seed(42)
11
+ n_samples = 1000
12
+
13
+ data = {
14
+ 'feature1': np.random.randn(n_samples) * 10 + 50,
15
+ 'feature2': np.random.randn(n_samples) * 15 + 75,
16
+ 'target': None
17
+ }
18
+
19
+ # Create target as a function of features with some noise
20
+ data['target'] = (
21
+ data['feature1'] * 2.5 +
22
+ data['feature2'] * 1.8 +
23
+ np.random.randn(n_samples) * 5
24
+ )
25
+
26
+ return pd.DataFrame(data)
27
+
28
+ def train_and_save_model(output_path='models/saved_model.pkl'):
29
+ """Train a simple model and save it"""
30
+
31
+ print("Generating sample dataset...")
32
+ df = generate_sample_data()
33
+
34
+ X = df[['feature1', 'feature2']]
35
+ y = df['target']
36
+
37
+ print(f"Dataset shape: {X.shape}")
38
+
39
+ # Split data
40
+ X_train, X_test, y_train, y_test = train_test_split(
41
+ X, y, test_size=0.2, random_state=42
42
+ )
43
+
44
+ print("Training Random Forest model...")
45
+ model = RandomForestRegressor(
46
+ n_estimators=100,
47
+ max_depth=10,
48
+ min_samples_split=5,
49
+ random_state=42
50
+ )
51
+
52
+ model.fit(X_train, y_train)
53
+
54
+ # Evaluate
55
+ train_score = model.score(X_train, y_train)
56
+ test_score = model.score(X_test, y_test)
57
+
58
+ print(f"Train R² Score: {train_score:.4f}")
59
+ print(f"Test R² Score: {test_score:.4f}")
60
+
61
+ # Save model
62
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
63
+ joblib.dump(model, output_path)
64
+
65
+ print(f"\nModel saved to: {output_path}")
66
+ print(f"Model file size: {os.path.getsize(output_path) / 1024:.2f} KB")
67
+
68
+ return model
69
+
70
+ if __name__ == "__main__":
71
+ train_and_save_model()
@@ -0,0 +1,56 @@
1
+ Metadata-Version: 2.1
2
+ Name: pulseflow-mlops
3
+ Version: 0.1.0
4
+ Summary: Production-grade open source MLOps pipeline for enterprise data engineering and predictive modeling
5
+ Author-email: Anil Prasad <anil@ambharii.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/anilatambharii/PulseFlow
8
+ Project-URL: Documentation, https://github.com/anilatambharii/PulseFlow/blob/main/README.md
9
+ Keywords: mlops,machine-learning,fastapi,airflow,mlflow,etl
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
12
+ Requires-Python: >=3.10
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+
16
+ # Enterprise MLOps Pipeline
17
+
18
+ This repository provides a production-ready MLOps template for building and deploying machine learning pipelines in enterprise environments.
19
+
20
+ ## Architecture Overview
21
+
22
+ Components included:
23
+ - ETL Pipeline: Data ingestion and preprocessing.
24
+ - Training Pipeline: Model training with MLflow tracking.
25
+ - Deployment Service: FastAPI microservice for real-time inference.
26
+ - Airflow Orchestration: Workflow automation for end-to-end pipelines.
27
+ - Dockerized Stack: Easily deployable with Docker Compose.
28
+
29
+ ## Run Locally
30
+
31
+ ### Prerequisites
32
+ - Python 3.10+
33
+ - Docker & Docker Compose
34
+
35
+ ### 1. Install dependencies
36
+ python -m venv .venv
37
+ source .venv/bin/activate # or .venv\Scripts\activate on Windows
38
+ pip install -r requirements.txt
39
+
40
+
41
+ ### 2. Run the pipeline manually
42
+
43
+ python etl/data_ingestion.py
44
+ python etl/data_preprocessing.py
45
+ python training/train_model.py
46
+ uvicorn deployment.app.main:app --reload
47
+
48
+
49
+ ### 3. Start MLflow and Airflow (optional)
50
+
51
+ mlflow ui &
52
+ airflow db init && airflow webserver -p 8080 & airflow scheduler &
53
+
54
+
55
+ ### 4. Run full stack with Docker
56
+ docker-compose up --build
@@ -0,0 +1,18 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ airflow/dags/mlops_workflow_dag.py
5
+ deployment/app/__init__.py
6
+ deployment/app/main.py
7
+ deployment/app/model_loader.py
8
+ etl/__init__.py
9
+ etl/data_ingestion.py
10
+ etl/data_preprocessing.py
11
+ models/generate_model.py
12
+ pulseflow_mlops.egg-info/PKG-INFO
13
+ pulseflow_mlops.egg-info/SOURCES.txt
14
+ pulseflow_mlops.egg-info/dependency_links.txt
15
+ pulseflow_mlops.egg-info/top_level.txt
16
+ training/__init__.py
17
+ training/evaluate_model.py
18
+ training/train_model.py
@@ -0,0 +1,5 @@
1
+ airflow
2
+ deployment
3
+ etl
4
+ models
5
+ training
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools==73", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "pulseflow-mlops"
7
+ version = "0.1.0"
8
+ description = "Production-grade open source MLOps pipeline for enterprise data engineering and predictive modeling"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.10"
12
+ authors = [{name = "Anil Prasad", email = "anil@ambharii.com"}]
13
+ keywords = ["mlops", "machine-learning", "fastapi", "airflow", "mlflow", "etl"]
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3.10",
16
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
17
+ ]
18
+
19
+ [project.urls]
20
+ Homepage = "https://github.com/anilatambharii/PulseFlow"
21
+ Documentation = "https://github.com/anilatambharii/PulseFlow/blob/main/README.md"
22
+
23
+ [tool.setuptools.packages.find]
24
+ where = ["."]
25
+ include = ["etl*", "training*", "deployment*", "models*", "airflow*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,65 @@
1
+ import os
2
+ import pandas as pd
3
+ import joblib
4
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
5
+ import json
6
+
7
+ def evaluate_model(model_path: str, data_path: str, output_metrics_path: str = 'models/metrics.json'):
8
+ """
9
+ Evaluate a trained model and save metrics
10
+ """
11
+ if not os.path.exists(model_path):
12
+ raise FileNotFoundError(f"Model not found at {model_path}")
13
+
14
+ if not os.path.exists(data_path):
15
+ raise FileNotFoundError(f"Data not found at {data_path}")
16
+
17
+ print(f"Loading model from {model_path}...")
18
+ model = joblib.load(model_path)
19
+
20
+ print(f"Loading evaluation data from {data_path}...")
21
+ df = pd.read_parquet(data_path)
22
+
23
+ if 'target' not in df.columns:
24
+ raise ValueError("Target column 'target' not found in dataset")
25
+
26
+ X = df.drop('target', axis=1)
27
+ y = df['target']
28
+
29
+ print(f"Evaluating model on {len(X)} samples...")
30
+ y_pred = model.predict(X)
31
+
32
+ # Calculate metrics
33
+ mse = mean_squared_error(y, y_pred)
34
+ mae = mean_absolute_error(y, y_pred)
35
+ r2 = r2_score(y, y_pred)
36
+ rmse = mse ** 0.5
37
+
38
+ metrics = {
39
+ 'mse': float(mse),
40
+ 'mae': float(mae),
41
+ 'rmse': float(rmse),
42
+ 'r2_score': float(r2)
43
+ }
44
+
45
+ print("\nEvaluation Results:")
46
+ print(f" MSE: {mse:.4f}")
47
+ print(f" MAE: {mae:.4f}")
48
+ print(f" RMSE: {rmse:.4f}")
49
+ print(f" R2 Score: {r2:.4f}")
50
+
51
+ # Save metrics to JSON
52
+ os.makedirs(os.path.dirname(output_metrics_path), exist_ok=True)
53
+ with open(output_metrics_path, 'w') as f:
54
+ json.dump(metrics, f, indent=4)
55
+
56
+ print(f"\nMetrics saved to {output_metrics_path}")
57
+
58
+ return metrics
59
+
60
+ if __name__ == "__main__":
61
+ MODEL_PATH = 'models/saved_model.pkl'
62
+ DATA_PATH = 'data/processed.parquet'
63
+ METRICS_OUTPUT_PATH = 'models/metrics.json'
64
+
65
+ evaluate_model(MODEL_PATH, DATA_PATH, METRICS_OUTPUT_PATH)
@@ -0,0 +1,92 @@
1
+ import os
2
+ import pandas as pd
3
+ import mlflow
4
+ import mlflow.sklearn
5
+ from sklearn.ensemble import RandomForestRegressor
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
8
+ import joblib
9
+
10
+ # MLflow tracking configuration
11
+ MLFLOW_TRACKING_URI = os.getenv('MLFLOW_TRACKING_URI', 'http://localhost:5000')
12
+ mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
13
+ mlflow.set_experiment("enterprise_mlops_training")
14
+
15
+ def train_model(data_path: str, model_output_path: str):
16
+ """
17
+ Train a Random Forest model with MLflow tracking
18
+ """
19
+ if not os.path.exists(data_path):
20
+ raise FileNotFoundError(f"Processed data not found at {data_path}")
21
+
22
+ print(f"Loading processed data from {data_path}...")
23
+ df = pd.read_parquet(data_path)
24
+
25
+ # Separate features and target
26
+ if 'target' not in df.columns:
27
+ raise ValueError("Target column 'target' not found in dataset")
28
+
29
+ X = df.drop('target', axis=1)
30
+ y = df['target']
31
+
32
+ print(f"Dataset shape: {X.shape}")
33
+
34
+ # Train-test split
35
+ X_train, X_test, y_train, y_test = train_test_split(
36
+ X, y, test_size=0.2, random_state=42
37
+ )
38
+
39
+ print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")
40
+
41
+ # Start MLflow run
42
+ with mlflow.start_run(run_name="random_forest_training"):
43
+ # Model parameters
44
+ params = {
45
+ 'n_estimators': 100,
46
+ 'max_depth': 10,
47
+ 'min_samples_split': 5,
48
+ 'random_state': 42
49
+ }
50
+
51
+ print("Training Random Forest model...")
52
+ model = RandomForestRegressor(**params)
53
+ model.fit(X_train, y_train)
54
+
55
+ # Predictions
56
+ y_pred = model.predict(X_test)
57
+
58
+ # Metrics
59
+ mse = mean_squared_error(y_test, y_pred)
60
+ mae = mean_absolute_error(y_test, y_pred)
61
+ r2 = r2_score(y_test, y_pred)
62
+
63
+ print(f"Model Performance:")
64
+ print(f" MSE: {mse:.4f}")
65
+ print(f" MAE: {mae:.4f}")
66
+ print(f" R2 Score: {r2:.4f}")
67
+
68
+ # Log parameters and metrics to MLflow
69
+ mlflow.log_params(params)
70
+ mlflow.log_metric("mse", mse)
71
+ mlflow.log_metric("mae", mae)
72
+ mlflow.log_metric("r2_score", r2)
73
+
74
+ # Log model to MLflow
75
+ mlflow.sklearn.log_model(
76
+ model,
77
+ artifact_path="model",
78
+ registered_model_name="enterprise_mlops_rf_model"
79
+ )
80
+
81
+ # Save model locally
82
+ os.makedirs(os.path.dirname(model_output_path), exist_ok=True)
83
+ joblib.dump(model, model_output_path)
84
+ print(f"Model saved to {model_output_path}")
85
+
86
+ return model, mse, mae, r2
87
+
88
+ if __name__ == "__main__":
89
+ DATA_PATH = 'data/processed.parquet'
90
+ MODEL_OUTPUT_PATH = 'models/saved_model.pkl'
91
+
92
+ train_model(DATA_PATH, MODEL_OUTPUT_PATH)