modelscout-ai 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. modelscout_ai-0.1.0/PKG-INFO +290 -0
  2. modelscout_ai-0.1.0/README.md +274 -0
  3. modelscout_ai-0.1.0/agent/__init__.py +0 -0
  4. modelscout_ai-0.1.0/agent/cleaner.py +138 -0
  5. modelscout_ai-0.1.0/agent/config.py +100 -0
  6. modelscout_ai-0.1.0/agent/cv_cleaner.py +296 -0
  7. modelscout_ai-0.1.0/agent/cv_detector.py +319 -0
  8. modelscout_ai-0.1.0/agent/cv_evaluator.py +230 -0
  9. modelscout_ai-0.1.0/agent/cv_reporter.py +264 -0
  10. modelscout_ai-0.1.0/agent/cv_trainer.py +351 -0
  11. modelscout_ai-0.1.0/agent/data_analyzer.py +110 -0
  12. modelscout_ai-0.1.0/agent/evaluator.py +314 -0
  13. modelscout_ai-0.1.0/agent/model_selector.py +168 -0
  14. modelscout_ai-0.1.0/agent/nlp_cleaner.py +255 -0
  15. modelscout_ai-0.1.0/agent/nlp_detector.py +225 -0
  16. modelscout_ai-0.1.0/agent/nlp_evaluator.py +284 -0
  17. modelscout_ai-0.1.0/agent/nlp_reporter.py +330 -0
  18. modelscout_ai-0.1.0/agent/nlp_trainer.py +391 -0
  19. modelscout_ai-0.1.0/agent/orchestrator.py +145 -0
  20. modelscout_ai-0.1.0/agent/reporter.py +180 -0
  21. modelscout_ai-0.1.0/agent/router.py +267 -0
  22. modelscout_ai-0.1.0/agent/trainer.py +310 -0
  23. modelscout_ai-0.1.0/agent/ts_cleaner.py +306 -0
  24. modelscout_ai-0.1.0/agent/ts_detector.py +270 -0
  25. modelscout_ai-0.1.0/agent/ts_evaluator.py +330 -0
  26. modelscout_ai-0.1.0/agent/ts_reporter.py +283 -0
  27. modelscout_ai-0.1.0/agent/ts_trainer.py +552 -0
  28. modelscout_ai-0.1.0/modelscout_ai.egg-info/PKG-INFO +290 -0
  29. modelscout_ai-0.1.0/modelscout_ai.egg-info/SOURCES.txt +31 -0
  30. modelscout_ai-0.1.0/modelscout_ai.egg-info/dependency_links.txt +1 -0
  31. modelscout_ai-0.1.0/modelscout_ai.egg-info/top_level.txt +1 -0
  32. modelscout_ai-0.1.0/pyproject.toml +29 -0
  33. modelscout_ai-0.1.0/setup.cfg +4 -0
@@ -0,0 +1,290 @@
1
+ Metadata-Version: 2.4
2
+ Name: modelscout-ai
3
+ Version: 0.1.0
4
+ Summary: Autonomous ML agent that finds the best model for any dataset automatically
5
+ Author-email: Iram Fatima <iramfatima749@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/Iramfatima12/modelscout
8
+ Keywords: machine learning,automl,model selection,autonomous,deep learning
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Requires-Python: >=3.8
15
+ Description-Content-Type: text/markdown
16
+
17
+ # ModelScout ๐Ÿค–
18
+
19
+ **Intelligent ML Model Recommendation System**
20
+
21
+ An automated machine learning tool that analyzes your dataset and recommends the best-fitting ML models. ModelScout uses Auto-sklearn to intelligently search through a vast hyperparameter space and identifies optimal models for your specific data.
22
+
23
+ ## ๐ŸŽฏ Features
24
+
25
+ - **Automated Problem Detection**: Automatically detects classification, regression, or clustering tasks
26
+ - **Smart Model Selection**: Uses Auto-sklearn to find the best models for your data
27
+ - **Comprehensive Analysis**: Provides detailed dataset analysis and insights
28
+ - **Multiple Formats**: Generates reports in text, JSON, and table formats
29
+ - **REST API**: Flask-based REST API for easy integration
30
+ - **Support for All ML Tasks**: Classification, Regression, Time-series, and more
31
+
32
+ ## ๐Ÿ“‹ Project Structure
33
+
34
+ ```
35
+ ModelScout/
36
+ โ”œโ”€โ”€ agent/ # Core ML engine
37
+ โ”‚ โ”œโ”€โ”€ data_analyzer.py # Dataset analysis module
38
+ โ”‚ โ”œโ”€โ”€ model_selector.py # Model recommendation using Auto-sklearn
39
+ โ”‚ โ”œโ”€โ”€ reporter.py # Report generation
40
+ โ”‚ โ”œโ”€โ”€ orchestrator.py # Main pipeline orchestrator
41
+ โ”‚ โ””โ”€โ”€ __init__.py
42
+ โ”œโ”€โ”€ api/ # REST API
43
+ โ”‚ โ”œโ”€โ”€ main.py # Flask API endpoints
44
+ โ”‚ โ””โ”€โ”€ __init__.py
45
+ โ”œโ”€โ”€ data/ # Sample datasets
46
+ โ”œโ”€โ”€ models/ # Trained models storage
47
+ โ”œโ”€โ”€ outputs/ # Generated reports
48
+ โ”œโ”€โ”€ requirements.txt # Python dependencies
49
+ โ”œโ”€โ”€ demo.py # Demo script with examples
50
+ โ””โ”€โ”€ README.md
51
+ ```
52
+
53
+ ## ๐Ÿš€ Quick Start
54
+
55
+ ### 1. Installation
56
+
57
+ ```bash
58
+ # Clone or navigate to the project directory
59
+ cd ModelScout
60
+
61
+ # Create virtual environment
62
+ python -m venv venv
63
+ source venv/bin/activate # On Windows: venv\Scripts\activate
64
+
65
+ # Install dependencies
66
+ pip install -r requirements.txt
67
+ ```
68
+
69
+ ### 2. Basic Usage
70
+
71
+ ```python
72
+ from agent.orchestrator import ModelScout
73
+
74
+ # Initialize
75
+ scout = ModelScout(auto_train_time=300)
76
+
77
+ # Run complete pipeline
78
+ result = scout.run_full_pipeline(
79
+ data_path='your_data.csv',
80
+ target='target_column',
81
+ report_path='outputs/report.txt'
82
+ )
83
+
84
+ # Access results
85
+ print(result['recommendations']['best_model_name'])
86
+ print(result['recommendations']['test_score'])
87
+ print(result['report'])
88
+ ```
89
+
90
+ ### 3. Step-by-Step Usage
91
+
92
+ ```python
93
+ from agent.orchestrator import ModelScout
94
+ import pandas as pd
95
+
96
+ scout = ModelScout()
97
+
98
+ # Load data
99
+ df = scout.load_data('data.csv')
100
+
101
+ # Analyze data
102
+ analysis = scout.analyze_data(df, target='label')
103
+ print(f"Problem Type: {analysis['target_analysis']['type']}")
104
+
105
+ # Get recommendations
106
+ recommendations = scout.recommend_models(df, 'label')
107
+ print(f"Best Model: {recommendations['best_model_name']}")
108
+ print(f"Test Score: {recommendations['test_score']}")
109
+
110
+ # Generate report
111
+ report = scout.generate_report(output_format='text', output_path='report.txt')
112
+ ```
113
+
114
+ ## ๐Ÿ”ง API Endpoints
115
+
116
+ ### Health Check
117
+ ```
118
+ GET /health
119
+ ```
120
+
121
+ ### Analyze Dataset
122
+ ```
123
+ POST /api/analyze
124
+ Content-Type: application/json
125
+
126
+ {
127
+ "file_path": "path/to/data.csv",
128
+ "target": "target_column"
129
+ }
130
+ ```
131
+
132
+ ### Get Recommendations
133
+ ```
134
+ POST /api/recommend
135
+ Content-Type: application/json
136
+
137
+ {
138
+ "file_path": "path/to/data.csv",
139
+ "target": "target_column",
140
+ "time_limit": 300
141
+ }
142
+ ```
143
+
144
+ ### Generate Report
145
+ ```
146
+ POST /api/report
147
+ Content-Type: application/json
148
+
149
+ {
150
+ "file_path": "path/to/data.csv",
151
+ "target": "target_column",
152
+ "format": "text"
153
+ }
154
+ ```
155
+
156
+ ### Full Pipeline
157
+ ```
158
+ POST /api/pipeline
159
+ Content-Type: application/json
160
+
161
+ {
162
+ "file_path": "path/to/data.csv",
163
+ "target": "target_column",
164
+ "time_limit": 300
165
+ }
166
+ ```
167
+
168
+ ## ๐ŸŽฎ Run Demo
169
+
170
+ ```bash
171
+ python demo.py
172
+ ```
173
+
174
+ The demo script:
175
+ 1. Creates sample datasets (Iris, Breast Cancer, Regression)
176
+ 2. Runs ModelScout on each dataset
177
+ 3. Generates comparison reports
178
+ 4. Demonstrates both classification and regression
179
+
180
+ ## ๐Ÿ“Š What ModelScout Analyzes
181
+
182
+ ### Data Characteristics
183
+ - Dataset size and shape
184
+ - Missing values and data quality
185
+ - Feature types and counts
186
+ - Memory usage
187
+
188
+ ### Target Variable
189
+ - Problem type (Classification/Regression)
190
+ - Class distribution (for classification)
191
+ - Value range (for regression)
192
+ - Class imbalance ratio
193
+
194
+ ### Feature Statistics
195
+ - Numeric: mean, std, min, max, missing count
196
+ - Categorical: unique values, missing count
197
+
198
+ ## ๐Ÿค– How It Works
199
+
200
+ 1. **Data Loading**: Supports CSV, Excel, JSON formats
201
+ 2. **Analysis**: Comprehensive dataset profiling
202
+ 3. **Problem Detection**: Auto-detects ML task type
203
+ 4. **Model Search**: Auto-sklearn searches optimal models
204
+ 5. **Evaluation**: Train/test split and performance metrics
205
+ 6. **Reporting**: Generates detailed recommendations
206
+
207
+ ## ๐Ÿ“ฆ Dependencies
208
+
209
+ - **pandas**: Data manipulation
210
+ - **scikit-learn**: ML algorithms
211
+ - **auto-sklearn**: Automated ML model selection
212
+ - **numpy**: Numerical computing
213
+ - **matplotlib/seaborn**: Visualization
214
+ - **flask**: REST API
215
+ - **xgboost, lightgbm, catboost**: Advanced models
216
+ - **imbalanced-learn**: Class imbalance handling
217
+
218
+ ## ๐Ÿ” Example Output
219
+
220
+ ```
221
+ ======================================================================
222
+ ___ ___ _ _ ____ ___ _ _ ___
223
+ | \/ | | | | | / ___ \/ _ \| | | |_ |
224
+ | . . | ___ __| | ___| | / / \/ /_\ \ | | | / /
225
+ | |\/| |/ _ \ / _` |/ _ \ | \ \ | _ | | | |/ /
226
+ | | | | (_) || (_| | __/ | \ \__| | | | |_| / /
227
+ |_| |_|\___/ \__,_|\___|_| \___/_| |_|\___/___/
228
+
229
+ ======================================================================
230
+
231
+ DATA OVERVIEW
232
+ ======================================================================
233
+ Dataset Shape: (150, 5) (rows, columns)
234
+ Memory Usage: 0.00 MB
235
+ Missing Values: 0 (0.00%)
236
+ Numeric Features: 4
237
+ Categorical Features: 0
238
+
239
+ TARGET VARIABLE ANALYSIS
240
+ ----------------------------------------------------------------------
241
+ Problem Type: CLASSIFICATION
242
+ Unique Values: 3
243
+ Missing Values: 0
244
+ Class Imbalance Ratio: 1.00:1
245
+ Class Distribution:
246
+ 0: 50 (33.3%)
247
+ 1: 50 (33.3%)
248
+ 2: 50 (33.3%)
249
+
250
+ MODEL RECOMMENDATIONS
251
+ ======================================================================
252
+ Best Model: RandomForestClassifier
253
+ Problem Type: CLASSIFICATION
254
+ Train Score: 1.0000
255
+ Test Score: 0.9333
256
+ Data Shape Used: (150, 4)
257
+ Number of Classes: 3
258
+
259
+ ======================================================================
260
+ ```
261
+
262
+ ## ๐Ÿ› ๏ธ Configuration
263
+
264
+ You can customize behavior by modifying parameters:
265
+
266
+ ```python
267
+ scout = ModelScout(
268
+ auto_train_time=600 # Increase for more thorough search (seconds)
269
+ )
270
+ ```
271
+
272
+ ## ๐Ÿ“ License
273
+
274
+ This project is for educational and portfolio purposes.
275
+
276
+ ## ๐Ÿค Contributing
277
+
278
+ Feel free to extend ModelScout with:
279
+ - Additional models
280
+ - More data preprocessing options
281
+ - Visualization enhancements
282
+ - Performance optimizations
283
+
284
+ ## ๐Ÿ“ž Support
285
+
286
+ For issues or questions, refer to the demo.py script for usage examples.
287
+
288
+ ---
289
+
290
+ **Happy Model Scouting! ๐ŸŽฏ**
@@ -0,0 +1,274 @@
1
+ # ModelScout ๐Ÿค–
2
+
3
+ **Intelligent ML Model Recommendation System**
4
+
5
+ An automated machine learning tool that analyzes your dataset and recommends the best-fitting ML models. ModelScout uses Auto-sklearn to intelligently search through a vast hyperparameter space and identifies optimal models for your specific data.
6
+
7
+ ## ๐ŸŽฏ Features
8
+
9
+ - **Automated Problem Detection**: Automatically detects classification, regression, or clustering tasks
10
+ - **Smart Model Selection**: Uses Auto-sklearn to find the best models for your data
11
+ - **Comprehensive Analysis**: Provides detailed dataset analysis and insights
12
+ - **Multiple Formats**: Generates reports in text, JSON, and table formats
13
+ - **REST API**: Flask-based REST API for easy integration
14
+ - **Support for All ML Tasks**: Classification, Regression, Time-series, and more
15
+
16
+ ## ๐Ÿ“‹ Project Structure
17
+
18
+ ```
19
+ ModelScout/
20
+ โ”œโ”€โ”€ agent/ # Core ML engine
21
+ โ”‚ โ”œโ”€โ”€ data_analyzer.py # Dataset analysis module
22
+ โ”‚ โ”œโ”€โ”€ model_selector.py # Model recommendation using Auto-sklearn
23
+ โ”‚ โ”œโ”€โ”€ reporter.py # Report generation
24
+ โ”‚ โ”œโ”€โ”€ orchestrator.py # Main pipeline orchestrator
25
+ โ”‚ โ””โ”€โ”€ __init__.py
26
+ โ”œโ”€โ”€ api/ # REST API
27
+ โ”‚ โ”œโ”€โ”€ main.py # Flask API endpoints
28
+ โ”‚ โ””โ”€โ”€ __init__.py
29
+ โ”œโ”€โ”€ data/ # Sample datasets
30
+ โ”œโ”€โ”€ models/ # Trained models storage
31
+ โ”œโ”€โ”€ outputs/ # Generated reports
32
+ โ”œโ”€โ”€ requirements.txt # Python dependencies
33
+ โ”œโ”€โ”€ demo.py # Demo script with examples
34
+ โ””โ”€โ”€ README.md
35
+ ```
36
+
37
+ ## ๐Ÿš€ Quick Start
38
+
39
+ ### 1. Installation
40
+
41
+ ```bash
42
+ # Clone or navigate to the project directory
43
+ cd ModelScout
44
+
45
+ # Create virtual environment
46
+ python -m venv venv
47
+ source venv/bin/activate # On Windows: venv\Scripts\activate
48
+
49
+ # Install dependencies
50
+ pip install -r requirements.txt
51
+ ```
52
+
53
+ ### 2. Basic Usage
54
+
55
+ ```python
56
+ from agent.orchestrator import ModelScout
57
+
58
+ # Initialize
59
+ scout = ModelScout(auto_train_time=300)
60
+
61
+ # Run complete pipeline
62
+ result = scout.run_full_pipeline(
63
+ data_path='your_data.csv',
64
+ target='target_column',
65
+ report_path='outputs/report.txt'
66
+ )
67
+
68
+ # Access results
69
+ print(result['recommendations']['best_model_name'])
70
+ print(result['recommendations']['test_score'])
71
+ print(result['report'])
72
+ ```
73
+
74
+ ### 3. Step-by-Step Usage
75
+
76
+ ```python
77
+ from agent.orchestrator import ModelScout
78
+ import pandas as pd
79
+
80
+ scout = ModelScout()
81
+
82
+ # Load data
83
+ df = scout.load_data('data.csv')
84
+
85
+ # Analyze data
86
+ analysis = scout.analyze_data(df, target='label')
87
+ print(f"Problem Type: {analysis['target_analysis']['type']}")
88
+
89
+ # Get recommendations
90
+ recommendations = scout.recommend_models(df, 'label')
91
+ print(f"Best Model: {recommendations['best_model_name']}")
92
+ print(f"Test Score: {recommendations['test_score']}")
93
+
94
+ # Generate report
95
+ report = scout.generate_report(output_format='text', output_path='report.txt')
96
+ ```
97
+
98
+ ## ๐Ÿ”ง API Endpoints
99
+
100
+ ### Health Check
101
+ ```
102
+ GET /health
103
+ ```
104
+
105
+ ### Analyze Dataset
106
+ ```
107
+ POST /api/analyze
108
+ Content-Type: application/json
109
+
110
+ {
111
+ "file_path": "path/to/data.csv",
112
+ "target": "target_column"
113
+ }
114
+ ```
115
+
116
+ ### Get Recommendations
117
+ ```
118
+ POST /api/recommend
119
+ Content-Type: application/json
120
+
121
+ {
122
+ "file_path": "path/to/data.csv",
123
+ "target": "target_column",
124
+ "time_limit": 300
125
+ }
126
+ ```
127
+
128
+ ### Generate Report
129
+ ```
130
+ POST /api/report
131
+ Content-Type: application/json
132
+
133
+ {
134
+ "file_path": "path/to/data.csv",
135
+ "target": "target_column",
136
+ "format": "text"
137
+ }
138
+ ```
139
+
140
+ ### Full Pipeline
141
+ ```
142
+ POST /api/pipeline
143
+ Content-Type: application/json
144
+
145
+ {
146
+ "file_path": "path/to/data.csv",
147
+ "target": "target_column",
148
+ "time_limit": 300
149
+ }
150
+ ```
151
+
152
+ ## ๐ŸŽฎ Run Demo
153
+
154
+ ```bash
155
+ python demo.py
156
+ ```
157
+
158
+ The demo script:
159
+ 1. Creates sample datasets (Iris, Breast Cancer, Regression)
160
+ 2. Runs ModelScout on each dataset
161
+ 3. Generates comparison reports
162
+ 4. Demonstrates both classification and regression
163
+
164
+ ## ๐Ÿ“Š What ModelScout Analyzes
165
+
166
+ ### Data Characteristics
167
+ - Dataset size and shape
168
+ - Missing values and data quality
169
+ - Feature types and counts
170
+ - Memory usage
171
+
172
+ ### Target Variable
173
+ - Problem type (Classification/Regression)
174
+ - Class distribution (for classification)
175
+ - Value range (for regression)
176
+ - Class imbalance ratio
177
+
178
+ ### Feature Statistics
179
+ - Numeric: mean, std, min, max, missing count
180
+ - Categorical: unique values, missing count
181
+
182
+ ## ๐Ÿค– How It Works
183
+
184
+ 1. **Data Loading**: Supports CSV, Excel, JSON formats
185
+ 2. **Analysis**: Comprehensive dataset profiling
186
+ 3. **Problem Detection**: Auto-detects ML task type
187
+ 4. **Model Search**: Auto-sklearn searches optimal models
188
+ 5. **Evaluation**: Train/test split and performance metrics
189
+ 6. **Reporting**: Generates detailed recommendations
190
+
191
+ ## ๐Ÿ“ฆ Dependencies
192
+
193
+ - **pandas**: Data manipulation
194
+ - **scikit-learn**: ML algorithms
195
+ - **auto-sklearn**: Automated ML model selection
196
+ - **numpy**: Numerical computing
197
+ - **matplotlib/seaborn**: Visualization
198
+ - **flask**: REST API
199
+ - **xgboost, lightgbm, catboost**: Advanced models
200
+ - **imbalanced-learn**: Class imbalance handling
201
+
202
+ ## ๐Ÿ” Example Output
203
+
204
+ ```
205
+ ======================================================================
206
+ ___ ___ _ _ ____ ___ _ _ ___
207
+ | \/ | | | | | / ___ \/ _ \| | | |_ |
208
+ | . . | ___ __| | ___| | / / \/ /_\ \ | | | / /
209
+ | |\/| |/ _ \ / _` |/ _ \ | \ \ | _ | | | |/ /
210
+ | | | | (_) || (_| | __/ | \ \__| | | | |_| / /
211
+ |_| |_|\___/ \__,_|\___|_| \___/_| |_|\___/___/
212
+
213
+ ======================================================================
214
+
215
+ DATA OVERVIEW
216
+ ======================================================================
217
+ Dataset Shape: (150, 5) (rows, columns)
218
+ Memory Usage: 0.00 MB
219
+ Missing Values: 0 (0.00%)
220
+ Numeric Features: 4
221
+ Categorical Features: 0
222
+
223
+ TARGET VARIABLE ANALYSIS
224
+ ----------------------------------------------------------------------
225
+ Problem Type: CLASSIFICATION
226
+ Unique Values: 3
227
+ Missing Values: 0
228
+ Class Imbalance Ratio: 1.00:1
229
+ Class Distribution:
230
+ 0: 50 (33.3%)
231
+ 1: 50 (33.3%)
232
+ 2: 50 (33.3%)
233
+
234
+ MODEL RECOMMENDATIONS
235
+ ======================================================================
236
+ Best Model: RandomForestClassifier
237
+ Problem Type: CLASSIFICATION
238
+ Train Score: 1.0000
239
+ Test Score: 0.9333
240
+ Data Shape Used: (150, 4)
241
+ Number of Classes: 3
242
+
243
+ ======================================================================
244
+ ```
245
+
246
+ ## ๐Ÿ› ๏ธ Configuration
247
+
248
+ You can customize behavior by modifying parameters:
249
+
250
+ ```python
251
+ scout = ModelScout(
252
+ auto_train_time=600 # Increase for more thorough search (seconds)
253
+ )
254
+ ```
255
+
256
+ ## ๐Ÿ“ License
257
+
258
+ This project is for educational and portfolio purposes.
259
+
260
+ ## ๐Ÿค Contributing
261
+
262
+ Feel free to extend ModelScout with:
263
+ - Additional models
264
+ - More data preprocessing options
265
+ - Visualization enhancements
266
+ - Performance optimizations
267
+
268
+ ## ๐Ÿ“ž Support
269
+
270
+ For issues or questions, refer to the demo.py script for usage examples.
271
+
272
+ ---
273
+
274
+ **Happy Model Scouting! ๐ŸŽฏ**
File without changes
@@ -0,0 +1,138 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
4
+ from sklearn.impute import SimpleImputer
5
+
6
+
7
+ def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
8
+ before = len(df)
9
+ df = df.drop_duplicates()
10
+ after = len(df)
11
+
12
+ removed = before - after
13
+ if removed > 0:
14
+ print(f" โœ… Removed {removed} duplicate rows")
15
+ else:
16
+ print(f" โœ… No duplicates found")
17
+
18
+ return df
19
+
20
+
21
+ def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame:
22
+ print("\n Handling missing values...")
23
+
24
+ for col in df.columns:
25
+ missing_count = df[col].isnull().sum()
26
+
27
+ if missing_count > 0:
28
+ if df[col].dtype in ["int64", "float64"]:
29
+ # Fill numbers with column average
30
+ fill_value = df[col].mean()
31
+ df[col] = df[col].fillna(fill_value)
32
+ print(f"{col} โ†’ filled {missing_count} empty cells with mean ({fill_value:.2f})")
33
+
34
+ else:
35
+ # Fill categories with most common value
36
+ fill_value = df[col].mode()[0]
37
+ df[col] = df[col].fillna(fill_value)
38
+ print(f" โœ… {col} โ†’ filled {missing_count} empty cells with '{fill_value}'")
39
+
40
+ return df
41
+
42
+
43
+ def encode_categorical_columns(
44
+ df: pd.DataFrame,
45
+ target_column: str,
46
+ feature_columns: list
47
+ ) -> tuple:
48
+
49
+
50
+ print("\n Encoding categorical columns...")
51
+
52
+ encoders = {}
53
+
54
+ # Encode feature columns
55
+ for col in feature_columns:
56
+ if df[col].dtype == "object":
57
+ le = LabelEncoder()
58
+ df[col] = le.fit_transform(df[col].astype(str))
59
+ encoders[col] = le
60
+ print(f" โœ… {col} โ†’ converted to numbers")
61
+
62
+ # Encode target column if it is text
63
+ if df[target_column].dtype == "object":
64
+ le = LabelEncoder()
65
+ df[target_column] = le.fit_transform(
66
+ df[target_column].astype(str)
67
+ )
68
+ encoders[target_column] = le
69
+ print(f" โœ… Target '{target_column}' โ†’ converted to numbers")
70
+
71
+ return df, encoders
72
+
73
+
74
+ def scale_numeric_columns(
75
+ df: pd.DataFrame,
76
+ feature_columns: list
77
+ ) -> tuple:
78
+
79
+ print("\n Scaling numeric columns...")
80
+
81
+ # Only scale numeric feature columns
82
+ numeric_cols = [
83
+ col for col in feature_columns
84
+ if df[col].dtype in ["int64", "float64"]
85
+ ]
86
+
87
+ if len(numeric_cols) > 0:
88
+ scaler = StandardScaler()
89
+ df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
90
+ print(f" โœ… Scaled {len(numeric_cols)} numeric columns")
91
+ else:
92
+ scaler = None
93
+ print(" โœ… No numeric columns to scale")
94
+
95
+ return df, scaler
96
+
97
+
98
+ def run_cleaner(
99
+ df: pd.DataFrame,
100
+ target_column: str,
101
+ feature_columns: list
102
+ ) -> dict:
103
+
104
+
105
+ print("\n๐Ÿงน ModelScout: Starting data cleaning...\n")
106
+
107
+ # Step 1
108
+ print("Step 1: Removing duplicates...")
109
+ df = remove_duplicates(df)
110
+
111
+ # Step 2
112
+ print("\nStep 2: Handling missing values...")
113
+ df = handle_missing_values(df)
114
+
115
+ # Step 3
116
+ print("\nStep 3: Encoding categorical columns...")
117
+ df, encoders = encode_categorical_columns(
118
+ df, target_column, feature_columns
119
+ )
120
+
121
+ # Step 4
122
+ print("\nStep 4: Scaling numeric columns...")
123
+ df, scaler = scale_numeric_columns(df, feature_columns)
124
+
125
+ # Separate features and target
126
+ X = df[feature_columns]
127
+ y = df[target_column]
128
+
129
+ print("\nโœ… Data cleaning complete!")
130
+ print(f" Final shape: {X.shape[0]} rows, {X.shape[1]} features\n")
131
+
132
+ return {
133
+ "X": X,
134
+ "y": y,
135
+ "encoders": encoders,
136
+ "scaler": scaler,
137
+ "cleaned_df": df
138
+ }