modelscout-ai 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- modelscout_ai-0.1.0/PKG-INFO +290 -0
- modelscout_ai-0.1.0/README.md +274 -0
- modelscout_ai-0.1.0/agent/__init__.py +0 -0
- modelscout_ai-0.1.0/agent/cleaner.py +138 -0
- modelscout_ai-0.1.0/agent/config.py +100 -0
- modelscout_ai-0.1.0/agent/cv_cleaner.py +296 -0
- modelscout_ai-0.1.0/agent/cv_detector.py +319 -0
- modelscout_ai-0.1.0/agent/cv_evaluator.py +230 -0
- modelscout_ai-0.1.0/agent/cv_reporter.py +264 -0
- modelscout_ai-0.1.0/agent/cv_trainer.py +351 -0
- modelscout_ai-0.1.0/agent/data_analyzer.py +110 -0
- modelscout_ai-0.1.0/agent/evaluator.py +314 -0
- modelscout_ai-0.1.0/agent/model_selector.py +168 -0
- modelscout_ai-0.1.0/agent/nlp_cleaner.py +255 -0
- modelscout_ai-0.1.0/agent/nlp_detector.py +225 -0
- modelscout_ai-0.1.0/agent/nlp_evaluator.py +284 -0
- modelscout_ai-0.1.0/agent/nlp_reporter.py +330 -0
- modelscout_ai-0.1.0/agent/nlp_trainer.py +391 -0
- modelscout_ai-0.1.0/agent/orchestrator.py +145 -0
- modelscout_ai-0.1.0/agent/reporter.py +180 -0
- modelscout_ai-0.1.0/agent/router.py +267 -0
- modelscout_ai-0.1.0/agent/trainer.py +310 -0
- modelscout_ai-0.1.0/agent/ts_cleaner.py +306 -0
- modelscout_ai-0.1.0/agent/ts_detector.py +270 -0
- modelscout_ai-0.1.0/agent/ts_evaluator.py +330 -0
- modelscout_ai-0.1.0/agent/ts_reporter.py +283 -0
- modelscout_ai-0.1.0/agent/ts_trainer.py +552 -0
- modelscout_ai-0.1.0/modelscout_ai.egg-info/PKG-INFO +290 -0
- modelscout_ai-0.1.0/modelscout_ai.egg-info/SOURCES.txt +31 -0
- modelscout_ai-0.1.0/modelscout_ai.egg-info/dependency_links.txt +1 -0
- modelscout_ai-0.1.0/modelscout_ai.egg-info/top_level.txt +1 -0
- modelscout_ai-0.1.0/pyproject.toml +29 -0
- modelscout_ai-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: modelscout-ai
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Autonomous ML agent that finds the best model for any dataset automatically
|
|
5
|
+
Author-email: Iram Fatima <iramfatima749@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Iramfatima12/modelscout
|
|
8
|
+
Keywords: machine learning,automl,model selection,autonomous,deep learning
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Requires-Python: >=3.8
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# ModelScout ๐ค
|
|
18
|
+
|
|
19
|
+
**Intelligent ML Model Recommendation System**
|
|
20
|
+
|
|
21
|
+
An automated machine learning tool that analyzes your dataset and recommends the best-fitting ML models. ModelScout uses Auto-sklearn to intelligently search through a vast hyperparameter space and identifies optimal models for your specific data.
|
|
22
|
+
|
|
23
|
+
## ๐ฏ Features
|
|
24
|
+
|
|
25
|
+
- **Automated Problem Detection**: Automatically detects classification, regression, or clustering tasks
|
|
26
|
+
- **Smart Model Selection**: Uses Auto-sklearn to find the best models for your data
|
|
27
|
+
- **Comprehensive Analysis**: Provides detailed dataset analysis and insights
|
|
28
|
+
- **Multiple Formats**: Generates reports in text, JSON, and table formats
|
|
29
|
+
- **REST API**: Flask-based REST API for easy integration
|
|
30
|
+
- **Support for All ML Tasks**: Classification, Regression, Time-series, and more
|
|
31
|
+
|
|
32
|
+
## ๐ Project Structure
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
ModelScout/
|
|
36
|
+
โโโ agent/ # Core ML engine
|
|
37
|
+
โ โโโ data_analyzer.py # Dataset analysis module
|
|
38
|
+
โ โโโ model_selector.py # Model recommendation using Auto-sklearn
|
|
39
|
+
โ โโโ reporter.py # Report generation
|
|
40
|
+
โ โโโ orchestrator.py # Main pipeline orchestrator
|
|
41
|
+
โ โโโ __init__.py
|
|
42
|
+
โโโ api/ # REST API
|
|
43
|
+
โ โโโ main.py # Flask API endpoints
|
|
44
|
+
โ โโโ __init__.py
|
|
45
|
+
โโโ data/ # Sample datasets
|
|
46
|
+
โโโ models/ # Trained models storage
|
|
47
|
+
โโโ outputs/ # Generated reports
|
|
48
|
+
โโโ requirements.txt # Python dependencies
|
|
49
|
+
โโโ demo.py # Demo script with examples
|
|
50
|
+
โโโ README.md
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## ๐ Quick Start
|
|
54
|
+
|
|
55
|
+
### 1. Installation
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
# Clone or navigate to the project directory
|
|
59
|
+
cd ModelScout
|
|
60
|
+
|
|
61
|
+
# Create virtual environment
|
|
62
|
+
python -m venv venv
|
|
63
|
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
|
64
|
+
|
|
65
|
+
# Install dependencies
|
|
66
|
+
pip install -r requirements.txt
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### 2. Basic Usage
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from agent.orchestrator import ModelScout
|
|
73
|
+
|
|
74
|
+
# Initialize
|
|
75
|
+
scout = ModelScout(auto_train_time=300)
|
|
76
|
+
|
|
77
|
+
# Run complete pipeline
|
|
78
|
+
result = scout.run_full_pipeline(
|
|
79
|
+
data_path='your_data.csv',
|
|
80
|
+
target='target_column',
|
|
81
|
+
report_path='outputs/report.txt'
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Access results
|
|
85
|
+
print(result['recommendations']['best_model_name'])
|
|
86
|
+
print(result['recommendations']['test_score'])
|
|
87
|
+
print(result['report'])
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### 3. Step-by-Step Usage
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from agent.orchestrator import ModelScout
|
|
94
|
+
import pandas as pd
|
|
95
|
+
|
|
96
|
+
scout = ModelScout()
|
|
97
|
+
|
|
98
|
+
# Load data
|
|
99
|
+
df = scout.load_data('data.csv')
|
|
100
|
+
|
|
101
|
+
# Analyze data
|
|
102
|
+
analysis = scout.analyze_data(df, target='label')
|
|
103
|
+
print(f"Problem Type: {analysis['target_analysis']['type']}")
|
|
104
|
+
|
|
105
|
+
# Get recommendations
|
|
106
|
+
recommendations = scout.recommend_models(df, 'label')
|
|
107
|
+
print(f"Best Model: {recommendations['best_model_name']}")
|
|
108
|
+
print(f"Test Score: {recommendations['test_score']}")
|
|
109
|
+
|
|
110
|
+
# Generate report
|
|
111
|
+
report = scout.generate_report(output_format='text', output_path='report.txt')
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## ๐ง API Endpoints
|
|
115
|
+
|
|
116
|
+
### Health Check
|
|
117
|
+
```
|
|
118
|
+
GET /health
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Analyze Dataset
|
|
122
|
+
```
|
|
123
|
+
POST /api/analyze
|
|
124
|
+
Content-Type: application/json
|
|
125
|
+
|
|
126
|
+
{
|
|
127
|
+
"file_path": "path/to/data.csv",
|
|
128
|
+
"target": "target_column"
|
|
129
|
+
}
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Get Recommendations
|
|
133
|
+
```
|
|
134
|
+
POST /api/recommend
|
|
135
|
+
Content-Type: application/json
|
|
136
|
+
|
|
137
|
+
{
|
|
138
|
+
"file_path": "path/to/data.csv",
|
|
139
|
+
"target": "target_column",
|
|
140
|
+
"time_limit": 300
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Generate Report
|
|
145
|
+
```
|
|
146
|
+
POST /api/report
|
|
147
|
+
Content-Type: application/json
|
|
148
|
+
|
|
149
|
+
{
|
|
150
|
+
"file_path": "path/to/data.csv",
|
|
151
|
+
"target": "target_column",
|
|
152
|
+
"format": "text"
|
|
153
|
+
}
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### Full Pipeline
|
|
157
|
+
```
|
|
158
|
+
POST /api/pipeline
|
|
159
|
+
Content-Type: application/json
|
|
160
|
+
|
|
161
|
+
{
|
|
162
|
+
"file_path": "path/to/data.csv",
|
|
163
|
+
"target": "target_column",
|
|
164
|
+
"time_limit": 300
|
|
165
|
+
}
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## ๐ฎ Run Demo
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
python demo.py
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
The demo script:
|
|
175
|
+
1. Creates sample datasets (Iris, Breast Cancer, Regression)
|
|
176
|
+
2. Runs ModelScout on each dataset
|
|
177
|
+
3. Generates comparison reports
|
|
178
|
+
4. Demonstrates both classification and regression
|
|
179
|
+
|
|
180
|
+
## ๐ What ModelScout Analyzes
|
|
181
|
+
|
|
182
|
+
### Data Characteristics
|
|
183
|
+
- Dataset size and shape
|
|
184
|
+
- Missing values and data quality
|
|
185
|
+
- Feature types and counts
|
|
186
|
+
- Memory usage
|
|
187
|
+
|
|
188
|
+
### Target Variable
|
|
189
|
+
- Problem type (Classification/Regression)
|
|
190
|
+
- Class distribution (for classification)
|
|
191
|
+
- Value range (for regression)
|
|
192
|
+
- Class imbalance ratio
|
|
193
|
+
|
|
194
|
+
### Feature Statistics
|
|
195
|
+
- Numeric: mean, std, min, max, missing count
|
|
196
|
+
- Categorical: unique values, missing count
|
|
197
|
+
|
|
198
|
+
## ๐ค How It Works
|
|
199
|
+
|
|
200
|
+
1. **Data Loading**: Supports CSV, Excel, JSON formats
|
|
201
|
+
2. **Analysis**: Comprehensive dataset profiling
|
|
202
|
+
3. **Problem Detection**: Auto-detects ML task type
|
|
203
|
+
4. **Model Search**: Auto-sklearn searches optimal models
|
|
204
|
+
5. **Evaluation**: Train/test split and performance metrics
|
|
205
|
+
6. **Reporting**: Generates detailed recommendations
|
|
206
|
+
|
|
207
|
+
## ๐ฆ Dependencies
|
|
208
|
+
|
|
209
|
+
- **pandas**: Data manipulation
|
|
210
|
+
- **scikit-learn**: ML algorithms
|
|
211
|
+
- **auto-sklearn**: Automated ML model selection
|
|
212
|
+
- **numpy**: Numerical computing
|
|
213
|
+
- **matplotlib/seaborn**: Visualization
|
|
214
|
+
- **flask**: REST API
|
|
215
|
+
- **xgboost, lightgbm, catboost**: Advanced models
|
|
216
|
+
- **imbalanced-learn**: Class imbalance handling
|
|
217
|
+
|
|
218
|
+
## ๐ Example Output
|
|
219
|
+
|
|
220
|
+
```
|
|
221
|
+
======================================================================
|
|
222
|
+
___ ___ _ _ ____ ___ _ _ ___
|
|
223
|
+
| \/ | | | | | / ___ \/ _ \| | | |_ |
|
|
224
|
+
| . . | ___ __| | ___| | / / \/ /_\ \ | | | / /
|
|
225
|
+
| |\/| |/ _ \ / _` |/ _ \ | \ \ | _ | | | |/ /
|
|
226
|
+
| | | | (_) || (_| | __/ | \ \__| | | | |_| / /
|
|
227
|
+
|_| |_|\___/ \__,_|\___|_| \___/_| |_|\___/___/
|
|
228
|
+
|
|
229
|
+
======================================================================
|
|
230
|
+
|
|
231
|
+
DATA OVERVIEW
|
|
232
|
+
======================================================================
|
|
233
|
+
Dataset Shape: (150, 5) (rows, columns)
|
|
234
|
+
Memory Usage: 0.00 MB
|
|
235
|
+
Missing Values: 0 (0.00%)
|
|
236
|
+
Numeric Features: 4
|
|
237
|
+
Categorical Features: 0
|
|
238
|
+
|
|
239
|
+
TARGET VARIABLE ANALYSIS
|
|
240
|
+
----------------------------------------------------------------------
|
|
241
|
+
Problem Type: CLASSIFICATION
|
|
242
|
+
Unique Values: 3
|
|
243
|
+
Missing Values: 0
|
|
244
|
+
Class Imbalance Ratio: 1.00:1
|
|
245
|
+
Class Distribution:
|
|
246
|
+
0: 50 (33.3%)
|
|
247
|
+
1: 50 (33.3%)
|
|
248
|
+
2: 50 (33.3%)
|
|
249
|
+
|
|
250
|
+
MODEL RECOMMENDATIONS
|
|
251
|
+
======================================================================
|
|
252
|
+
Best Model: RandomForestClassifier
|
|
253
|
+
Problem Type: CLASSIFICATION
|
|
254
|
+
Train Score: 1.0000
|
|
255
|
+
Test Score: 0.9333
|
|
256
|
+
Data Shape Used: (150, 4)
|
|
257
|
+
Number of Classes: 3
|
|
258
|
+
|
|
259
|
+
======================================================================
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
## ๐ ๏ธ Configuration
|
|
263
|
+
|
|
264
|
+
You can customize behavior by modifying parameters:
|
|
265
|
+
|
|
266
|
+
```python
|
|
267
|
+
scout = ModelScout(
|
|
268
|
+
auto_train_time=600 # Increase for more thorough search (seconds)
|
|
269
|
+
)
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
## ๐ License
|
|
273
|
+
|
|
274
|
+
This project is for educational and portfolio purposes.
|
|
275
|
+
|
|
276
|
+
## ๐ค Contributing
|
|
277
|
+
|
|
278
|
+
Feel free to extend ModelScout with:
|
|
279
|
+
- Additional models
|
|
280
|
+
- More data preprocessing options
|
|
281
|
+
- Visualization enhancements
|
|
282
|
+
- Performance optimizations
|
|
283
|
+
|
|
284
|
+
## ๐ Support
|
|
285
|
+
|
|
286
|
+
For issues or questions, refer to the demo.py script for usage examples.
|
|
287
|
+
|
|
288
|
+
---
|
|
289
|
+
|
|
290
|
+
**Happy Model Scouting! ๐ฏ**
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
# ModelScout ๐ค
|
|
2
|
+
|
|
3
|
+
**Intelligent ML Model Recommendation System**
|
|
4
|
+
|
|
5
|
+
An automated machine learning tool that analyzes your dataset and recommends the best-fitting ML models. ModelScout uses Auto-sklearn to intelligently search through a vast hyperparameter space and identifies optimal models for your specific data.
|
|
6
|
+
|
|
7
|
+
## ๐ฏ Features
|
|
8
|
+
|
|
9
|
+
- **Automated Problem Detection**: Automatically detects classification, regression, or clustering tasks
|
|
10
|
+
- **Smart Model Selection**: Uses Auto-sklearn to find the best models for your data
|
|
11
|
+
- **Comprehensive Analysis**: Provides detailed dataset analysis and insights
|
|
12
|
+
- **Multiple Formats**: Generates reports in text, JSON, and table formats
|
|
13
|
+
- **REST API**: Flask-based REST API for easy integration
|
|
14
|
+
- **Support for All ML Tasks**: Classification, Regression, Time-series, and more
|
|
15
|
+
|
|
16
|
+
## ๐ Project Structure
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
ModelScout/
|
|
20
|
+
โโโ agent/ # Core ML engine
|
|
21
|
+
โ โโโ data_analyzer.py # Dataset analysis module
|
|
22
|
+
โ โโโ model_selector.py # Model recommendation using Auto-sklearn
|
|
23
|
+
โ โโโ reporter.py # Report generation
|
|
24
|
+
โ โโโ orchestrator.py # Main pipeline orchestrator
|
|
25
|
+
โ โโโ __init__.py
|
|
26
|
+
โโโ api/ # REST API
|
|
27
|
+
โ โโโ main.py # Flask API endpoints
|
|
28
|
+
โ โโโ __init__.py
|
|
29
|
+
โโโ data/ # Sample datasets
|
|
30
|
+
โโโ models/ # Trained models storage
|
|
31
|
+
โโโ outputs/ # Generated reports
|
|
32
|
+
โโโ requirements.txt # Python dependencies
|
|
33
|
+
โโโ demo.py # Demo script with examples
|
|
34
|
+
โโโ README.md
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## ๐ Quick Start
|
|
38
|
+
|
|
39
|
+
### 1. Installation
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# Clone or navigate to the project directory
|
|
43
|
+
cd ModelScout
|
|
44
|
+
|
|
45
|
+
# Create virtual environment
|
|
46
|
+
python -m venv venv
|
|
47
|
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
|
48
|
+
|
|
49
|
+
# Install dependencies
|
|
50
|
+
pip install -r requirements.txt
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### 2. Basic Usage
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from agent.orchestrator import ModelScout
|
|
57
|
+
|
|
58
|
+
# Initialize
|
|
59
|
+
scout = ModelScout(auto_train_time=300)
|
|
60
|
+
|
|
61
|
+
# Run complete pipeline
|
|
62
|
+
result = scout.run_full_pipeline(
|
|
63
|
+
data_path='your_data.csv',
|
|
64
|
+
target='target_column',
|
|
65
|
+
report_path='outputs/report.txt'
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Access results
|
|
69
|
+
print(result['recommendations']['best_model_name'])
|
|
70
|
+
print(result['recommendations']['test_score'])
|
|
71
|
+
print(result['report'])
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### 3. Step-by-Step Usage
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from agent.orchestrator import ModelScout
|
|
78
|
+
import pandas as pd
|
|
79
|
+
|
|
80
|
+
scout = ModelScout()
|
|
81
|
+
|
|
82
|
+
# Load data
|
|
83
|
+
df = scout.load_data('data.csv')
|
|
84
|
+
|
|
85
|
+
# Analyze data
|
|
86
|
+
analysis = scout.analyze_data(df, target='label')
|
|
87
|
+
print(f"Problem Type: {analysis['target_analysis']['type']}")
|
|
88
|
+
|
|
89
|
+
# Get recommendations
|
|
90
|
+
recommendations = scout.recommend_models(df, 'label')
|
|
91
|
+
print(f"Best Model: {recommendations['best_model_name']}")
|
|
92
|
+
print(f"Test Score: {recommendations['test_score']}")
|
|
93
|
+
|
|
94
|
+
# Generate report
|
|
95
|
+
report = scout.generate_report(output_format='text', output_path='report.txt')
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## ๐ง API Endpoints
|
|
99
|
+
|
|
100
|
+
### Health Check
|
|
101
|
+
```
|
|
102
|
+
GET /health
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Analyze Dataset
|
|
106
|
+
```
|
|
107
|
+
POST /api/analyze
|
|
108
|
+
Content-Type: application/json
|
|
109
|
+
|
|
110
|
+
{
|
|
111
|
+
"file_path": "path/to/data.csv",
|
|
112
|
+
"target": "target_column"
|
|
113
|
+
}
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Get Recommendations
|
|
117
|
+
```
|
|
118
|
+
POST /api/recommend
|
|
119
|
+
Content-Type: application/json
|
|
120
|
+
|
|
121
|
+
{
|
|
122
|
+
"file_path": "path/to/data.csv",
|
|
123
|
+
"target": "target_column",
|
|
124
|
+
"time_limit": 300
|
|
125
|
+
}
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Generate Report
|
|
129
|
+
```
|
|
130
|
+
POST /api/report
|
|
131
|
+
Content-Type: application/json
|
|
132
|
+
|
|
133
|
+
{
|
|
134
|
+
"file_path": "path/to/data.csv",
|
|
135
|
+
"target": "target_column",
|
|
136
|
+
"format": "text"
|
|
137
|
+
}
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### Full Pipeline
|
|
141
|
+
```
|
|
142
|
+
POST /api/pipeline
|
|
143
|
+
Content-Type: application/json
|
|
144
|
+
|
|
145
|
+
{
|
|
146
|
+
"file_path": "path/to/data.csv",
|
|
147
|
+
"target": "target_column",
|
|
148
|
+
"time_limit": 300
|
|
149
|
+
}
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## ๐ฎ Run Demo
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
python demo.py
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
The demo script:
|
|
159
|
+
1. Creates sample datasets (Iris, Breast Cancer, Regression)
|
|
160
|
+
2. Runs ModelScout on each dataset
|
|
161
|
+
3. Generates comparison reports
|
|
162
|
+
4. Demonstrates both classification and regression
|
|
163
|
+
|
|
164
|
+
## ๐ What ModelScout Analyzes
|
|
165
|
+
|
|
166
|
+
### Data Characteristics
|
|
167
|
+
- Dataset size and shape
|
|
168
|
+
- Missing values and data quality
|
|
169
|
+
- Feature types and counts
|
|
170
|
+
- Memory usage
|
|
171
|
+
|
|
172
|
+
### Target Variable
|
|
173
|
+
- Problem type (Classification/Regression)
|
|
174
|
+
- Class distribution (for classification)
|
|
175
|
+
- Value range (for regression)
|
|
176
|
+
- Class imbalance ratio
|
|
177
|
+
|
|
178
|
+
### Feature Statistics
|
|
179
|
+
- Numeric: mean, std, min, max, missing count
|
|
180
|
+
- Categorical: unique values, missing count
|
|
181
|
+
|
|
182
|
+
## ๐ค How It Works
|
|
183
|
+
|
|
184
|
+
1. **Data Loading**: Supports CSV, Excel, JSON formats
|
|
185
|
+
2. **Analysis**: Comprehensive dataset profiling
|
|
186
|
+
3. **Problem Detection**: Auto-detects ML task type
|
|
187
|
+
4. **Model Search**: Auto-sklearn searches optimal models
|
|
188
|
+
5. **Evaluation**: Train/test split and performance metrics
|
|
189
|
+
6. **Reporting**: Generates detailed recommendations
|
|
190
|
+
|
|
191
|
+
## ๐ฆ Dependencies
|
|
192
|
+
|
|
193
|
+
- **pandas**: Data manipulation
|
|
194
|
+
- **scikit-learn**: ML algorithms
|
|
195
|
+
- **auto-sklearn**: Automated ML model selection
|
|
196
|
+
- **numpy**: Numerical computing
|
|
197
|
+
- **matplotlib/seaborn**: Visualization
|
|
198
|
+
- **flask**: REST API
|
|
199
|
+
- **xgboost, lightgbm, catboost**: Advanced models
|
|
200
|
+
- **imbalanced-learn**: Class imbalance handling
|
|
201
|
+
|
|
202
|
+
## ๐ Example Output
|
|
203
|
+
|
|
204
|
+
```
|
|
205
|
+
======================================================================
|
|
206
|
+
___ ___ _ _ ____ ___ _ _ ___
|
|
207
|
+
| \/ | | | | | / ___ \/ _ \| | | |_ |
|
|
208
|
+
| . . | ___ __| | ___| | / / \/ /_\ \ | | | / /
|
|
209
|
+
| |\/| |/ _ \ / _` |/ _ \ | \ \ | _ | | | |/ /
|
|
210
|
+
| | | | (_) || (_| | __/ | \ \__| | | | |_| / /
|
|
211
|
+
|_| |_|\___/ \__,_|\___|_| \___/_| |_|\___/___/
|
|
212
|
+
|
|
213
|
+
======================================================================
|
|
214
|
+
|
|
215
|
+
DATA OVERVIEW
|
|
216
|
+
======================================================================
|
|
217
|
+
Dataset Shape: (150, 5) (rows, columns)
|
|
218
|
+
Memory Usage: 0.00 MB
|
|
219
|
+
Missing Values: 0 (0.00%)
|
|
220
|
+
Numeric Features: 4
|
|
221
|
+
Categorical Features: 0
|
|
222
|
+
|
|
223
|
+
TARGET VARIABLE ANALYSIS
|
|
224
|
+
----------------------------------------------------------------------
|
|
225
|
+
Problem Type: CLASSIFICATION
|
|
226
|
+
Unique Values: 3
|
|
227
|
+
Missing Values: 0
|
|
228
|
+
Class Imbalance Ratio: 1.00:1
|
|
229
|
+
Class Distribution:
|
|
230
|
+
0: 50 (33.3%)
|
|
231
|
+
1: 50 (33.3%)
|
|
232
|
+
2: 50 (33.3%)
|
|
233
|
+
|
|
234
|
+
MODEL RECOMMENDATIONS
|
|
235
|
+
======================================================================
|
|
236
|
+
Best Model: RandomForestClassifier
|
|
237
|
+
Problem Type: CLASSIFICATION
|
|
238
|
+
Train Score: 1.0000
|
|
239
|
+
Test Score: 0.9333
|
|
240
|
+
Data Shape Used: (150, 4)
|
|
241
|
+
Number of Classes: 3
|
|
242
|
+
|
|
243
|
+
======================================================================
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## ๐ ๏ธ Configuration
|
|
247
|
+
|
|
248
|
+
You can customize behavior by modifying parameters:
|
|
249
|
+
|
|
250
|
+
```python
|
|
251
|
+
scout = ModelScout(
|
|
252
|
+
auto_train_time=600 # Increase for more thorough search (seconds)
|
|
253
|
+
)
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
## ๐ License
|
|
257
|
+
|
|
258
|
+
This project is for educational and portfolio purposes.
|
|
259
|
+
|
|
260
|
+
## ๐ค Contributing
|
|
261
|
+
|
|
262
|
+
Feel free to extend ModelScout with:
|
|
263
|
+
- Additional models
|
|
264
|
+
- More data preprocessing options
|
|
265
|
+
- Visualization enhancements
|
|
266
|
+
- Performance optimizations
|
|
267
|
+
|
|
268
|
+
## ๐ Support
|
|
269
|
+
|
|
270
|
+
For issues or questions, refer to the demo.py script for usage examples.
|
|
271
|
+
|
|
272
|
+
---
|
|
273
|
+
|
|
274
|
+
**Happy Model Scouting! ๐ฏ**
|
|
File without changes
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
|
4
|
+
from sklearn.impute import SimpleImputer
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
|
|
8
|
+
before = len(df)
|
|
9
|
+
df = df.drop_duplicates()
|
|
10
|
+
after = len(df)
|
|
11
|
+
|
|
12
|
+
removed = before - after
|
|
13
|
+
if removed > 0:
|
|
14
|
+
print(f" โ
Removed {removed} duplicate rows")
|
|
15
|
+
else:
|
|
16
|
+
print(f" โ
No duplicates found")
|
|
17
|
+
|
|
18
|
+
return df
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame:
|
|
22
|
+
print("\n Handling missing values...")
|
|
23
|
+
|
|
24
|
+
for col in df.columns:
|
|
25
|
+
missing_count = df[col].isnull().sum()
|
|
26
|
+
|
|
27
|
+
if missing_count > 0:
|
|
28
|
+
if df[col].dtype in ["int64", "float64"]:
|
|
29
|
+
# Fill numbers with column average
|
|
30
|
+
fill_value = df[col].mean()
|
|
31
|
+
df[col] = df[col].fillna(fill_value)
|
|
32
|
+
print(f"{col} โ filled {missing_count} empty cells with mean ({fill_value:.2f})")
|
|
33
|
+
|
|
34
|
+
else:
|
|
35
|
+
# Fill categories with most common value
|
|
36
|
+
fill_value = df[col].mode()[0]
|
|
37
|
+
df[col] = df[col].fillna(fill_value)
|
|
38
|
+
print(f" โ
{col} โ filled {missing_count} empty cells with '{fill_value}'")
|
|
39
|
+
|
|
40
|
+
return df
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def encode_categorical_columns(
|
|
44
|
+
df: pd.DataFrame,
|
|
45
|
+
target_column: str,
|
|
46
|
+
feature_columns: list
|
|
47
|
+
) -> tuple:
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
print("\n Encoding categorical columns...")
|
|
51
|
+
|
|
52
|
+
encoders = {}
|
|
53
|
+
|
|
54
|
+
# Encode feature columns
|
|
55
|
+
for col in feature_columns:
|
|
56
|
+
if df[col].dtype == "object":
|
|
57
|
+
le = LabelEncoder()
|
|
58
|
+
df[col] = le.fit_transform(df[col].astype(str))
|
|
59
|
+
encoders[col] = le
|
|
60
|
+
print(f" โ
{col} โ converted to numbers")
|
|
61
|
+
|
|
62
|
+
# Encode target column if it is text
|
|
63
|
+
if df[target_column].dtype == "object":
|
|
64
|
+
le = LabelEncoder()
|
|
65
|
+
df[target_column] = le.fit_transform(
|
|
66
|
+
df[target_column].astype(str)
|
|
67
|
+
)
|
|
68
|
+
encoders[target_column] = le
|
|
69
|
+
print(f" โ
Target '{target_column}' โ converted to numbers")
|
|
70
|
+
|
|
71
|
+
return df, encoders
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def scale_numeric_columns(
|
|
75
|
+
df: pd.DataFrame,
|
|
76
|
+
feature_columns: list
|
|
77
|
+
) -> tuple:
|
|
78
|
+
|
|
79
|
+
print("\n Scaling numeric columns...")
|
|
80
|
+
|
|
81
|
+
# Only scale numeric feature columns
|
|
82
|
+
numeric_cols = [
|
|
83
|
+
col for col in feature_columns
|
|
84
|
+
if df[col].dtype in ["int64", "float64"]
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
if len(numeric_cols) > 0:
|
|
88
|
+
scaler = StandardScaler()
|
|
89
|
+
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
|
|
90
|
+
print(f" โ
Scaled {len(numeric_cols)} numeric columns")
|
|
91
|
+
else:
|
|
92
|
+
scaler = None
|
|
93
|
+
print(" โ
No numeric columns to scale")
|
|
94
|
+
|
|
95
|
+
return df, scaler
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def run_cleaner(
|
|
99
|
+
df: pd.DataFrame,
|
|
100
|
+
target_column: str,
|
|
101
|
+
feature_columns: list
|
|
102
|
+
) -> dict:
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
print("\n๐งน ModelScout: Starting data cleaning...\n")
|
|
106
|
+
|
|
107
|
+
# Step 1
|
|
108
|
+
print("Step 1: Removing duplicates...")
|
|
109
|
+
df = remove_duplicates(df)
|
|
110
|
+
|
|
111
|
+
# Step 2
|
|
112
|
+
print("\nStep 2: Handling missing values...")
|
|
113
|
+
df = handle_missing_values(df)
|
|
114
|
+
|
|
115
|
+
# Step 3
|
|
116
|
+
print("\nStep 3: Encoding categorical columns...")
|
|
117
|
+
df, encoders = encode_categorical_columns(
|
|
118
|
+
df, target_column, feature_columns
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Step 4
|
|
122
|
+
print("\nStep 4: Scaling numeric columns...")
|
|
123
|
+
df, scaler = scale_numeric_columns(df, feature_columns)
|
|
124
|
+
|
|
125
|
+
# Separate features and target
|
|
126
|
+
X = df[feature_columns]
|
|
127
|
+
y = df[target_column]
|
|
128
|
+
|
|
129
|
+
print("\nโ
Data cleaning complete!")
|
|
130
|
+
print(f" Final shape: {X.shape[0]} rows, {X.shape[1]} features\n")
|
|
131
|
+
|
|
132
|
+
return {
|
|
133
|
+
"X": X,
|
|
134
|
+
"y": y,
|
|
135
|
+
"encoders": encoders,
|
|
136
|
+
"scaler": scaler,
|
|
137
|
+
"cleaned_df": df
|
|
138
|
+
}
|