PyPI - modelscout-ai - Versions diffs - 0.1.0__tar.gz - Mend

modelscout-ai 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

modelscout_ai-0.1.0/PKG-INFO +290 -0
modelscout_ai-0.1.0/README.md +274 -0
modelscout_ai-0.1.0/agent/__init__.py +0 -0
modelscout_ai-0.1.0/agent/cleaner.py +138 -0
modelscout_ai-0.1.0/agent/config.py +100 -0
modelscout_ai-0.1.0/agent/cv_cleaner.py +296 -0
modelscout_ai-0.1.0/agent/cv_detector.py +319 -0
modelscout_ai-0.1.0/agent/cv_evaluator.py +230 -0
modelscout_ai-0.1.0/agent/cv_reporter.py +264 -0
modelscout_ai-0.1.0/agent/cv_trainer.py +351 -0
modelscout_ai-0.1.0/agent/data_analyzer.py +110 -0
modelscout_ai-0.1.0/agent/evaluator.py +314 -0
modelscout_ai-0.1.0/agent/model_selector.py +168 -0
modelscout_ai-0.1.0/agent/nlp_cleaner.py +255 -0
modelscout_ai-0.1.0/agent/nlp_detector.py +225 -0
modelscout_ai-0.1.0/agent/nlp_evaluator.py +284 -0
modelscout_ai-0.1.0/agent/nlp_reporter.py +330 -0
modelscout_ai-0.1.0/agent/nlp_trainer.py +391 -0
modelscout_ai-0.1.0/agent/orchestrator.py +145 -0
modelscout_ai-0.1.0/agent/reporter.py +180 -0
modelscout_ai-0.1.0/agent/router.py +267 -0
modelscout_ai-0.1.0/agent/trainer.py +310 -0
modelscout_ai-0.1.0/agent/ts_cleaner.py +306 -0
modelscout_ai-0.1.0/agent/ts_detector.py +270 -0
modelscout_ai-0.1.0/agent/ts_evaluator.py +330 -0
modelscout_ai-0.1.0/agent/ts_reporter.py +283 -0
modelscout_ai-0.1.0/agent/ts_trainer.py +552 -0
modelscout_ai-0.1.0/modelscout_ai.egg-info/PKG-INFO +290 -0
modelscout_ai-0.1.0/modelscout_ai.egg-info/SOURCES.txt +31 -0
modelscout_ai-0.1.0/modelscout_ai.egg-info/dependency_links.txt +1 -0
modelscout_ai-0.1.0/modelscout_ai.egg-info/top_level.txt +1 -0
modelscout_ai-0.1.0/pyproject.toml +29 -0
modelscout_ai-0.1.0/setup.cfg +4 -0

modelscout_ai-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,290 @@
+Metadata-Version: 2.4
+Name: modelscout-ai
+Version: 0.1.0
+Summary: Autonomous ML agent that finds the best model for any dataset automatically
+Author-email: Iram Fatima <iramfatima749@gmail.com>
+License: MIT
+Project-URL: Homepage, https://github.com/Iramfatima12/modelscout
+Keywords: machine learning,automl,model selection,autonomous,deep learning
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+# ModelScout 🤖
+**Intelligent ML Model Recommendation System**
+An automated machine learning tool that analyzes your dataset and recommends the best-fitting ML models. ModelScout uses Auto-sklearn to intelligently search through a vast hyperparameter space and identifies optimal models for your specific data.
+## 🎯 Features
+- **Automated Problem Detection**: Automatically detects classification, regression, or clustering tasks
+- **Smart Model Selection**: Uses Auto-sklearn to find the best models for your data
+- **Comprehensive Analysis**: Provides detailed dataset analysis and insights
+- **Multiple Formats**: Generates reports in text, JSON, and table formats
+- **REST API**: Flask-based REST API for easy integration
+- **Support for All ML Tasks**: Classification, Regression, Time-series, and more
+## 📋 Project Structure
+```
+ModelScout/
+├── agent/                    # Core ML engine
+│   ├── data_analyzer.py     # Dataset analysis module
+│   ├── model_selector.py    # Model recommendation using Auto-sklearn
+│   ├── reporter.py          # Report generation
+│   ├── orchestrator.py      # Main pipeline orchestrator
+│   └── __init__.py
+├── api/                      # REST API
+│   ├── main.py              # Flask API endpoints
+│   └── __init__.py
+├── data/                     # Sample datasets
+├── models/                   # Trained models storage
+├── outputs/                  # Generated reports
+├── requirements.txt         # Python dependencies
+├── demo.py                  # Demo script with examples
+└── README.md
+```
+## 🚀 Quick Start
+### 1. Installation
+```bash
+# Clone or navigate to the project directory
+cd ModelScout
+# Create virtual environment
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+# Install dependencies
+pip install -r requirements.txt
+```
+### 2. Basic Usage
+```python
+from agent.orchestrator import ModelScout
+# Initialize
+scout = ModelScout(auto_train_time=300)
+# Run complete pipeline
+result = scout.run_full_pipeline(
+    data_path='your_data.csv',
+    target='target_column',
+    report_path='outputs/report.txt'
+)
+# Access results
+print(result['recommendations']['best_model_name'])
+print(result['recommendations']['test_score'])
+print(result['report'])
+```
+### 3. Step-by-Step Usage
+```python
+from agent.orchestrator import ModelScout
+import pandas as pd
+scout = ModelScout()
+# Load data
+df = scout.load_data('data.csv')
+# Analyze data
+analysis = scout.analyze_data(df, target='label')
+print(f"Problem Type: {analysis['target_analysis']['type']}")
+# Get recommendations
+recommendations = scout.recommend_models(df, 'label')
+print(f"Best Model: {recommendations['best_model_name']}")
+print(f"Test Score: {recommendations['test_score']}")
+# Generate report
+report = scout.generate_report(output_format='text', output_path='report.txt')
+```
+## 🔧 API Endpoints
+### Health Check
+```
+GET /health
+```
+### Analyze Dataset
+```
+POST /api/analyze
+Content-Type: application/json
+{
+    "file_path": "path/to/data.csv",
+    "target": "target_column"
+}
+```
+### Get Recommendations
+```
+POST /api/recommend
+Content-Type: application/json
+{
+    "file_path": "path/to/data.csv",
+    "target": "target_column",
+    "time_limit": 300
+}
+```
+### Generate Report
+```
+POST /api/report
+Content-Type: application/json
+{
+    "file_path": "path/to/data.csv",
+    "target": "target_column",
+    "format": "text"
+}
+```
+### Full Pipeline
+```
+POST /api/pipeline
+Content-Type: application/json
+{
+    "file_path": "path/to/data.csv",
+    "target": "target_column",
+    "time_limit": 300
+}
+```
+## 🎮 Run Demo
+```bash
+python demo.py
+```
+The demo script:
+1. Creates sample datasets (Iris, Breast Cancer, Regression)
+2. Runs ModelScout on each dataset
+3. Generates comparison reports
+4. Demonstrates both classification and regression
+## 📊 What ModelScout Analyzes
+### Data Characteristics
+- Dataset size and shape
+- Missing values and data quality
+- Feature types and counts
+- Memory usage
+### Target Variable
+- Problem type (Classification/Regression)
+- Class distribution (for classification)
+- Value range (for regression)
+- Class imbalance ratio
+### Feature Statistics
+- Numeric: mean, std, min, max, missing count
+- Categorical: unique values, missing count
+## 🤖 How It Works
+1. **Data Loading**: Supports CSV, Excel, JSON formats
+2. **Analysis**: Comprehensive dataset profiling
+3. **Problem Detection**: Auto-detects ML task type
+4. **Model Search**: Auto-sklearn searches optimal models
+5. **Evaluation**: Train/test split and performance metrics
+6. **Reporting**: Generates detailed recommendations
+## 📦 Dependencies
+- **pandas**: Data manipulation
+- **scikit-learn**: ML algorithms
+- **auto-sklearn**: Automated ML model selection
+- **numpy**: Numerical computing
+- **matplotlib/seaborn**: Visualization
+- **flask**: REST API
+- **xgboost, lightgbm, catboost**: Advanced models
+- **imbalanced-learn**: Class imbalance handling
+## 🔍 Example Output
+```
+======================================================================
+  ___  ___           _      _    ____  ___  _   _ ___
+ |  \/  |          | |    | |  / ___ \/ _ \| | | |_  |
+ | .  . | ___    __| | ___| | / /   \/ /_\ \ | | | / /
+ | |\/| |/ _ \  / _` |/ _ \ | \ \   |  _  | | | |/ /
+ | |  | | (_) || (_| |  __/ |  \ \__| | | | |_| / /
+ |_|  |_|\___/  \__,_|\___|_|   \___/_| |_|\___/___/
+======================================================================
+DATA OVERVIEW
+======================================================================
+Dataset Shape: (150, 5) (rows, columns)
+Memory Usage: 0.00 MB
+Missing Values: 0 (0.00%)
+Numeric Features: 4
+Categorical Features: 0
+TARGET VARIABLE ANALYSIS
+----------------------------------------------------------------------
+Problem Type: CLASSIFICATION
+Unique Values: 3
+Missing Values: 0
+Class Imbalance Ratio: 1.00:1
+Class Distribution:
+  0: 50 (33.3%)
+  1: 50 (33.3%)
+  2: 50 (33.3%)
+MODEL RECOMMENDATIONS
+======================================================================
+Best Model: RandomForestClassifier
+Problem Type: CLASSIFICATION
+Train Score: 1.0000
+Test Score: 0.9333
+Data Shape Used: (150, 4)
+Number of Classes: 3
+======================================================================
+```
+## 🛠️ Configuration
+You can customize behavior by modifying parameters:
+```python
+scout = ModelScout(
+    auto_train_time=600  # Increase for more thorough search (seconds)
+)
+```
+## 📝 License
+This project is for educational and portfolio purposes.
+## 🤝 Contributing
+Feel free to extend ModelScout with:
+- Additional models
+- More data preprocessing options
+- Visualization enhancements
+- Performance optimizations
+## 📞 Support
+For issues or questions, refer to the demo.py script for usage examples.
+---
+**Happy Model Scouting! 🎯**

modelscout_ai-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,274 @@
+# ModelScout 🤖
+**Intelligent ML Model Recommendation System**
+An automated machine learning tool that analyzes your dataset and recommends the best-fitting ML models. ModelScout uses Auto-sklearn to intelligently search through a vast hyperparameter space and identifies optimal models for your specific data.
+## 🎯 Features
+- **Automated Problem Detection**: Automatically detects classification, regression, or clustering tasks
+- **Smart Model Selection**: Uses Auto-sklearn to find the best models for your data
+- **Comprehensive Analysis**: Provides detailed dataset analysis and insights
+- **Multiple Formats**: Generates reports in text, JSON, and table formats
+- **REST API**: Flask-based REST API for easy integration
+- **Support for All ML Tasks**: Classification, Regression, Time-series, and more
+## 📋 Project Structure
+```
+ModelScout/
+├── agent/                    # Core ML engine
+│   ├── data_analyzer.py     # Dataset analysis module
+│   ├── model_selector.py    # Model recommendation using Auto-sklearn
+│   ├── reporter.py          # Report generation
+│   ├── orchestrator.py      # Main pipeline orchestrator
+│   └── __init__.py
+├── api/                      # REST API
+│   ├── main.py              # Flask API endpoints
+│   └── __init__.py
+├── data/                     # Sample datasets
+├── models/                   # Trained models storage
+├── outputs/                  # Generated reports
+├── requirements.txt         # Python dependencies
+├── demo.py                  # Demo script with examples
+└── README.md
+```
+## 🚀 Quick Start
+### 1. Installation
+```bash
+# Clone or navigate to the project directory
+cd ModelScout
+# Create virtual environment
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+# Install dependencies
+pip install -r requirements.txt
+```
+### 2. Basic Usage
+```python
+from agent.orchestrator import ModelScout
+# Initialize
+scout = ModelScout(auto_train_time=300)
+# Run complete pipeline
+result = scout.run_full_pipeline(
+    data_path='your_data.csv',
+    target='target_column',
+    report_path='outputs/report.txt'
+)
+# Access results
+print(result['recommendations']['best_model_name'])
+print(result['recommendations']['test_score'])
+print(result['report'])
+```
+### 3. Step-by-Step Usage
+```python
+from agent.orchestrator import ModelScout
+import pandas as pd
+scout = ModelScout()
+# Load data
+df = scout.load_data('data.csv')
+# Analyze data
+analysis = scout.analyze_data(df, target='label')
+print(f"Problem Type: {analysis['target_analysis']['type']}")
+# Get recommendations
+recommendations = scout.recommend_models(df, 'label')
+print(f"Best Model: {recommendations['best_model_name']}")
+print(f"Test Score: {recommendations['test_score']}")
+# Generate report
+report = scout.generate_report(output_format='text', output_path='report.txt')
+```
+## 🔧 API Endpoints
+### Health Check
+```
+GET /health
+```
+### Analyze Dataset
+```
+POST /api/analyze
+Content-Type: application/json
+{
+    "file_path": "path/to/data.csv",
+    "target": "target_column"
+}
+```
+### Get Recommendations
+```
+POST /api/recommend
+Content-Type: application/json
+{
+    "file_path": "path/to/data.csv",
+    "target": "target_column",
+    "time_limit": 300
+}
+```
+### Generate Report
+```
+POST /api/report
+Content-Type: application/json
+{
+    "file_path": "path/to/data.csv",
+    "target": "target_column",
+    "format": "text"
+}
+```
+### Full Pipeline
+```
+POST /api/pipeline
+Content-Type: application/json
+{
+    "file_path": "path/to/data.csv",
+    "target": "target_column",
+    "time_limit": 300
+}
+```
+## 🎮 Run Demo
+```bash
+python demo.py
+```
+The demo script:
+1. Creates sample datasets (Iris, Breast Cancer, Regression)
+2. Runs ModelScout on each dataset
+3. Generates comparison reports
+4. Demonstrates both classification and regression
+## 📊 What ModelScout Analyzes
+### Data Characteristics
+- Dataset size and shape
+- Missing values and data quality
+- Feature types and counts
+- Memory usage
+### Target Variable
+- Problem type (Classification/Regression)
+- Class distribution (for classification)
+- Value range (for regression)
+- Class imbalance ratio
+### Feature Statistics
+- Numeric: mean, std, min, max, missing count
+- Categorical: unique values, missing count
+## 🤖 How It Works
+1. **Data Loading**: Supports CSV, Excel, JSON formats
+2. **Analysis**: Comprehensive dataset profiling
+3. **Problem Detection**: Auto-detects ML task type
+4. **Model Search**: Auto-sklearn searches optimal models
+5. **Evaluation**: Train/test split and performance metrics
+6. **Reporting**: Generates detailed recommendations
+## 📦 Dependencies
+- **pandas**: Data manipulation
+- **scikit-learn**: ML algorithms
+- **auto-sklearn**: Automated ML model selection
+- **numpy**: Numerical computing
+- **matplotlib/seaborn**: Visualization
+- **flask**: REST API
+- **xgboost, lightgbm, catboost**: Advanced models
+- **imbalanced-learn**: Class imbalance handling
+## 🔍 Example Output
+```
+======================================================================
+  ___  ___           _      _    ____  ___  _   _ ___
+ |  \/  |          | |    | |  / ___ \/ _ \| | | |_  |
+ | .  . | ___    __| | ___| | / /   \/ /_\ \ | | | / /
+ | |\/| |/ _ \  / _` |/ _ \ | \ \   |  _  | | | |/ /
+ | |  | | (_) || (_| |  __/ |  \ \__| | | | |_| / /
+ |_|  |_|\___/  \__,_|\___|_|   \___/_| |_|\___/___/
+======================================================================
+DATA OVERVIEW
+======================================================================
+Dataset Shape: (150, 5) (rows, columns)
+Memory Usage: 0.00 MB
+Missing Values: 0 (0.00%)
+Numeric Features: 4
+Categorical Features: 0
+TARGET VARIABLE ANALYSIS
+----------------------------------------------------------------------
+Problem Type: CLASSIFICATION
+Unique Values: 3
+Missing Values: 0
+Class Imbalance Ratio: 1.00:1
+Class Distribution:
+  0: 50 (33.3%)
+  1: 50 (33.3%)
+  2: 50 (33.3%)
+MODEL RECOMMENDATIONS
+======================================================================
+Best Model: RandomForestClassifier
+Problem Type: CLASSIFICATION
+Train Score: 1.0000
+Test Score: 0.9333
+Data Shape Used: (150, 4)
+Number of Classes: 3
+======================================================================
+```
+## 🛠️ Configuration
+You can customize behavior by modifying parameters:
+```python
+scout = ModelScout(
+    auto_train_time=600  # Increase for more thorough search (seconds)
+)
+```
+## 📝 License
+This project is for educational and portfolio purposes.
+## 🤝 Contributing
+Feel free to extend ModelScout with:
+- Additional models
+- More data preprocessing options
+- Visualization enhancements
+- Performance optimizations
+## 📞 Support
+For issues or questions, refer to the demo.py script for usage examples.
+---
+**Happy Model Scouting! 🎯**

modelscout_ai-0.1.0/agent/__init__.py ADDED Viewed

File without changes

modelscout_ai-0.1.0/agent/cleaner.py ADDED Viewed

@@ -0,0 +1,138 @@
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+from sklearn.impute import SimpleImputer
+def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
+    before = len(df)
+    df = df.drop_duplicates()
+    after = len(df)
+    removed = before - after
+    if removed > 0:
+        print(f"   ✅ Removed {removed} duplicate rows")
+    else:
+        print(f"   ✅ No duplicates found")
+    return df
+def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame:
+    print("\n   Handling missing values...")
+    for col in df.columns:
+        missing_count = df[col].isnull().sum()
+        if missing_count > 0:
+            if df[col].dtype in ["int64", "float64"]:
+                # Fill numbers with column average
+                fill_value = df[col].mean()
+                df[col] = df[col].fillna(fill_value)
+                print(f"{col} → filled {missing_count} empty cells with mean ({fill_value:.2f})")
+            else:
+                # Fill categories with most common value
+                fill_value = df[col].mode()[0]
+                df[col] = df[col].fillna(fill_value)
+                print(f"   ✅ {col} → filled {missing_count} empty cells with '{fill_value}'")
+    return df
+def encode_categorical_columns(
+    df: pd.DataFrame,
+    target_column: str,
+    feature_columns: list
+) -> tuple:
+    print("\n   Encoding categorical columns...")
+    encoders = {}
+    # Encode feature columns
+    for col in feature_columns:
+        if df[col].dtype == "object":
+            le = LabelEncoder()
+            df[col] = le.fit_transform(df[col].astype(str))
+            encoders[col] = le
+            print(f"   ✅ {col} → converted to numbers")
+    # Encode target column if it is text
+    if df[target_column].dtype == "object":
+        le = LabelEncoder()
+        df[target_column] = le.fit_transform(
+            df[target_column].astype(str)
+        )
+        encoders[target_column] = le
+        print(f"   ✅ Target '{target_column}' → converted to numbers")
+    return df, encoders
+def scale_numeric_columns(
+    df: pd.DataFrame,
+    feature_columns: list
+) -> tuple:
+    print("\n   Scaling numeric columns...")
+    # Only scale numeric feature columns
+    numeric_cols = [
+        col for col in feature_columns
+        if df[col].dtype in ["int64", "float64"]
+    ]
+    if len(numeric_cols) > 0:
+        scaler = StandardScaler()
+        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
+        print(f"   ✅ Scaled {len(numeric_cols)} numeric columns")
+    else:
+        scaler = None
+        print("   ✅ No numeric columns to scale")
+    return df, scaler
+def run_cleaner(
+    df: pd.DataFrame,
+    target_column: str,
+    feature_columns: list
+) -> dict:
+    print("\n🧹 ModelScout: Starting data cleaning...\n")
+    # Step 1
+    print("Step 1: Removing duplicates...")
+    df = remove_duplicates(df)
+    # Step 2
+    print("\nStep 2: Handling missing values...")
+    df = handle_missing_values(df)
+    # Step 3
+    print("\nStep 3: Encoding categorical columns...")
+    df, encoders = encode_categorical_columns(
+        df, target_column, feature_columns
+    )
+    # Step 4
+    print("\nStep 4: Scaling numeric columns...")
+    df, scaler = scale_numeric_columns(df, feature_columns)
+    # Separate features and target
+    X = df[feature_columns]
+    y = df[target_column]
+    print("\n✅ Data cleaning complete!")
+    print(f"   Final shape: {X.shape[0]} rows, {X.shape[1]} features\n")
+    return {
+        "X": X,
+        "y": y,
+        "encoders": encoders,
+        "scaler": scaler,
+        "cleaned_df": df
+    }