PyPI - ml_approach_suggestion_agent - Versions diffs - 0.1.0__py3-none-any.whl - Mend

ml_approach_suggestion_agent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ml_approach_suggestion_agent might be problematic. Click here for more details.

Files changed (9) hide show

ml_approach_suggestion_agent/__init__.py +0 -0
ml_approach_suggestion_agent/agent.py +127 -0
ml_approach_suggestion_agent/config.py +20 -0
ml_approach_suggestion_agent/constants.py +102 -0
ml_approach_suggestion_agent/models.py +7 -0
ml_approach_suggestion_agent-0.1.0.dist-info/METADATA +225 -0
ml_approach_suggestion_agent-0.1.0.dist-info/RECORD +9 -0
ml_approach_suggestion_agent-0.1.0.dist-info/WHEEL +5 -0
ml_approach_suggestion_agent-0.1.0.dist-info/top_level.txt +1 -0

ml_approach_suggestion_agent/__init__.py ADDED Viewed

File without changes

ml_approach_suggestion_agent/agent.py ADDED Viewed

@@ -0,0 +1,127 @@
+import json
+import logging
+from typing import Dict, Any, List, Optional, Tuple
+from datetime import datetime
+from sfn_blueprint import SFNAIHandler, self_correcting_sql, Context
+from .config import MethodologyConfig
+from .constants import format_approach_prompt
+from .models import MethodologyRecommendation
+class MLApproachDecisionAgent:
+    def __init__(self, config: Optional[MethodologyConfig] = None):
+        self.logger = logging.getLogger(__name__)
+        self.config = config or MethodologyConfig()
+        self.ai_handler = SFNAIHandler()
+    def suggest_approach(self, domain_name, domain_description, use_case, column_descriptions, column_insights, target_column_name, target_column_insights, max_try=1) -> Tuple[MethodologyRecommendation, Dict[str, Any]]:
+        """
+        Suggests a machine learning approach based on the provided domain, use case, and column descriptions.
+        Args:
+            domain_name (str): The name of the domain.
+            domain_description (str): The description of the domain.
+            use_case (str): problem need to solve.
+            column_descriptions (List[str]): A list of column descriptions.
+            column_insights (List[str]): A list of column insights.
+            max_try (int, optional): The maximum number of attempts to make the API call. Defaults to 3.
+        Returns:
+            MethodologyRecommendation: The suggested machine learning approach.
+        TODO:
+            - USER prompt should consider those approaches which will be supported.
+        """
+        system_prompt, user_prompt = format_approach_prompt(domain_name=domain_name, domain_description=domain_description, use_case=use_case, column_descriptions=column_descriptions, column_insights=column_insights, target_column_name=target_column_name, target_column_insights=target_column_insights)
+        for _ in range(max_try):
+            try:
+                response, cost_summary = self.ai_handler.route_to(
+                    llm_provider=self.config.methodology_ai_provider,
+                    configuration={
+                        "messages": [
+                            {"role": "system", "content": system_prompt},
+                            {"role": "user", "content": user_prompt}
+                        ],
+                        "max_tokens": self.config.methodology_max_tokens,
+                        # "temperature": self.config.methodology_temperature,
+                        "text_format":MethodologyRecommendation
+                    },
+                    model=self.config.methodology_ai_model
+                )
+                return response, cost_summary
+            except Exception as e:
+                self.logger.error(f"Error while executing API call to {self.config.methodology_ai_provider}: {e}")
+        return {}, {}
+    def execute_task(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
+        self.logger.info("Executing data quality assessment task.")
+        domain_name, domain_description, use_case, column_descriptions, column_insights, target_column_name, target_column_insights = (
+            task_data["domain_name"],
+            task_data["domain_description"],
+            task_data["use_case"],
+            task_data["column_descriptions"],
+            task_data["column_insights"],
+            task_data["target_column_name"],
+            task_data["target_column_insights"],
+        )
+        # Suggest an approach
+        result, cost_summary = self.suggest_approach(
+            domain_name=domain_name,
+            domain_description=domain_description,
+            use_case=use_case,
+            column_descriptions=column_descriptions,
+            column_insights=column_insights,
+            target_column_name=target_column_name,
+            target_column_insights=target_column_insights
+        )
+        if not result:
+            return {
+                "success": False,
+                "error": "Failed to suggest approach.",
+                "agent": self.__class__.__name__
+            }
+        try:
+            # Check if we have workflow storage information
+            if 'workflow_storage_path' in task_data or 'workflow_id' in task_data:
+                from sfn_blueprint import WorkflowStorageManager
+                # Determine workflow storage path
+                workflow_storage_path = task_data.get('workflow_storage_path', 'outputs/workflows')
+                workflow_id = task_data.get('workflow_id', 'unknown')
+                # Initialize storage manager
+                storage_manager = WorkflowStorageManager(workflow_storage_path, workflow_id)
+                storage_manager.save_agent_result(
+                    agent_name=self.__class__.__name__,
+                    step_name=" ",
+                    data={"quality_reports": result.model_dump(), "cost_summary": cost_summary},
+                    metadata={ "execution_time": datetime.now().isoformat()}
+                )
+                self.logger.info(" saved to workflow storage.")
+        except Exception as e:
+            self.logger.warning(f"Failed to save results to workflow storage: {e}")
+        return {
+                "success": True,
+                "result": {
+                    "approach": result ,
+                    "cost_summary": cost_summary
+                },
+                "agent": self.__class__.__name__
+            }
+    def __call__(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
+        return self.execute_task(task_data)

ml_approach_suggestion_agent/config.py ADDED Viewed

@@ -0,0 +1,20 @@
+from pydantic import Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class MethodologyConfig(BaseSettings):
+    model_config = SettingsConfigDict(
+        env_file='.env',
+        env_file_encoding='utf-8',
+        case_sensitive=False,
+        extra='ignore'
+    )
+    methodology_ai_provider: str = Field(default="openai", description="AI provider to use")
+    methodology_ai_model: str = Field(default="gpt-4o-mini", description="AI model to use")
+    methodology_temperature: float = Field(default=0.3, ge=0.0, le=0.5, description="AI model temperature")
+    methodology_max_tokens: int = Field(default=4000, ge=100, le=8000, description="Maximum tokens for AI response")

ml_approach_suggestion_agent/constants.py ADDED Viewed

@@ -0,0 +1,102 @@
+METHODOLOGY_SELECTION_SYSTEM_PROMPT = """
+You are an ML methodology advisor. Your task is to analyze the user's problem and recommend the single most appropriate approach.
+**Decision Framework:**
+1. **Understand the Business Goal:** Start with the `use_case_description` to grasp what the user is trying to achieve.
+2. **Examine the Target Variable:** Use `target_column_insights` to understand the nature of what needs to be predicted:
+   - Check `unique_count` to determine if it's binary (2 values), multiclass (>2 values), or continuous
+   - Check `data_type` to see if it's numerical (int/float) or categorical (str)
+   - Review `sample_values` to understand the actual values
+3. **Check for Temporal Dependencies:** Look for these indicators in `column_insights`:
+   - Presence of timestamp/date columns with high unique_count (near row_count)
+   - Column descriptions mentioning "time", "date", "timestamp", "sequential"
+   - Use case description mentioning "over time", "forecast", "predict future", "trend", "sequential patterns"
+   - If temporal column exists AND the prediction depends on historical patterns, consider time series methods
+4. **Critical Time Series Distinction:**
+   - **Time Series Forecasting**: Target is NUMERICAL and goal is predicting FUTURE VALUES (e.g., "predict next month's sales", "forecast temperature")
+   - **Time Series Classification**: Target is CATEGORICAL (even if binary 0/1) and data is SEQUENTIAL (e.g., "classify failure from sensor patterns", "detect activity type from accelerometer sequence")
+   - **Binary/Multiclass Classification**: Target is categorical BUT data points are INDEPENDENT (no temporal ordering matters)
+5. **Assess Data Structure:** Review `column_insights`:
+   - If no target specified or target_column_name is "Not specified" → likely `not_applicable`
+   - If use case is purely computational/rule-based → `not_applicable`
+6. **Select ONE Methodology:**
+   - `binary_classification`: Target has exactly 2 unique values (categorical/binary) AND samples are independent (no temporal dependency)
+   - `multiclass_classification`: Target has >2 unique categories AND samples are independent
+   - `time_series_forecasting`: Target is NUMERICAL AND prediction involves FUTURE time periods based on historical patterns
+   - `time_series_classification`: Target is CATEGORICAL (binary or multiclass) AND data has TEMPORAL ORDERING where sequential patterns are critical for prediction
+   - `not_applicable`: No clear ML objective, purely rule-based problem, or insufficient information
+7. **Key Rules to Avoid Mistakes:**
+   - Binary target (0/1) with timestamp does NOT automatically mean binary_classification - check if SEQUENCE matters
+   - If use case mentions "based on sensor readings over time", "sequential patterns", "time-series data" → it's time_series_classification
+   - If use case is just "predict X" with no temporal context and independent samples → it's binary/multiclass_classification
+   - Presence of timestamp column alone doesn't mean time series - check if the PREDICTION depends on temporal patterns
+   - If target is numerical but goal is "classify into categories" → still classification, not regression
+8. **Justify Your Choice:**
+   - State the business goal clearly
+   - Identify the target variable type (binary/multiclass/numerical)
+   - Explain whether temporal dependencies exist and matter for prediction
+   - Connect these factors to show why the chosen methodology fits
+"""
+METHODOLOGY_SELECTION_USER_PROMPT = """
+**Business Context:**
+Domain: {domain_name}
+{domain_description}
+**Use Case:**
+{use_case_description}
+**Data Overview:**
+Columns:
+{column_descriptions}
+Dataset Characteristics:
+{column_insights}
+**Target Information:**
+Target Column: {target_column_name}
+Target Details: {target_column_insights}
+**Required Output:**
+1. Select the single best ML methodology
+2. Provide a brief justification explaining why this methodology fits the problem
+"""
+def format_approach_prompt(domain_name, domain_description, use_case, column_descriptions, column_insights, target_column_name, target_column_insights):
+    """
+    Args:
+    domain (str): The domain of the data.
+    use_case (str): The use case of the data.
+    column_descriptions (List[str]): A list of column descriptions.
+    column_insights (List[str]): A list of column insights.
+    Returns:
+    Tuple[str, str]: The formatted system prompt and user prompt.
+    TODO:
+        - Change prompt write new prompt and involve the supported approaches in new prompt.
+    """
+    user_prompt = METHODOLOGY_SELECTION_USER_PROMPT.format(
+        domain_name=domain_name,
+        domain_description=domain_description,
+        use_case_description=use_case,
+        column_descriptions=column_descriptions,
+        column_insights=column_insights,
+        target_column_name=target_column_name,
+        target_column_insights=target_column_insights,
+    )
+    return METHODOLOGY_SELECTION_SYSTEM_PROMPT, user_prompt

ml_approach_suggestion_agent/models.py ADDED Viewed

@@ -0,0 +1,7 @@
+from pydantic import BaseModel, Field
+from typing import Literal
+class MethodologyRecommendation(BaseModel):
+    selected_methodology: Literal["binary_classification", "multiclass_classification", "time_series_forecasting", "time_series_classification", "not_applicable" ] = Field(..., description="The most appropriate ML approach for this problem")
+    justification: str = Field(..., description="Clear explanation connecting the business goal and target variable to the chosen methodology")

ml_approach_suggestion_agent-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,225 @@
+Metadata-Version: 2.4
+Name: ml_approach_suggestion_agent
+Version: 0.1.0
+Summary: Add your description here
+License-Expression: MIT
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+Requires-Dist: pydantic-settings
+Requires-Dist: sfn-blueprint>=0.6.15
+Provides-Extra: dev
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: pytest-mock; extra == "dev"
+# ml_approach_suggestion_agent
+An AI-powered agent that analyzes a dataset and use case to recommend the most appropriate machine learning methodology.
+## Description
+This agent takes a detailed description of a business domain, a specific use case, and information about the dataset—including column descriptions, insights, and target variable details—to suggest the best ML approach. It uses a large language model to:
+1.  **Analyze** the relationship between the use case and the target variable.
+2.  **Evaluate** the characteristics of the data (especially the target column).
+3.  **Recommend** the most suitable methodology from a predefined list: `Classification`, `Regression`, `Forecasting`, `Clustering`, or `No-ML`.
+4.  **Provide** a clear justification for its recommendation.
+This helps data scientists and analysts quickly and confidently choose the right path for their modeling efforts, saving time and reducing the risk of starting with an incorrect approach.
+## Key Features
+-   **Intelligent Use Case Analysis**: Leverages an LLM to understand the core objective of the business problem.
+-   **Target-Aware Recommendation**: Places special emphasis on the nature of the target variable to guide its decision.
+-   **Context-Driven Suggestions**: Considers the entire data context, including domain and column descriptions, to make an informed choice.
+-   **Accelerates Model Planning**: Provides a validated starting point for ML projects, ensuring alignment between the problem and the proposed solution.
+## Installation
+### Prerequisites
+-   [**uv**](https://docs.astral.sh/uv/getting-started/installation/) – A fast Python package and environment manager.
+    -   For a quick setup on macOS/Linux, you can use:
+        ```bash
+        curl -LsSf https://astral.sh/uv/install.sh | sh
+        ```
+-   [**Git**](https://git-scm.com/)
+### Steps
+1.  **Clone the `methodology_selection_agent` repository:**
+    ```bash
+    git clone https://github.com/stepfnAI/ml_approach_suggestion_agent.git
+    cd ml_approach_suggestion_agent
+    git switch dev
+    ```
+2.  **Create a virtual environment and install dependencies:**
+    This command creates a `.venv` folder in the current directory and installs all required packages.
+    ```bash
+    uv sync --extra dev
+    source .venv/bin/activate
+    ```
+3.  **Clone and install the `sfn_blueprint` dependency:**
+    The agent requires the `sfn_blueprint` library. The following commands clone it into a sibling directory and install it in editable mode.
+    ```bash
+    cd ../
+    git clone https://github.com/stepfnAI/sfn_blueprint.git
+    cd sfn_blueprint
+    git switch dev
+    uv pip install -e .
+    cd ../methodology_selection_agent
+    ```
+## Configuration
+You can configure the agent by creating a `.env` file in the project root or by exporting environment variables in your shell. Settings loaded via `export` will override those in a `.env` file.
+### Available Settings
+| Environment Variable            | Description                                  | Default  |
+| ------------------------------- | -------------------------------------------- | -------- |
+| `OPENAI_API_KEY`                | **(Required)** Your OpenAI API key.          | *None*   |
+| `METHODOLOGY_AI_PROVIDER`       | AI provider for methodology suggestions.     | `openai` |
+| `METHODOLOGY_AI_MODEL`          | AI model for methodology suggestions.        | `gpt-4o` |
+| `METHODOLOGY_TEMPERATURE`       | AI model temperature (e.g., `0.0` to `0.5`). | `0.3`    |
+| `METHODOLOGY_MAX_TOKENS`        | Maximum tokens for the AI response.          | `4000`   |
+---
+### Method 1: Using a `.env` File (Recommended)
+Create a `.env` file in the root directory to store API keys and project-wide defaults.
+#### Example `.env` file:
+```dotenv
+# .env
+# --- Required Settings ---
+OPENAI_API_KEY="sk-your-api-key-here"
+# --- Optional Overrides ---
+# Use a different model
+METHODOLOGY_AI_MODEL="gpt-4o-mini"
+# Use a lower temperature for more deterministic responses
+METHODOLOGY_TEMPERATURE=0.1
+```
+---
+### Method 2: Using `export` Commands
+Use `export` in your terminal for temporary settings or in CI/CD environments.
+#### Example `export` commands:
+```bash
+# Set the environment variables for the current terminal session
+export OPENAI_API_KEY="sk-your-api-key-here"
+export METHODOLOGY_AI_MODEL="gpt-4o-mini"
+```
+## Testing
+To run the test suite, use the following command from the root of the project directory:
+```bash
+pytest -s
+```
+## Usage
+### Running the Example Script
+To see a quick demonstration, run the provided example script. This will execute the agent with pre-defined data and print the recommended methodology.
+```bash
+python examples/basic_usage.py
+```
+### Using as a Library
+Integrate the `MLApproachDecisionAgent` directly into your Python applications to get methodology recommendations programmatically.
+```python
+import logging
+from ml_approach_suggestion_agent.agent import MLApproachDecisionAgent
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+# 1. Define the domain, use case, and data context
+domain_name = "Mortgage Loan Servicing"
+domain_description = "Managing mortgage loans from post-origination to payoff, including payment collection, escrow management, and compliance for domestic and international loans."
+use_case = "To predict the likelihood of a borrower becoming delinquent on their mortgage payment within the next 60 days using their demographic and financial data to enable proactive intervention."
+column_descriptions = {
+    "CreditScore": "Borrower's credit score from credit bureau sources",
+    "EmploymentStatus": "Current employment status (e.g., employed, self-employed, unemployed)",
+    # ... other column descriptions
+}
+column_insights = {
+  "table_info": { "row_count": 50000 },
+  "table_columns_info": {
+    "CreditScore": { "data_type": "Int64", "min_max_value": [350, 850] },
+    "EmploymentStatus": { "data_type": "string", "distinct_count": 5 },
+    # ... other column insights
+  }
+}
+target_column_name = "IsDelinquent"
+target_column_insights = {
+    "Target Column Description": "A binary categorical flag indicating if the borrower has missed one or more mortgage payments in the last 60 days.",
+    "Data Type": "Integer (or Boolean)",
+    "Value Distribution": {
+      "0 (Not Delinquent)": "92%",
+      "1 (Delinquent)": "8%"
+    }
+}
+# 2. Prepare the task data payload
+task_data = {
+    "domain_name": domain_name,
+    "domain_description": domain_description,
+    "use_case": use_case,
+    "column_descriptions": column_descriptions,
+    "column_insights": column_insights,
+    "target_column_name": target_column_name,
+    "target_column_insights": target_column_insights
+}
+# 3. Initialize and execute the agent
+agent = MLApproachDecisionAgent()
+result = agent(task_data)
+# 4. Print the suggested methodology
+if result["success"]:
+    print("Successfully suggested an approach:")
+    print(result["result"]["approach"].model_dump_json(indent=4))
+    print(f"Cost summary: {result['result']['cost_summary']}")
+else:
+    print("Failed to suggest an approach.")
+```
+### Example Output
+The agent returns a JSON object containing the recommended methodology and a detailed explanation for the choice.
+*(Note: The actual output may vary slightly based on the LLM's response.)*
+```json
+{
+    "recommended": "Classification",
+    "description": "The goal is to predict the likelihood of a borrower becoming delinquent on their mortgage payment within the next 60 days. This is a binary outcome (delinquent or not delinquent), making classification the appropriate methodology. The target variable is categorical, and the available demographic and financial data can be used as features to train a classification model."
+}
+```

ml_approach_suggestion_agent-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+ml_approach_suggestion_agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ml_approach_suggestion_agent/agent.py,sha256=4FgDbiFXY1BRQN3hToif2Y-uaceHPdPrwyG3GEVRWMU,5569
+ml_approach_suggestion_agent/config.py,sha256=YOAXgdzgY72yB0mNEt9J2w1bdavx-VSCANBJ_4YR4Bo,704
+ml_approach_suggestion_agent/constants.py,sha256=znbt7x6xAxJL4eAZXcr8njerD84c8QuKK2ZlsFhsloo,5011
+ml_approach_suggestion_agent/models.py,sha256=i4vxGvA9vB3JOAh8IySMfcyAEQ7CWQe9ArgG7Rqo_cU,501
+ml_approach_suggestion_agent-0.1.0.dist-info/METADATA,sha256=qha4KlPfYbnoPlkCSJM_1L0COCFeh0CxBC_sr2WLYU0,8451
+ml_approach_suggestion_agent-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ml_approach_suggestion_agent-0.1.0.dist-info/top_level.txt,sha256=3-KHls6umFXtNFJoP7OFCLvb4zd12AWH71PVKNd5Aok,29
+ml_approach_suggestion_agent-0.1.0.dist-info/RECORD,,

ml_approach_suggestion_agent-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.9.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

ml_approach_suggestion_agent-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ ml_approach_suggestion_agent