ml_approach_suggestion_agent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ml_approach_suggestion_agent might be problematic. Click here for more details.

File without changes
@@ -0,0 +1,127 @@
1
+ import json
2
+ import logging
3
+ from typing import Dict, Any, List, Optional, Tuple
4
+ from datetime import datetime
5
+ from sfn_blueprint import SFNAIHandler, self_correcting_sql, Context
6
+
7
+
8
+ from .config import MethodologyConfig
9
+ from .constants import format_approach_prompt
10
+ from .models import MethodologyRecommendation
11
+
12
+
13
+
14
+ class MLApproachDecisionAgent:
15
+ def __init__(self, config: Optional[MethodologyConfig] = None):
16
+ self.logger = logging.getLogger(__name__)
17
+ self.config = config or MethodologyConfig()
18
+ self.ai_handler = SFNAIHandler()
19
+
20
+ def suggest_approach(self, domain_name, domain_description, use_case, column_descriptions, column_insights, target_column_name, target_column_insights, max_try=1) -> Tuple[MethodologyRecommendation, Dict[str, Any]]:
21
+ """
22
+ Suggests a machine learning approach based on the provided domain, use case, and column descriptions.
23
+ Args:
24
+ domain_name (str): The name of the domain.
25
+ domain_description (str): The description of the domain.
26
+ use_case (str): problem need to solve.
27
+ column_descriptions (List[str]): A list of column descriptions.
28
+ column_insights (List[str]): A list of column insights.
29
+ max_try (int, optional): The maximum number of attempts to make the API call. Defaults to 3.
30
+
31
+ Returns:
32
+ MethodologyRecommendation: The suggested machine learning approach.
33
+
34
+ TODO:
35
+ - USER prompt should consider those approaches which will be supported.
36
+
37
+
38
+ """
39
+ system_prompt, user_prompt = format_approach_prompt(domain_name=domain_name, domain_description=domain_description, use_case=use_case, column_descriptions=column_descriptions, column_insights=column_insights, target_column_name=target_column_name, target_column_insights=target_column_insights)
40
+ for _ in range(max_try):
41
+ try:
42
+ response, cost_summary = self.ai_handler.route_to(
43
+ llm_provider=self.config.methodology_ai_provider,
44
+ configuration={
45
+ "messages": [
46
+ {"role": "system", "content": system_prompt},
47
+ {"role": "user", "content": user_prompt}
48
+ ],
49
+ "max_tokens": self.config.methodology_max_tokens,
50
+ # "temperature": self.config.methodology_temperature,
51
+ "text_format":MethodologyRecommendation
52
+ },
53
+ model=self.config.methodology_ai_model
54
+
55
+ )
56
+
57
+
58
+ return response, cost_summary
59
+
60
+ except Exception as e:
61
+ self.logger.error(f"Error while executing API call to {self.config.methodology_ai_provider}: {e}")
62
+
63
+ return {}, {}
64
+
65
+
66
+
67
+ def execute_task(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
68
+ self.logger.info("Executing data quality assessment task.")
69
+ domain_name, domain_description, use_case, column_descriptions, column_insights, target_column_name, target_column_insights = (
70
+ task_data["domain_name"],
71
+ task_data["domain_description"],
72
+ task_data["use_case"],
73
+ task_data["column_descriptions"],
74
+ task_data["column_insights"],
75
+ task_data["target_column_name"],
76
+ task_data["target_column_insights"],
77
+ )
78
+
79
+ # Suggest an approach
80
+ result, cost_summary = self.suggest_approach(
81
+ domain_name=domain_name,
82
+ domain_description=domain_description,
83
+ use_case=use_case,
84
+ column_descriptions=column_descriptions,
85
+ column_insights=column_insights,
86
+ target_column_name=target_column_name,
87
+ target_column_insights=target_column_insights
88
+ )
89
+ if not result:
90
+ return {
91
+ "success": False,
92
+ "error": "Failed to suggest approach.",
93
+ "agent": self.__class__.__name__
94
+ }
95
+
96
+ try:
97
+ # Check if we have workflow storage information
98
+ if 'workflow_storage_path' in task_data or 'workflow_id' in task_data:
99
+ from sfn_blueprint import WorkflowStorageManager
100
+
101
+ # Determine workflow storage path
102
+ workflow_storage_path = task_data.get('workflow_storage_path', 'outputs/workflows')
103
+ workflow_id = task_data.get('workflow_id', 'unknown')
104
+
105
+ # Initialize storage manager
106
+ storage_manager = WorkflowStorageManager(workflow_storage_path, workflow_id)
107
+ storage_manager.save_agent_result(
108
+ agent_name=self.__class__.__name__,
109
+ step_name=" ",
110
+ data={"quality_reports": result.model_dump(), "cost_summary": cost_summary},
111
+ metadata={ "execution_time": datetime.now().isoformat()}
112
+ )
113
+ self.logger.info(" saved to workflow storage.")
114
+ except Exception as e:
115
+ self.logger.warning(f"Failed to save results to workflow storage: {e}")
116
+
117
+ return {
118
+ "success": True,
119
+ "result": {
120
+ "approach": result ,
121
+ "cost_summary": cost_summary
122
+ },
123
+ "agent": self.__class__.__name__
124
+ }
125
+
126
+ def __call__(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
127
+ return self.execute_task(task_data)
@@ -0,0 +1,20 @@
1
+ from pydantic import Field
2
+ from pydantic_settings import BaseSettings, SettingsConfigDict
3
+
4
+
5
+
6
+
7
+ class MethodologyConfig(BaseSettings):
8
+ model_config = SettingsConfigDict(
9
+ env_file='.env',
10
+ env_file_encoding='utf-8',
11
+ case_sensitive=False,
12
+ extra='ignore'
13
+ )
14
+
15
+ methodology_ai_provider: str = Field(default="openai", description="AI provider to use")
16
+ methodology_ai_model: str = Field(default="gpt-4o-mini", description="AI model to use")
17
+ methodology_temperature: float = Field(default=0.3, ge=0.0, le=0.5, description="AI model temperature")
18
+ methodology_max_tokens: int = Field(default=4000, ge=100, le=8000, description="Maximum tokens for AI response")
19
+
20
+
@@ -0,0 +1,102 @@
1
+ METHODOLOGY_SELECTION_SYSTEM_PROMPT = """
2
+ You are an ML methodology advisor. Your task is to analyze the user's problem and recommend the single most appropriate approach.
3
+
4
+ **Decision Framework:**
5
+
6
+ 1. **Understand the Business Goal:** Start with the `use_case_description` to grasp what the user is trying to achieve.
7
+
8
+ 2. **Examine the Target Variable:** Use `target_column_insights` to understand the nature of what needs to be predicted:
9
+ - Check `unique_count` to determine if it's binary (2 values), multiclass (>2 values), or continuous
10
+ - Check `data_type` to see if it's numerical (int/float) or categorical (str)
11
+ - Review `sample_values` to understand the actual values
12
+
13
+ 3. **Check for Temporal Dependencies:** Look for these indicators in `column_insights`:
14
+ - Presence of timestamp/date columns with high unique_count (near row_count)
15
+ - Column descriptions mentioning "time", "date", "timestamp", "sequential"
16
+ - Use case description mentioning "over time", "forecast", "predict future", "trend", "sequential patterns"
17
+ - If temporal column exists AND the prediction depends on historical patterns, consider time series methods
18
+
19
+ 4. **Critical Time Series Distinction:**
20
+ - **Time Series Forecasting**: Target is NUMERICAL and goal is predicting FUTURE VALUES (e.g., "predict next month's sales", "forecast temperature")
21
+ - **Time Series Classification**: Target is CATEGORICAL (even if binary 0/1) and data is SEQUENTIAL (e.g., "classify failure from sensor patterns", "detect activity type from accelerometer sequence")
22
+ - **Binary/Multiclass Classification**: Target is categorical BUT data points are INDEPENDENT (no temporal ordering matters)
23
+
24
+ 5. **Assess Data Structure:** Review `column_insights`:
25
+ - If no target specified or target_column_name is "Not specified" → likely `not_applicable`
26
+ - If use case is purely computational/rule-based → `not_applicable`
27
+
28
+ 6. **Select ONE Methodology:**
29
+ - `binary_classification`: Target has exactly 2 unique values (categorical/binary) AND samples are independent (no temporal dependency)
30
+ - `multiclass_classification`: Target has >2 unique categories AND samples are independent
31
+ - `time_series_forecasting`: Target is NUMERICAL AND prediction involves FUTURE time periods based on historical patterns
32
+ - `time_series_classification`: Target is CATEGORICAL (binary or multiclass) AND data has TEMPORAL ORDERING where sequential patterns are critical for prediction
33
+ - `not_applicable`: No clear ML objective, purely rule-based problem, or insufficient information
34
+
35
+ 7. **Key Rules to Avoid Mistakes:**
36
+ - Binary target (0/1) with timestamp does NOT automatically mean binary_classification - check if SEQUENCE matters
37
+ - If use case mentions "based on sensor readings over time", "sequential patterns", "time-series data" → it's time_series_classification
38
+ - If use case is just "predict X" with no temporal context and independent samples → it's binary/multiclass_classification
39
+ - Presence of timestamp column alone doesn't mean time series - check if the PREDICTION depends on temporal patterns
40
+ - If target is numerical but goal is "classify into categories" → still classification, not regression
41
+
42
+ 8. **Justify Your Choice:**
43
+ - State the business goal clearly
44
+ - Identify the target variable type (binary/multiclass/numerical)
45
+ - Explain whether temporal dependencies exist and matter for prediction
46
+ - Connect these factors to show why the chosen methodology fits
47
+
48
+
49
+ """
50
+
51
+
52
+
53
+ METHODOLOGY_SELECTION_USER_PROMPT = """
54
+ **Business Context:**
55
+ Domain: {domain_name}
56
+ {domain_description}
57
+
58
+ **Use Case:**
59
+ {use_case_description}
60
+
61
+ **Data Overview:**
62
+ Columns:
63
+ {column_descriptions}
64
+
65
+ Dataset Characteristics:
66
+ {column_insights}
67
+
68
+ **Target Information:**
69
+ Target Column: {target_column_name}
70
+ Target Details: {target_column_insights}
71
+
72
+ **Required Output:**
73
+ 1. Select the single best ML methodology
74
+ 2. Provide a brief justification explaining why this methodology fits the problem
75
+ """
76
+
77
+
78
+
79
+ def format_approach_prompt(domain_name, domain_description, use_case, column_descriptions, column_insights, target_column_name, target_column_insights):
80
+ """
81
+ Args:
82
+ domain (str): The domain of the data.
83
+ use_case (str): The use case of the data.
84
+ column_descriptions (List[str]): A list of column descriptions.
85
+ column_insights (List[str]): A list of column insights.
86
+
87
+ Returns:
88
+ Tuple[str, str]: The formatted system prompt and user prompt.
89
+ TODO:
90
+ - Change prompt write new prompt and involve the supported approaches in new prompt.
91
+ """
92
+
93
+ user_prompt = METHODOLOGY_SELECTION_USER_PROMPT.format(
94
+ domain_name=domain_name,
95
+ domain_description=domain_description,
96
+ use_case_description=use_case,
97
+ column_descriptions=column_descriptions,
98
+ column_insights=column_insights,
99
+ target_column_name=target_column_name,
100
+ target_column_insights=target_column_insights,
101
+ )
102
+ return METHODOLOGY_SELECTION_SYSTEM_PROMPT, user_prompt
@@ -0,0 +1,7 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing import Literal
3
+
4
+ class MethodologyRecommendation(BaseModel):
5
+ selected_methodology: Literal["binary_classification", "multiclass_classification", "time_series_forecasting", "time_series_classification", "not_applicable" ] = Field(..., description="The most appropriate ML approach for this problem")
6
+
7
+ justification: str = Field(..., description="Clear explanation connecting the business goal and target variable to the chosen methodology")
@@ -0,0 +1,225 @@
1
+ Metadata-Version: 2.4
2
+ Name: ml_approach_suggestion_agent
3
+ Version: 0.1.0
4
+ Summary: Add your description here
5
+ License-Expression: MIT
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: Programming Language :: Python :: 3.10
8
+ Classifier: Programming Language :: Python :: 3.11
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Programming Language :: Python :: 3.13
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.11
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: pydantic-settings
15
+ Requires-Dist: sfn-blueprint>=0.6.15
16
+ Provides-Extra: dev
17
+ Requires-Dist: pytest; extra == "dev"
18
+ Requires-Dist: pytest-mock; extra == "dev"
19
+
20
+ # ml_approach_suggestion_agent
21
+
22
+ An AI-powered agent that analyzes a dataset and use case to recommend the most appropriate machine learning methodology.
23
+
24
+ ## Description
25
+
26
+ This agent takes a detailed description of a business domain, a specific use case, and information about the dataset—including column descriptions, insights, and target variable details—to suggest the best ML approach. It uses a large language model to:
27
+
28
+ 1. **Analyze** the relationship between the use case and the target variable.
29
+ 2. **Evaluate** the characteristics of the data (especially the target column).
30
+ 3. **Recommend** the most suitable methodology from a predefined list: `Classification`, `Regression`, `Forecasting`, `Clustering`, or `No-ML`.
31
+ 4. **Provide** a clear justification for its recommendation.
32
+
33
+ This helps data scientists and analysts quickly and confidently choose the right path for their modeling efforts, saving time and reducing the risk of starting with an incorrect approach.
34
+
35
+ ## Key Features
36
+
37
+ - **Intelligent Use Case Analysis**: Leverages an LLM to understand the core objective of the business problem.
38
+ - **Target-Aware Recommendation**: Places special emphasis on the nature of the target variable to guide its decision.
39
+ - **Context-Driven Suggestions**: Considers the entire data context, including domain and column descriptions, to make an informed choice.
40
+ - **Accelerates Model Planning**: Provides a validated starting point for ML projects, ensuring alignment between the problem and the proposed solution.
41
+
42
+ ## Installation
43
+
44
+ ### Prerequisites
45
+
46
+ - [**uv**](https://docs.astral.sh/uv/getting-started/installation/) – A fast Python package and environment manager.
47
+ - For a quick setup on macOS/Linux, you can use:
48
+ ```bash
49
+ curl -LsSf https://astral.sh/uv/install.sh | sh
50
+ ```
51
+ - [**Git**](https://git-scm.com/)
52
+
53
+ ### Steps
54
+
55
+ 1. **Clone the `methodology_selection_agent` repository:**
56
+ ```bash
57
+ git clone https://github.com/stepfnAI/ml_approach_suggestion_agent.git
58
+ cd ml_approach_suggestion_agent
59
+ git switch dev
60
+ ```
61
+
62
+ 2. **Create a virtual environment and install dependencies:**
63
+ This command creates a `.venv` folder in the current directory and installs all required packages.
64
+ ```bash
65
+ uv sync --extra dev
66
+ source .venv/bin/activate
67
+ ```
68
+
69
+ 3. **Clone and install the `sfn_blueprint` dependency:**
70
+ The agent requires the `sfn_blueprint` library. The following commands clone it into a sibling directory and install it in editable mode.
71
+ ```bash
72
+ cd ../
73
+ git clone https://github.com/stepfnAI/sfn_blueprint.git
74
+ cd sfn_blueprint
75
+ git switch dev
76
+ uv pip install -e .
77
+ cd ../methodology_selection_agent
78
+ ```
79
+
80
+ ## Configuration
81
+
82
+ You can configure the agent by creating a `.env` file in the project root or by exporting environment variables in your shell. Settings loaded via `export` will override those in a `.env` file.
83
+
84
+ ### Available Settings
85
+
86
+ | Environment Variable | Description | Default |
87
+ | ------------------------------- | -------------------------------------------- | -------- |
88
+ | `OPENAI_API_KEY` | **(Required)** Your OpenAI API key. | *None* |
89
+ | `METHODOLOGY_AI_PROVIDER` | AI provider for methodology suggestions. | `openai` |
90
+ | `METHODOLOGY_AI_MODEL` | AI model for methodology suggestions. | `gpt-4o` |
91
+ | `METHODOLOGY_TEMPERATURE` | AI model temperature (e.g., `0.0` to `0.5`). | `0.3` |
92
+ | `METHODOLOGY_MAX_TOKENS` | Maximum tokens for the AI response. | `4000` |
93
+
94
+ ---
95
+
96
+ ### Method 1: Using a `.env` File (Recommended)
97
+
98
+ Create a `.env` file in the root directory to store API keys and project-wide defaults.
99
+
100
+ #### Example `.env` file:
101
+
102
+ ```dotenv
103
+ # .env
104
+
105
+ # --- Required Settings ---
106
+ OPENAI_API_KEY="sk-your-api-key-here"
107
+
108
+ # --- Optional Overrides ---
109
+ # Use a different model
110
+ METHODOLOGY_AI_MODEL="gpt-4o-mini"
111
+
112
+ # Use a lower temperature for more deterministic responses
113
+ METHODOLOGY_TEMPERATURE=0.1
114
+ ```
115
+
116
+ ---
117
+
118
+ ### Method 2: Using `export` Commands
119
+
120
+ Use `export` in your terminal for temporary settings or in CI/CD environments.
121
+
122
+ #### Example `export` commands:
123
+
124
+ ```bash
125
+ # Set the environment variables for the current terminal session
126
+ export OPENAI_API_KEY="sk-your-api-key-here"
127
+ export METHODOLOGY_AI_MODEL="gpt-4o-mini"
128
+ ```
129
+
130
+ ## Testing
131
+
132
+ To run the test suite, use the following command from the root of the project directory:
133
+
134
+ ```bash
135
+ pytest -s
136
+ ```
137
+
138
+ ## Usage
139
+
140
+ ### Running the Example Script
141
+
142
+ To see a quick demonstration, run the provided example script. This will execute the agent with pre-defined data and print the recommended methodology.
143
+
144
+ ```bash
145
+ python examples/basic_usage.py
146
+ ```
147
+
148
+ ### Using as a Library
149
+
150
+ Integrate the `MLApproachDecisionAgent` directly into your Python applications to get methodology recommendations programmatically.
151
+
152
+ ```python
153
+ import logging
154
+ from ml_approach_suggestion_agent.agent import MLApproachDecisionAgent
155
+
156
+ # Configure logging
157
+ logging.basicConfig(level=logging.INFO)
158
+
159
+ # 1. Define the domain, use case, and data context
160
+ domain_name = "Mortgage Loan Servicing"
161
+ domain_description = "Managing mortgage loans from post-origination to payoff, including payment collection, escrow management, and compliance for domestic and international loans."
162
+ use_case = "To predict the likelihood of a borrower becoming delinquent on their mortgage payment within the next 60 days using their demographic and financial data to enable proactive intervention."
163
+
164
+ column_descriptions = {
165
+ "CreditScore": "Borrower's credit score from credit bureau sources",
166
+ "EmploymentStatus": "Current employment status (e.g., employed, self-employed, unemployed)",
167
+ # ... other column descriptions
168
+ }
169
+
170
+ column_insights = {
171
+ "table_info": { "row_count": 50000 },
172
+ "table_columns_info": {
173
+ "CreditScore": { "data_type": "Int64", "min_max_value": [350, 850] },
174
+ "EmploymentStatus": { "data_type": "string", "distinct_count": 5 },
175
+ # ... other column insights
176
+ }
177
+ }
178
+
179
+ target_column_name = "IsDelinquent"
180
+ target_column_insights = {
181
+ "Target Column Description": "A binary categorical flag indicating if the borrower has missed one or more mortgage payments in the last 60 days.",
182
+ "Data Type": "Integer (or Boolean)",
183
+ "Value Distribution": {
184
+ "0 (Not Delinquent)": "92%",
185
+ "1 (Delinquent)": "8%"
186
+ }
187
+ }
188
+
189
+ # 2. Prepare the task data payload
190
+ task_data = {
191
+ "domain_name": domain_name,
192
+ "domain_description": domain_description,
193
+ "use_case": use_case,
194
+ "column_descriptions": column_descriptions,
195
+ "column_insights": column_insights,
196
+ "target_column_name": target_column_name,
197
+ "target_column_insights": target_column_insights
198
+ }
199
+
200
+ # 3. Initialize and execute the agent
201
+ agent = MLApproachDecisionAgent()
202
+ result = agent(task_data)
203
+
204
+ # 4. Print the suggested methodology
205
+ if result["success"]:
206
+ print("Successfully suggested an approach:")
207
+ print(result["result"]["approach"].model_dump_json(indent=4))
208
+ print(f"Cost summary: {result['result']['cost_summary']}")
209
+ else:
210
+ print("Failed to suggest an approach.")
211
+
212
+ ```
213
+
214
+ ### Example Output
215
+
216
+ The agent returns a JSON object containing the recommended methodology and a detailed explanation for the choice.
217
+
218
+ *(Note: The actual output may vary slightly based on the LLM's response.)*
219
+
220
+ ```json
221
+ {
222
+ "recommended": "Classification",
223
+ "description": "The goal is to predict the likelihood of a borrower becoming delinquent on their mortgage payment within the next 60 days. This is a binary outcome (delinquent or not delinquent), making classification the appropriate methodology. The target variable is categorical, and the available demographic and financial data can be used as features to train a classification model."
224
+ }
225
+ ```
@@ -0,0 +1,9 @@
1
+ ml_approach_suggestion_agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ ml_approach_suggestion_agent/agent.py,sha256=4FgDbiFXY1BRQN3hToif2Y-uaceHPdPrwyG3GEVRWMU,5569
3
+ ml_approach_suggestion_agent/config.py,sha256=YOAXgdzgY72yB0mNEt9J2w1bdavx-VSCANBJ_4YR4Bo,704
4
+ ml_approach_suggestion_agent/constants.py,sha256=znbt7x6xAxJL4eAZXcr8njerD84c8QuKK2ZlsFhsloo,5011
5
+ ml_approach_suggestion_agent/models.py,sha256=i4vxGvA9vB3JOAh8IySMfcyAEQ7CWQe9ArgG7Rqo_cU,501
6
+ ml_approach_suggestion_agent-0.1.0.dist-info/METADATA,sha256=qha4KlPfYbnoPlkCSJM_1L0COCFeh0CxBC_sr2WLYU0,8451
7
+ ml_approach_suggestion_agent-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ ml_approach_suggestion_agent-0.1.0.dist-info/top_level.txt,sha256=3-KHls6umFXtNFJoP7OFCLvb4zd12AWH71PVKNd5Aok,29
9
+ ml_approach_suggestion_agent-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ ml_approach_suggestion_agent