ml_approach_suggestion_agent 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ml_approach_suggestion_agent-0.1.1/PKG-INFO +225 -0
- ml_approach_suggestion_agent-0.1.1/README.md +206 -0
- ml_approach_suggestion_agent-0.1.1/pyproject.toml +41 -0
- ml_approach_suggestion_agent-0.1.1/setup.cfg +4 -0
- ml_approach_suggestion_agent-0.1.1/src/ml_approach_suggestion_agent/__init__.py +0 -0
- ml_approach_suggestion_agent-0.1.1/src/ml_approach_suggestion_agent/agent.py +123 -0
- ml_approach_suggestion_agent-0.1.1/src/ml_approach_suggestion_agent/config.py +20 -0
- ml_approach_suggestion_agent-0.1.1/src/ml_approach_suggestion_agent/constants.py +88 -0
- ml_approach_suggestion_agent-0.1.1/src/ml_approach_suggestion_agent/models.py +7 -0
- ml_approach_suggestion_agent-0.1.1/src/ml_approach_suggestion_agent.egg-info/PKG-INFO +225 -0
- ml_approach_suggestion_agent-0.1.1/src/ml_approach_suggestion_agent.egg-info/SOURCES.txt +13 -0
- ml_approach_suggestion_agent-0.1.1/src/ml_approach_suggestion_agent.egg-info/dependency_links.txt +1 -0
- ml_approach_suggestion_agent-0.1.1/src/ml_approach_suggestion_agent.egg-info/requires.txt +6 -0
- ml_approach_suggestion_agent-0.1.1/src/ml_approach_suggestion_agent.egg-info/top_level.txt +1 -0
- ml_approach_suggestion_agent-0.1.1/tests/test_agent.py +151 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ml_approach_suggestion_agent
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.11
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: pydantic-settings
|
|
15
|
+
Requires-Dist: sfn-blueprint>=0.6.15
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
Requires-Dist: pytest; extra == "dev"
|
|
18
|
+
Requires-Dist: pytest-mock; extra == "dev"
|
|
19
|
+
|
|
20
|
+
# ml_approach_suggestion_agent
|
|
21
|
+
|
|
22
|
+
An AI-powered agent that analyzes a dataset and use case to recommend the most appropriate machine learning methodology.
|
|
23
|
+
|
|
24
|
+
## Description
|
|
25
|
+
|
|
26
|
+
This agent takes a detailed description of a business domain, a specific use case, and information about the dataset—including column descriptions, insights, and target variable details—to suggest the best ML approach. It uses a large language model to:
|
|
27
|
+
|
|
28
|
+
1. **Analyze** the relationship between the use case and the target variable.
|
|
29
|
+
2. **Evaluate** the characteristics of the data (especially the target column).
|
|
30
|
+
3. **Recommend** the most suitable methodology from a predefined list: `Classification`, `Regression`, `Forecasting`, `Clustering`, or `No-ML`.
|
|
31
|
+
4. **Provide** a clear justification for its recommendation.
|
|
32
|
+
|
|
33
|
+
This helps data scientists and analysts quickly and confidently choose the right path for their modeling efforts, saving time and reducing the risk of starting with an incorrect approach.
|
|
34
|
+
|
|
35
|
+
## Key Features
|
|
36
|
+
|
|
37
|
+
- **Intelligent Use Case Analysis**: Leverages an LLM to understand the core objective of the business problem.
|
|
38
|
+
- **Target-Aware Recommendation**: Places special emphasis on the nature of the target variable to guide its decision.
|
|
39
|
+
- **Context-Driven Suggestions**: Considers the entire data context, including domain and column descriptions, to make an informed choice.
|
|
40
|
+
- **Accelerates Model Planning**: Provides a validated starting point for ML projects, ensuring alignment between the problem and the proposed solution.
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
### Prerequisites
|
|
45
|
+
|
|
46
|
+
- [**uv**](https://docs.astral.sh/uv/getting-started/installation/) – A fast Python package and environment manager.
|
|
47
|
+
- For a quick setup on macOS/Linux, you can use:
|
|
48
|
+
```bash
|
|
49
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
50
|
+
```
|
|
51
|
+
- [**Git**](https://git-scm.com/)
|
|
52
|
+
|
|
53
|
+
### Steps
|
|
54
|
+
|
|
55
|
+
1. **Clone the `methodology_selection_agent` repository:**
|
|
56
|
+
```bash
|
|
57
|
+
git clone https://github.com/stepfnAI/ml_approach_suggestion_agent.git
|
|
58
|
+
cd ml_approach_suggestion_agent
|
|
59
|
+
git switch dev
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
2. **Create a virtual environment and install dependencies:**
|
|
63
|
+
This command creates a `.venv` folder in the current directory and installs all required packages.
|
|
64
|
+
```bash
|
|
65
|
+
uv sync --extra dev
|
|
66
|
+
source .venv/bin/activate
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
3. **Clone and install the `sfn_blueprint` dependency:**
|
|
70
|
+
The agent requires the `sfn_blueprint` library. The following commands clone it into a sibling directory and install it in editable mode.
|
|
71
|
+
```bash
|
|
72
|
+
cd ../
|
|
73
|
+
git clone https://github.com/stepfnAI/sfn_blueprint.git
|
|
74
|
+
cd sfn_blueprint
|
|
75
|
+
git switch dev
|
|
76
|
+
uv pip install -e .
|
|
77
|
+
cd ../methodology_selection_agent
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Configuration
|
|
81
|
+
|
|
82
|
+
You can configure the agent by creating a `.env` file in the project root or by exporting environment variables in your shell. Settings loaded via `export` will override those in a `.env` file.
|
|
83
|
+
|
|
84
|
+
### Available Settings
|
|
85
|
+
|
|
86
|
+
| Environment Variable | Description | Default |
|
|
87
|
+
| ------------------------------- | -------------------------------------------- | -------- |
|
|
88
|
+
| `OPENAI_API_KEY` | **(Required)** Your OpenAI API key. | *None* |
|
|
89
|
+
| `METHODOLOGY_AI_PROVIDER` | AI provider for methodology suggestions. | `openai` |
|
|
90
|
+
| `METHODOLOGY_AI_MODEL` | AI model for methodology suggestions. | `gpt-4o` |
|
|
91
|
+
| `METHODOLOGY_TEMPERATURE` | AI model temperature (e.g., `0.0` to `0.5`). | `0.3` |
|
|
92
|
+
| `METHODOLOGY_MAX_TOKENS` | Maximum tokens for the AI response. | `4000` |
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
### Method 1: Using a `.env` File (Recommended)
|
|
97
|
+
|
|
98
|
+
Create a `.env` file in the root directory to store API keys and project-wide defaults.
|
|
99
|
+
|
|
100
|
+
#### Example `.env` file:
|
|
101
|
+
|
|
102
|
+
```dotenv
|
|
103
|
+
# .env
|
|
104
|
+
|
|
105
|
+
# --- Required Settings ---
|
|
106
|
+
OPENAI_API_KEY="sk-your-api-key-here"
|
|
107
|
+
|
|
108
|
+
# --- Optional Overrides ---
|
|
109
|
+
# Use a different model
|
|
110
|
+
METHODOLOGY_AI_MODEL="gpt-4o-mini"
|
|
111
|
+
|
|
112
|
+
# Use a lower temperature for more deterministic responses
|
|
113
|
+
METHODOLOGY_TEMPERATURE=0.1
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
### Method 2: Using `export` Commands
|
|
119
|
+
|
|
120
|
+
Use `export` in your terminal for temporary settings or in CI/CD environments.
|
|
121
|
+
|
|
122
|
+
#### Example `export` commands:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
# Set the environment variables for the current terminal session
|
|
126
|
+
export OPENAI_API_KEY="sk-your-api-key-here"
|
|
127
|
+
export METHODOLOGY_AI_MODEL="gpt-4o-mini"
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Testing
|
|
131
|
+
|
|
132
|
+
To run the test suite, use the following command from the root of the project directory:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
pytest -s
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Usage
|
|
139
|
+
|
|
140
|
+
### Running the Example Script
|
|
141
|
+
|
|
142
|
+
To see a quick demonstration, run the provided example script. This will execute the agent with pre-defined data and print the recommended methodology.
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
python examples/basic_usage.py
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Using as a Library
|
|
149
|
+
|
|
150
|
+
Integrate the `MLApproachDecisionAgent` directly into your Python applications to get methodology recommendations programmatically.
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
import logging
|
|
154
|
+
from ml_approach_suggestion_agent.agent import MLApproachDecisionAgent
|
|
155
|
+
|
|
156
|
+
# Configure logging
|
|
157
|
+
logging.basicConfig(level=logging.INFO)
|
|
158
|
+
|
|
159
|
+
# 1. Define the domain, use case, and data context
|
|
160
|
+
domain_name = "Mortgage Loan Servicing"
|
|
161
|
+
domain_description = "Managing mortgage loans from post-origination to payoff, including payment collection, escrow management, and compliance for domestic and international loans."
|
|
162
|
+
use_case = "To predict the likelihood of a borrower becoming delinquent on their mortgage payment within the next 60 days using their demographic and financial data to enable proactive intervention."
|
|
163
|
+
|
|
164
|
+
column_descriptions = {
|
|
165
|
+
"CreditScore": "Borrower's credit score from credit bureau sources",
|
|
166
|
+
"EmploymentStatus": "Current employment status (e.g., employed, self-employed, unemployed)",
|
|
167
|
+
# ... other column descriptions
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
column_insights = {
|
|
171
|
+
"table_info": { "row_count": 50000 },
|
|
172
|
+
"table_columns_info": {
|
|
173
|
+
"CreditScore": { "data_type": "Int64", "min_max_value": [350, 850] },
|
|
174
|
+
"EmploymentStatus": { "data_type": "string", "distinct_count": 5 },
|
|
175
|
+
# ... other column insights
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
target_column_name = "IsDelinquent"
|
|
180
|
+
target_column_insights = {
|
|
181
|
+
"Target Column Description": "A binary categorical flag indicating if the borrower has missed one or more mortgage payments in the last 60 days.",
|
|
182
|
+
"Data Type": "Integer (or Boolean)",
|
|
183
|
+
"Value Distribution": {
|
|
184
|
+
"0 (Not Delinquent)": "92%",
|
|
185
|
+
"1 (Delinquent)": "8%"
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
# 2. Prepare the task data payload
|
|
190
|
+
task_data = {
|
|
191
|
+
"domain_name": domain_name,
|
|
192
|
+
"domain_description": domain_description,
|
|
193
|
+
"use_case": use_case,
|
|
194
|
+
"column_descriptions": column_descriptions,
|
|
195
|
+
"column_insights": column_insights,
|
|
196
|
+
"target_column_name": target_column_name,
|
|
197
|
+
"target_column_insights": target_column_insights
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
# 3. Initialize and execute the agent
|
|
201
|
+
agent = MLApproachDecisionAgent()
|
|
202
|
+
result = agent(task_data)
|
|
203
|
+
|
|
204
|
+
# 4. Print the suggested methodology
|
|
205
|
+
if result["success"]:
|
|
206
|
+
print("Successfully suggested an approach:")
|
|
207
|
+
print(result["result"]["approach"].model_dump_json(indent=4))
|
|
208
|
+
print(f"Cost summary: {result['result']['cost_summary']}")
|
|
209
|
+
else:
|
|
210
|
+
print("Failed to suggest an approach.")
|
|
211
|
+
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### Example Output
|
|
215
|
+
|
|
216
|
+
The agent returns a JSON object containing the recommended methodology and a detailed explanation for the choice.
|
|
217
|
+
|
|
218
|
+
*(Note: The actual output may vary slightly based on the LLM's response.)*
|
|
219
|
+
|
|
220
|
+
```json
|
|
221
|
+
{
|
|
222
|
+
"recommended": "Classification",
|
|
223
|
+
"description": "The goal is to predict the likelihood of a borrower becoming delinquent on their mortgage payment within the next 60 days. This is a binary outcome (delinquent or not delinquent), making classification the appropriate methodology. The target variable is categorical, and the available demographic and financial data can be used as features to train a classification model."
|
|
224
|
+
}
|
|
225
|
+
```
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
# ml_approach_suggestion_agent
|
|
2
|
+
|
|
3
|
+
An AI-powered agent that analyzes a dataset and use case to recommend the most appropriate machine learning methodology.
|
|
4
|
+
|
|
5
|
+
## Description
|
|
6
|
+
|
|
7
|
+
This agent takes a detailed description of a business domain, a specific use case, and information about the dataset—including column descriptions, insights, and target variable details—to suggest the best ML approach. It uses a large language model to:
|
|
8
|
+
|
|
9
|
+
1. **Analyze** the relationship between the use case and the target variable.
|
|
10
|
+
2. **Evaluate** the characteristics of the data (especially the target column).
|
|
11
|
+
3. **Recommend** the most suitable methodology from a predefined list: `Classification`, `Regression`, `Forecasting`, `Clustering`, or `No-ML`.
|
|
12
|
+
4. **Provide** a clear justification for its recommendation.
|
|
13
|
+
|
|
14
|
+
This helps data scientists and analysts quickly and confidently choose the right path for their modeling efforts, saving time and reducing the risk of starting with an incorrect approach.
|
|
15
|
+
|
|
16
|
+
## Key Features
|
|
17
|
+
|
|
18
|
+
- **Intelligent Use Case Analysis**: Leverages an LLM to understand the core objective of the business problem.
|
|
19
|
+
- **Target-Aware Recommendation**: Places special emphasis on the nature of the target variable to guide its decision.
|
|
20
|
+
- **Context-Driven Suggestions**: Considers the entire data context, including domain and column descriptions, to make an informed choice.
|
|
21
|
+
- **Accelerates Model Planning**: Provides a validated starting point for ML projects, ensuring alignment between the problem and the proposed solution.
|
|
22
|
+
|
|
23
|
+
## Installation
|
|
24
|
+
|
|
25
|
+
### Prerequisites
|
|
26
|
+
|
|
27
|
+
- [**uv**](https://docs.astral.sh/uv/getting-started/installation/) – A fast Python package and environment manager.
|
|
28
|
+
- For a quick setup on macOS/Linux, you can use:
|
|
29
|
+
```bash
|
|
30
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
31
|
+
```
|
|
32
|
+
- [**Git**](https://git-scm.com/)
|
|
33
|
+
|
|
34
|
+
### Steps
|
|
35
|
+
|
|
36
|
+
1. **Clone the `methodology_selection_agent` repository:**
|
|
37
|
+
```bash
|
|
38
|
+
git clone https://github.com/stepfnAI/ml_approach_suggestion_agent.git
|
|
39
|
+
cd ml_approach_suggestion_agent
|
|
40
|
+
git switch dev
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
2. **Create a virtual environment and install dependencies:**
|
|
44
|
+
This command creates a `.venv` folder in the current directory and installs all required packages.
|
|
45
|
+
```bash
|
|
46
|
+
uv sync --extra dev
|
|
47
|
+
source .venv/bin/activate
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
3. **Clone and install the `sfn_blueprint` dependency:**
|
|
51
|
+
The agent requires the `sfn_blueprint` library. The following commands clone it into a sibling directory and install it in editable mode.
|
|
52
|
+
```bash
|
|
53
|
+
cd ../
|
|
54
|
+
git clone https://github.com/stepfnAI/sfn_blueprint.git
|
|
55
|
+
cd sfn_blueprint
|
|
56
|
+
git switch dev
|
|
57
|
+
uv pip install -e .
|
|
58
|
+
cd ../methodology_selection_agent
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Configuration
|
|
62
|
+
|
|
63
|
+
You can configure the agent by creating a `.env` file in the project root or by exporting environment variables in your shell. Settings loaded via `export` will override those in a `.env` file.
|
|
64
|
+
|
|
65
|
+
### Available Settings
|
|
66
|
+
|
|
67
|
+
| Environment Variable | Description | Default |
|
|
68
|
+
| ------------------------------- | -------------------------------------------- | -------- |
|
|
69
|
+
| `OPENAI_API_KEY` | **(Required)** Your OpenAI API key. | *None* |
|
|
70
|
+
| `METHODOLOGY_AI_PROVIDER` | AI provider for methodology suggestions. | `openai` |
|
|
71
|
+
| `METHODOLOGY_AI_MODEL` | AI model for methodology suggestions. | `gpt-4o` |
|
|
72
|
+
| `METHODOLOGY_TEMPERATURE` | AI model temperature (e.g., `0.0` to `0.5`). | `0.3` |
|
|
73
|
+
| `METHODOLOGY_MAX_TOKENS` | Maximum tokens for the AI response. | `4000` |
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
### Method 1: Using a `.env` File (Recommended)
|
|
78
|
+
|
|
79
|
+
Create a `.env` file in the root directory to store API keys and project-wide defaults.
|
|
80
|
+
|
|
81
|
+
#### Example `.env` file:
|
|
82
|
+
|
|
83
|
+
```dotenv
|
|
84
|
+
# .env
|
|
85
|
+
|
|
86
|
+
# --- Required Settings ---
|
|
87
|
+
OPENAI_API_KEY="sk-your-api-key-here"
|
|
88
|
+
|
|
89
|
+
# --- Optional Overrides ---
|
|
90
|
+
# Use a different model
|
|
91
|
+
METHODOLOGY_AI_MODEL="gpt-4o-mini"
|
|
92
|
+
|
|
93
|
+
# Use a lower temperature for more deterministic responses
|
|
94
|
+
METHODOLOGY_TEMPERATURE=0.1
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
### Method 2: Using `export` Commands
|
|
100
|
+
|
|
101
|
+
Use `export` in your terminal for temporary settings or in CI/CD environments.
|
|
102
|
+
|
|
103
|
+
#### Example `export` commands:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
# Set the environment variables for the current terminal session
|
|
107
|
+
export OPENAI_API_KEY="sk-your-api-key-here"
|
|
108
|
+
export METHODOLOGY_AI_MODEL="gpt-4o-mini"
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Testing
|
|
112
|
+
|
|
113
|
+
To run the test suite, use the following command from the root of the project directory:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
pytest -s
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Usage
|
|
120
|
+
|
|
121
|
+
### Running the Example Script
|
|
122
|
+
|
|
123
|
+
To see a quick demonstration, run the provided example script. This will execute the agent with pre-defined data and print the recommended methodology.
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
python examples/basic_usage.py
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Using as a Library
|
|
130
|
+
|
|
131
|
+
Integrate the `MLApproachDecisionAgent` directly into your Python applications to get methodology recommendations programmatically.
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
import logging
|
|
135
|
+
from ml_approach_suggestion_agent.agent import MLApproachDecisionAgent
|
|
136
|
+
|
|
137
|
+
# Configure logging
|
|
138
|
+
logging.basicConfig(level=logging.INFO)
|
|
139
|
+
|
|
140
|
+
# 1. Define the domain, use case, and data context
|
|
141
|
+
domain_name = "Mortgage Loan Servicing"
|
|
142
|
+
domain_description = "Managing mortgage loans from post-origination to payoff, including payment collection, escrow management, and compliance for domestic and international loans."
|
|
143
|
+
use_case = "To predict the likelihood of a borrower becoming delinquent on their mortgage payment within the next 60 days using their demographic and financial data to enable proactive intervention."
|
|
144
|
+
|
|
145
|
+
column_descriptions = {
|
|
146
|
+
"CreditScore": "Borrower's credit score from credit bureau sources",
|
|
147
|
+
"EmploymentStatus": "Current employment status (e.g., employed, self-employed, unemployed)",
|
|
148
|
+
# ... other column descriptions
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
column_insights = {
|
|
152
|
+
"table_info": { "row_count": 50000 },
|
|
153
|
+
"table_columns_info": {
|
|
154
|
+
"CreditScore": { "data_type": "Int64", "min_max_value": [350, 850] },
|
|
155
|
+
"EmploymentStatus": { "data_type": "string", "distinct_count": 5 },
|
|
156
|
+
# ... other column insights
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
target_column_name = "IsDelinquent"
|
|
161
|
+
target_column_insights = {
|
|
162
|
+
"Target Column Description": "A binary categorical flag indicating if the borrower has missed one or more mortgage payments in the last 60 days.",
|
|
163
|
+
"Data Type": "Integer (or Boolean)",
|
|
164
|
+
"Value Distribution": {
|
|
165
|
+
"0 (Not Delinquent)": "92%",
|
|
166
|
+
"1 (Delinquent)": "8%"
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
# 2. Prepare the task data payload
|
|
171
|
+
task_data = {
|
|
172
|
+
"domain_name": domain_name,
|
|
173
|
+
"domain_description": domain_description,
|
|
174
|
+
"use_case": use_case,
|
|
175
|
+
"column_descriptions": column_descriptions,
|
|
176
|
+
"column_insights": column_insights,
|
|
177
|
+
"target_column_name": target_column_name,
|
|
178
|
+
"target_column_insights": target_column_insights
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
# 3. Initialize and execute the agent
|
|
182
|
+
agent = MLApproachDecisionAgent()
|
|
183
|
+
result = agent(task_data)
|
|
184
|
+
|
|
185
|
+
# 4. Print the suggested methodology
|
|
186
|
+
if result["success"]:
|
|
187
|
+
print("Successfully suggested an approach:")
|
|
188
|
+
print(result["result"]["approach"].model_dump_json(indent=4))
|
|
189
|
+
print(f"Cost summary: {result['result']['cost_summary']}")
|
|
190
|
+
else:
|
|
191
|
+
print("Failed to suggest an approach.")
|
|
192
|
+
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### Example Output
|
|
196
|
+
|
|
197
|
+
The agent returns a JSON object containing the recommended methodology and a detailed explanation for the choice.
|
|
198
|
+
|
|
199
|
+
*(Note: The actual output may vary slightly based on the LLM's response.)*
|
|
200
|
+
|
|
201
|
+
```json
|
|
202
|
+
{
|
|
203
|
+
"recommended": "Classification",
|
|
204
|
+
"description": "The goal is to predict the likelihood of a borrower becoming delinquent on their mortgage payment within the next 60 days. This is a binary outcome (delinquent or not delinquent), making classification the appropriate methodology. The target variable is categorical, and the available demographic and financial data can be used as features to train a classification model."
|
|
205
|
+
}
|
|
206
|
+
```
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ml_approach_suggestion_agent"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "Add your description here"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Programming Language :: Python :: 3",
|
|
14
|
+
"Programming Language :: Python :: 3.10",
|
|
15
|
+
"Programming Language :: Python :: 3.11",
|
|
16
|
+
"Programming Language :: Python :: 3.12",
|
|
17
|
+
"Programming Language :: Python :: 3.13",
|
|
18
|
+
# "License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
]
|
|
21
|
+
dependencies = [
|
|
22
|
+
"pydantic-settings",
|
|
23
|
+
"sfn-blueprint>=0.6.15",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.optional-dependencies]
|
|
27
|
+
dev = [
|
|
28
|
+
"pytest",
|
|
29
|
+
"pytest-mock",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[tool.setuptools.packages.find]
|
|
33
|
+
where = ["src"]
|
|
34
|
+
|
|
35
|
+
[tool.pytest.ini_options]
|
|
36
|
+
pythonpath = [
|
|
37
|
+
"."
|
|
38
|
+
]
|
|
39
|
+
filterwarnings = [
|
|
40
|
+
"ignore::DeprecationWarning",
|
|
41
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Dict, Any, List, Optional, Tuple
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from sfn_blueprint import SFNAIHandler, self_correcting_sql, Context
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
from .config import MethodologyConfig
|
|
9
|
+
from .constants import format_approach_prompt
|
|
10
|
+
from .models import MethodologyRecommendation
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MLApproachDecisionAgent:
|
|
15
|
+
def __init__(self, config: Optional[MethodologyConfig] = None):
|
|
16
|
+
self.logger = logging.getLogger(__name__)
|
|
17
|
+
self.config = config or MethodologyConfig()
|
|
18
|
+
self.ai_handler = SFNAIHandler()
|
|
19
|
+
|
|
20
|
+
def suggest_approach(self, domain_name, domain_description, use_case, column_descriptions, column_insights, max_try=1) -> Tuple[MethodologyRecommendation, Dict[str, Any]]:
|
|
21
|
+
"""
|
|
22
|
+
Suggests a machine learning approach based on the provided domain, use case, and column descriptions.
|
|
23
|
+
Args:
|
|
24
|
+
domain_name (str): The name of the domain.
|
|
25
|
+
domain_description (str): The description of the domain.
|
|
26
|
+
use_case (str): problem need to solve.
|
|
27
|
+
column_descriptions (List[str]): A list of column descriptions.
|
|
28
|
+
column_insights (List[str]): A list of column insights.
|
|
29
|
+
max_try (int, optional): The maximum number of attempts to make the API call. Defaults to 3.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
MethodologyRecommendation: The suggested machine learning approach.
|
|
33
|
+
|
|
34
|
+
TODO:
|
|
35
|
+
- USER prompt should consider those approaches which will be supported.
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
"""
|
|
39
|
+
system_prompt, user_prompt = format_approach_prompt(domain_name=domain_name, domain_description=domain_description, use_case=use_case, column_descriptions=column_descriptions, column_insights=column_insights)
|
|
40
|
+
for _ in range(max_try):
|
|
41
|
+
try:
|
|
42
|
+
response, cost_summary = self.ai_handler.route_to(
|
|
43
|
+
llm_provider=self.config.methodology_ai_provider,
|
|
44
|
+
configuration={
|
|
45
|
+
"messages": [
|
|
46
|
+
{"role": "system", "content": system_prompt},
|
|
47
|
+
{"role": "user", "content": user_prompt}
|
|
48
|
+
],
|
|
49
|
+
"max_tokens": self.config.methodology_max_tokens,
|
|
50
|
+
# "temperature": self.config.methodology_temperature,
|
|
51
|
+
"text_format":MethodologyRecommendation
|
|
52
|
+
},
|
|
53
|
+
model=self.config.methodology_ai_model
|
|
54
|
+
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
return response, cost_summary
|
|
59
|
+
|
|
60
|
+
except Exception as e:
|
|
61
|
+
self.logger.error(f"Error while executing API call to {self.config.methodology_ai_provider}: {e}")
|
|
62
|
+
|
|
63
|
+
return {}, {}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def execute_task(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
68
|
+
self.logger.info("Executing data quality assessment task.")
|
|
69
|
+
domain_name, domain_description, use_case, column_descriptions, column_insights = (
|
|
70
|
+
task_data["domain_name"],
|
|
71
|
+
task_data["domain_description"],
|
|
72
|
+
task_data["use_case"],
|
|
73
|
+
task_data["column_descriptions"],
|
|
74
|
+
task_data["column_insights"],
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Suggest an approach
|
|
78
|
+
result, cost_summary = self.suggest_approach(
|
|
79
|
+
domain_name=domain_name,
|
|
80
|
+
domain_description=domain_description,
|
|
81
|
+
use_case=use_case,
|
|
82
|
+
column_descriptions=column_descriptions,
|
|
83
|
+
column_insights=column_insights,
|
|
84
|
+
)
|
|
85
|
+
if not result:
|
|
86
|
+
return {
|
|
87
|
+
"success": False,
|
|
88
|
+
"error": "Failed to suggest approach.",
|
|
89
|
+
"agent": self.__class__.__name__
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
# Check if we have workflow storage information
|
|
94
|
+
if 'workflow_storage_path' in task_data or 'workflow_id' in task_data:
|
|
95
|
+
from sfn_blueprint import WorkflowStorageManager
|
|
96
|
+
|
|
97
|
+
# Determine workflow storage path
|
|
98
|
+
workflow_storage_path = task_data.get('workflow_storage_path', 'outputs/workflows')
|
|
99
|
+
workflow_id = task_data.get('workflow_id', 'unknown')
|
|
100
|
+
|
|
101
|
+
# Initialize storage manager
|
|
102
|
+
storage_manager = WorkflowStorageManager(workflow_storage_path, workflow_id)
|
|
103
|
+
storage_manager.save_agent_result(
|
|
104
|
+
agent_name=self.__class__.__name__,
|
|
105
|
+
step_name=" ",
|
|
106
|
+
data={"quality_reports": result.model_dump(), "cost_summary": cost_summary},
|
|
107
|
+
metadata={ "execution_time": datetime.now().isoformat()}
|
|
108
|
+
)
|
|
109
|
+
self.logger.info(" saved to workflow storage.")
|
|
110
|
+
except Exception as e:
|
|
111
|
+
self.logger.warning(f"Failed to save results to workflow storage: {e}")
|
|
112
|
+
|
|
113
|
+
return {
|
|
114
|
+
"success": True,
|
|
115
|
+
"result": {
|
|
116
|
+
"approach": result ,
|
|
117
|
+
"cost_summary": cost_summary
|
|
118
|
+
},
|
|
119
|
+
"agent": self.__class__.__name__
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
def __call__(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
123
|
+
return self.execute_task(task_data)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from pydantic import Field
|
|
2
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class MethodologyConfig(BaseSettings):
|
|
8
|
+
model_config = SettingsConfigDict(
|
|
9
|
+
env_file='.env',
|
|
10
|
+
env_file_encoding='utf-8',
|
|
11
|
+
case_sensitive=False,
|
|
12
|
+
extra='ignore'
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
methodology_ai_provider: str = Field(default="openai", description="AI provider to use")
|
|
16
|
+
methodology_ai_model: str = Field(default="gpt-4o-mini", description="AI model to use")
|
|
17
|
+
methodology_temperature: float = Field(default=0.3, ge=0.0, le=0.5, description="AI model temperature")
|
|
18
|
+
methodology_max_tokens: int = Field(default=4000, ge=100, le=8000, description="Maximum tokens for AI response")
|
|
19
|
+
|
|
20
|
+
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
METHODOLOGY_SELECTION_SYSTEM_PROMPT = """You are an ML methodology advisor. Analyze the problem and select ONE methodology: binary_classification, time_series_forecasting, or not_applicable.
|
|
2
|
+
|
|
3
|
+
**Simple Decision Rules:**
|
|
4
|
+
|
|
5
|
+
1. **Binary Classification** - Choose when:
|
|
6
|
+
- Use case asks "predict whether", "will X happen", "classify if"
|
|
7
|
+
- Answer is YES/NO, TRUE/FALSE, or 1/0
|
|
8
|
+
- Example: "predict if machine fails", "detect fraud", "identify churn"
|
|
9
|
+
|
|
10
|
+
2. **Time Series Forecasting** - Choose when:
|
|
11
|
+
- Use case asks to "forecast", "predict future value", "estimate next"
|
|
12
|
+
- Answer is a NUMERICAL value in the FUTURE
|
|
13
|
+
- Example: "forecast next month sales", "predict tomorrow's temperature"
|
|
14
|
+
|
|
15
|
+
3. **Not Applicable** - Choose when:
|
|
16
|
+
- No prediction needed
|
|
17
|
+
- Just data analysis, reporting, or calculations
|
|
18
|
+
- Not enough information
|
|
19
|
+
**Required Output:**
|
|
20
|
+
1. Select the single best ML methodology from: binary_classification, time_series_forecasting, or not_applicable
|
|
21
|
+
2. Provide a clear justification explaining:
|
|
22
|
+
- What you understand the business goal to be
|
|
23
|
+
- What type of prediction is needed (binary outcome, numerical forecast, or none)
|
|
24
|
+
- Whether temporal patterns are critical for this prediction
|
|
25
|
+
- Why the selected methodology is the best fit
|
|
26
|
+
|
|
27
|
+
**Important:**
|
|
28
|
+
- Having timestamps doesn't mean it's time series forecasting
|
|
29
|
+
- Check WHAT is being predicted: binary outcome OR future number
|
|
30
|
+
- The dataset may contain 1-4 tables - analyze all provided tables together"""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
METHODOLOGY_SELECTION_USER_PROMPT = """**Business Context:**
|
|
35
|
+
Domain: {domain_name}
|
|
36
|
+
{domain_description}
|
|
37
|
+
|
|
38
|
+
**Use Case:**
|
|
39
|
+
{use_case_description}
|
|
40
|
+
|
|
41
|
+
**Data Overview:**
|
|
42
|
+
Columns: {column_descriptions}
|
|
43
|
+
|
|
44
|
+
Dataset Characteristics:
|
|
45
|
+
{column_insights}
|
|
46
|
+
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def format_approach_prompt(
|
|
51
|
+
domain_name: str,
|
|
52
|
+
domain_description: str,
|
|
53
|
+
use_case: str,
|
|
54
|
+
column_descriptions: str,
|
|
55
|
+
column_insights: str
|
|
56
|
+
) -> tuple[str, str]:
|
|
57
|
+
"""
|
|
58
|
+
Format the methodology selection prompts for the LLM.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
domain_name: The domain of the data (e.g., "Healthcare", "Finance")
|
|
62
|
+
domain_description: Detailed description of the domain context
|
|
63
|
+
use_case: Description of what the user wants to achieve
|
|
64
|
+
column_descriptions: Description of the columns in the dataset
|
|
65
|
+
column_insights: Statistical insights about the columns (data types,
|
|
66
|
+
unique counts, distributions, etc.)
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
tuple[str, str]: The formatted system prompt and user prompt
|
|
70
|
+
|
|
71
|
+
Example:
|
|
72
|
+
system_prompt, user_prompt = format_approach_prompt(
|
|
73
|
+
domain_name="E-commerce",
|
|
74
|
+
domain_description="Online retail platform with customer transactions",
|
|
75
|
+
use_case="Predict if a customer will make a purchase",
|
|
76
|
+
column_descriptions="user_id, page_views, cart_additions, timestamp",
|
|
77
|
+
column_insights="4 columns, 10000 rows, mixed types"
|
|
78
|
+
)
|
|
79
|
+
"""
|
|
80
|
+
user_prompt = METHODOLOGY_SELECTION_USER_PROMPT.format(
|
|
81
|
+
domain_name=domain_name,
|
|
82
|
+
domain_description=domain_description,
|
|
83
|
+
use_case_description=use_case,
|
|
84
|
+
column_descriptions=column_descriptions,
|
|
85
|
+
column_insights=column_insights
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return METHODOLOGY_SELECTION_SYSTEM_PROMPT, user_prompt
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from typing import Literal
|
|
3
|
+
|
|
4
|
+
class MethodologyRecommendation(BaseModel):
|
|
5
|
+
selected_methodology: Literal[ "binary_classification", "time_series_forecasting", "not_applicable"] = Field(..., description="The most appropriate ML approach for this problem")
|
|
6
|
+
|
|
7
|
+
justification: str = Field( ..., description="Clear explanation connecting the business goal and data characteristics to the chosen methodology")
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ml_approach_suggestion_agent
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.11
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: pydantic-settings
|
|
15
|
+
Requires-Dist: sfn-blueprint>=0.6.15
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
Requires-Dist: pytest; extra == "dev"
|
|
18
|
+
Requires-Dist: pytest-mock; extra == "dev"
|
|
19
|
+
|
|
20
|
+
# ml_approach_suggestion_agent
|
|
21
|
+
|
|
22
|
+
An AI-powered agent that analyzes a dataset and use case to recommend the most appropriate machine learning methodology.
|
|
23
|
+
|
|
24
|
+
## Description
|
|
25
|
+
|
|
26
|
+
This agent takes a detailed description of a business domain, a specific use case, and information about the dataset—including column descriptions, insights, and target variable details—to suggest the best ML approach. It uses a large language model to:
|
|
27
|
+
|
|
28
|
+
1. **Analyze** the relationship between the use case and the target variable.
|
|
29
|
+
2. **Evaluate** the characteristics of the data (especially the target column).
|
|
30
|
+
3. **Recommend** the most suitable methodology from a predefined list: `Classification`, `Regression`, `Forecasting`, `Clustering`, or `No-ML`.
|
|
31
|
+
4. **Provide** a clear justification for its recommendation.
|
|
32
|
+
|
|
33
|
+
This helps data scientists and analysts quickly and confidently choose the right path for their modeling efforts, saving time and reducing the risk of starting with an incorrect approach.
|
|
34
|
+
|
|
35
|
+
## Key Features
|
|
36
|
+
|
|
37
|
+
- **Intelligent Use Case Analysis**: Leverages an LLM to understand the core objective of the business problem.
|
|
38
|
+
- **Target-Aware Recommendation**: Places special emphasis on the nature of the target variable to guide its decision.
|
|
39
|
+
- **Context-Driven Suggestions**: Considers the entire data context, including domain and column descriptions, to make an informed choice.
|
|
40
|
+
- **Accelerates Model Planning**: Provides a validated starting point for ML projects, ensuring alignment between the problem and the proposed solution.
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
### Prerequisites
|
|
45
|
+
|
|
46
|
+
- [**uv**](https://docs.astral.sh/uv/getting-started/installation/) – A fast Python package and environment manager.
|
|
47
|
+
- For a quick setup on macOS/Linux, you can use:
|
|
48
|
+
```bash
|
|
49
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
50
|
+
```
|
|
51
|
+
- [**Git**](https://git-scm.com/)
|
|
52
|
+
|
|
53
|
+
### Steps
|
|
54
|
+
|
|
55
|
+
1. **Clone the `methodology_selection_agent` repository:**
|
|
56
|
+
```bash
|
|
57
|
+
git clone https://github.com/stepfnAI/ml_approach_suggestion_agent.git
|
|
58
|
+
cd ml_approach_suggestion_agent
|
|
59
|
+
git switch dev
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
2. **Create a virtual environment and install dependencies:**
|
|
63
|
+
This command creates a `.venv` folder in the current directory and installs all required packages.
|
|
64
|
+
```bash
|
|
65
|
+
uv sync --extra dev
|
|
66
|
+
source .venv/bin/activate
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
3. **Clone and install the `sfn_blueprint` dependency:**
|
|
70
|
+
The agent requires the `sfn_blueprint` library. The following commands clone it into a sibling directory and install it in editable mode.
|
|
71
|
+
```bash
|
|
72
|
+
cd ../
|
|
73
|
+
git clone https://github.com/stepfnAI/sfn_blueprint.git
|
|
74
|
+
cd sfn_blueprint
|
|
75
|
+
git switch dev
|
|
76
|
+
uv pip install -e .
|
|
77
|
+
cd ../methodology_selection_agent
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Configuration
|
|
81
|
+
|
|
82
|
+
You can configure the agent by creating a `.env` file in the project root or by exporting environment variables in your shell. Settings loaded via `export` will override those in a `.env` file.
|
|
83
|
+
|
|
84
|
+
### Available Settings
|
|
85
|
+
|
|
86
|
+
| Environment Variable | Description | Default |
|
|
87
|
+
| ------------------------------- | -------------------------------------------- | -------- |
|
|
88
|
+
| `OPENAI_API_KEY` | **(Required)** Your OpenAI API key. | *None* |
|
|
89
|
+
| `METHODOLOGY_AI_PROVIDER` | AI provider for methodology suggestions. | `openai` |
|
|
90
|
+
| `METHODOLOGY_AI_MODEL` | AI model for methodology suggestions. | `gpt-4o` |
|
|
91
|
+
| `METHODOLOGY_TEMPERATURE` | AI model temperature (e.g., `0.0` to `0.5`). | `0.3` |
|
|
92
|
+
| `METHODOLOGY_MAX_TOKENS` | Maximum tokens for the AI response. | `4000` |
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
### Method 1: Using a `.env` File (Recommended)
|
|
97
|
+
|
|
98
|
+
Create a `.env` file in the root directory to store API keys and project-wide defaults.
|
|
99
|
+
|
|
100
|
+
#### Example `.env` file:
|
|
101
|
+
|
|
102
|
+
```dotenv
|
|
103
|
+
# .env
|
|
104
|
+
|
|
105
|
+
# --- Required Settings ---
|
|
106
|
+
OPENAI_API_KEY="sk-your-api-key-here"
|
|
107
|
+
|
|
108
|
+
# --- Optional Overrides ---
|
|
109
|
+
# Use a different model
|
|
110
|
+
METHODOLOGY_AI_MODEL="gpt-4o-mini"
|
|
111
|
+
|
|
112
|
+
# Use a lower temperature for more deterministic responses
|
|
113
|
+
METHODOLOGY_TEMPERATURE=0.1
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
### Method 2: Using `export` Commands
|
|
119
|
+
|
|
120
|
+
Use `export` in your terminal for temporary settings or in CI/CD environments.
|
|
121
|
+
|
|
122
|
+
#### Example `export` commands:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
# Set the environment variables for the current terminal session
|
|
126
|
+
export OPENAI_API_KEY="sk-your-api-key-here"
|
|
127
|
+
export METHODOLOGY_AI_MODEL="gpt-4o-mini"
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Testing
|
|
131
|
+
|
|
132
|
+
To run the test suite, use the following command from the root of the project directory:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
pytest -s
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Usage
|
|
139
|
+
|
|
140
|
+
### Running the Example Script
|
|
141
|
+
|
|
142
|
+
To see a quick demonstration, run the provided example script. This will execute the agent with pre-defined data and print the recommended methodology.
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
python examples/basic_usage.py
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Using as a Library
|
|
149
|
+
|
|
150
|
+
Integrate the `MLApproachDecisionAgent` directly into your Python applications to get methodology recommendations programmatically.
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
import logging
|
|
154
|
+
from ml_approach_suggestion_agent.agent import MLApproachDecisionAgent
|
|
155
|
+
|
|
156
|
+
# Configure logging
|
|
157
|
+
logging.basicConfig(level=logging.INFO)
|
|
158
|
+
|
|
159
|
+
# 1. Define the domain, use case, and data context
|
|
160
|
+
domain_name = "Mortgage Loan Servicing"
|
|
161
|
+
domain_description = "Managing mortgage loans from post-origination to payoff, including payment collection, escrow management, and compliance for domestic and international loans."
|
|
162
|
+
use_case = "To predict the likelihood of a borrower becoming delinquent on their mortgage payment within the next 60 days using their demographic and financial data to enable proactive intervention."
|
|
163
|
+
|
|
164
|
+
column_descriptions = {
|
|
165
|
+
"CreditScore": "Borrower's credit score from credit bureau sources",
|
|
166
|
+
"EmploymentStatus": "Current employment status (e.g., employed, self-employed, unemployed)",
|
|
167
|
+
# ... other column descriptions
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
column_insights = {
|
|
171
|
+
"table_info": { "row_count": 50000 },
|
|
172
|
+
"table_columns_info": {
|
|
173
|
+
"CreditScore": { "data_type": "Int64", "min_max_value": [350, 850] },
|
|
174
|
+
"EmploymentStatus": { "data_type": "string", "distinct_count": 5 },
|
|
175
|
+
# ... other column insights
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
target_column_name = "IsDelinquent"
|
|
180
|
+
target_column_insights = {
|
|
181
|
+
"Target Column Description": "A binary categorical flag indicating if the borrower has missed one or more mortgage payments in the last 60 days.",
|
|
182
|
+
"Data Type": "Integer (or Boolean)",
|
|
183
|
+
"Value Distribution": {
|
|
184
|
+
"0 (Not Delinquent)": "92%",
|
|
185
|
+
"1 (Delinquent)": "8%"
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
# 2. Prepare the task data payload
|
|
190
|
+
task_data = {
|
|
191
|
+
"domain_name": domain_name,
|
|
192
|
+
"domain_description": domain_description,
|
|
193
|
+
"use_case": use_case,
|
|
194
|
+
"column_descriptions": column_descriptions,
|
|
195
|
+
"column_insights": column_insights,
|
|
196
|
+
"target_column_name": target_column_name,
|
|
197
|
+
"target_column_insights": target_column_insights
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
# 3. Initialize and execute the agent
|
|
201
|
+
agent = MLApproachDecisionAgent()
|
|
202
|
+
result = agent(task_data)
|
|
203
|
+
|
|
204
|
+
# 4. Print the suggested methodology
|
|
205
|
+
if result["success"]:
|
|
206
|
+
print("Successfully suggested an approach:")
|
|
207
|
+
print(result["result"]["approach"].model_dump_json(indent=4))
|
|
208
|
+
print(f"Cost summary: {result['result']['cost_summary']}")
|
|
209
|
+
else:
|
|
210
|
+
print("Failed to suggest an approach.")
|
|
211
|
+
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### Example Output
|
|
215
|
+
|
|
216
|
+
The agent returns a JSON object containing the recommended methodology and a detailed explanation for the choice.
|
|
217
|
+
|
|
218
|
+
*(Note: The actual output may vary slightly based on the LLM's response.)*
|
|
219
|
+
|
|
220
|
+
```json
|
|
221
|
+
{
|
|
222
|
+
"recommended": "Classification",
|
|
223
|
+
"description": "The goal is to predict the likelihood of a borrower becoming delinquent on their mortgage payment within the next 60 days. This is a binary outcome (delinquent or not delinquent), making classification the appropriate methodology. The target variable is categorical, and the available demographic and financial data can be used as features to train a classification model."
|
|
224
|
+
}
|
|
225
|
+
```
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/ml_approach_suggestion_agent/__init__.py
|
|
4
|
+
src/ml_approach_suggestion_agent/agent.py
|
|
5
|
+
src/ml_approach_suggestion_agent/config.py
|
|
6
|
+
src/ml_approach_suggestion_agent/constants.py
|
|
7
|
+
src/ml_approach_suggestion_agent/models.py
|
|
8
|
+
src/ml_approach_suggestion_agent.egg-info/PKG-INFO
|
|
9
|
+
src/ml_approach_suggestion_agent.egg-info/SOURCES.txt
|
|
10
|
+
src/ml_approach_suggestion_agent.egg-info/dependency_links.txt
|
|
11
|
+
src/ml_approach_suggestion_agent.egg-info/requires.txt
|
|
12
|
+
src/ml_approach_suggestion_agent.egg-info/top_level.txt
|
|
13
|
+
tests/test_agent.py
|
ml_approach_suggestion_agent-0.1.1/src/ml_approach_suggestion_agent.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ml_approach_suggestion_agent
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
|
|
2
|
+
import json
|
|
3
|
+
import pytest
|
|
4
|
+
from unittest.mock import MagicMock, patch
|
|
5
|
+
from collections import namedtuple
|
|
6
|
+
|
|
7
|
+
from src.ml_approach_suggestion_agent.agent import MLApproachDecisionAgent
|
|
8
|
+
from src.ml_approach_suggestion_agent.models import MethodologyRecommendation
|
|
9
|
+
|
|
10
|
+
# Mock data for testing
|
|
11
|
+
domain_obj = namedtuple("Domain", ["domain_name", "domain_description"])
|
|
12
|
+
use_case_obj = namedtuple("UseCase", ["use_case_name", "use_case_description"])
|
|
13
|
+
|
|
14
|
+
mock_domain = domain_obj("E-commerce", "Online retail platform")
|
|
15
|
+
mock_use_case = use_case_obj("Customer Churn Prediction", "Predict which customers are likely to churn")
|
|
16
|
+
mock_column_descriptions = "user_id: string, purchase_history: array, last_login: date"
|
|
17
|
+
mock_column_insights = "High cardinality in user_id"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@pytest.fixture
|
|
21
|
+
def agent():
|
|
22
|
+
"""Fixture for MLApproachDecisionAgent."""
|
|
23
|
+
with patch('src.ml_approach_suggestion_agent.agent.SFNAIHandler') as mock_ai_handler:
|
|
24
|
+
agent = MLApproachDecisionAgent()
|
|
25
|
+
agent.ai_handler = mock_ai_handler()
|
|
26
|
+
yield agent
|
|
27
|
+
|
|
28
|
+
def test_suggest_approach_success(agent):
|
|
29
|
+
"""Test successful suggestion of an ML approach."""
|
|
30
|
+
mock_response = {
|
|
31
|
+
"selected_methodology": "binary_classification",
|
|
32
|
+
"justification": "The use case is a classic classification problem."
|
|
33
|
+
}
|
|
34
|
+
cost_summary = {'prompt_tokens': 213, 'completion_tokens': 125, 'total_tokens': 338, 'total_cost_usd': 0.0018}
|
|
35
|
+
agent.ai_handler.route_to.return_value = (MethodologyRecommendation(**mock_response), cost_summary)
|
|
36
|
+
result, cost = agent.suggest_approach(mock_domain.domain_name, mock_domain.domain_description, mock_use_case, mock_column_descriptions, mock_column_insights)
|
|
37
|
+
|
|
38
|
+
assert isinstance(result, MethodologyRecommendation)
|
|
39
|
+
assert result.selected_methodology == "binary_classification"
|
|
40
|
+
assert cost == cost_summary
|
|
41
|
+
|
|
42
|
+
def test_suggest_approach_failure_json(agent):
|
|
43
|
+
"""Test failure due to invalid JSON response."""
|
|
44
|
+
agent.ai_handler.route_to.side_effect = Exception("Simulating JSON parsing error")
|
|
45
|
+
result, cost = agent.suggest_approach(mock_domain.domain_name, mock_domain.domain_description, mock_use_case, mock_column_descriptions, mock_column_insights, max_try=1)
|
|
46
|
+
|
|
47
|
+
assert result == {}
|
|
48
|
+
assert cost == {}
|
|
49
|
+
|
|
50
|
+
def test_suggest_approach_api_error(agent):
|
|
51
|
+
"""Test failure due to an API error."""
|
|
52
|
+
agent.ai_handler.route_to.side_effect = Exception("API Error")
|
|
53
|
+
|
|
54
|
+
result, cost = agent.suggest_approach(mock_domain.domain_name, mock_domain.domain_description, mock_use_case, mock_column_descriptions, mock_column_insights, max_try=1)
|
|
55
|
+
|
|
56
|
+
assert result == {}
|
|
57
|
+
assert cost == {}
|
|
58
|
+
|
|
59
|
+
@patch('src.ml_approach_suggestion_agent.agent.MLApproachDecisionAgent.suggest_approach')
|
|
60
|
+
def test_execute_task_success(mock_suggest_approach):
|
|
61
|
+
"""Test successful execution of the task."""
|
|
62
|
+
agent = MLApproachDecisionAgent()
|
|
63
|
+
mock_response = MethodologyRecommendation(
|
|
64
|
+
selected_methodology="binary_classification",
|
|
65
|
+
justification="The use case is a classic classification problem."
|
|
66
|
+
)
|
|
67
|
+
mock_suggest_approach.return_value = (mock_response, {"cost": 0.1})
|
|
68
|
+
|
|
69
|
+
task_data = {
|
|
70
|
+
"domain_name": mock_domain.domain_name,
|
|
71
|
+
"domain_description": mock_domain.domain_description,
|
|
72
|
+
"use_case": mock_use_case,
|
|
73
|
+
"column_descriptions": mock_column_descriptions,
|
|
74
|
+
"column_insights": mock_column_insights,
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
result = agent.execute_task(task_data)
|
|
78
|
+
|
|
79
|
+
assert result["success"] is True
|
|
80
|
+
assert result["result"]["approach"].selected_methodology == "binary_classification"
|
|
81
|
+
assert result["result"]["cost_summary"] == {"cost": 0.1}
|
|
82
|
+
mock_suggest_approach.assert_called_once_with(
|
|
83
|
+
domain_name=mock_domain.domain_name,
|
|
84
|
+
domain_description=mock_domain.domain_description,
|
|
85
|
+
use_case=mock_use_case,
|
|
86
|
+
column_descriptions=mock_column_descriptions,
|
|
87
|
+
column_insights=mock_column_insights,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
@patch('src.ml_approach_suggestion_agent.agent.MLApproachDecisionAgent.suggest_approach')
|
|
91
|
+
def test_execute_task_failure(mock_suggest_approach):
|
|
92
|
+
"""Test failure during task execution."""
|
|
93
|
+
agent = MLApproachDecisionAgent()
|
|
94
|
+
mock_suggest_approach.return_value = (None, {})
|
|
95
|
+
|
|
96
|
+
task_data = {
|
|
97
|
+
"domain_name": mock_domain.domain_name,
|
|
98
|
+
"domain_description": mock_domain.domain_description,
|
|
99
|
+
"use_case": mock_use_case,
|
|
100
|
+
"column_descriptions": mock_column_descriptions,
|
|
101
|
+
"column_insights": mock_column_insights,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
result = agent.execute_task(task_data)
|
|
105
|
+
|
|
106
|
+
assert result["success"] is False
|
|
107
|
+
assert result["error"] == "Failed to suggest approach."
|
|
108
|
+
mock_suggest_approach.assert_called_once_with(
|
|
109
|
+
domain_name=mock_domain.domain_name,
|
|
110
|
+
domain_description=mock_domain.domain_description,
|
|
111
|
+
use_case=mock_use_case,
|
|
112
|
+
column_descriptions=mock_column_descriptions,
|
|
113
|
+
column_insights=mock_column_insights,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
@patch('sfn_blueprint.WorkflowStorageManager')
|
|
117
|
+
@patch('src.ml_approach_suggestion_agent.agent.MLApproachDecisionAgent.suggest_approach')
|
|
118
|
+
def test_execute_task_with_storage(mock_suggest_approach, mock_storage_manager):
|
|
119
|
+
"""Test task execution with result storage."""
|
|
120
|
+
agent = MLApproachDecisionAgent()
|
|
121
|
+
mock_response = MethodologyRecommendation(
|
|
122
|
+
selected_methodology="binary_classification",
|
|
123
|
+
justification="The use case is a classic classification problem."
|
|
124
|
+
)
|
|
125
|
+
mock_suggest_approach.return_value = (mock_response, {"cost": 0.1})
|
|
126
|
+
|
|
127
|
+
# Mock the storage manager instance
|
|
128
|
+
mock_storage_instance = MagicMock()
|
|
129
|
+
mock_storage_manager.return_value = mock_storage_instance
|
|
130
|
+
|
|
131
|
+
task_data = {
|
|
132
|
+
"domain_name": mock_domain.domain_name,
|
|
133
|
+
"domain_description": mock_domain.domain_description,
|
|
134
|
+
"use_case": mock_use_case,
|
|
135
|
+
"column_descriptions": mock_column_descriptions,
|
|
136
|
+
"column_insights": mock_column_insights,
|
|
137
|
+
"workflow_storage_path": "/tmp/test",
|
|
138
|
+
"workflow_id": "test_workflow"
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
result = agent.execute_task(task_data)
|
|
142
|
+
|
|
143
|
+
assert result["success"] is True
|
|
144
|
+
mock_storage_instance.save_agent_result.assert_called_once()
|
|
145
|
+
|
|
146
|
+
# Get the arguments passed to save_agent_result
|
|
147
|
+
_, kwargs = mock_storage_instance.save_agent_result.call_args
|
|
148
|
+
assert kwargs['agent_name'] == "MLApproachDecisionAgent"
|
|
149
|
+
assert kwargs['step_name'] == " "
|
|
150
|
+
assert kwargs['data'] == {"quality_reports": mock_response.model_dump(), "cost_summary": {"cost": 0.1}}
|
|
151
|
+
|