ai-data-science-team 0.0.0.9006__tar.gz → 0.0.0.9007__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/PKG-INFO +41 -23
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/README.md +31 -21
- ai_data_science_team-0.0.0.9007/ai_data_science_team/_version.py +1 -0
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/agents/__init__.py +2 -1
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/agents/data_cleaning_agent.py +204 -19
- ai_data_science_team-0.0.0.9007/ai_data_science_team/agents/data_visualization_agent.py +331 -0
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/agents/data_wrangling_agent.py +56 -11
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/agents/feature_engineering_agent.py +40 -11
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/agents/sql_database_agent.py +30 -12
- ai_data_science_team-0.0.0.9007/ai_data_science_team/templates/__init__.py +8 -0
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/tools/metadata.py +110 -47
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/tools/regex.py +6 -0
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team.egg-info/PKG-INFO +41 -23
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team.egg-info/SOURCES.txt +1 -0
- ai_data_science_team-0.0.0.9006/ai_data_science_team/_version.py +0 -1
- ai_data_science_team-0.0.0.9006/ai_data_science_team/tools/__init__.py +0 -0
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/__init__.py +0 -0
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/orchestration.py +0 -0
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/templates/agent_templates.py +0 -0
- {ai_data_science_team-0.0.0.9006/ai_data_science_team/templates → ai_data_science_team-0.0.0.9007/ai_data_science_team/tools}/__init__.py +0 -0
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/tools/logging.py +0 -0
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/tools/parsers.py +0 -0
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team.egg-info/dependency_links.txt +0 -0
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team.egg-info/requires.txt +0 -0
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/ai_data_science_team.egg-info/top_level.txt +0 -0
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/setup.cfg +0 -0
- {ai_data_science_team-0.0.0.9006 → ai_data_science_team-0.0.0.9007}/setup.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: ai-data-science-team
|
3
|
-
Version: 0.0.0.
|
3
|
+
Version: 0.0.0.9007
|
4
4
|
Summary: Build and run an AI-powered data science team.
|
5
5
|
Home-page: https://github.com/business-science/ai-data-science-team
|
6
6
|
Author: Matt Dancho
|
@@ -21,12 +21,22 @@ Requires-Dist: plotly
|
|
21
21
|
Requires-Dist: streamlit
|
22
22
|
Requires-Dist: scikit-learn
|
23
23
|
Requires-Dist: xgboost
|
24
|
+
Dynamic: author
|
25
|
+
Dynamic: author-email
|
26
|
+
Dynamic: description
|
27
|
+
Dynamic: description-content-type
|
28
|
+
Dynamic: home-page
|
29
|
+
Dynamic: requires-dist
|
30
|
+
Dynamic: requires-python
|
31
|
+
Dynamic: summary
|
24
32
|
|
25
33
|
# Your AI Data Science Team (An Army Of Copilots)
|
26
34
|
|
27
35
|
**An AI-powered data science team of copilots that uses agents to help you perform common data science tasks 10X faster**.
|
28
36
|
|
29
|
-
Star ⭐ This GitHub (Takes 2 seconds and means a lot)
|
37
|
+
**Star ⭐ This GitHub (Takes 2 seconds and means a lot).**
|
38
|
+
|
39
|
+
*Beta - This Python library is under active development. There may be breaking changes that occur until release of 0.1.0.*
|
30
40
|
|
31
41
|
---
|
32
42
|
|
@@ -39,6 +49,24 @@ The AI Data Science Team of Copilots includes Agents that specialize data cleani
|
|
39
49
|
- Credit Card Risk
|
40
50
|
- And more
|
41
51
|
|
52
|
+
## Table of Contents
|
53
|
+
|
54
|
+
- [Your AI Data Science Team (An Army Of Copilots)](#your-ai-data-science-team-an-army-of-copilots)
|
55
|
+
- [Table of Contents](#table-of-contents)
|
56
|
+
- [Companies That Want An AI Data Science Team Copilot](#companies-that-want-an-ai-data-science-team-copilot)
|
57
|
+
- [Free Generative AI For Data Scientists Workshop](#free-generative-ai-for-data-scientists-workshop)
|
58
|
+
- [Data Science Agents](#data-science-agents)
|
59
|
+
- [Coming Soon: Multi-Agents](#coming-soon-multi-agents)
|
60
|
+
- [Agents Available Now](#agents-available-now)
|
61
|
+
- [Agents Coming Soon](#agents-coming-soon)
|
62
|
+
- [Disclaimer](#disclaimer)
|
63
|
+
- [Installation](#installation)
|
64
|
+
- [Usage](#usage)
|
65
|
+
- [Example 1: Feature Engineering with the Feature Engineering Agent](#example-1-feature-engineering-with-the-feature-engineering-agent)
|
66
|
+
- [Example 2: Cleaning Data with the Data Cleaning Agent](#example-2-cleaning-data-with-the-data-cleaning-agent)
|
67
|
+
- [Contributing](#contributing)
|
68
|
+
- [License](#license)
|
69
|
+
|
42
70
|
## Companies That Want An AI Data Science Team Copilot
|
43
71
|
|
44
72
|
If you are interested in having your own custom enteprise-grade AI Data Science Team Copilot, send inquiries here: [https://www.business-science.io/contact.html](https://www.business-science.io/contact.html)
|
@@ -53,12 +81,19 @@ This project is a work in progress. New data science agents will be released soo
|
|
53
81
|
|
54
82
|

|
55
83
|
|
84
|
+
### Coming Soon: Multi-Agents
|
85
|
+
|
86
|
+
This is the internals of the Business Intelligence SQL Agent I'm working on:
|
87
|
+
|
88
|
+

|
89
|
+
|
56
90
|
### Agents Available Now
|
57
91
|
|
58
92
|
1. **Data Wrangling Agent:** Merges, Joins, Preps and Wrangles data into a format that is ready for data analysis.
|
59
|
-
2. **Data
|
60
|
-
3. **
|
61
|
-
4. **
|
93
|
+
2. **Data Visualization Agent:** Creates visualizations to help you understand your data. Returns JSON serializable plotly visualizations.
|
94
|
+
3. **Data Cleaning Agent:** Performs Data Preparation steps including handling missing values, outliers, and data type conversions.
|
95
|
+
4. **Feature Engineering Agent:** Converts the prepared data into ML-ready data. Adds features to increase predictive accuracy of ML models.
|
96
|
+
5. **SQL Database Agent:** Connects to SQL databases to pull data into the data science environment. Creates pipelines to automate data extraction. Performs Joins, Aggregations, and other SQL Query operations.
|
62
97
|
|
63
98
|
### Agents Coming Soon
|
64
99
|
|
@@ -79,23 +114,6 @@ This project is a work in progress. New data science agents will be released soo
|
|
79
114
|
|
80
115
|
By using this software, you agree to use it solely for learning purposes.
|
81
116
|
|
82
|
-
## Table of Contents
|
83
|
-
|
84
|
-
- [Your AI Data Science Team (An Army Of Copilots)](#your-ai-data-science-team-an-army-of-copilots)
|
85
|
-
- [Companies That Want An AI Data Science Team Copilot](#companies-that-want-an-ai-data-science-team-copilot)
|
86
|
-
- [Free Generative AI For Data Scientists Workshop](#free-generative-ai-for-data-scientists-workshop)
|
87
|
-
- [Data Science Agents](#data-science-agents)
|
88
|
-
- [Agents Available Now](#agents-available-now)
|
89
|
-
- [Agents Coming Soon](#agents-coming-soon)
|
90
|
-
- [Disclaimer](#disclaimer)
|
91
|
-
- [Table of Contents](#table-of-contents)
|
92
|
-
- [Installation](#installation)
|
93
|
-
- [Usage](#usage)
|
94
|
-
- [Example 1: Feature Engineering with the Feature Engineering Agent](#example-1-feature-engineering-with-the-feature-engineering-agent)
|
95
|
-
- [Example 2: Cleaning Data with the Data Cleaning Agent](#example-2-cleaning-data-with-the-data-cleaning-agent)
|
96
|
-
- [Contributing](#contributing)
|
97
|
-
- [License](#license)
|
98
|
-
|
99
117
|
## Installation
|
100
118
|
|
101
119
|
``` bash
|
@@ -2,7 +2,9 @@
|
|
2
2
|
|
3
3
|
**An AI-powered data science team of copilots that uses agents to help you perform common data science tasks 10X faster**.
|
4
4
|
|
5
|
-
Star ⭐ This GitHub (Takes 2 seconds and means a lot)
|
5
|
+
**Star ⭐ This GitHub (Takes 2 seconds and means a lot).**
|
6
|
+
|
7
|
+
*Beta - This Python library is under active development. There may be breaking changes that occur until release of 0.1.0.*
|
6
8
|
|
7
9
|
---
|
8
10
|
|
@@ -15,6 +17,24 @@ The AI Data Science Team of Copilots includes Agents that specialize data cleani
|
|
15
17
|
- Credit Card Risk
|
16
18
|
- And more
|
17
19
|
|
20
|
+
## Table of Contents
|
21
|
+
|
22
|
+
- [Your AI Data Science Team (An Army Of Copilots)](#your-ai-data-science-team-an-army-of-copilots)
|
23
|
+
- [Table of Contents](#table-of-contents)
|
24
|
+
- [Companies That Want An AI Data Science Team Copilot](#companies-that-want-an-ai-data-science-team-copilot)
|
25
|
+
- [Free Generative AI For Data Scientists Workshop](#free-generative-ai-for-data-scientists-workshop)
|
26
|
+
- [Data Science Agents](#data-science-agents)
|
27
|
+
- [Coming Soon: Multi-Agents](#coming-soon-multi-agents)
|
28
|
+
- [Agents Available Now](#agents-available-now)
|
29
|
+
- [Agents Coming Soon](#agents-coming-soon)
|
30
|
+
- [Disclaimer](#disclaimer)
|
31
|
+
- [Installation](#installation)
|
32
|
+
- [Usage](#usage)
|
33
|
+
- [Example 1: Feature Engineering with the Feature Engineering Agent](#example-1-feature-engineering-with-the-feature-engineering-agent)
|
34
|
+
- [Example 2: Cleaning Data with the Data Cleaning Agent](#example-2-cleaning-data-with-the-data-cleaning-agent)
|
35
|
+
- [Contributing](#contributing)
|
36
|
+
- [License](#license)
|
37
|
+
|
18
38
|
## Companies That Want An AI Data Science Team Copilot
|
19
39
|
|
20
40
|
If you are interested in having your own custom enteprise-grade AI Data Science Team Copilot, send inquiries here: [https://www.business-science.io/contact.html](https://www.business-science.io/contact.html)
|
@@ -29,12 +49,19 @@ This project is a work in progress. New data science agents will be released soo
|
|
29
49
|
|
30
50
|

|
31
51
|
|
52
|
+
### Coming Soon: Multi-Agents
|
53
|
+
|
54
|
+
This is the internals of the Business Intelligence SQL Agent I'm working on:
|
55
|
+
|
56
|
+

|
57
|
+
|
32
58
|
### Agents Available Now
|
33
59
|
|
34
60
|
1. **Data Wrangling Agent:** Merges, Joins, Preps and Wrangles data into a format that is ready for data analysis.
|
35
|
-
2. **Data
|
36
|
-
3. **
|
37
|
-
4. **
|
61
|
+
2. **Data Visualization Agent:** Creates visualizations to help you understand your data. Returns JSON serializable plotly visualizations.
|
62
|
+
3. **Data Cleaning Agent:** Performs Data Preparation steps including handling missing values, outliers, and data type conversions.
|
63
|
+
4. **Feature Engineering Agent:** Converts the prepared data into ML-ready data. Adds features to increase predictive accuracy of ML models.
|
64
|
+
5. **SQL Database Agent:** Connects to SQL databases to pull data into the data science environment. Creates pipelines to automate data extraction. Performs Joins, Aggregations, and other SQL Query operations.
|
38
65
|
|
39
66
|
### Agents Coming Soon
|
40
67
|
|
@@ -55,23 +82,6 @@ This project is a work in progress. New data science agents will be released soo
|
|
55
82
|
|
56
83
|
By using this software, you agree to use it solely for learning purposes.
|
57
84
|
|
58
|
-
## Table of Contents
|
59
|
-
|
60
|
-
- [Your AI Data Science Team (An Army Of Copilots)](#your-ai-data-science-team-an-army-of-copilots)
|
61
|
-
- [Companies That Want An AI Data Science Team Copilot](#companies-that-want-an-ai-data-science-team-copilot)
|
62
|
-
- [Free Generative AI For Data Scientists Workshop](#free-generative-ai-for-data-scientists-workshop)
|
63
|
-
- [Data Science Agents](#data-science-agents)
|
64
|
-
- [Agents Available Now](#agents-available-now)
|
65
|
-
- [Agents Coming Soon](#agents-coming-soon)
|
66
|
-
- [Disclaimer](#disclaimer)
|
67
|
-
- [Table of Contents](#table-of-contents)
|
68
|
-
- [Installation](#installation)
|
69
|
-
- [Usage](#usage)
|
70
|
-
- [Example 1: Feature Engineering with the Feature Engineering Agent](#example-1-feature-engineering-with-the-feature-engineering-agent)
|
71
|
-
- [Example 2: Cleaning Data with the Data Cleaning Agent](#example-2-cleaning-data-with-the-data-cleaning-agent)
|
72
|
-
- [Contributing](#contributing)
|
73
|
-
- [License](#license)
|
74
|
-
|
75
85
|
## Installation
|
76
86
|
|
77
87
|
``` bash
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "0.0.0.9007"
|
@@ -1,5 +1,6 @@
|
|
1
|
-
from ai_data_science_team.agents.data_cleaning_agent import make_data_cleaning_agent
|
1
|
+
from ai_data_science_team.agents.data_cleaning_agent import make_data_cleaning_agent, DataCleaningAgent
|
2
2
|
from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent
|
3
3
|
from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent
|
4
4
|
from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent
|
5
|
+
from ai_data_science_team.agents.data_visualization_agent import make_data_visualization_agent
|
5
6
|
|
@@ -13,11 +13,13 @@ from langchain_core.messages import BaseMessage
|
|
13
13
|
from langgraph.types import Command
|
14
14
|
from langgraph.checkpoint.memory import MemorySaver
|
15
15
|
|
16
|
+
from langgraph.graph.state import CompiledStateGraph
|
17
|
+
|
16
18
|
import os
|
17
19
|
import io
|
18
20
|
import pandas as pd
|
19
21
|
|
20
|
-
from ai_data_science_team.templates
|
22
|
+
from ai_data_science_team.templates import(
|
21
23
|
node_func_execute_agent_code_on_data,
|
22
24
|
node_func_human_review,
|
23
25
|
node_func_fix_agent_code,
|
@@ -25,7 +27,7 @@ from ai_data_science_team.templates.agent_templates import(
|
|
25
27
|
create_coding_agent_graph
|
26
28
|
)
|
27
29
|
from ai_data_science_team.tools.parsers import PythonOutputParser
|
28
|
-
from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
|
30
|
+
from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name
|
29
31
|
from ai_data_science_team.tools.metadata import get_dataframe_summary
|
30
32
|
from ai_data_science_team.tools.logging import log_ai_function
|
31
33
|
|
@@ -33,9 +35,170 @@ from ai_data_science_team.tools.logging import log_ai_function
|
|
33
35
|
AGENT_NAME = "data_cleaning_agent"
|
34
36
|
LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
35
37
|
|
38
|
+
|
39
|
+
|
40
|
+
# Class
|
41
|
+
class DataCleaningAgent(CompiledStateGraph):
|
42
|
+
|
43
|
+
def __init__(
|
44
|
+
self,
|
45
|
+
model,
|
46
|
+
n_samples=30,
|
47
|
+
log=False,
|
48
|
+
log_path=None,
|
49
|
+
file_name="data_cleaner.py",
|
50
|
+
overwrite=True,
|
51
|
+
human_in_the_loop=False,
|
52
|
+
bypass_recommended_steps=False,
|
53
|
+
bypass_explain_code=False
|
54
|
+
):
|
55
|
+
self._params = {
|
56
|
+
"model": model,
|
57
|
+
"n_samples": n_samples,
|
58
|
+
"log": log,
|
59
|
+
"log_path": log_path,
|
60
|
+
"file_name": file_name,
|
61
|
+
"overwrite": overwrite,
|
62
|
+
"human_in_the_loop": human_in_the_loop,
|
63
|
+
"bypass_recommended_steps": bypass_recommended_steps,
|
64
|
+
"bypass_explain_code": bypass_explain_code,
|
65
|
+
}
|
66
|
+
self._compiled_graph = self._make_compiled_graph()
|
67
|
+
self.response = None
|
68
|
+
|
69
|
+
def _make_compiled_graph(self):
|
70
|
+
self.response = None
|
71
|
+
return make_data_cleaning_agent(**self._params)
|
72
|
+
|
73
|
+
def update_params(self, **kwargs):
|
74
|
+
"""
|
75
|
+
Update one or more parameters at once, then rebuild the compiled graph.
|
76
|
+
e.g. agent.update_params(model=new_llm, n_samples=100)
|
77
|
+
"""
|
78
|
+
self._params.update(kwargs)
|
79
|
+
self._compiled_graph = self._make_compiled_graph()
|
80
|
+
|
81
|
+
def __getattr__(self, name: str):
|
82
|
+
"""
|
83
|
+
Delegate attribute access to `_compiled_graph` if `name` is not
|
84
|
+
found in this instance. This 'inherits' methods from the compiled graph.
|
85
|
+
"""
|
86
|
+
return getattr(self._compiled_graph, name)
|
87
|
+
|
88
|
+
def ainvoke(self, user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0):
|
89
|
+
"""
|
90
|
+
Cleans the provided dataset based on user instructions.
|
91
|
+
|
92
|
+
Parameters:
|
93
|
+
user_instructions (str): Instructions for data cleaning.
|
94
|
+
data_raw (pd.DataFrame): The raw dataset to be cleaned.
|
95
|
+
max_retries (int): Maximum retry attempts for cleaning.
|
96
|
+
retry_count (int): Current retry attempt.
|
97
|
+
|
98
|
+
Returns:
|
99
|
+
None. The response is stored in the response attribute.
|
100
|
+
"""
|
101
|
+
response = self.ainvoke({
|
102
|
+
"user_instructions": user_instructions,
|
103
|
+
"data_raw": data_raw.to_dict(),
|
104
|
+
"max_retries": max_retries,
|
105
|
+
"retry_count": retry_count,
|
106
|
+
})
|
107
|
+
self.response = response
|
108
|
+
return None
|
109
|
+
|
110
|
+
def invoke(self, user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0):
|
111
|
+
"""
|
112
|
+
Cleans the provided dataset based on user instructions.
|
113
|
+
|
114
|
+
Parameters:
|
115
|
+
user_instructions (str): Instructions for data cleaning.
|
116
|
+
data_raw (pd.DataFrame): The raw dataset to be cleaned.
|
117
|
+
max_retries (int): Maximum retry attempts for cleaning.
|
118
|
+
retry_count (int): Current retry attempt.
|
119
|
+
|
120
|
+
Returns:
|
121
|
+
None. The response is stored in the response attribute.
|
122
|
+
"""
|
123
|
+
response = self.invoke({
|
124
|
+
"user_instructions": user_instructions,
|
125
|
+
"data_raw": data_raw.to_dict(),
|
126
|
+
"max_retries": max_retries,
|
127
|
+
"retry_count": retry_count,
|
128
|
+
})
|
129
|
+
self.response = response
|
130
|
+
return None
|
131
|
+
|
132
|
+
def explain_cleaning_steps(self):
|
133
|
+
"""
|
134
|
+
Provides an explanation of the cleaning steps performed by the agent.
|
135
|
+
|
136
|
+
Returns:
|
137
|
+
str: Explanation of the cleaning steps.
|
138
|
+
"""
|
139
|
+
messages = self.response.get("messages", [])
|
140
|
+
return messages
|
141
|
+
|
142
|
+
def get_log_summary(self):
|
143
|
+
"""
|
144
|
+
Logs a summary of the agent's operations, if logging is enabled.
|
145
|
+
"""
|
146
|
+
if self.response:
|
147
|
+
if self.log:
|
148
|
+
log_details = f"Log Path: {self.response.get('data_cleaner_function_path')}"
|
149
|
+
return log_details
|
150
|
+
|
151
|
+
def get_state_keys(self):
|
152
|
+
"""
|
153
|
+
Returns a list of keys that the state graph returns in a response.
|
154
|
+
"""
|
155
|
+
return list(self.get_output_jsonschema()['properties'].keys())
|
156
|
+
|
157
|
+
def get_state_properties(self):
|
158
|
+
"""
|
159
|
+
Returns a list of keys that the state graph returns in a response.
|
160
|
+
"""
|
161
|
+
return self.get_output_jsonschema()['properties']
|
162
|
+
|
163
|
+
def get_data_cleaned(self):
|
164
|
+
"""
|
165
|
+
Retrieves the cleaned data stored after running invoke or clean_data methods.
|
166
|
+
"""
|
167
|
+
if self.response:
|
168
|
+
return pd.DataFrame(self.response.get("data_cleaned"))
|
169
|
+
|
170
|
+
def get_data_raw(self):
|
171
|
+
"""
|
172
|
+
Retrieves the raw data.
|
173
|
+
"""
|
174
|
+
if self.response:
|
175
|
+
return pd.DataFrame(self.response.get("data_raw"))
|
176
|
+
|
177
|
+
def get_data_cleaner_function(self):
|
178
|
+
"""
|
179
|
+
Retrieves the agent's pipeline function.
|
180
|
+
"""
|
181
|
+
if self.response:
|
182
|
+
return self.response.get("data_cleaner_function")
|
183
|
+
|
184
|
+
|
185
|
+
|
186
|
+
|
187
|
+
|
188
|
+
|
36
189
|
# Agent
|
37
190
|
|
38
|
-
def make_data_cleaning_agent(
|
191
|
+
def make_data_cleaning_agent(
|
192
|
+
model,
|
193
|
+
n_samples = 30,
|
194
|
+
log=False,
|
195
|
+
log_path=None,
|
196
|
+
file_name="data_cleaner.py",
|
197
|
+
overwrite = True,
|
198
|
+
human_in_the_loop=False,
|
199
|
+
bypass_recommended_steps=False,
|
200
|
+
bypass_explain_code=False
|
201
|
+
):
|
39
202
|
"""
|
40
203
|
Creates a data cleaning agent that can be run on a dataset. The agent can be used to clean a dataset in a variety of
|
41
204
|
ways, such as removing columns with more than 40% missing values, imputing missing
|
@@ -44,9 +207,9 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
44
207
|
The agent takes in a dataset and some user instructions, and outputs a python
|
45
208
|
function that can be used to clean the dataset. The agent also logs the code
|
46
209
|
generated and any errors that occur.
|
47
|
-
|
210
|
+
|
48
211
|
The agent is instructed to to perform the following data cleaning steps:
|
49
|
-
|
212
|
+
|
50
213
|
- Removing columns if more than 40 percent of the data is missing
|
51
214
|
- Imputing missing values with the mean of the column if the column is numeric
|
52
215
|
- Imputing missing values with the mode of the column if the column is categorical
|
@@ -60,12 +223,18 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
60
223
|
----------
|
61
224
|
model : langchain.llms.base.LLM
|
62
225
|
The language model to use to generate code.
|
226
|
+
n_samples : int, optional
|
227
|
+
The number of samples to use when summarizing the dataset. Defaults to 30.
|
228
|
+
If you get an error due to maximum tokens, try reducing this number.
|
229
|
+
> "This model's maximum context length is 128000 tokens. However, your messages resulted in 333858 tokens. Please reduce the length of the messages."
|
63
230
|
log : bool, optional
|
64
231
|
Whether or not to log the code generated and any errors that occur.
|
65
232
|
Defaults to False.
|
66
233
|
log_path : str, optional
|
67
234
|
The path to the directory where the log files should be stored. Defaults to
|
68
235
|
"logs/".
|
236
|
+
file_name : str, optional
|
237
|
+
The name of the file to save the response to. Defaults to "data_cleaner.py".
|
69
238
|
overwrite : bool, optional
|
70
239
|
Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
|
71
240
|
Defaults to True.
|
@@ -82,26 +251,26 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
82
251
|
import pandas as pd
|
83
252
|
from langchain_openai import ChatOpenAI
|
84
253
|
from ai_data_science_team.agents import data_cleaning_agent
|
85
|
-
|
254
|
+
|
86
255
|
llm = ChatOpenAI(model = "gpt-4o-mini")
|
87
256
|
|
88
257
|
data_cleaning_agent = make_data_cleaning_agent(llm)
|
89
|
-
|
258
|
+
|
90
259
|
df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
|
91
|
-
|
260
|
+
|
92
261
|
response = data_cleaning_agent.invoke({
|
93
262
|
"user_instructions": "Don't remove outliers when cleaning the data.",
|
94
263
|
"data_raw": df.to_dict(),
|
95
264
|
"max_retries":3,
|
96
265
|
"retry_count":0
|
97
266
|
})
|
98
|
-
|
267
|
+
|
99
268
|
pd.DataFrame(response['data_cleaned'])
|
100
269
|
```
|
101
270
|
|
102
271
|
Returns
|
103
272
|
-------
|
104
|
-
app : langchain.graphs.
|
273
|
+
app : langchain.graphs.CompiledStateGraph
|
105
274
|
The data cleaning agent as a state graph.
|
106
275
|
"""
|
107
276
|
llm = model
|
@@ -134,7 +303,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
134
303
|
Recommend a series of data cleaning steps based on the input data.
|
135
304
|
These recommended steps will be appended to the user_instructions.
|
136
305
|
"""
|
137
|
-
print(
|
306
|
+
print(format_agent_name(AGENT_NAME))
|
138
307
|
print(" * RECOMMEND CLEANING STEPS")
|
139
308
|
|
140
309
|
# Prompt to get recommended steps from the LLM
|
@@ -177,6 +346,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
177
346
|
|
178
347
|
Avoid these:
|
179
348
|
1. Do not include steps to save files.
|
349
|
+
2. Do not include unrelated user instructions that are not related to the data cleaning.
|
180
350
|
""",
|
181
351
|
input_variables=["user_instructions", "recommended_steps", "all_datasets_summary"]
|
182
352
|
)
|
@@ -184,7 +354,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
184
354
|
data_raw = state.get("data_raw")
|
185
355
|
df = pd.DataFrame.from_dict(data_raw)
|
186
356
|
|
187
|
-
all_datasets_summary = get_dataframe_summary([df])
|
357
|
+
all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
|
188
358
|
|
189
359
|
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
190
360
|
|
@@ -201,10 +371,21 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
201
371
|
}
|
202
372
|
|
203
373
|
def create_data_cleaner_code(state: GraphState):
|
204
|
-
|
205
|
-
print("---DATA CLEANING AGENT----")
|
374
|
+
|
206
375
|
print(" * CREATE DATA CLEANER CODE")
|
207
376
|
|
377
|
+
if bypass_recommended_steps:
|
378
|
+
print(format_agent_name(AGENT_NAME))
|
379
|
+
|
380
|
+
data_raw = state.get("data_raw")
|
381
|
+
df = pd.DataFrame.from_dict(data_raw)
|
382
|
+
|
383
|
+
all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
|
384
|
+
|
385
|
+
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
386
|
+
else:
|
387
|
+
all_datasets_summary_str = state.get("all_datasets_summary")
|
388
|
+
|
208
389
|
data_cleaning_prompt = PromptTemplate(
|
209
390
|
template="""
|
210
391
|
You are a Data Cleaning Agent. Your job is to create a data_cleaner() function that can be run on the data provided using the following recommended steps.
|
@@ -218,7 +399,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
218
399
|
|
219
400
|
{all_datasets_summary}
|
220
401
|
|
221
|
-
Return Python code in ```python ``` format with a single function definition, data_cleaner(data_raw), that
|
402
|
+
Return Python code in ```python ``` format with a single function definition, data_cleaner(data_raw), that includes all imports inside the function.
|
222
403
|
|
223
404
|
Return code to provide the data cleaning function:
|
224
405
|
|
@@ -240,16 +421,16 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
240
421
|
|
241
422
|
response = data_cleaning_agent.invoke({
|
242
423
|
"recommended_steps": state.get("recommended_steps"),
|
243
|
-
"all_datasets_summary":
|
424
|
+
"all_datasets_summary": all_datasets_summary_str
|
244
425
|
})
|
245
426
|
|
246
427
|
response = relocate_imports_inside_function(response)
|
247
428
|
response = add_comments_to_top(response, agent_name=AGENT_NAME)
|
248
429
|
|
249
430
|
# For logging: store the code generated:
|
250
|
-
file_path,
|
431
|
+
file_path, file_name_2 = log_ai_function(
|
251
432
|
response=response,
|
252
|
-
file_name=
|
433
|
+
file_name=file_name,
|
253
434
|
log=log,
|
254
435
|
log_path=log_path,
|
255
436
|
overwrite=overwrite
|
@@ -258,7 +439,8 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
258
439
|
return {
|
259
440
|
"data_cleaner_function" : response,
|
260
441
|
"data_cleaner_function_path": file_path,
|
261
|
-
"data_cleaner_function_name":
|
442
|
+
"data_cleaner_function_name": file_name_2,
|
443
|
+
"all_datasets_summary": all_datasets_summary_str
|
262
444
|
}
|
263
445
|
|
264
446
|
def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "create_data_cleaner_code"]]:
|
@@ -353,3 +535,6 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
|
|
353
535
|
)
|
354
536
|
|
355
537
|
return app
|
538
|
+
|
539
|
+
|
540
|
+
|