ai-data-science-team 0.0.0.9005__tar.gz → 0.0.0.9007__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (29) hide show
  1. {ai_data_science_team-0.0.0.9005/ai_data_science_team.egg-info → ai_data_science_team-0.0.0.9007}/PKG-INFO +43 -22
  2. ai_data_science_team-0.0.0.9005/PKG-INFO → ai_data_science_team-0.0.0.9007/README.md +33 -44
  3. ai_data_science_team-0.0.0.9007/ai_data_science_team/_version.py +1 -0
  4. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/agents/__init__.py +3 -1
  5. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/agents/data_cleaning_agent.py +213 -20
  6. ai_data_science_team-0.0.0.9007/ai_data_science_team/agents/data_visualization_agent.py +331 -0
  7. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/agents/data_wrangling_agent.py +66 -24
  8. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/agents/feature_engineering_agent.py +50 -13
  9. ai_data_science_team-0.0.0.9007/ai_data_science_team/agents/sql_database_agent.py +397 -0
  10. ai_data_science_team-0.0.0.9007/ai_data_science_team/templates/__init__.py +8 -0
  11. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/templates/agent_templates.py +154 -37
  12. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/tools/logging.py +1 -1
  13. ai_data_science_team-0.0.0.9007/ai_data_science_team/tools/metadata.py +230 -0
  14. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/tools/regex.py +7 -1
  15. ai_data_science_team-0.0.0.9005/README.md → ai_data_science_team-0.0.0.9007/ai_data_science_team.egg-info/PKG-INFO +65 -20
  16. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/ai_data_science_team.egg-info/SOURCES.txt +3 -1
  17. ai_data_science_team-0.0.0.9005/ai_data_science_team/_version.py +0 -1
  18. ai_data_science_team-0.0.0.9005/ai_data_science_team/tools/__init__.py +0 -0
  19. ai_data_science_team-0.0.0.9005/ai_data_science_team/tools/data_analysis.py +0 -116
  20. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/LICENSE +0 -0
  21. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/__init__.py +0 -0
  22. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/orchestration.py +0 -0
  23. {ai_data_science_team-0.0.0.9005/ai_data_science_team/templates → ai_data_science_team-0.0.0.9007/ai_data_science_team/tools}/__init__.py +0 -0
  24. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/ai_data_science_team/tools/parsers.py +0 -0
  25. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/ai_data_science_team.egg-info/dependency_links.txt +0 -0
  26. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/ai_data_science_team.egg-info/requires.txt +0 -0
  27. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/ai_data_science_team.egg-info/top_level.txt +0 -0
  28. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/setup.cfg +0 -0
  29. {ai_data_science_team-0.0.0.9005 → ai_data_science_team-0.0.0.9007}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: ai-data-science-team
3
- Version: 0.0.0.9005
3
+ Version: 0.0.0.9007
4
4
  Summary: Build and run an AI-powered data science team.
5
5
  Home-page: https://github.com/business-science/ai-data-science-team
6
6
  Author: Matt Dancho
@@ -21,12 +21,22 @@ Requires-Dist: plotly
21
21
  Requires-Dist: streamlit
22
22
  Requires-Dist: scikit-learn
23
23
  Requires-Dist: xgboost
24
+ Dynamic: author
25
+ Dynamic: author-email
26
+ Dynamic: description
27
+ Dynamic: description-content-type
28
+ Dynamic: home-page
29
+ Dynamic: requires-dist
30
+ Dynamic: requires-python
31
+ Dynamic: summary
24
32
 
25
33
  # Your AI Data Science Team (An Army Of Copilots)
26
34
 
27
35
  **An AI-powered data science team of copilots that uses agents to help you perform common data science tasks 10X faster**.
28
36
 
29
- Star ⭐ This GitHub (Takes 2 seconds and means a lot).
37
+ **Star ⭐ This GitHub (Takes 2 seconds and means a lot).**
38
+
39
+ *Beta - This Python library is under active development. There may be breaking changes that occur until release of 0.1.0.*
30
40
 
31
41
  ---
32
42
 
@@ -39,6 +49,24 @@ The AI Data Science Team of Copilots includes Agents that specialize data cleani
39
49
  - Credit Card Risk
40
50
  - And more
41
51
 
52
+ ## Table of Contents
53
+
54
+ - [Your AI Data Science Team (An Army Of Copilots)](#your-ai-data-science-team-an-army-of-copilots)
55
+ - [Table of Contents](#table-of-contents)
56
+ - [Companies That Want An AI Data Science Team Copilot](#companies-that-want-an-ai-data-science-team-copilot)
57
+ - [Free Generative AI For Data Scientists Workshop](#free-generative-ai-for-data-scientists-workshop)
58
+ - [Data Science Agents](#data-science-agents)
59
+ - [Coming Soon: Multi-Agents](#coming-soon-multi-agents)
60
+ - [Agents Available Now](#agents-available-now)
61
+ - [Agents Coming Soon](#agents-coming-soon)
62
+ - [Disclaimer](#disclaimer)
63
+ - [Installation](#installation)
64
+ - [Usage](#usage)
65
+ - [Example 1: Feature Engineering with the Feature Engineering Agent](#example-1-feature-engineering-with-the-feature-engineering-agent)
66
+ - [Example 2: Cleaning Data with the Data Cleaning Agent](#example-2-cleaning-data-with-the-data-cleaning-agent)
67
+ - [Contributing](#contributing)
68
+ - [License](#license)
69
+
42
70
  ## Companies That Want An AI Data Science Team Copilot
43
71
 
44
72
  If you are interested in having your own custom enteprise-grade AI Data Science Team Copilot, send inquiries here: [https://www.business-science.io/contact.html](https://www.business-science.io/contact.html)
@@ -53,11 +81,19 @@ This project is a work in progress. New data science agents will be released soo
53
81
 
54
82
  ![Data Science Team](/img/ai_data_science_team.jpg)
55
83
 
84
+ ### Coming Soon: Multi-Agents
85
+
86
+ This is the internals of the Business Intelligence SQL Agent I'm working on:
87
+
88
+ ![Business Intelligence SQL Agent](/img/multi_agent_sql_data_visualization.jpg)
89
+
56
90
  ### Agents Available Now
57
91
 
58
92
  1. **Data Wrangling Agent:** Merges, Joins, Preps and Wrangles data into a format that is ready for data analysis.
59
- 2. **Data Cleaning Agent:** Performs Data Preparation steps including handling missing values, outliers, and data type conversions.
60
- 3. **Feature Engineering Agent:** Converts the prepared data into ML-ready data. Adds features to increase predictive accuracy of ML models.
93
+ 2. **Data Visualization Agent:** Creates visualizations to help you understand your data. Returns JSON serializable plotly visualizations.
94
+ 3. **Data Cleaning Agent:** Performs Data Preparation steps including handling missing values, outliers, and data type conversions.
95
+ 4. **Feature Engineering Agent:** Converts the prepared data into ML-ready data. Adds features to increase predictive accuracy of ML models.
96
+ 5. **SQL Database Agent:** Connects to SQL databases to pull data into the data science environment. Creates pipelines to automate data extraction. Performs Joins, Aggregations, and other SQL Query operations.
61
97
 
62
98
  ### Agents Coming Soon
63
99
 
@@ -78,23 +114,6 @@ This project is a work in progress. New data science agents will be released soo
78
114
 
79
115
  By using this software, you agree to use it solely for learning purposes.
80
116
 
81
- ## Table of Contents
82
-
83
- - [Your AI Data Science Team (An Army Of Copilots)](#your-ai-data-science-team-an-army-of-copilots)
84
- - [Companies That Want An AI Data Science Team Copilot](#companies-that-want-an-ai-data-science-team-copilot)
85
- - [Free Generative AI For Data Scientists Workshop](#free-generative-ai-for-data-scientists-workshop)
86
- - [Data Science Agents](#data-science-agents)
87
- - [Agents Available Now](#agents-available-now)
88
- - [Agents Coming Soon](#agents-coming-soon)
89
- - [Disclaimer](#disclaimer)
90
- - [Table of Contents](#table-of-contents)
91
- - [Installation](#installation)
92
- - [Usage](#usage)
93
- - [Example 1: Feature Engineering with the Feature Engineering Agent](#example-1-feature-engineering-with-the-feature-engineering-agent)
94
- - [Example 2: Cleaning Data with the Data Cleaning Agent](#example-2-cleaning-data-with-the-data-cleaning-agent)
95
- - [Contributing](#contributing)
96
- - [License](#license)
97
-
98
117
  ## Installation
99
118
 
100
119
  ``` bash
@@ -103,6 +122,8 @@ pip install git+https://github.com/business-science/ai-data-science-team.git --u
103
122
 
104
123
  ## Usage
105
124
 
125
+ [See all examples here.](/examples)
126
+
106
127
  ### Example 1: Feature Engineering with the Feature Engineering Agent
107
128
 
108
129
  [See the full example here.](/examples/feature_engineering_agent.ipynb)
@@ -1,32 +1,10 @@
1
- Metadata-Version: 2.1
2
- Name: ai-data-science-team
3
- Version: 0.0.0.9005
4
- Summary: Build and run an AI-powered data science team.
5
- Home-page: https://github.com/business-science/ai-data-science-team
6
- Author: Matt Dancho
7
- Author-email: mdancho@business-science.io
8
- Requires-Python: >=3.9
9
- Description-Content-Type: text/markdown
10
- License-File: LICENSE
11
- Requires-Dist: openpyxl
12
- Requires-Dist: langchain
13
- Requires-Dist: langchain_community
14
- Requires-Dist: langchain_openai
15
- Requires-Dist: langchain_experimental
16
- Requires-Dist: langgraph>=0.2.57
17
- Requires-Dist: openai
18
- Requires-Dist: pandas
19
- Requires-Dist: numpy
20
- Requires-Dist: plotly
21
- Requires-Dist: streamlit
22
- Requires-Dist: scikit-learn
23
- Requires-Dist: xgboost
24
-
25
1
  # Your AI Data Science Team (An Army Of Copilots)
26
2
 
27
3
  **An AI-powered data science team of copilots that uses agents to help you perform common data science tasks 10X faster**.
28
4
 
29
- Star ⭐ This GitHub (Takes 2 seconds and means a lot).
5
+ **Star ⭐ This GitHub (Takes 2 seconds and means a lot).**
6
+
7
+ *Beta - This Python library is under active development. There may be breaking changes that occur until release of 0.1.0.*
30
8
 
31
9
  ---
32
10
 
@@ -39,6 +17,24 @@ The AI Data Science Team of Copilots includes Agents that specialize data cleani
39
17
  - Credit Card Risk
40
18
  - And more
41
19
 
20
+ ## Table of Contents
21
+
22
+ - [Your AI Data Science Team (An Army Of Copilots)](#your-ai-data-science-team-an-army-of-copilots)
23
+ - [Table of Contents](#table-of-contents)
24
+ - [Companies That Want An AI Data Science Team Copilot](#companies-that-want-an-ai-data-science-team-copilot)
25
+ - [Free Generative AI For Data Scientists Workshop](#free-generative-ai-for-data-scientists-workshop)
26
+ - [Data Science Agents](#data-science-agents)
27
+ - [Coming Soon: Multi-Agents](#coming-soon-multi-agents)
28
+ - [Agents Available Now](#agents-available-now)
29
+ - [Agents Coming Soon](#agents-coming-soon)
30
+ - [Disclaimer](#disclaimer)
31
+ - [Installation](#installation)
32
+ - [Usage](#usage)
33
+ - [Example 1: Feature Engineering with the Feature Engineering Agent](#example-1-feature-engineering-with-the-feature-engineering-agent)
34
+ - [Example 2: Cleaning Data with the Data Cleaning Agent](#example-2-cleaning-data-with-the-data-cleaning-agent)
35
+ - [Contributing](#contributing)
36
+ - [License](#license)
37
+
42
38
  ## Companies That Want An AI Data Science Team Copilot
43
39
 
44
40
  If you are interested in having your own custom enteprise-grade AI Data Science Team Copilot, send inquiries here: [https://www.business-science.io/contact.html](https://www.business-science.io/contact.html)
@@ -53,11 +49,19 @@ This project is a work in progress. New data science agents will be released soo
53
49
 
54
50
  ![Data Science Team](/img/ai_data_science_team.jpg)
55
51
 
52
+ ### Coming Soon: Multi-Agents
53
+
54
+ This is the internals of the Business Intelligence SQL Agent I'm working on:
55
+
56
+ ![Business Intelligence SQL Agent](/img/multi_agent_sql_data_visualization.jpg)
57
+
56
58
  ### Agents Available Now
57
59
 
58
60
  1. **Data Wrangling Agent:** Merges, Joins, Preps and Wrangles data into a format that is ready for data analysis.
59
- 2. **Data Cleaning Agent:** Performs Data Preparation steps including handling missing values, outliers, and data type conversions.
60
- 3. **Feature Engineering Agent:** Converts the prepared data into ML-ready data. Adds features to increase predictive accuracy of ML models.
61
+ 2. **Data Visualization Agent:** Creates visualizations to help you understand your data. Returns JSON serializable plotly visualizations.
62
+ 3. **Data Cleaning Agent:** Performs Data Preparation steps including handling missing values, outliers, and data type conversions.
63
+ 4. **Feature Engineering Agent:** Converts the prepared data into ML-ready data. Adds features to increase predictive accuracy of ML models.
64
+ 5. **SQL Database Agent:** Connects to SQL databases to pull data into the data science environment. Creates pipelines to automate data extraction. Performs Joins, Aggregations, and other SQL Query operations.
61
65
 
62
66
  ### Agents Coming Soon
63
67
 
@@ -78,23 +82,6 @@ This project is a work in progress. New data science agents will be released soo
78
82
 
79
83
  By using this software, you agree to use it solely for learning purposes.
80
84
 
81
- ## Table of Contents
82
-
83
- - [Your AI Data Science Team (An Army Of Copilots)](#your-ai-data-science-team-an-army-of-copilots)
84
- - [Companies That Want An AI Data Science Team Copilot](#companies-that-want-an-ai-data-science-team-copilot)
85
- - [Free Generative AI For Data Scientists Workshop](#free-generative-ai-for-data-scientists-workshop)
86
- - [Data Science Agents](#data-science-agents)
87
- - [Agents Available Now](#agents-available-now)
88
- - [Agents Coming Soon](#agents-coming-soon)
89
- - [Disclaimer](#disclaimer)
90
- - [Table of Contents](#table-of-contents)
91
- - [Installation](#installation)
92
- - [Usage](#usage)
93
- - [Example 1: Feature Engineering with the Feature Engineering Agent](#example-1-feature-engineering-with-the-feature-engineering-agent)
94
- - [Example 2: Cleaning Data with the Data Cleaning Agent](#example-2-cleaning-data-with-the-data-cleaning-agent)
95
- - [Contributing](#contributing)
96
- - [License](#license)
97
-
98
85
  ## Installation
99
86
 
100
87
  ``` bash
@@ -103,6 +90,8 @@ pip install git+https://github.com/business-science/ai-data-science-team.git --u
103
90
 
104
91
  ## Usage
105
92
 
93
+ [See all examples here.](/examples)
94
+
106
95
  ### Example 1: Feature Engineering with the Feature Engineering Agent
107
96
 
108
97
  [See the full example here.](/examples/feature_engineering_agent.ipynb)
@@ -0,0 +1 @@
1
+ __version__ = "0.0.0.9007"
@@ -1,4 +1,6 @@
1
- from ai_data_science_team.agents.data_cleaning_agent import make_data_cleaning_agent
1
+ from ai_data_science_team.agents.data_cleaning_agent import make_data_cleaning_agent, DataCleaningAgent
2
2
  from ai_data_science_team.agents.feature_engineering_agent import make_feature_engineering_agent
3
3
  from ai_data_science_team.agents.data_wrangling_agent import make_data_wrangling_agent
4
+ from ai_data_science_team.agents.sql_database_agent import make_sql_database_agent
5
+ from ai_data_science_team.agents.data_visualization_agent import make_data_visualization_agent
4
6
 
@@ -13,11 +13,13 @@ from langchain_core.messages import BaseMessage
13
13
  from langgraph.types import Command
14
14
  from langgraph.checkpoint.memory import MemorySaver
15
15
 
16
+ from langgraph.graph.state import CompiledStateGraph
17
+
16
18
  import os
17
19
  import io
18
20
  import pandas as pd
19
21
 
20
- from ai_data_science_team.templates.agent_templates import(
22
+ from ai_data_science_team.templates import(
21
23
  node_func_execute_agent_code_on_data,
22
24
  node_func_human_review,
23
25
  node_func_fix_agent_code,
@@ -25,17 +27,178 @@ from ai_data_science_team.templates.agent_templates import(
25
27
  create_coding_agent_graph
26
28
  )
27
29
  from ai_data_science_team.tools.parsers import PythonOutputParser
28
- from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
29
- from ai_data_science_team.tools.data_analysis import summarize_dataframes
30
+ from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top, format_agent_name
31
+ from ai_data_science_team.tools.metadata import get_dataframe_summary
30
32
  from ai_data_science_team.tools.logging import log_ai_function
31
33
 
32
34
  # Setup
33
35
  AGENT_NAME = "data_cleaning_agent"
34
36
  LOG_PATH = os.path.join(os.getcwd(), "logs/")
35
37
 
38
+
39
+
40
+ # Class
41
+ class DataCleaningAgent(CompiledStateGraph):
42
+
43
+ def __init__(
44
+ self,
45
+ model,
46
+ n_samples=30,
47
+ log=False,
48
+ log_path=None,
49
+ file_name="data_cleaner.py",
50
+ overwrite=True,
51
+ human_in_the_loop=False,
52
+ bypass_recommended_steps=False,
53
+ bypass_explain_code=False
54
+ ):
55
+ self._params = {
56
+ "model": model,
57
+ "n_samples": n_samples,
58
+ "log": log,
59
+ "log_path": log_path,
60
+ "file_name": file_name,
61
+ "overwrite": overwrite,
62
+ "human_in_the_loop": human_in_the_loop,
63
+ "bypass_recommended_steps": bypass_recommended_steps,
64
+ "bypass_explain_code": bypass_explain_code,
65
+ }
66
+ self._compiled_graph = self._make_compiled_graph()
67
+ self.response = None
68
+
69
+ def _make_compiled_graph(self):
70
+ self.response = None
71
+ return make_data_cleaning_agent(**self._params)
72
+
73
+ def update_params(self, **kwargs):
74
+ """
75
+ Update one or more parameters at once, then rebuild the compiled graph.
76
+ e.g. agent.update_params(model=new_llm, n_samples=100)
77
+ """
78
+ self._params.update(kwargs)
79
+ self._compiled_graph = self._make_compiled_graph()
80
+
81
+ def __getattr__(self, name: str):
82
+ """
83
+ Delegate attribute access to `_compiled_graph` if `name` is not
84
+ found in this instance. This 'inherits' methods from the compiled graph.
85
+ """
86
+ return getattr(self._compiled_graph, name)
87
+
88
+ def ainvoke(self, user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0):
89
+ """
90
+ Cleans the provided dataset based on user instructions.
91
+
92
+ Parameters:
93
+ user_instructions (str): Instructions for data cleaning.
94
+ data_raw (pd.DataFrame): The raw dataset to be cleaned.
95
+ max_retries (int): Maximum retry attempts for cleaning.
96
+ retry_count (int): Current retry attempt.
97
+
98
+ Returns:
99
+ None. The response is stored in the response attribute.
100
+ """
101
+ response = self.ainvoke({
102
+ "user_instructions": user_instructions,
103
+ "data_raw": data_raw.to_dict(),
104
+ "max_retries": max_retries,
105
+ "retry_count": retry_count,
106
+ })
107
+ self.response = response
108
+ return None
109
+
110
+ def invoke(self, user_instructions: str, data_raw: pd.DataFrame, max_retries=3, retry_count=0):
111
+ """
112
+ Cleans the provided dataset based on user instructions.
113
+
114
+ Parameters:
115
+ user_instructions (str): Instructions for data cleaning.
116
+ data_raw (pd.DataFrame): The raw dataset to be cleaned.
117
+ max_retries (int): Maximum retry attempts for cleaning.
118
+ retry_count (int): Current retry attempt.
119
+
120
+ Returns:
121
+ None. The response is stored in the response attribute.
122
+ """
123
+ response = self.invoke({
124
+ "user_instructions": user_instructions,
125
+ "data_raw": data_raw.to_dict(),
126
+ "max_retries": max_retries,
127
+ "retry_count": retry_count,
128
+ })
129
+ self.response = response
130
+ return None
131
+
132
+ def explain_cleaning_steps(self):
133
+ """
134
+ Provides an explanation of the cleaning steps performed by the agent.
135
+
136
+ Returns:
137
+ str: Explanation of the cleaning steps.
138
+ """
139
+ messages = self.response.get("messages", [])
140
+ return messages
141
+
142
+ def get_log_summary(self):
143
+ """
144
+ Logs a summary of the agent's operations, if logging is enabled.
145
+ """
146
+ if self.response:
147
+ if self.log:
148
+ log_details = f"Log Path: {self.response.get('data_cleaner_function_path')}"
149
+ return log_details
150
+
151
+ def get_state_keys(self):
152
+ """
153
+ Returns a list of keys that the state graph returns in a response.
154
+ """
155
+ return list(self.get_output_jsonschema()['properties'].keys())
156
+
157
+ def get_state_properties(self):
158
+ """
159
+ Returns a list of keys that the state graph returns in a response.
160
+ """
161
+ return self.get_output_jsonschema()['properties']
162
+
163
+ def get_data_cleaned(self):
164
+ """
165
+ Retrieves the cleaned data stored after running invoke or clean_data methods.
166
+ """
167
+ if self.response:
168
+ return pd.DataFrame(self.response.get("data_cleaned"))
169
+
170
+ def get_data_raw(self):
171
+ """
172
+ Retrieves the raw data.
173
+ """
174
+ if self.response:
175
+ return pd.DataFrame(self.response.get("data_raw"))
176
+
177
+ def get_data_cleaner_function(self):
178
+ """
179
+ Retrieves the agent's pipeline function.
180
+ """
181
+ if self.response:
182
+ return self.response.get("data_cleaner_function")
183
+
184
+
185
+
186
+
187
+
188
+
36
189
  # Agent
37
190
 
38
- def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True, human_in_the_loop=False):
191
+ def make_data_cleaning_agent(
192
+ model,
193
+ n_samples = 30,
194
+ log=False,
195
+ log_path=None,
196
+ file_name="data_cleaner.py",
197
+ overwrite = True,
198
+ human_in_the_loop=False,
199
+ bypass_recommended_steps=False,
200
+ bypass_explain_code=False
201
+ ):
39
202
  """
40
203
  Creates a data cleaning agent that can be run on a dataset. The agent can be used to clean a dataset in a variety of
41
204
  ways, such as removing columns with more than 40% missing values, imputing missing
@@ -44,9 +207,9 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
44
207
  The agent takes in a dataset and some user instructions, and outputs a python
45
208
  function that can be used to clean the dataset. The agent also logs the code
46
209
  generated and any errors that occur.
47
-
210
+
48
211
  The agent is instructed to to perform the following data cleaning steps:
49
-
212
+
50
213
  - Removing columns if more than 40 percent of the data is missing
51
214
  - Imputing missing values with the mean of the column if the column is numeric
52
215
  - Imputing missing values with the mode of the column if the column is categorical
@@ -60,17 +223,27 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
60
223
  ----------
61
224
  model : langchain.llms.base.LLM
62
225
  The language model to use to generate code.
226
+ n_samples : int, optional
227
+ The number of samples to use when summarizing the dataset. Defaults to 30.
228
+ If you get an error due to maximum tokens, try reducing this number.
229
+ > "This model's maximum context length is 128000 tokens. However, your messages resulted in 333858 tokens. Please reduce the length of the messages."
63
230
  log : bool, optional
64
231
  Whether or not to log the code generated and any errors that occur.
65
232
  Defaults to False.
66
233
  log_path : str, optional
67
234
  The path to the directory where the log files should be stored. Defaults to
68
235
  "logs/".
236
+ file_name : str, optional
237
+ The name of the file to save the response to. Defaults to "data_cleaner.py".
69
238
  overwrite : bool, optional
70
239
  Whether or not to overwrite the log file if it already exists. If False, a unique file name will be created.
71
240
  Defaults to True.
72
241
  human_in_the_loop : bool, optional
73
242
  Whether or not to use human in the loop. If True, adds an interput and human in the loop step that asks the user to review the data cleaning instructions. Defaults to False.
243
+ bypass_recommended_steps : bool, optional
244
+ Bypass the recommendation step, by default False
245
+ bypass_explain_code : bool, optional
246
+ Bypass the code explanation step, by default False.
74
247
 
75
248
  Examples
76
249
  -------
@@ -78,26 +251,26 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
78
251
  import pandas as pd
79
252
  from langchain_openai import ChatOpenAI
80
253
  from ai_data_science_team.agents import data_cleaning_agent
81
-
254
+
82
255
  llm = ChatOpenAI(model = "gpt-4o-mini")
83
256
 
84
257
  data_cleaning_agent = make_data_cleaning_agent(llm)
85
-
258
+
86
259
  df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
87
-
260
+
88
261
  response = data_cleaning_agent.invoke({
89
262
  "user_instructions": "Don't remove outliers when cleaning the data.",
90
263
  "data_raw": df.to_dict(),
91
264
  "max_retries":3,
92
265
  "retry_count":0
93
266
  })
94
-
267
+
95
268
  pd.DataFrame(response['data_cleaned'])
96
269
  ```
97
270
 
98
271
  Returns
99
272
  -------
100
- app : langchain.graphs.StateGraph
273
+ app : langchain.graphs.CompiledStateGraph
101
274
  The data cleaning agent as a state graph.
102
275
  """
103
276
  llm = model
@@ -130,7 +303,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
130
303
  Recommend a series of data cleaning steps based on the input data.
131
304
  These recommended steps will be appended to the user_instructions.
132
305
  """
133
- print("---DATA CLEANING AGENT----")
306
+ print(format_agent_name(AGENT_NAME))
134
307
  print(" * RECOMMEND CLEANING STEPS")
135
308
 
136
309
  # Prompt to get recommended steps from the LLM
@@ -173,6 +346,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
173
346
 
174
347
  Avoid these:
175
348
  1. Do not include steps to save files.
349
+ 2. Do not include unrelated user instructions that are not related to the data cleaning.
176
350
  """,
177
351
  input_variables=["user_instructions", "recommended_steps", "all_datasets_summary"]
178
352
  )
@@ -180,7 +354,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
180
354
  data_raw = state.get("data_raw")
181
355
  df = pd.DataFrame.from_dict(data_raw)
182
356
 
183
- all_datasets_summary = summarize_dataframes([df])
357
+ all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
184
358
 
185
359
  all_datasets_summary_str = "\n\n".join(all_datasets_summary)
186
360
 
@@ -197,8 +371,21 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
197
371
  }
198
372
 
199
373
  def create_data_cleaner_code(state: GraphState):
374
+
200
375
  print(" * CREATE DATA CLEANER CODE")
201
376
 
377
+ if bypass_recommended_steps:
378
+ print(format_agent_name(AGENT_NAME))
379
+
380
+ data_raw = state.get("data_raw")
381
+ df = pd.DataFrame.from_dict(data_raw)
382
+
383
+ all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
384
+
385
+ all_datasets_summary_str = "\n\n".join(all_datasets_summary)
386
+ else:
387
+ all_datasets_summary_str = state.get("all_datasets_summary")
388
+
202
389
  data_cleaning_prompt = PromptTemplate(
203
390
  template="""
204
391
  You are a Data Cleaning Agent. Your job is to create a data_cleaner() function that can be run on the data provided using the following recommended steps.
@@ -212,7 +399,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
212
399
 
213
400
  {all_datasets_summary}
214
401
 
215
- Return Python code in ```python ``` format with a single function definition, data_cleaner(data_raw), that incldues all imports inside the function.
402
+ Return Python code in ```python ``` format with a single function definition, data_cleaner(data_raw), that includes all imports inside the function.
216
403
 
217
404
  Return code to provide the data cleaning function:
218
405
 
@@ -234,16 +421,16 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
234
421
 
235
422
  response = data_cleaning_agent.invoke({
236
423
  "recommended_steps": state.get("recommended_steps"),
237
- "all_datasets_summary": state.get("all_datasets_summary")
424
+ "all_datasets_summary": all_datasets_summary_str
238
425
  })
239
426
 
240
427
  response = relocate_imports_inside_function(response)
241
428
  response = add_comments_to_top(response, agent_name=AGENT_NAME)
242
429
 
243
430
  # For logging: store the code generated:
244
- file_path, file_name = log_ai_function(
431
+ file_path, file_name_2 = log_ai_function(
245
432
  response=response,
246
- file_name="data_cleaner.py",
433
+ file_name=file_name,
247
434
  log=log,
248
435
  log_path=log_path,
249
436
  overwrite=overwrite
@@ -252,7 +439,8 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
252
439
  return {
253
440
  "data_cleaner_function" : response,
254
441
  "data_cleaner_function_path": file_path,
255
- "data_cleaner_function_name": file_name
442
+ "data_cleaner_function_name": file_name_2,
443
+ "all_datasets_summary": all_datasets_summary_str
256
444
  }
257
445
 
258
446
  def human_review(state: GraphState) -> Command[Literal["recommend_cleaning_steps", "create_data_cleaner_code"]]:
@@ -274,7 +462,7 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
274
462
  code_snippet_key="data_cleaner_function",
275
463
  agent_function_name="data_cleaner",
276
464
  pre_processing=lambda data: pd.DataFrame.from_dict(data),
277
- post_processing=lambda df: df.to_dict(),
465
+ post_processing=lambda df: df.to_dict() if isinstance(df, pd.DataFrame) else df,
278
466
  error_message_prefix="An error occurred during data cleaning: "
279
467
  )
280
468
 
@@ -341,7 +529,12 @@ def make_data_cleaning_agent(model, log=False, log_path=None, overwrite = True,
341
529
  error_key="data_cleaner_error",
342
530
  human_in_the_loop=human_in_the_loop, # or False
343
531
  human_review_node_name="human_review",
344
- checkpointer=MemorySaver() if human_in_the_loop else None
532
+ checkpointer=MemorySaver() if human_in_the_loop else None,
533
+ bypass_recommended_steps=bypass_recommended_steps,
534
+ bypass_explain_code=bypass_explain_code,
345
535
  )
346
536
 
347
537
  return app
538
+
539
+
540
+