ai-data-science-team 0.0.0.9000__py3-none-any.whl → 0.0.0.9005__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,73 @@
1
+ import re
2
+ from datetime import datetime
3
+
4
+
5
+ def relocate_imports_inside_function(code_text):
6
+ """
7
+ Relocates all import statements in a given Python function and moves them inside the function definition.
8
+
9
+ Parameters
10
+ ----------
11
+ code_text : str
12
+ The Python code as a string.
13
+
14
+ Returns
15
+ -------
16
+ str
17
+ The modified Python code with imports relocated inside the function.
18
+ """
19
+ # Match all import statements
20
+ import_pattern = r'^\s*(import\s+[^\n]+|from\s+\S+\s+import\s+[^\n]+)\s*$'
21
+ imports = re.findall(import_pattern, code_text, re.MULTILINE)
22
+
23
+ # Remove imports from the top-level code
24
+ code_without_imports = re.sub(import_pattern, '', code_text, flags=re.MULTILINE).strip()
25
+
26
+ # Find the function definition and insert the imports inside it
27
+ function_pattern = r'(def\s+\w+\s*\(.*?\):)'
28
+ match = re.search(function_pattern, code_without_imports)
29
+
30
+ if match:
31
+ function_start = match.end()
32
+ # Insert the imports right after the function definition
33
+ imports_code = '\n ' + '\n '.join(imports) # Indent imports
34
+ modified_code = (
35
+ code_without_imports[:function_start]
36
+ + imports_code
37
+ + code_without_imports[function_start:]
38
+ )
39
+ return modified_code
40
+
41
+ # If no function is found, return the original code
42
+ return code_text
43
+
44
+ def add_comments_to_top(code_text, agent_name="data_wrangler"):
45
+ """
46
+ Adds AI-generated metadata comments to the top of the Python code.
47
+
48
+ Parameters
49
+ ----------
50
+ code_text : str
51
+ The Python code to be commented.
52
+ agent_name : str, optional
53
+ The agent name to include in the comments, by default "data_wrangler".
54
+
55
+ Returns
56
+ -------
57
+ str
58
+ The Python code with the added comments at the top.
59
+ """
60
+ # Generate timestamp
61
+ time_created = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
62
+
63
+ # Construct the header comments
64
+ header_comments = [
65
+ "# Disclaimer: This function was generated by AI. Please review before using.",
66
+ f"# Agent Name: {agent_name}",
67
+ f"# Time Created: {time_created}",
68
+ ""
69
+ ]
70
+
71
+ # Join the header with newlines, then prepend to the existing code_text
72
+ header_block = "\n".join(header_comments)
73
+ return header_block + code_text
@@ -0,0 +1,162 @@
1
+ Metadata-Version: 2.1
2
+ Name: ai-data-science-team
3
+ Version: 0.0.0.9005
4
+ Summary: Build and run an AI-powered data science team.
5
+ Home-page: https://github.com/business-science/ai-data-science-team
6
+ Author: Matt Dancho
7
+ Author-email: mdancho@business-science.io
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: openpyxl
12
+ Requires-Dist: langchain
13
+ Requires-Dist: langchain_community
14
+ Requires-Dist: langchain_openai
15
+ Requires-Dist: langchain_experimental
16
+ Requires-Dist: langgraph>=0.2.57
17
+ Requires-Dist: openai
18
+ Requires-Dist: pandas
19
+ Requires-Dist: numpy
20
+ Requires-Dist: plotly
21
+ Requires-Dist: streamlit
22
+ Requires-Dist: scikit-learn
23
+ Requires-Dist: xgboost
24
+
25
+ # Your AI Data Science Team (An Army Of Copilots)
26
+
27
+ **An AI-powered data science team of copilots that uses agents to help you perform common data science tasks 10X faster**.
28
+
29
+ Star ⭐ This GitHub (Takes 2 seconds and means a lot).
30
+
31
+ ---
32
+
33
+ The AI Data Science Team of Copilots includes Agents that specialize data cleaning, preparation, feature engineering, modeling (machine learning), and interpretation of various business problems like:
34
+
35
+ - Churn Modeling
36
+ - Employee Attrition
37
+ - Lead Scoring
38
+ - Insurance Risk
39
+ - Credit Card Risk
40
+ - And more
41
+
42
+ ## Companies That Want An AI Data Science Team Copilot
43
+
44
+ If you are interested in having your own custom enteprise-grade AI Data Science Team Copilot, send inquiries here: [https://www.business-science.io/contact.html](https://www.business-science.io/contact.html)
45
+
46
+ ## Free Generative AI For Data Scientists Workshop
47
+
48
+ If you want to learn how to build AI Agents for your company that performs Data Science, Business Intelligence, Churn Modeling, Time Series Forecasting, and more, [register for my next Generative AI for Data Scientists workshop here.](https://learn.business-science.io/ai-register)
49
+
50
+ ## Data Science Agents
51
+
52
+ This project is a work in progress. New data science agents will be released soon.
53
+
54
+ ![Data Science Team](/img/ai_data_science_team.jpg)
55
+
56
+ ### Agents Available Now
57
+
58
+ 1. **Data Wrangling Agent:** Merges, Joins, Preps and Wrangles data into a format that is ready for data analysis.
59
+ 2. **Data Cleaning Agent:** Performs Data Preparation steps including handling missing values, outliers, and data type conversions.
60
+ 3. **Feature Engineering Agent:** Converts the prepared data into ML-ready data. Adds features to increase predictive accuracy of ML models.
61
+
62
+ ### Agents Coming Soon
63
+
64
+ 1. **Data Analyst:** Analyzes data structure, creates exploratory visualizations, and performs correlation analysis to identify relationships.
65
+ 2. **Machine Learning Agent:** Builds and logs the machine learning models.
66
+ 3. **Interpretability Agent:** Performs Interpretable ML to explain why the model returned predictions including which features were the most important to the model.
67
+ 4. **Supervisor:** Forms task list. Moderates sub-agents. Returns completed assignment.
68
+
69
+ ## Disclaimer
70
+
71
+ **This project is for educational purposes only.**
72
+
73
+ - It is not intended to replace your company's data science team
74
+ - No warranties or guarantees provided
75
+ - Creator assumes no liability for financial loss
76
+ - Consult an experienced Generative AI Data Scientist for building your own custom AI Data Science Team
77
+ - If you want a custom enterprise-grade AI Data Science Team, [send inquiries here](https://www.business-science.io/contact.html).
78
+
79
+ By using this software, you agree to use it solely for learning purposes.
80
+
81
+ ## Table of Contents
82
+
83
+ - [Your AI Data Science Team (An Army Of Copilots)](#your-ai-data-science-team-an-army-of-copilots)
84
+ - [Companies That Want An AI Data Science Team Copilot](#companies-that-want-an-ai-data-science-team-copilot)
85
+ - [Free Generative AI For Data Scientists Workshop](#free-generative-ai-for-data-scientists-workshop)
86
+ - [Data Science Agents](#data-science-agents)
87
+ - [Agents Available Now](#agents-available-now)
88
+ - [Agents Coming Soon](#agents-coming-soon)
89
+ - [Disclaimer](#disclaimer)
90
+ - [Table of Contents](#table-of-contents)
91
+ - [Installation](#installation)
92
+ - [Usage](#usage)
93
+ - [Example 1: Feature Engineering with the Feature Engineering Agent](#example-1-feature-engineering-with-the-feature-engineering-agent)
94
+ - [Example 2: Cleaning Data with the Data Cleaning Agent](#example-2-cleaning-data-with-the-data-cleaning-agent)
95
+ - [Contributing](#contributing)
96
+ - [License](#license)
97
+
98
+ ## Installation
99
+
100
+ ``` bash
101
+ pip install git+https://github.com/business-science/ai-data-science-team.git --upgrade
102
+ ```
103
+
104
+ ## Usage
105
+
106
+ ### Example 1: Feature Engineering with the Feature Engineering Agent
107
+
108
+ [See the full example here.](/examples/feature_engineering_agent.ipynb)
109
+
110
+ ``` python
111
+ feature_engineering_agent = make_feature_engineering_agent(model = llm)
112
+
113
+ response = feature_engineering_agent.invoke({
114
+ "user_instructions": "Make sure to scale and center numeric features",
115
+ "target_variable": "Churn",
116
+ "data_raw": df.to_dict(),
117
+ "max_retries":3,
118
+ "retry_count":0
119
+ })
120
+ ```
121
+
122
+ ``` bash
123
+ ---FEATURE ENGINEERING AGENT----
124
+ * CREATE FEATURE ENGINEER CODE
125
+ * EXECUTING AGENT CODE
126
+ * EXPLAIN AGENT CODE
127
+ ```
128
+
129
+ ### Example 2: Cleaning Data with the Data Cleaning Agent
130
+
131
+ [See the full example here.](/examples/data_cleaning_agent.ipynb)
132
+
133
+ ``` python
134
+ data_cleaning_agent = make_data_cleaning_agent(model = llm)
135
+
136
+ response = data_cleaning_agent.invoke({
137
+ "user_instructions": "Don't remove outliers when cleaning the data.",
138
+ "data_raw": df.to_dict(),
139
+ "max_retries":3,
140
+ "retry_count":0
141
+ })
142
+ ```
143
+
144
+ ``` bash
145
+ ---DATA CLEANING AGENT----
146
+ * CREATE DATA CLEANER CODE
147
+ * EXECUTING AGENT CODE
148
+ * EXPLAIN AGENT CODE
149
+ ```
150
+
151
+ ## Contributing
152
+
153
+ 1. Fork the repository
154
+ 2. Create a feature branch
155
+ 3. Commit your changes
156
+ 4. Push to the branch
157
+ 5. Create a Pull Request
158
+
159
+ ## License
160
+
161
+ This project is licensed under the MIT License. See LICENSE file for details.
162
+
@@ -0,0 +1,19 @@
1
+ ai_data_science_team/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ ai_data_science_team/_version.py,sha256=7tA8TocqCCzLkcB4ptV6bn3k5ni-0TGZvGnVBzmbeIc,26
3
+ ai_data_science_team/orchestration.py,sha256=xiIFOsrLwPdkSmtme7wNCCGv8XopnMTNElNzlZokL-4,303
4
+ ai_data_science_team/agents/__init__.py,sha256=DtwQgyeG3Q4rQ-NrMbva-jshVQyULaWW1RrnETQGZOY,270
5
+ ai_data_science_team/agents/data_cleaning_agent.py,sha256=0K-CgngGjamRk_QzMqNkplrI-ddCbtruQ7kjGrsRIN8,14390
6
+ ai_data_science_team/agents/data_wrangling_agent.py,sha256=uQBJ8vQwrXubQgaI9_UoNZnVQjIEBUOh3dTmNdg326k,14581
7
+ ai_data_science_team/agents/feature_engineering_agent.py,sha256=QEqXTsfjllUj4Wgsw4nNGUT6r9Y6q629ZNgqGy3Dbbk,15921
8
+ ai_data_science_team/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ ai_data_science_team/templates/agent_templates.py,sha256=gT48Pq9KlrrrF0yigodGl_BdptmowTJ2rEWUqh7g5E0,15410
10
+ ai_data_science_team/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ ai_data_science_team/tools/data_analysis.py,sha256=V7e6_fZA01mosFf5VcLwBcpiMVf7fClZMjTrj-egK-o,3715
12
+ ai_data_science_team/tools/logging.py,sha256=EU5EMg4Y0-Yhqf1vAEFg0eRvSTx8uF0LTOAKss8-T2M,2073
13
+ ai_data_science_team/tools/parsers.py,sha256=BAi-fJT7BBt9nRS3w5n9LDTsu7JAJsH8CAI9-Qf7jCs,2086
14
+ ai_data_science_team/tools/regex.py,sha256=KTH2SXPJT8Tzmj7CufyeET-FbA9BMhRzFlPKr4Tan3g,2320
15
+ ai_data_science_team-0.0.0.9005.dist-info/LICENSE,sha256=Xif0IRLdd2HGLATxV2EVp91aSY6KOuacRr_6BorKGzA,1084
16
+ ai_data_science_team-0.0.0.9005.dist-info/METADATA,sha256=PC6rJR965hPu02LtZrzHICkd3QeWzh2A35axTLjE9hM,5840
17
+ ai_data_science_team-0.0.0.9005.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
18
+ ai_data_science_team-0.0.0.9005.dist-info/top_level.txt,sha256=CnoMgOphCoAdGTLueWdCVByVyjwOubaGiTB1lchdy4M,21
19
+ ai_data_science_team-0.0.0.9005.dist-info/RECORD,,
@@ -1,325 +0,0 @@
1
- # BUSINESS SCIENCE UNIVERSITY
2
- # AI DATA SCIENCE TEAM
3
- # ***
4
- # Agents
5
- # ai_data_science_team/agents.py
6
-
7
- # Libraries
8
- from typing import TypedDict, Annotated, Sequence
9
- import operator
10
-
11
- from langchain.prompts import PromptTemplate
12
- from langchain_core.messages import BaseMessage
13
- from langgraph.graph import END, StateGraph
14
-
15
- import os
16
- import io
17
- import pandas as pd
18
-
19
- from ai_data_science_team.templates.agent_templates import execute_agent_code_on_data, fix_agent_code, explain_agent_code
20
- from ai_data_science_team.tools.parsers import PythonOutputParser
21
-
22
- # Setup
23
-
24
- LOG_PATH = os.path.join(os.getcwd(), "logs/")
25
-
26
-
27
- # * Data Cleaning Agent
28
-
29
- def data_cleaning_agent(model, log=False, log_path=None):
30
- """
31
- Creates a data cleaning agent that can be run on a dataset. The agent can be used to clean a dataset in a variety of
32
- ways, such as removing columns with more than 40% missing values, imputing missing
33
- values with the mean of the column if the column is numeric, or imputing missing
34
- values with the mode of the column if the column is categorical.
35
- The agent takes in a dataset and some user instructions, and outputs a python
36
- function that can be used to clean the dataset. The agent also logs the code
37
- generated and any errors that occur.
38
-
39
- Parameters
40
- ----------
41
- model : langchain.llms.base.LLM
42
- The language model to use to generate code.
43
- log : bool, optional
44
- Whether or not to log the code generated and any errors that occur.
45
- Defaults to False.
46
- log_path : str, optional
47
- The path to the directory where the log files should be stored. Defaults to
48
- "logs/".
49
-
50
- Examples
51
- -------
52
- ``` python
53
- import pandas as pd
54
- from langchain_openai import ChatOpenAI
55
- from ai_data_science_team.agents import data_cleaning_agent
56
-
57
- llm = ChatOpenAI(model = "gpt-4o-mini")
58
-
59
- data_cleaning_agent = data_cleaning_agent(llm)
60
-
61
- df = pd.read_csv("https://raw.githubusercontent.com/business-science/ai-data-science-team/refs/heads/master/data/churn_data.csv")
62
-
63
- response = data_cleaning_agent.invoke({
64
- "user_instructions": "Don't remove outliers when cleaning the data.",
65
- "data_raw": df.to_dict(),
66
- "max_retries":3,
67
- "retry_count":0
68
- })
69
-
70
- pd.DataFrame(response['data_cleaned'])
71
- ```
72
-
73
- Returns
74
- -------
75
- app : langchain.graphs.StateGraph
76
- The data cleaning agent as a state graph.
77
- """
78
- llm = model
79
-
80
- # Setup Log Directory
81
- if log:
82
- if log_path is None:
83
- log_path = LOG_PATH
84
- if not os.path.exists(log_path):
85
- os.makedirs(log_path)
86
-
87
- # Define GraphState for the router
88
- class GraphState(TypedDict):
89
- messages: Annotated[Sequence[BaseMessage], operator.add]
90
- user_instructions: str
91
- data_raw: dict
92
- data_cleaner_function: str
93
- data_cleaner_error: str
94
- data_cleaned: dict
95
- max_retries: int
96
- retry_count: int
97
-
98
-
99
- def create_data_cleaner_code(state: GraphState):
100
- print("---DATA CLEANING AGENT----")
101
- print(" * CREATE DATA CLEANER CODE")
102
-
103
- data_cleaning_prompt = PromptTemplate(
104
- template="""
105
- You are a Data Cleaning Agent. Your job is to create a data_cleaner() function to that can be run on the data provided.
106
-
107
- Things that should be considered in the data summary function:
108
-
109
- * Removing columns if more than 40 percent of the data is missing
110
- * Imputing missing values with the mean of the column if the column is numeric
111
- * Imputing missing values with the mode of the column if the column is categorical
112
- * Converting columns to the correct data type
113
- * Removing duplicate rows
114
- * Removing rows with missing values
115
- * Removing rows with extreme outliers (3X the interquartile range)
116
-
117
- Make sure to take into account any additional user instructions that may negate some of these steps or add new steps. Include comments in your code to explain your reasoning for each step. Include comments if something is not done because a user requested. Include comments if something is done because a user requested.
118
-
119
- User instructions:
120
- {user_instructions}
121
-
122
- Return Python code in ```python ``` format with a single function definition, data_cleaner(data_raw), that incldues all imports inside the function.
123
-
124
- You can use Pandas, Numpy, and Scikit Learn libraries to clean the data.
125
-
126
- Sample Data (first 100 rows):
127
- {data_head}
128
-
129
- Data Description:
130
- {data_description}
131
-
132
- Data Info:
133
- {data_info}
134
-
135
- Return code to provide the data cleaning function:
136
-
137
- def data_cleaner(data_raw):
138
- import pandas as pd
139
- import numpy as np
140
- ...
141
- return data_cleaner
142
-
143
- Best Practices and Error Preventions:
144
-
145
- Always ensure that when assigning the output of fit_transform() from SimpleImputer to a Pandas DataFrame column, you call .ravel() or flatten the array, because fit_transform() returns a 2D array while a DataFrame column is 1D.
146
-
147
- """,
148
- input_variables=["user_instructions","data_head", "data_description", "data_info"]
149
- )
150
-
151
- data_cleaning_agent = data_cleaning_prompt | llm | PythonOutputParser()
152
-
153
- data_raw = state.get("data_raw")
154
-
155
- df = pd.DataFrame.from_dict(data_raw)
156
-
157
- buffer = io.StringIO()
158
- df.info(buf=buffer)
159
- info_text = buffer.getvalue()
160
-
161
- response = data_cleaning_agent.invoke({
162
- "user_instructions": state.get("user_instructions"),
163
- "data_head": df.head().to_string(),
164
- "data_description": df.describe().to_string(),
165
- "data_info": info_text
166
- })
167
-
168
- # For logging: store the code generated:
169
- if log:
170
- with open(log_path + 'data_cleaner.py', 'w') as file:
171
- file.write(response)
172
-
173
- return {"data_cleaner_function" : response}
174
-
175
- def execute_data_cleaner_code(state):
176
- return execute_agent_code_on_data(
177
- state=state,
178
- data_key="data_raw",
179
- result_key="data_cleaned",
180
- error_key="data_cleaner_error",
181
- code_snippet_key="data_cleaner_function",
182
- agent_function_name="data_cleaner",
183
- pre_processing=lambda data: pd.DataFrame.from_dict(data),
184
- post_processing=lambda df: df.to_dict(),
185
- error_message_prefix="An error occurred during data cleaning: "
186
- )
187
-
188
- def fix_data_cleaner_code(state: GraphState):
189
- data_cleaner_prompt = """
190
- You are a Data Cleaning Agent. Your job is to create a data_cleaner() function that can be run on the data provided. The function is currently broken and needs to be fixed.
191
-
192
- Make sure to only return the function definition for data_cleaner().
193
-
194
- Return Python code in ```python``` format with a single function definition, data_cleaner(data_raw), that includes all imports inside the function.
195
-
196
- This is the broken code (please fix):
197
- {code_snippet}
198
-
199
- Last Known Error:
200
- {error}
201
- """
202
-
203
- return fix_agent_code(
204
- state=state,
205
- code_snippet_key="data_cleaner_function",
206
- error_key="data_cleaner_error",
207
- llm=llm,
208
- prompt_template=data_cleaner_prompt,
209
- log=True,
210
- log_path="logs/",
211
- log_file_name="data_cleaner.py"
212
- )
213
-
214
- def explain_data_cleaner_code(state: GraphState):
215
- return explain_agent_code(
216
- state=state,
217
- code_snippet_key="data_cleaner_function",
218
- result_key="messages",
219
- error_key="data_cleaner_error",
220
- llm=llm,
221
- explanation_prompt_template="""
222
- Explain the data cleaning steps that the data cleaning agent performed in this function.
223
- Keep the summary succinct and to the point.\n\n# Data Cleaning Agent:\n\n{code}
224
- """,
225
- success_prefix="# Data Cleaning Agent:\n\n ",
226
- error_message="The Data Cleaning Agent encountered an error during data cleaning. Data could not be explained."
227
- )
228
-
229
-
230
- workflow = StateGraph(GraphState)
231
-
232
- workflow.add_node("create_data_cleaner_code", create_data_cleaner_code)
233
- workflow.add_node("execute_data_cleaner_code", execute_data_cleaner_code)
234
- workflow.add_node("fix_data_cleaner_code", fix_data_cleaner_code)
235
- workflow.add_node("explain_data_cleaner_code", explain_data_cleaner_code)
236
-
237
- workflow.set_entry_point("create_data_cleaner_code")
238
- workflow.add_edge("create_data_cleaner_code", "execute_data_cleaner_code")
239
-
240
- workflow.add_conditional_edges(
241
- "execute_data_cleaner_code",
242
- lambda state: "fix_code"
243
- if (state.get("data_cleaner_error") is not None
244
- and state.get("retry_count") is not None
245
- and state.get("max_retries") is not None
246
- and state.get("retry_count") < state.get("max_retries"))
247
- else "explain_code",
248
- {"fix_code": "fix_data_cleaner_code", "explain_code": "explain_data_cleaner_code"},
249
- )
250
-
251
- workflow.add_edge("fix_data_cleaner_code", "execute_data_cleaner_code")
252
- workflow.add_edge("explain_data_cleaner_code", END)
253
-
254
- app = workflow.compile()
255
-
256
- return app
257
-
258
- # # * Data Summary Agent
259
-
260
- # def data_summary_agent(model, log=True, log_path=None):
261
-
262
- # # Setup Log Directory
263
- # if log:
264
- # if log_path is None:
265
- # log_path = LOG_PATH
266
- # if not os.path.exists(log_path):
267
- # os.makedirs(log_path)
268
-
269
- # llm = model
270
-
271
- # data_summary_prompt = PromptTemplate(
272
- # template="""
273
- # You are a Data Summary Agent. Your job is to summarize a dataset.
274
-
275
- # Things that should be considered in the data summary function:
276
-
277
- # * How many missing values
278
- # * How many unique values
279
- # * How many rows
280
- # * How many columns
281
- # * What data types are present
282
- # * What the data looks like
283
- # * What column types are present
284
- # * What is the distribution of the data
285
- # * What is the correlation between the data
286
-
287
- # Make sure to take into account any additional user instructions that may negate some of these steps or add new steps.
288
-
289
- # User instructions:
290
- # {user_instructions}
291
-
292
- # Return Python code in ```python ``` format with a single function definition, data_sumary(data), that incldues all imports inside the function.
293
-
294
- # You can use Pandas, Numpy, and Scikit Learn libraries to summarize the data.
295
-
296
- # Sample Data (first 100 rows):
297
- # {data_head}
298
-
299
- # Data Description:
300
- # {data_description}
301
-
302
- # Data Info:
303
- # {data_info}
304
-
305
- # Return code to provide the data cleaning function:
306
-
307
- # def data_summary(data):
308
- # import pandas as pd
309
- # import numpy as np
310
- # ...
311
- # return {
312
- # 'data_summary': ...,
313
- # 'data_correlation': ...
314
- # [INSERT MORE KEYS HERE],
315
- # }
316
-
317
- # """,
318
- # input_variables=["user_instructions","data_head", "data_description", "data_info"]
319
- # )
320
-
321
- # data_summary_agent = data_summary_prompt | llm | PythonOutputParser()
322
-
323
-
324
-
325
- # return 1