ai-data-science-team 0.0.0.9005__py3-none-any.whl → 0.0.0.9007__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +3 -1
- ai_data_science_team/agents/data_cleaning_agent.py +213 -20
- ai_data_science_team/agents/data_visualization_agent.py +331 -0
- ai_data_science_team/agents/data_wrangling_agent.py +66 -24
- ai_data_science_team/agents/feature_engineering_agent.py +50 -13
- ai_data_science_team/agents/sql_database_agent.py +397 -0
- ai_data_science_team/templates/__init__.py +8 -0
- ai_data_science_team/templates/agent_templates.py +154 -37
- ai_data_science_team/tools/logging.py +1 -1
- ai_data_science_team/tools/metadata.py +230 -0
- ai_data_science_team/tools/regex.py +7 -1
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/METADATA +43 -22
- ai_data_science_team-0.0.0.9007.dist-info/RECORD +21 -0
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/WHEEL +1 -1
- ai_data_science_team/tools/data_analysis.py +0 -116
- ai_data_science_team-0.0.0.9005.dist-info/RECORD +0 -19
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@ from langgraph.graph import StateGraph, END
|
|
3
3
|
from langgraph.types import interrupt, Command
|
4
4
|
|
5
5
|
import pandas as pd
|
6
|
+
import sqlalchemy as sql
|
6
7
|
|
7
8
|
from typing import Any, Callable, Dict, Type, Optional
|
8
9
|
|
@@ -22,7 +23,9 @@ def create_coding_agent_graph(
|
|
22
23
|
retry_count_key: str = "retry_count",
|
23
24
|
human_in_the_loop: bool = False,
|
24
25
|
human_review_node_name: str = "human_review",
|
25
|
-
checkpointer: Optional[Callable] = None
|
26
|
+
checkpointer: Optional[Callable] = None,
|
27
|
+
bypass_recommended_steps: bool = False,
|
28
|
+
bypass_explain_code: bool = False,
|
26
29
|
):
|
27
30
|
"""
|
28
31
|
Creates a generic agent graph using the provided node functions and node names.
|
@@ -63,7 +66,11 @@ def create_coding_agent_graph(
|
|
63
66
|
The node name for human review if human_in_the_loop is True.
|
64
67
|
checkpointer : callable, optional
|
65
68
|
A checkpointer callable if desired.
|
66
|
-
|
69
|
+
bypass_recommended_steps : bool, optional
|
70
|
+
Whether to skip the recommended steps node.
|
71
|
+
bypass_explain_code : bool, optional
|
72
|
+
Whether to skip the final explain code node.
|
73
|
+
|
67
74
|
Returns
|
68
75
|
-------
|
69
76
|
app : langchain.graphs.StateGraph
|
@@ -72,50 +79,76 @@ def create_coding_agent_graph(
|
|
72
79
|
|
73
80
|
workflow = StateGraph(GraphState)
|
74
81
|
|
75
|
-
#
|
76
|
-
|
82
|
+
# Conditionally add the recommended-steps node
|
83
|
+
if not bypass_recommended_steps:
|
84
|
+
workflow.add_node(recommended_steps_node_name, node_functions[recommended_steps_node_name])
|
77
85
|
|
78
|
-
#
|
79
|
-
if human_in_the_loop:
|
80
|
-
workflow.add_node(human_review_node_name, node_functions[human_review_node_name])
|
81
|
-
|
82
|
-
# Add main nodes
|
86
|
+
# Always add create, execute, and fix nodes
|
83
87
|
workflow.add_node(create_code_node_name, node_functions[create_code_node_name])
|
84
88
|
workflow.add_node(execute_code_node_name, node_functions[execute_code_node_name])
|
85
89
|
workflow.add_node(fix_code_node_name, node_functions[fix_code_node_name])
|
86
|
-
|
90
|
+
|
91
|
+
# Conditionally add the explanation node
|
92
|
+
if not bypass_explain_code:
|
93
|
+
workflow.add_node(explain_code_node_name, node_functions[explain_code_node_name])
|
87
94
|
|
88
95
|
# Set the entry point
|
89
|
-
|
96
|
+
entry_point = create_code_node_name if bypass_recommended_steps else recommended_steps_node_name
|
97
|
+
workflow.set_entry_point(entry_point)
|
90
98
|
|
91
|
-
# Add edges
|
92
|
-
if
|
93
|
-
|
94
|
-
|
95
|
-
|
99
|
+
# Add edges for recommended steps
|
100
|
+
if not bypass_recommended_steps:
|
101
|
+
if human_in_the_loop:
|
102
|
+
workflow.add_edge(recommended_steps_node_name, human_review_node_name)
|
103
|
+
else:
|
104
|
+
workflow.add_edge(recommended_steps_node_name, create_code_node_name)
|
105
|
+
elif human_in_the_loop:
|
106
|
+
# Skip recommended steps but still include human review
|
107
|
+
workflow.add_edge(create_code_node_name, human_review_node_name)
|
96
108
|
|
97
|
-
#
|
109
|
+
# Create -> Execute
|
98
110
|
workflow.add_edge(create_code_node_name, execute_code_node_name)
|
99
111
|
|
100
|
-
#
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
state.get(
|
105
|
-
state.get(
|
106
|
-
state
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
112
|
+
# Define a helper to check if we have an error & can still retry
|
113
|
+
def error_and_can_retry(state):
|
114
|
+
return (
|
115
|
+
state.get(error_key) is not None
|
116
|
+
and state.get(retry_count_key) is not None
|
117
|
+
and state.get(max_retries_key) is not None
|
118
|
+
and state[retry_count_key] < state[max_retries_key]
|
119
|
+
)
|
120
|
+
|
121
|
+
# ---- Split into two branches for bypass_explain_code ----
|
122
|
+
if not bypass_explain_code:
|
123
|
+
# If we are NOT bypassing explain, the next node is fix_code if error,
|
124
|
+
# else explain_code. Then we wire explain_code -> END afterward.
|
125
|
+
workflow.add_conditional_edges(
|
126
|
+
execute_code_node_name,
|
127
|
+
lambda s: "fix_code" if error_and_can_retry(s) else "explain_code",
|
128
|
+
{
|
129
|
+
"fix_code": fix_code_node_name,
|
130
|
+
"explain_code": explain_code_node_name,
|
131
|
+
},
|
132
|
+
)
|
133
|
+
# Fix code -> Execute again
|
134
|
+
workflow.add_edge(fix_code_node_name, execute_code_node_name)
|
135
|
+
# explain_code -> END
|
136
|
+
workflow.add_edge(explain_code_node_name, END)
|
137
|
+
else:
|
138
|
+
# If we ARE bypassing explain_code, the next node is fix_code if error,
|
139
|
+
# else straight to END.
|
140
|
+
workflow.add_conditional_edges(
|
141
|
+
execute_code_node_name,
|
142
|
+
lambda s: "fix_code" if error_and_can_retry(s) else "END",
|
143
|
+
{
|
144
|
+
"fix_code": fix_code_node_name,
|
145
|
+
"END": END,
|
146
|
+
},
|
147
|
+
)
|
148
|
+
# Fix code -> Execute again
|
149
|
+
workflow.add_edge(fix_code_node_name, execute_code_node_name)
|
150
|
+
|
151
|
+
# Finally, compile
|
119
152
|
if human_in_the_loop and checkpointer is not None:
|
120
153
|
app = workflow.compile(checkpointer=checkpointer)
|
121
154
|
else:
|
@@ -124,6 +157,7 @@ def create_coding_agent_graph(
|
|
124
157
|
return app
|
125
158
|
|
126
159
|
|
160
|
+
|
127
161
|
def node_func_human_review(
|
128
162
|
state: Any,
|
129
163
|
prompt_text: str,
|
@@ -256,6 +290,88 @@ def node_func_execute_agent_code_on_data(
|
|
256
290
|
# if state.get("retry_count") == 0:
|
257
291
|
# 10/0
|
258
292
|
|
293
|
+
# Apply post-processing if provided
|
294
|
+
if post_processing is not None:
|
295
|
+
result = post_processing(result)
|
296
|
+
else:
|
297
|
+
if isinstance(result, pd.DataFrame):
|
298
|
+
result = result.to_dict()
|
299
|
+
|
300
|
+
except Exception as e:
|
301
|
+
print(e)
|
302
|
+
agent_error = f"{error_message_prefix}{str(e)}"
|
303
|
+
|
304
|
+
# Return results
|
305
|
+
output = {result_key: result, error_key: agent_error}
|
306
|
+
return output
|
307
|
+
|
308
|
+
def node_func_execute_agent_from_sql_connection(
|
309
|
+
state: Any,
|
310
|
+
connection: Any,
|
311
|
+
code_snippet_key: str,
|
312
|
+
result_key: str,
|
313
|
+
error_key: str,
|
314
|
+
agent_function_name: str,
|
315
|
+
post_processing: Optional[Callable[[Any], Any]] = None,
|
316
|
+
error_message_prefix: str = "An error occurred during agent execution: "
|
317
|
+
) -> Dict[str, Any]:
|
318
|
+
"""
|
319
|
+
Execute a generic agent code defined in a code snippet retrieved from the state on a SQLAlchemy connection object
|
320
|
+
and return the result.
|
321
|
+
|
322
|
+
Parameters
|
323
|
+
----------
|
324
|
+
state : Any
|
325
|
+
A state object that supports `get(key: str)` method to retrieve values.
|
326
|
+
connection : str
|
327
|
+
The SQLAlchemy connection object to use for executing the agent function.
|
328
|
+
code_snippet_key : str
|
329
|
+
The key in the state used to retrieve the Python code snippet defining the agent function.
|
330
|
+
result_key : str
|
331
|
+
The key in the state used to store the result of the agent function.
|
332
|
+
error_key : str
|
333
|
+
The key in the state used to store the error message if any.
|
334
|
+
agent_function_name : str
|
335
|
+
The name of the function (e.g., 'sql_database_agent') expected to be defined in the code snippet.
|
336
|
+
post_processing : Callable[[Any], Any], optional
|
337
|
+
A function to postprocess the output of the agent function before returning it.
|
338
|
+
error_message_prefix : str, optional
|
339
|
+
A prefix or full message to use in the error output if an exception occurs.
|
340
|
+
|
341
|
+
Returns
|
342
|
+
-------
|
343
|
+
Dict[str, Any]
|
344
|
+
A dictionary containing the result and/or error messages. Keys are arbitrary,
|
345
|
+
but typically include something like "result" or "error".
|
346
|
+
"""
|
347
|
+
|
348
|
+
print(" * EXECUTING AGENT CODE ON SQL CONNECTION")
|
349
|
+
|
350
|
+
# Retrieve SQLAlchemy connection and code snippet from the state
|
351
|
+
is_engine = isinstance(connection, sql.engine.base.Engine)
|
352
|
+
conn = connection.connect() if is_engine else connection
|
353
|
+
agent_code = state.get(code_snippet_key)
|
354
|
+
|
355
|
+
# Ensure the connection object is provided
|
356
|
+
if connection is None:
|
357
|
+
raise ValueError(f"Connection object not found.")
|
358
|
+
|
359
|
+
# Execute the code snippet to define the agent function
|
360
|
+
local_vars = {}
|
361
|
+
global_vars = {}
|
362
|
+
exec(agent_code, global_vars, local_vars)
|
363
|
+
|
364
|
+
# Retrieve the agent function from the executed code
|
365
|
+
agent_function = local_vars.get(agent_function_name, None)
|
366
|
+
if agent_function is None or not callable(agent_function):
|
367
|
+
raise ValueError(f"Agent function '{agent_function_name}' not found or not callable in the provided code.")
|
368
|
+
|
369
|
+
# Execute the agent function
|
370
|
+
agent_error = None
|
371
|
+
result = None
|
372
|
+
try:
|
373
|
+
result = agent_function(connection)
|
374
|
+
|
259
375
|
# Apply post-processing if provided
|
260
376
|
if post_processing is not None:
|
261
377
|
result = post_processing(result)
|
@@ -267,6 +383,7 @@ def node_func_execute_agent_code_on_data(
|
|
267
383
|
output = {result_key: result, error_key: agent_error}
|
268
384
|
return output
|
269
385
|
|
386
|
+
|
270
387
|
def node_func_fix_agent_code(
|
271
388
|
state: Any,
|
272
389
|
code_snippet_key: str,
|
@@ -326,7 +443,7 @@ def node_func_fix_agent_code(
|
|
326
443
|
response = (llm | PythonOutputParser()).invoke(prompt)
|
327
444
|
|
328
445
|
response = relocate_imports_inside_function(response)
|
329
|
-
response = add_comments_to_top(response, agent_name=
|
446
|
+
response = add_comments_to_top(response, agent_name=agent_name)
|
330
447
|
|
331
448
|
# Log the response if requested
|
332
449
|
if log:
|
@@ -0,0 +1,230 @@
|
|
1
|
+
import io
|
2
|
+
import pandas as pd
|
3
|
+
import sqlalchemy as sql
|
4
|
+
from typing import Union, List, Dict
|
5
|
+
|
6
|
+
def get_dataframe_summary(
|
7
|
+
dataframes: Union[pd.DataFrame, List[pd.DataFrame], Dict[str, pd.DataFrame]],
|
8
|
+
n_sample: int = 30,
|
9
|
+
skip_stats: bool = False,
|
10
|
+
) -> List[str]:
|
11
|
+
"""
|
12
|
+
Generate a summary for one or more DataFrames. Accepts a single DataFrame, a list of DataFrames,
|
13
|
+
or a dictionary mapping names to DataFrames.
|
14
|
+
|
15
|
+
Parameters
|
16
|
+
----------
|
17
|
+
dataframes : pandas.DataFrame or list of pandas.DataFrame or dict of (str -> pandas.DataFrame)
|
18
|
+
- Single DataFrame: produce a single summary (returned within a one-element list).
|
19
|
+
- List of DataFrames: produce a summary for each DataFrame, using index-based names.
|
20
|
+
- Dictionary of DataFrames: produce a summary for each DataFrame, using dictionary keys as names.
|
21
|
+
n_sample : int, default 30
|
22
|
+
Number of rows to display in the "Data (first 30 rows)" section.
|
23
|
+
skip_stats : bool, default False
|
24
|
+
If True, skip the descriptive statistics and DataFrame info sections.
|
25
|
+
|
26
|
+
Example:
|
27
|
+
--------
|
28
|
+
``` python
|
29
|
+
import pandas as pd
|
30
|
+
from sklearn.datasets import load_iris
|
31
|
+
data = load_iris(as_frame=True)
|
32
|
+
dataframes = {
|
33
|
+
"iris": data.frame,
|
34
|
+
"iris_target": data.target,
|
35
|
+
}
|
36
|
+
summaries = get_dataframe_summary(dataframes)
|
37
|
+
print(summaries[0])
|
38
|
+
```
|
39
|
+
|
40
|
+
Returns
|
41
|
+
-------
|
42
|
+
list of str
|
43
|
+
A list of summaries, one for each provided DataFrame. Each summary includes:
|
44
|
+
- Shape of the DataFrame (rows, columns)
|
45
|
+
- Column data types
|
46
|
+
- Missing value percentage
|
47
|
+
- Unique value counts
|
48
|
+
- First 30 rows
|
49
|
+
- Descriptive statistics
|
50
|
+
- DataFrame info output
|
51
|
+
"""
|
52
|
+
|
53
|
+
summaries = []
|
54
|
+
|
55
|
+
# --- Dictionary Case ---
|
56
|
+
if isinstance(dataframes, dict):
|
57
|
+
for dataset_name, df in dataframes.items():
|
58
|
+
summaries.append(_summarize_dataframe(df, dataset_name, n_sample, skip_stats))
|
59
|
+
|
60
|
+
# --- Single DataFrame Case ---
|
61
|
+
elif isinstance(dataframes, pd.DataFrame):
|
62
|
+
summaries.append(_summarize_dataframe(dataframes, "Single_Dataset", n_sample, skip_stats))
|
63
|
+
|
64
|
+
# --- List of DataFrames Case ---
|
65
|
+
elif isinstance(dataframes, list):
|
66
|
+
for idx, df in enumerate(dataframes):
|
67
|
+
dataset_name = f"Dataset_{idx}"
|
68
|
+
summaries.append(_summarize_dataframe(df, dataset_name, n_sample, skip_stats))
|
69
|
+
|
70
|
+
else:
|
71
|
+
raise TypeError(
|
72
|
+
"Input must be a single DataFrame, a list of DataFrames, or a dictionary of DataFrames."
|
73
|
+
)
|
74
|
+
|
75
|
+
return summaries
|
76
|
+
|
77
|
+
|
78
|
+
def _summarize_dataframe(df: pd.DataFrame, dataset_name: str, n_sample=30, skip_stats=False) -> str:
|
79
|
+
"""Generate a summary string for a single DataFrame."""
|
80
|
+
# 1. Convert dictionary-type cells to strings
|
81
|
+
# This prevents unhashable dict errors during df.nunique().
|
82
|
+
df = df.apply(lambda col: col.map(lambda x: str(x) if isinstance(x, dict) else x))
|
83
|
+
|
84
|
+
# 2. Capture df.info() output
|
85
|
+
buffer = io.StringIO()
|
86
|
+
df.info(buf=buffer)
|
87
|
+
info_text = buffer.getvalue()
|
88
|
+
|
89
|
+
# 3. Calculate missing value stats
|
90
|
+
missing_stats = (df.isna().sum() / len(df) * 100).sort_values(ascending=False)
|
91
|
+
missing_summary = "\n".join([f"{col}: {val:.2f}%" for col, val in missing_stats.items()])
|
92
|
+
|
93
|
+
# 4. Get column data types
|
94
|
+
column_types = "\n".join([f"{col}: {dtype}" for col, dtype in df.dtypes.items()])
|
95
|
+
|
96
|
+
# 5. Get unique value counts
|
97
|
+
unique_counts = df.nunique() # Will no longer fail on unhashable dict
|
98
|
+
unique_counts_summary = "\n".join([f"{col}: {count}" for col, count in unique_counts.items()])
|
99
|
+
|
100
|
+
# 6. Generate the summary text
|
101
|
+
if not skip_stats:
|
102
|
+
summary_text = f"""
|
103
|
+
Dataset Name: {dataset_name}
|
104
|
+
----------------------------
|
105
|
+
Shape: {df.shape[0]} rows x {df.shape[1]} columns
|
106
|
+
|
107
|
+
Column Data Types:
|
108
|
+
{column_types}
|
109
|
+
|
110
|
+
Missing Value Percentage:
|
111
|
+
{missing_summary}
|
112
|
+
|
113
|
+
Unique Value Counts:
|
114
|
+
{unique_counts_summary}
|
115
|
+
|
116
|
+
Data (first {n_sample} rows):
|
117
|
+
{df.head(n_sample).to_string()}
|
118
|
+
|
119
|
+
Data Description:
|
120
|
+
{df.describe().to_string()}
|
121
|
+
|
122
|
+
Data Info:
|
123
|
+
{info_text}
|
124
|
+
"""
|
125
|
+
else:
|
126
|
+
summary_text = f"""
|
127
|
+
Dataset Name: {dataset_name}
|
128
|
+
----------------------------
|
129
|
+
Shape: {df.shape[0]} rows x {df.shape[1]} columns
|
130
|
+
|
131
|
+
Column Data Types:
|
132
|
+
{column_types}
|
133
|
+
|
134
|
+
Data (first {n_sample} rows):
|
135
|
+
{df.head(n_sample).to_string()}
|
136
|
+
"""
|
137
|
+
|
138
|
+
return summary_text.strip()
|
139
|
+
|
140
|
+
|
141
|
+
|
142
|
+
def get_database_metadata(connection: Union[sql.engine.base.Connection, sql.engine.base.Engine],
|
143
|
+
n_samples: int = 10) -> str:
|
144
|
+
"""
|
145
|
+
Collects metadata and sample data from a database, with safe identifier quoting and
|
146
|
+
basic dialect-aware row limiting. Prevents issues with spaces/reserved words in identifiers.
|
147
|
+
|
148
|
+
Parameters
|
149
|
+
----------
|
150
|
+
connection : Union[sql.engine.base.Connection, sql.engine.base.Engine]
|
151
|
+
An active SQLAlchemy connection or engine.
|
152
|
+
n_samples : int
|
153
|
+
Number of sample values to retrieve for each column.
|
154
|
+
|
155
|
+
Returns
|
156
|
+
-------
|
157
|
+
str
|
158
|
+
A formatted string with database metadata, including some sample data from each column.
|
159
|
+
"""
|
160
|
+
|
161
|
+
# If a connection is passed, use it; if an engine is passed, connect to it
|
162
|
+
is_engine = isinstance(connection, sql.engine.base.Engine)
|
163
|
+
conn = connection.connect() if is_engine else connection
|
164
|
+
|
165
|
+
output = []
|
166
|
+
try:
|
167
|
+
# Grab the engine off the connection
|
168
|
+
sql_engine = conn.engine
|
169
|
+
dialect_name = sql_engine.dialect.name.lower()
|
170
|
+
|
171
|
+
output.append(f"Database Dialect: {sql_engine.dialect.name}")
|
172
|
+
output.append(f"Driver: {sql_engine.driver}")
|
173
|
+
output.append(f"Connection URL: {sql_engine.url}")
|
174
|
+
|
175
|
+
# Inspect the database
|
176
|
+
inspector = sql.inspect(sql_engine)
|
177
|
+
tables = inspector.get_table_names()
|
178
|
+
output.append(f"Tables: {tables}")
|
179
|
+
output.append(f"Schemas: {inspector.get_schema_names()}")
|
180
|
+
|
181
|
+
# Helper to build a dialect-specific limit clause
|
182
|
+
def build_query(col_name_quoted: str, table_name_quoted: str, n: int) -> str:
|
183
|
+
"""
|
184
|
+
Returns a SQL query string to select N rows from the given column/table
|
185
|
+
across different dialects (SQLite, MySQL, Postgres, MSSQL, Oracle, etc.)
|
186
|
+
"""
|
187
|
+
if "sqlite" in dialect_name or "mysql" in dialect_name or "postgres" in dialect_name:
|
188
|
+
# Common dialects supporting LIMIT
|
189
|
+
return f"SELECT {col_name_quoted} FROM {table_name_quoted} LIMIT {n}"
|
190
|
+
elif "mssql" in dialect_name:
|
191
|
+
# Microsoft SQL Server syntax
|
192
|
+
return f"SELECT TOP {n} {col_name_quoted} FROM {table_name_quoted}"
|
193
|
+
elif "oracle" in dialect_name:
|
194
|
+
# Oracle syntax
|
195
|
+
return f"SELECT {col_name_quoted} FROM {table_name_quoted} WHERE ROWNUM <= {n}"
|
196
|
+
else:
|
197
|
+
# Fallback
|
198
|
+
return f"SELECT {col_name_quoted} FROM {table_name_quoted} LIMIT {n}"
|
199
|
+
|
200
|
+
# Prepare for quoting
|
201
|
+
preparer = inspector.bind.dialect.identifier_preparer
|
202
|
+
|
203
|
+
# For each table, get columns and sample data
|
204
|
+
for table_name in tables:
|
205
|
+
output.append(f"\nTable: {table_name}")
|
206
|
+
# Properly quote the table name
|
207
|
+
table_name_quoted = preparer.quote_identifier(table_name)
|
208
|
+
|
209
|
+
for column in inspector.get_columns(table_name):
|
210
|
+
col_name = column["name"]
|
211
|
+
col_type = column["type"]
|
212
|
+
output.append(f" Column: {col_name} Type: {col_type}")
|
213
|
+
|
214
|
+
# Properly quote the column name
|
215
|
+
col_name_quoted = preparer.quote_identifier(col_name)
|
216
|
+
|
217
|
+
# Build a dialect-aware query with safe quoting
|
218
|
+
query = build_query(col_name_quoted, table_name_quoted, n_samples)
|
219
|
+
|
220
|
+
# Read a few sample values
|
221
|
+
df = pd.read_sql(sql.text(query), conn)
|
222
|
+
first_values = df[col_name].tolist()
|
223
|
+
output.append(f" First {n_samples} Values: {first_values}")
|
224
|
+
|
225
|
+
finally:
|
226
|
+
# Close connection if created inside the function
|
227
|
+
if is_engine:
|
228
|
+
conn.close()
|
229
|
+
|
230
|
+
return "\n".join(output)
|
@@ -64,10 +64,16 @@ def add_comments_to_top(code_text, agent_name="data_wrangler"):
|
|
64
64
|
header_comments = [
|
65
65
|
"# Disclaimer: This function was generated by AI. Please review before using.",
|
66
66
|
f"# Agent Name: {agent_name}",
|
67
|
-
f"# Time Created: {time_created}",
|
67
|
+
f"# Time Created: {time_created}\n",
|
68
68
|
""
|
69
69
|
]
|
70
70
|
|
71
71
|
# Join the header with newlines, then prepend to the existing code_text
|
72
72
|
header_block = "\n".join(header_comments)
|
73
73
|
return header_block + code_text
|
74
|
+
|
75
|
+
def format_agent_name(agent_name: str) -> str:
|
76
|
+
|
77
|
+
formatted_name = agent_name.strip().replace("_", " ").upper()
|
78
|
+
|
79
|
+
return f"---{formatted_name}----"
|
{ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: ai-data-science-team
|
3
|
-
Version: 0.0.0.
|
3
|
+
Version: 0.0.0.9007
|
4
4
|
Summary: Build and run an AI-powered data science team.
|
5
5
|
Home-page: https://github.com/business-science/ai-data-science-team
|
6
6
|
Author: Matt Dancho
|
@@ -21,12 +21,22 @@ Requires-Dist: plotly
|
|
21
21
|
Requires-Dist: streamlit
|
22
22
|
Requires-Dist: scikit-learn
|
23
23
|
Requires-Dist: xgboost
|
24
|
+
Dynamic: author
|
25
|
+
Dynamic: author-email
|
26
|
+
Dynamic: description
|
27
|
+
Dynamic: description-content-type
|
28
|
+
Dynamic: home-page
|
29
|
+
Dynamic: requires-dist
|
30
|
+
Dynamic: requires-python
|
31
|
+
Dynamic: summary
|
24
32
|
|
25
33
|
# Your AI Data Science Team (An Army Of Copilots)
|
26
34
|
|
27
35
|
**An AI-powered data science team of copilots that uses agents to help you perform common data science tasks 10X faster**.
|
28
36
|
|
29
|
-
Star ⭐ This GitHub (Takes 2 seconds and means a lot)
|
37
|
+
**Star ⭐ This GitHub (Takes 2 seconds and means a lot).**
|
38
|
+
|
39
|
+
*Beta - This Python library is under active development. There may be breaking changes that occur until release of 0.1.0.*
|
30
40
|
|
31
41
|
---
|
32
42
|
|
@@ -39,6 +49,24 @@ The AI Data Science Team of Copilots includes Agents that specialize data cleani
|
|
39
49
|
- Credit Card Risk
|
40
50
|
- And more
|
41
51
|
|
52
|
+
## Table of Contents
|
53
|
+
|
54
|
+
- [Your AI Data Science Team (An Army Of Copilots)](#your-ai-data-science-team-an-army-of-copilots)
|
55
|
+
- [Table of Contents](#table-of-contents)
|
56
|
+
- [Companies That Want An AI Data Science Team Copilot](#companies-that-want-an-ai-data-science-team-copilot)
|
57
|
+
- [Free Generative AI For Data Scientists Workshop](#free-generative-ai-for-data-scientists-workshop)
|
58
|
+
- [Data Science Agents](#data-science-agents)
|
59
|
+
- [Coming Soon: Multi-Agents](#coming-soon-multi-agents)
|
60
|
+
- [Agents Available Now](#agents-available-now)
|
61
|
+
- [Agents Coming Soon](#agents-coming-soon)
|
62
|
+
- [Disclaimer](#disclaimer)
|
63
|
+
- [Installation](#installation)
|
64
|
+
- [Usage](#usage)
|
65
|
+
- [Example 1: Feature Engineering with the Feature Engineering Agent](#example-1-feature-engineering-with-the-feature-engineering-agent)
|
66
|
+
- [Example 2: Cleaning Data with the Data Cleaning Agent](#example-2-cleaning-data-with-the-data-cleaning-agent)
|
67
|
+
- [Contributing](#contributing)
|
68
|
+
- [License](#license)
|
69
|
+
|
42
70
|
## Companies That Want An AI Data Science Team Copilot
|
43
71
|
|
44
72
|
If you are interested in having your own custom enteprise-grade AI Data Science Team Copilot, send inquiries here: [https://www.business-science.io/contact.html](https://www.business-science.io/contact.html)
|
@@ -53,11 +81,19 @@ This project is a work in progress. New data science agents will be released soo
|
|
53
81
|
|
54
82
|

|
55
83
|
|
84
|
+
### Coming Soon: Multi-Agents
|
85
|
+
|
86
|
+
This is the internals of the Business Intelligence SQL Agent I'm working on:
|
87
|
+
|
88
|
+

|
89
|
+
|
56
90
|
### Agents Available Now
|
57
91
|
|
58
92
|
1. **Data Wrangling Agent:** Merges, Joins, Preps and Wrangles data into a format that is ready for data analysis.
|
59
|
-
2. **Data
|
60
|
-
3. **
|
93
|
+
2. **Data Visualization Agent:** Creates visualizations to help you understand your data. Returns JSON serializable plotly visualizations.
|
94
|
+
3. **Data Cleaning Agent:** Performs Data Preparation steps including handling missing values, outliers, and data type conversions.
|
95
|
+
4. **Feature Engineering Agent:** Converts the prepared data into ML-ready data. Adds features to increase predictive accuracy of ML models.
|
96
|
+
5. **SQL Database Agent:** Connects to SQL databases to pull data into the data science environment. Creates pipelines to automate data extraction. Performs Joins, Aggregations, and other SQL Query operations.
|
61
97
|
|
62
98
|
### Agents Coming Soon
|
63
99
|
|
@@ -78,23 +114,6 @@ This project is a work in progress. New data science agents will be released soo
|
|
78
114
|
|
79
115
|
By using this software, you agree to use it solely for learning purposes.
|
80
116
|
|
81
|
-
## Table of Contents
|
82
|
-
|
83
|
-
- [Your AI Data Science Team (An Army Of Copilots)](#your-ai-data-science-team-an-army-of-copilots)
|
84
|
-
- [Companies That Want An AI Data Science Team Copilot](#companies-that-want-an-ai-data-science-team-copilot)
|
85
|
-
- [Free Generative AI For Data Scientists Workshop](#free-generative-ai-for-data-scientists-workshop)
|
86
|
-
- [Data Science Agents](#data-science-agents)
|
87
|
-
- [Agents Available Now](#agents-available-now)
|
88
|
-
- [Agents Coming Soon](#agents-coming-soon)
|
89
|
-
- [Disclaimer](#disclaimer)
|
90
|
-
- [Table of Contents](#table-of-contents)
|
91
|
-
- [Installation](#installation)
|
92
|
-
- [Usage](#usage)
|
93
|
-
- [Example 1: Feature Engineering with the Feature Engineering Agent](#example-1-feature-engineering-with-the-feature-engineering-agent)
|
94
|
-
- [Example 2: Cleaning Data with the Data Cleaning Agent](#example-2-cleaning-data-with-the-data-cleaning-agent)
|
95
|
-
- [Contributing](#contributing)
|
96
|
-
- [License](#license)
|
97
|
-
|
98
117
|
## Installation
|
99
118
|
|
100
119
|
``` bash
|
@@ -103,6 +122,8 @@ pip install git+https://github.com/business-science/ai-data-science-team.git --u
|
|
103
122
|
|
104
123
|
## Usage
|
105
124
|
|
125
|
+
[See all examples here.](/examples)
|
126
|
+
|
106
127
|
### Example 1: Feature Engineering with the Feature Engineering Agent
|
107
128
|
|
108
129
|
[See the full example here.](/examples/feature_engineering_agent.ipynb)
|
@@ -0,0 +1,21 @@
|
|
1
|
+
ai_data_science_team/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
ai_data_science_team/_version.py,sha256=VJYpfOaKsXjGzPOsT6kYyVW6T9bFBqxt6Ph3qF8t-A8,26
|
3
|
+
ai_data_science_team/orchestration.py,sha256=xiIFOsrLwPdkSmtme7wNCCGv8XopnMTNElNzlZokL-4,303
|
4
|
+
ai_data_science_team/agents/__init__.py,sha256=rcF18rBsOuPJqJKvoffh6lwr4Nwm24MErM2u4H4Th9s,467
|
5
|
+
ai_data_science_team/agents/data_cleaning_agent.py,sha256=gixYY4wGehKK_ROgU7CVOzijghmVQGD4hyK9uKhc8Hw,20890
|
6
|
+
ai_data_science_team/agents/data_visualization_agent.py,sha256=wePFZbdB4kBah8m_iy6f4IDyjl6L6zBWzIgigJEXdk8,12933
|
7
|
+
ai_data_science_team/agents/data_wrangling_agent.py,sha256=5w1kytoWLE4p3hj0YHVuXcgCd304eNQac-Zrrgmnr2s,16735
|
8
|
+
ai_data_science_team/agents/feature_engineering_agent.py,sha256=UaaU3VkPhjOV0NbrYXedRb6eHOcOWWiGYhB_srrYWvg,17571
|
9
|
+
ai_data_science_team/agents/sql_database_agent.py,sha256=mRbEAPHP6NlwQac2_VL9RuyIfCCtrmXTrzu5RLzOoeU,16031
|
10
|
+
ai_data_science_team/templates/__init__.py,sha256=bNrKGmWXQG7GRczln_zVfUQLzxzp7hSwlLyNtLxleu4,278
|
11
|
+
ai_data_science_team/templates/agent_templates.py,sha256=xohVgEfxPcVukPLpPfV7mZ0cpFgp-oJVLZRWCv2V-WU,19948
|
12
|
+
ai_data_science_team/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
+
ai_data_science_team/tools/logging.py,sha256=7wFOv6GGhXR_RPbh-8p0GyrS608XOnZtiaGK2IbDl_s,2081
|
14
|
+
ai_data_science_team/tools/metadata.py,sha256=tbnca_tDp67oBA6qD29AKVooJG10VqGr4vwzj4rPUas,8348
|
15
|
+
ai_data_science_team/tools/parsers.py,sha256=BAi-fJT7BBt9nRS3w5n9LDTsu7JAJsH8CAI9-Qf7jCs,2086
|
16
|
+
ai_data_science_team/tools/regex.py,sha256=vkfdvi9pDe582p-fh_7cB07Wb0dOR2CsiVq-wUO3mas,2491
|
17
|
+
ai_data_science_team-0.0.0.9007.dist-info/LICENSE,sha256=Xif0IRLdd2HGLATxV2EVp91aSY6KOuacRr_6BorKGzA,1084
|
18
|
+
ai_data_science_team-0.0.0.9007.dist-info/METADATA,sha256=KcMFR2V9_wbepdKsrlFdfc7UB7t-Hf7i75x67LPXw3Q,6783
|
19
|
+
ai_data_science_team-0.0.0.9007.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
20
|
+
ai_data_science_team-0.0.0.9007.dist-info/top_level.txt,sha256=CnoMgOphCoAdGTLueWdCVByVyjwOubaGiTB1lchdy4M,21
|
21
|
+
ai_data_science_team-0.0.0.9007.dist-info/RECORD,,
|