ai-data-science-team 0.0.0.9005__py3-none-any.whl → 0.0.0.9007__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +3 -1
- ai_data_science_team/agents/data_cleaning_agent.py +213 -20
- ai_data_science_team/agents/data_visualization_agent.py +331 -0
- ai_data_science_team/agents/data_wrangling_agent.py +66 -24
- ai_data_science_team/agents/feature_engineering_agent.py +50 -13
- ai_data_science_team/agents/sql_database_agent.py +397 -0
- ai_data_science_team/templates/__init__.py +8 -0
- ai_data_science_team/templates/agent_templates.py +154 -37
- ai_data_science_team/tools/logging.py +1 -1
- ai_data_science_team/tools/metadata.py +230 -0
- ai_data_science_team/tools/regex.py +7 -1
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/METADATA +43 -22
- ai_data_science_team-0.0.0.9007.dist-info/RECORD +21 -0
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/WHEEL +1 -1
- ai_data_science_team/tools/data_analysis.py +0 -116
- ai_data_science_team-0.0.0.9005.dist-info/RECORD +0 -19
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@ from langgraph.graph import StateGraph, END
|
|
3
3
|
from langgraph.types import interrupt, Command
|
4
4
|
|
5
5
|
import pandas as pd
|
6
|
+
import sqlalchemy as sql
|
6
7
|
|
7
8
|
from typing import Any, Callable, Dict, Type, Optional
|
8
9
|
|
@@ -22,7 +23,9 @@ def create_coding_agent_graph(
|
|
22
23
|
retry_count_key: str = "retry_count",
|
23
24
|
human_in_the_loop: bool = False,
|
24
25
|
human_review_node_name: str = "human_review",
|
25
|
-
checkpointer: Optional[Callable] = None
|
26
|
+
checkpointer: Optional[Callable] = None,
|
27
|
+
bypass_recommended_steps: bool = False,
|
28
|
+
bypass_explain_code: bool = False,
|
26
29
|
):
|
27
30
|
"""
|
28
31
|
Creates a generic agent graph using the provided node functions and node names.
|
@@ -63,7 +66,11 @@ def create_coding_agent_graph(
|
|
63
66
|
The node name for human review if human_in_the_loop is True.
|
64
67
|
checkpointer : callable, optional
|
65
68
|
A checkpointer callable if desired.
|
66
|
-
|
69
|
+
bypass_recommended_steps : bool, optional
|
70
|
+
Whether to skip the recommended steps node.
|
71
|
+
bypass_explain_code : bool, optional
|
72
|
+
Whether to skip the final explain code node.
|
73
|
+
|
67
74
|
Returns
|
68
75
|
-------
|
69
76
|
app : langchain.graphs.StateGraph
|
@@ -72,50 +79,76 @@ def create_coding_agent_graph(
|
|
72
79
|
|
73
80
|
workflow = StateGraph(GraphState)
|
74
81
|
|
75
|
-
#
|
76
|
-
|
82
|
+
# Conditionally add the recommended-steps node
|
83
|
+
if not bypass_recommended_steps:
|
84
|
+
workflow.add_node(recommended_steps_node_name, node_functions[recommended_steps_node_name])
|
77
85
|
|
78
|
-
#
|
79
|
-
if human_in_the_loop:
|
80
|
-
workflow.add_node(human_review_node_name, node_functions[human_review_node_name])
|
81
|
-
|
82
|
-
# Add main nodes
|
86
|
+
# Always add create, execute, and fix nodes
|
83
87
|
workflow.add_node(create_code_node_name, node_functions[create_code_node_name])
|
84
88
|
workflow.add_node(execute_code_node_name, node_functions[execute_code_node_name])
|
85
89
|
workflow.add_node(fix_code_node_name, node_functions[fix_code_node_name])
|
86
|
-
|
90
|
+
|
91
|
+
# Conditionally add the explanation node
|
92
|
+
if not bypass_explain_code:
|
93
|
+
workflow.add_node(explain_code_node_name, node_functions[explain_code_node_name])
|
87
94
|
|
88
95
|
# Set the entry point
|
89
|
-
|
96
|
+
entry_point = create_code_node_name if bypass_recommended_steps else recommended_steps_node_name
|
97
|
+
workflow.set_entry_point(entry_point)
|
90
98
|
|
91
|
-
# Add edges
|
92
|
-
if
|
93
|
-
|
94
|
-
|
95
|
-
|
99
|
+
# Add edges for recommended steps
|
100
|
+
if not bypass_recommended_steps:
|
101
|
+
if human_in_the_loop:
|
102
|
+
workflow.add_edge(recommended_steps_node_name, human_review_node_name)
|
103
|
+
else:
|
104
|
+
workflow.add_edge(recommended_steps_node_name, create_code_node_name)
|
105
|
+
elif human_in_the_loop:
|
106
|
+
# Skip recommended steps but still include human review
|
107
|
+
workflow.add_edge(create_code_node_name, human_review_node_name)
|
96
108
|
|
97
|
-
#
|
109
|
+
# Create -> Execute
|
98
110
|
workflow.add_edge(create_code_node_name, execute_code_node_name)
|
99
111
|
|
100
|
-
#
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
state.get(
|
105
|
-
state.get(
|
106
|
-
state
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
112
|
+
# Define a helper to check if we have an error & can still retry
|
113
|
+
def error_and_can_retry(state):
|
114
|
+
return (
|
115
|
+
state.get(error_key) is not None
|
116
|
+
and state.get(retry_count_key) is not None
|
117
|
+
and state.get(max_retries_key) is not None
|
118
|
+
and state[retry_count_key] < state[max_retries_key]
|
119
|
+
)
|
120
|
+
|
121
|
+
# ---- Split into two branches for bypass_explain_code ----
|
122
|
+
if not bypass_explain_code:
|
123
|
+
# If we are NOT bypassing explain, the next node is fix_code if error,
|
124
|
+
# else explain_code. Then we wire explain_code -> END afterward.
|
125
|
+
workflow.add_conditional_edges(
|
126
|
+
execute_code_node_name,
|
127
|
+
lambda s: "fix_code" if error_and_can_retry(s) else "explain_code",
|
128
|
+
{
|
129
|
+
"fix_code": fix_code_node_name,
|
130
|
+
"explain_code": explain_code_node_name,
|
131
|
+
},
|
132
|
+
)
|
133
|
+
# Fix code -> Execute again
|
134
|
+
workflow.add_edge(fix_code_node_name, execute_code_node_name)
|
135
|
+
# explain_code -> END
|
136
|
+
workflow.add_edge(explain_code_node_name, END)
|
137
|
+
else:
|
138
|
+
# If we ARE bypassing explain_code, the next node is fix_code if error,
|
139
|
+
# else straight to END.
|
140
|
+
workflow.add_conditional_edges(
|
141
|
+
execute_code_node_name,
|
142
|
+
lambda s: "fix_code" if error_and_can_retry(s) else "END",
|
143
|
+
{
|
144
|
+
"fix_code": fix_code_node_name,
|
145
|
+
"END": END,
|
146
|
+
},
|
147
|
+
)
|
148
|
+
# Fix code -> Execute again
|
149
|
+
workflow.add_edge(fix_code_node_name, execute_code_node_name)
|
150
|
+
|
151
|
+
# Finally, compile
|
119
152
|
if human_in_the_loop and checkpointer is not None:
|
120
153
|
app = workflow.compile(checkpointer=checkpointer)
|
121
154
|
else:
|
@@ -124,6 +157,7 @@ def create_coding_agent_graph(
|
|
124
157
|
return app
|
125
158
|
|
126
159
|
|
160
|
+
|
127
161
|
def node_func_human_review(
|
128
162
|
state: Any,
|
129
163
|
prompt_text: str,
|
@@ -256,6 +290,88 @@ def node_func_execute_agent_code_on_data(
|
|
256
290
|
# if state.get("retry_count") == 0:
|
257
291
|
# 10/0
|
258
292
|
|
293
|
+
# Apply post-processing if provided
|
294
|
+
if post_processing is not None:
|
295
|
+
result = post_processing(result)
|
296
|
+
else:
|
297
|
+
if isinstance(result, pd.DataFrame):
|
298
|
+
result = result.to_dict()
|
299
|
+
|
300
|
+
except Exception as e:
|
301
|
+
print(e)
|
302
|
+
agent_error = f"{error_message_prefix}{str(e)}"
|
303
|
+
|
304
|
+
# Return results
|
305
|
+
output = {result_key: result, error_key: agent_error}
|
306
|
+
return output
|
307
|
+
|
308
|
+
def node_func_execute_agent_from_sql_connection(
|
309
|
+
state: Any,
|
310
|
+
connection: Any,
|
311
|
+
code_snippet_key: str,
|
312
|
+
result_key: str,
|
313
|
+
error_key: str,
|
314
|
+
agent_function_name: str,
|
315
|
+
post_processing: Optional[Callable[[Any], Any]] = None,
|
316
|
+
error_message_prefix: str = "An error occurred during agent execution: "
|
317
|
+
) -> Dict[str, Any]:
|
318
|
+
"""
|
319
|
+
Execute a generic agent code defined in a code snippet retrieved from the state on a SQLAlchemy connection object
|
320
|
+
and return the result.
|
321
|
+
|
322
|
+
Parameters
|
323
|
+
----------
|
324
|
+
state : Any
|
325
|
+
A state object that supports `get(key: str)` method to retrieve values.
|
326
|
+
connection : str
|
327
|
+
The SQLAlchemy connection object to use for executing the agent function.
|
328
|
+
code_snippet_key : str
|
329
|
+
The key in the state used to retrieve the Python code snippet defining the agent function.
|
330
|
+
result_key : str
|
331
|
+
The key in the state used to store the result of the agent function.
|
332
|
+
error_key : str
|
333
|
+
The key in the state used to store the error message if any.
|
334
|
+
agent_function_name : str
|
335
|
+
The name of the function (e.g., 'sql_database_agent') expected to be defined in the code snippet.
|
336
|
+
post_processing : Callable[[Any], Any], optional
|
337
|
+
A function to postprocess the output of the agent function before returning it.
|
338
|
+
error_message_prefix : str, optional
|
339
|
+
A prefix or full message to use in the error output if an exception occurs.
|
340
|
+
|
341
|
+
Returns
|
342
|
+
-------
|
343
|
+
Dict[str, Any]
|
344
|
+
A dictionary containing the result and/or error messages. Keys are arbitrary,
|
345
|
+
but typically include something like "result" or "error".
|
346
|
+
"""
|
347
|
+
|
348
|
+
print(" * EXECUTING AGENT CODE ON SQL CONNECTION")
|
349
|
+
|
350
|
+
# Retrieve SQLAlchemy connection and code snippet from the state
|
351
|
+
is_engine = isinstance(connection, sql.engine.base.Engine)
|
352
|
+
conn = connection.connect() if is_engine else connection
|
353
|
+
agent_code = state.get(code_snippet_key)
|
354
|
+
|
355
|
+
# Ensure the connection object is provided
|
356
|
+
if connection is None:
|
357
|
+
raise ValueError(f"Connection object not found.")
|
358
|
+
|
359
|
+
# Execute the code snippet to define the agent function
|
360
|
+
local_vars = {}
|
361
|
+
global_vars = {}
|
362
|
+
exec(agent_code, global_vars, local_vars)
|
363
|
+
|
364
|
+
# Retrieve the agent function from the executed code
|
365
|
+
agent_function = local_vars.get(agent_function_name, None)
|
366
|
+
if agent_function is None or not callable(agent_function):
|
367
|
+
raise ValueError(f"Agent function '{agent_function_name}' not found or not callable in the provided code.")
|
368
|
+
|
369
|
+
# Execute the agent function
|
370
|
+
agent_error = None
|
371
|
+
result = None
|
372
|
+
try:
|
373
|
+
result = agent_function(connection)
|
374
|
+
|
259
375
|
# Apply post-processing if provided
|
260
376
|
if post_processing is not None:
|
261
377
|
result = post_processing(result)
|
@@ -267,6 +383,7 @@ def node_func_execute_agent_code_on_data(
|
|
267
383
|
output = {result_key: result, error_key: agent_error}
|
268
384
|
return output
|
269
385
|
|
386
|
+
|
270
387
|
def node_func_fix_agent_code(
|
271
388
|
state: Any,
|
272
389
|
code_snippet_key: str,
|
@@ -326,7 +443,7 @@ def node_func_fix_agent_code(
|
|
326
443
|
response = (llm | PythonOutputParser()).invoke(prompt)
|
327
444
|
|
328
445
|
response = relocate_imports_inside_function(response)
|
329
|
-
response = add_comments_to_top(response, agent_name=
|
446
|
+
response = add_comments_to_top(response, agent_name=agent_name)
|
330
447
|
|
331
448
|
# Log the response if requested
|
332
449
|
if log:
|
@@ -0,0 +1,230 @@
|
|
1
|
+
import io
|
2
|
+
import pandas as pd
|
3
|
+
import sqlalchemy as sql
|
4
|
+
from typing import Union, List, Dict
|
5
|
+
|
6
|
+
def get_dataframe_summary(
|
7
|
+
dataframes: Union[pd.DataFrame, List[pd.DataFrame], Dict[str, pd.DataFrame]],
|
8
|
+
n_sample: int = 30,
|
9
|
+
skip_stats: bool = False,
|
10
|
+
) -> List[str]:
|
11
|
+
"""
|
12
|
+
Generate a summary for one or more DataFrames. Accepts a single DataFrame, a list of DataFrames,
|
13
|
+
or a dictionary mapping names to DataFrames.
|
14
|
+
|
15
|
+
Parameters
|
16
|
+
----------
|
17
|
+
dataframes : pandas.DataFrame or list of pandas.DataFrame or dict of (str -> pandas.DataFrame)
|
18
|
+
- Single DataFrame: produce a single summary (returned within a one-element list).
|
19
|
+
- List of DataFrames: produce a summary for each DataFrame, using index-based names.
|
20
|
+
- Dictionary of DataFrames: produce a summary for each DataFrame, using dictionary keys as names.
|
21
|
+
n_sample : int, default 30
|
22
|
+
Number of rows to display in the "Data (first 30 rows)" section.
|
23
|
+
skip_stats : bool, default False
|
24
|
+
If True, skip the descriptive statistics and DataFrame info sections.
|
25
|
+
|
26
|
+
Example:
|
27
|
+
--------
|
28
|
+
``` python
|
29
|
+
import pandas as pd
|
30
|
+
from sklearn.datasets import load_iris
|
31
|
+
data = load_iris(as_frame=True)
|
32
|
+
dataframes = {
|
33
|
+
"iris": data.frame,
|
34
|
+
"iris_target": data.target,
|
35
|
+
}
|
36
|
+
summaries = get_dataframe_summary(dataframes)
|
37
|
+
print(summaries[0])
|
38
|
+
```
|
39
|
+
|
40
|
+
Returns
|
41
|
+
-------
|
42
|
+
list of str
|
43
|
+
A list of summaries, one for each provided DataFrame. Each summary includes:
|
44
|
+
- Shape of the DataFrame (rows, columns)
|
45
|
+
- Column data types
|
46
|
+
- Missing value percentage
|
47
|
+
- Unique value counts
|
48
|
+
- First 30 rows
|
49
|
+
- Descriptive statistics
|
50
|
+
- DataFrame info output
|
51
|
+
"""
|
52
|
+
|
53
|
+
summaries = []
|
54
|
+
|
55
|
+
# --- Dictionary Case ---
|
56
|
+
if isinstance(dataframes, dict):
|
57
|
+
for dataset_name, df in dataframes.items():
|
58
|
+
summaries.append(_summarize_dataframe(df, dataset_name, n_sample, skip_stats))
|
59
|
+
|
60
|
+
# --- Single DataFrame Case ---
|
61
|
+
elif isinstance(dataframes, pd.DataFrame):
|
62
|
+
summaries.append(_summarize_dataframe(dataframes, "Single_Dataset", n_sample, skip_stats))
|
63
|
+
|
64
|
+
# --- List of DataFrames Case ---
|
65
|
+
elif isinstance(dataframes, list):
|
66
|
+
for idx, df in enumerate(dataframes):
|
67
|
+
dataset_name = f"Dataset_{idx}"
|
68
|
+
summaries.append(_summarize_dataframe(df, dataset_name, n_sample, skip_stats))
|
69
|
+
|
70
|
+
else:
|
71
|
+
raise TypeError(
|
72
|
+
"Input must be a single DataFrame, a list of DataFrames, or a dictionary of DataFrames."
|
73
|
+
)
|
74
|
+
|
75
|
+
return summaries
|
76
|
+
|
77
|
+
|
78
|
+
def _summarize_dataframe(df: pd.DataFrame, dataset_name: str, n_sample=30, skip_stats=False) -> str:
|
79
|
+
"""Generate a summary string for a single DataFrame."""
|
80
|
+
# 1. Convert dictionary-type cells to strings
|
81
|
+
# This prevents unhashable dict errors during df.nunique().
|
82
|
+
df = df.apply(lambda col: col.map(lambda x: str(x) if isinstance(x, dict) else x))
|
83
|
+
|
84
|
+
# 2. Capture df.info() output
|
85
|
+
buffer = io.StringIO()
|
86
|
+
df.info(buf=buffer)
|
87
|
+
info_text = buffer.getvalue()
|
88
|
+
|
89
|
+
# 3. Calculate missing value stats
|
90
|
+
missing_stats = (df.isna().sum() / len(df) * 100).sort_values(ascending=False)
|
91
|
+
missing_summary = "\n".join([f"{col}: {val:.2f}%" for col, val in missing_stats.items()])
|
92
|
+
|
93
|
+
# 4. Get column data types
|
94
|
+
column_types = "\n".join([f"{col}: {dtype}" for col, dtype in df.dtypes.items()])
|
95
|
+
|
96
|
+
# 5. Get unique value counts
|
97
|
+
unique_counts = df.nunique() # Will no longer fail on unhashable dict
|
98
|
+
unique_counts_summary = "\n".join([f"{col}: {count}" for col, count in unique_counts.items()])
|
99
|
+
|
100
|
+
# 6. Generate the summary text
|
101
|
+
if not skip_stats:
|
102
|
+
summary_text = f"""
|
103
|
+
Dataset Name: {dataset_name}
|
104
|
+
----------------------------
|
105
|
+
Shape: {df.shape[0]} rows x {df.shape[1]} columns
|
106
|
+
|
107
|
+
Column Data Types:
|
108
|
+
{column_types}
|
109
|
+
|
110
|
+
Missing Value Percentage:
|
111
|
+
{missing_summary}
|
112
|
+
|
113
|
+
Unique Value Counts:
|
114
|
+
{unique_counts_summary}
|
115
|
+
|
116
|
+
Data (first {n_sample} rows):
|
117
|
+
{df.head(n_sample).to_string()}
|
118
|
+
|
119
|
+
Data Description:
|
120
|
+
{df.describe().to_string()}
|
121
|
+
|
122
|
+
Data Info:
|
123
|
+
{info_text}
|
124
|
+
"""
|
125
|
+
else:
|
126
|
+
summary_text = f"""
|
127
|
+
Dataset Name: {dataset_name}
|
128
|
+
----------------------------
|
129
|
+
Shape: {df.shape[0]} rows x {df.shape[1]} columns
|
130
|
+
|
131
|
+
Column Data Types:
|
132
|
+
{column_types}
|
133
|
+
|
134
|
+
Data (first {n_sample} rows):
|
135
|
+
{df.head(n_sample).to_string()}
|
136
|
+
"""
|
137
|
+
|
138
|
+
return summary_text.strip()
|
139
|
+
|
140
|
+
|
141
|
+
|
142
|
+
def get_database_metadata(connection: Union[sql.engine.base.Connection, sql.engine.base.Engine],
|
143
|
+
n_samples: int = 10) -> str:
|
144
|
+
"""
|
145
|
+
Collects metadata and sample data from a database, with safe identifier quoting and
|
146
|
+
basic dialect-aware row limiting. Prevents issues with spaces/reserved words in identifiers.
|
147
|
+
|
148
|
+
Parameters
|
149
|
+
----------
|
150
|
+
connection : Union[sql.engine.base.Connection, sql.engine.base.Engine]
|
151
|
+
An active SQLAlchemy connection or engine.
|
152
|
+
n_samples : int
|
153
|
+
Number of sample values to retrieve for each column.
|
154
|
+
|
155
|
+
Returns
|
156
|
+
-------
|
157
|
+
str
|
158
|
+
A formatted string with database metadata, including some sample data from each column.
|
159
|
+
"""
|
160
|
+
|
161
|
+
# If a connection is passed, use it; if an engine is passed, connect to it
|
162
|
+
is_engine = isinstance(connection, sql.engine.base.Engine)
|
163
|
+
conn = connection.connect() if is_engine else connection
|
164
|
+
|
165
|
+
output = []
|
166
|
+
try:
|
167
|
+
# Grab the engine off the connection
|
168
|
+
sql_engine = conn.engine
|
169
|
+
dialect_name = sql_engine.dialect.name.lower()
|
170
|
+
|
171
|
+
output.append(f"Database Dialect: {sql_engine.dialect.name}")
|
172
|
+
output.append(f"Driver: {sql_engine.driver}")
|
173
|
+
output.append(f"Connection URL: {sql_engine.url}")
|
174
|
+
|
175
|
+
# Inspect the database
|
176
|
+
inspector = sql.inspect(sql_engine)
|
177
|
+
tables = inspector.get_table_names()
|
178
|
+
output.append(f"Tables: {tables}")
|
179
|
+
output.append(f"Schemas: {inspector.get_schema_names()}")
|
180
|
+
|
181
|
+
# Helper to build a dialect-specific limit clause
|
182
|
+
def build_query(col_name_quoted: str, table_name_quoted: str, n: int) -> str:
|
183
|
+
"""
|
184
|
+
Returns a SQL query string to select N rows from the given column/table
|
185
|
+
across different dialects (SQLite, MySQL, Postgres, MSSQL, Oracle, etc.)
|
186
|
+
"""
|
187
|
+
if "sqlite" in dialect_name or "mysql" in dialect_name or "postgres" in dialect_name:
|
188
|
+
# Common dialects supporting LIMIT
|
189
|
+
return f"SELECT {col_name_quoted} FROM {table_name_quoted} LIMIT {n}"
|
190
|
+
elif "mssql" in dialect_name:
|
191
|
+
# Microsoft SQL Server syntax
|
192
|
+
return f"SELECT TOP {n} {col_name_quoted} FROM {table_name_quoted}"
|
193
|
+
elif "oracle" in dialect_name:
|
194
|
+
# Oracle syntax
|
195
|
+
return f"SELECT {col_name_quoted} FROM {table_name_quoted} WHERE ROWNUM <= {n}"
|
196
|
+
else:
|
197
|
+
# Fallback
|
198
|
+
return f"SELECT {col_name_quoted} FROM {table_name_quoted} LIMIT {n}"
|
199
|
+
|
200
|
+
# Prepare for quoting
|
201
|
+
preparer = inspector.bind.dialect.identifier_preparer
|
202
|
+
|
203
|
+
# For each table, get columns and sample data
|
204
|
+
for table_name in tables:
|
205
|
+
output.append(f"\nTable: {table_name}")
|
206
|
+
# Properly quote the table name
|
207
|
+
table_name_quoted = preparer.quote_identifier(table_name)
|
208
|
+
|
209
|
+
for column in inspector.get_columns(table_name):
|
210
|
+
col_name = column["name"]
|
211
|
+
col_type = column["type"]
|
212
|
+
output.append(f" Column: {col_name} Type: {col_type}")
|
213
|
+
|
214
|
+
# Properly quote the column name
|
215
|
+
col_name_quoted = preparer.quote_identifier(col_name)
|
216
|
+
|
217
|
+
# Build a dialect-aware query with safe quoting
|
218
|
+
query = build_query(col_name_quoted, table_name_quoted, n_samples)
|
219
|
+
|
220
|
+
# Read a few sample values
|
221
|
+
df = pd.read_sql(sql.text(query), conn)
|
222
|
+
first_values = df[col_name].tolist()
|
223
|
+
output.append(f" First {n_samples} Values: {first_values}")
|
224
|
+
|
225
|
+
finally:
|
226
|
+
# Close connection if created inside the function
|
227
|
+
if is_engine:
|
228
|
+
conn.close()
|
229
|
+
|
230
|
+
return "\n".join(output)
|
@@ -64,10 +64,16 @@ def add_comments_to_top(code_text, agent_name="data_wrangler"):
|
|
64
64
|
header_comments = [
|
65
65
|
"# Disclaimer: This function was generated by AI. Please review before using.",
|
66
66
|
f"# Agent Name: {agent_name}",
|
67
|
-
f"# Time Created: {time_created}",
|
67
|
+
f"# Time Created: {time_created}\n",
|
68
68
|
""
|
69
69
|
]
|
70
70
|
|
71
71
|
# Join the header with newlines, then prepend to the existing code_text
|
72
72
|
header_block = "\n".join(header_comments)
|
73
73
|
return header_block + code_text
|
74
|
+
|
75
|
+
def format_agent_name(agent_name: str) -> str:
|
76
|
+
|
77
|
+
formatted_name = agent_name.strip().replace("_", " ").upper()
|
78
|
+
|
79
|
+
return f"---{formatted_name}----"
|
{ai_data_science_team-0.0.0.9005.dist-info → ai_data_science_team-0.0.0.9007.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: ai-data-science-team
|
3
|
-
Version: 0.0.0.
|
3
|
+
Version: 0.0.0.9007
|
4
4
|
Summary: Build and run an AI-powered data science team.
|
5
5
|
Home-page: https://github.com/business-science/ai-data-science-team
|
6
6
|
Author: Matt Dancho
|
@@ -21,12 +21,22 @@ Requires-Dist: plotly
|
|
21
21
|
Requires-Dist: streamlit
|
22
22
|
Requires-Dist: scikit-learn
|
23
23
|
Requires-Dist: xgboost
|
24
|
+
Dynamic: author
|
25
|
+
Dynamic: author-email
|
26
|
+
Dynamic: description
|
27
|
+
Dynamic: description-content-type
|
28
|
+
Dynamic: home-page
|
29
|
+
Dynamic: requires-dist
|
30
|
+
Dynamic: requires-python
|
31
|
+
Dynamic: summary
|
24
32
|
|
25
33
|
# Your AI Data Science Team (An Army Of Copilots)
|
26
34
|
|
27
35
|
**An AI-powered data science team of copilots that uses agents to help you perform common data science tasks 10X faster**.
|
28
36
|
|
29
|
-
Star ⭐ This GitHub (Takes 2 seconds and means a lot)
|
37
|
+
**Star ⭐ This GitHub (Takes 2 seconds and means a lot).**
|
38
|
+
|
39
|
+
*Beta - This Python library is under active development. There may be breaking changes that occur until release of 0.1.0.*
|
30
40
|
|
31
41
|
---
|
32
42
|
|
@@ -39,6 +49,24 @@ The AI Data Science Team of Copilots includes Agents that specialize data cleani
|
|
39
49
|
- Credit Card Risk
|
40
50
|
- And more
|
41
51
|
|
52
|
+
## Table of Contents
|
53
|
+
|
54
|
+
- [Your AI Data Science Team (An Army Of Copilots)](#your-ai-data-science-team-an-army-of-copilots)
|
55
|
+
- [Table of Contents](#table-of-contents)
|
56
|
+
- [Companies That Want An AI Data Science Team Copilot](#companies-that-want-an-ai-data-science-team-copilot)
|
57
|
+
- [Free Generative AI For Data Scientists Workshop](#free-generative-ai-for-data-scientists-workshop)
|
58
|
+
- [Data Science Agents](#data-science-agents)
|
59
|
+
- [Coming Soon: Multi-Agents](#coming-soon-multi-agents)
|
60
|
+
- [Agents Available Now](#agents-available-now)
|
61
|
+
- [Agents Coming Soon](#agents-coming-soon)
|
62
|
+
- [Disclaimer](#disclaimer)
|
63
|
+
- [Installation](#installation)
|
64
|
+
- [Usage](#usage)
|
65
|
+
- [Example 1: Feature Engineering with the Feature Engineering Agent](#example-1-feature-engineering-with-the-feature-engineering-agent)
|
66
|
+
- [Example 2: Cleaning Data with the Data Cleaning Agent](#example-2-cleaning-data-with-the-data-cleaning-agent)
|
67
|
+
- [Contributing](#contributing)
|
68
|
+
- [License](#license)
|
69
|
+
|
42
70
|
## Companies That Want An AI Data Science Team Copilot
|
43
71
|
|
44
72
|
If you are interested in having your own custom enteprise-grade AI Data Science Team Copilot, send inquiries here: [https://www.business-science.io/contact.html](https://www.business-science.io/contact.html)
|
@@ -53,11 +81,19 @@ This project is a work in progress. New data science agents will be released soo
|
|
53
81
|
|
54
82
|

|
55
83
|
|
84
|
+
### Coming Soon: Multi-Agents
|
85
|
+
|
86
|
+
This is the internals of the Business Intelligence SQL Agent I'm working on:
|
87
|
+
|
88
|
+

|
89
|
+
|
56
90
|
### Agents Available Now
|
57
91
|
|
58
92
|
1. **Data Wrangling Agent:** Merges, Joins, Preps and Wrangles data into a format that is ready for data analysis.
|
59
|
-
2. **Data
|
60
|
-
3. **
|
93
|
+
2. **Data Visualization Agent:** Creates visualizations to help you understand your data. Returns JSON serializable plotly visualizations.
|
94
|
+
3. **Data Cleaning Agent:** Performs Data Preparation steps including handling missing values, outliers, and data type conversions.
|
95
|
+
4. **Feature Engineering Agent:** Converts the prepared data into ML-ready data. Adds features to increase predictive accuracy of ML models.
|
96
|
+
5. **SQL Database Agent:** Connects to SQL databases to pull data into the data science environment. Creates pipelines to automate data extraction. Performs Joins, Aggregations, and other SQL Query operations.
|
61
97
|
|
62
98
|
### Agents Coming Soon
|
63
99
|
|
@@ -78,23 +114,6 @@ This project is a work in progress. New data science agents will be released soo
|
|
78
114
|
|
79
115
|
By using this software, you agree to use it solely for learning purposes.
|
80
116
|
|
81
|
-
## Table of Contents
|
82
|
-
|
83
|
-
- [Your AI Data Science Team (An Army Of Copilots)](#your-ai-data-science-team-an-army-of-copilots)
|
84
|
-
- [Companies That Want An AI Data Science Team Copilot](#companies-that-want-an-ai-data-science-team-copilot)
|
85
|
-
- [Free Generative AI For Data Scientists Workshop](#free-generative-ai-for-data-scientists-workshop)
|
86
|
-
- [Data Science Agents](#data-science-agents)
|
87
|
-
- [Agents Available Now](#agents-available-now)
|
88
|
-
- [Agents Coming Soon](#agents-coming-soon)
|
89
|
-
- [Disclaimer](#disclaimer)
|
90
|
-
- [Table of Contents](#table-of-contents)
|
91
|
-
- [Installation](#installation)
|
92
|
-
- [Usage](#usage)
|
93
|
-
- [Example 1: Feature Engineering with the Feature Engineering Agent](#example-1-feature-engineering-with-the-feature-engineering-agent)
|
94
|
-
- [Example 2: Cleaning Data with the Data Cleaning Agent](#example-2-cleaning-data-with-the-data-cleaning-agent)
|
95
|
-
- [Contributing](#contributing)
|
96
|
-
- [License](#license)
|
97
|
-
|
98
117
|
## Installation
|
99
118
|
|
100
119
|
``` bash
|
@@ -103,6 +122,8 @@ pip install git+https://github.com/business-science/ai-data-science-team.git --u
|
|
103
122
|
|
104
123
|
## Usage
|
105
124
|
|
125
|
+
[See all examples here.](/examples)
|
126
|
+
|
106
127
|
### Example 1: Feature Engineering with the Feature Engineering Agent
|
107
128
|
|
108
129
|
[See the full example here.](/examples/feature_engineering_agent.ipynb)
|
@@ -0,0 +1,21 @@
|
|
1
|
+
ai_data_science_team/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
ai_data_science_team/_version.py,sha256=VJYpfOaKsXjGzPOsT6kYyVW6T9bFBqxt6Ph3qF8t-A8,26
|
3
|
+
ai_data_science_team/orchestration.py,sha256=xiIFOsrLwPdkSmtme7wNCCGv8XopnMTNElNzlZokL-4,303
|
4
|
+
ai_data_science_team/agents/__init__.py,sha256=rcF18rBsOuPJqJKvoffh6lwr4Nwm24MErM2u4H4Th9s,467
|
5
|
+
ai_data_science_team/agents/data_cleaning_agent.py,sha256=gixYY4wGehKK_ROgU7CVOzijghmVQGD4hyK9uKhc8Hw,20890
|
6
|
+
ai_data_science_team/agents/data_visualization_agent.py,sha256=wePFZbdB4kBah8m_iy6f4IDyjl6L6zBWzIgigJEXdk8,12933
|
7
|
+
ai_data_science_team/agents/data_wrangling_agent.py,sha256=5w1kytoWLE4p3hj0YHVuXcgCd304eNQac-Zrrgmnr2s,16735
|
8
|
+
ai_data_science_team/agents/feature_engineering_agent.py,sha256=UaaU3VkPhjOV0NbrYXedRb6eHOcOWWiGYhB_srrYWvg,17571
|
9
|
+
ai_data_science_team/agents/sql_database_agent.py,sha256=mRbEAPHP6NlwQac2_VL9RuyIfCCtrmXTrzu5RLzOoeU,16031
|
10
|
+
ai_data_science_team/templates/__init__.py,sha256=bNrKGmWXQG7GRczln_zVfUQLzxzp7hSwlLyNtLxleu4,278
|
11
|
+
ai_data_science_team/templates/agent_templates.py,sha256=xohVgEfxPcVukPLpPfV7mZ0cpFgp-oJVLZRWCv2V-WU,19948
|
12
|
+
ai_data_science_team/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
+
ai_data_science_team/tools/logging.py,sha256=7wFOv6GGhXR_RPbh-8p0GyrS608XOnZtiaGK2IbDl_s,2081
|
14
|
+
ai_data_science_team/tools/metadata.py,sha256=tbnca_tDp67oBA6qD29AKVooJG10VqGr4vwzj4rPUas,8348
|
15
|
+
ai_data_science_team/tools/parsers.py,sha256=BAi-fJT7BBt9nRS3w5n9LDTsu7JAJsH8CAI9-Qf7jCs,2086
|
16
|
+
ai_data_science_team/tools/regex.py,sha256=vkfdvi9pDe582p-fh_7cB07Wb0dOR2CsiVq-wUO3mas,2491
|
17
|
+
ai_data_science_team-0.0.0.9007.dist-info/LICENSE,sha256=Xif0IRLdd2HGLATxV2EVp91aSY6KOuacRr_6BorKGzA,1084
|
18
|
+
ai_data_science_team-0.0.0.9007.dist-info/METADATA,sha256=KcMFR2V9_wbepdKsrlFdfc7UB7t-Hf7i75x67LPXw3Q,6783
|
19
|
+
ai_data_science_team-0.0.0.9007.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
20
|
+
ai_data_science_team-0.0.0.9007.dist-info/top_level.txt,sha256=CnoMgOphCoAdGTLueWdCVByVyjwOubaGiTB1lchdy4M,21
|
21
|
+
ai_data_science_team-0.0.0.9007.dist-info/RECORD,,
|