ai-data-science-team 0.0.0.9012__py3-none-any.whl → 0.0.0.9013__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- __version__ = "0.0.0.9012"
1
+ __version__ = "0.0.0.9013"
@@ -25,6 +25,7 @@ from ai_data_science_team.tools.data_loader import (
25
25
  get_file_info,
26
26
  search_files_by_pattern,
27
27
  )
28
+ from ai_data_science_team.utils.messages import get_tool_call_names
28
29
 
29
30
  AGENT_NAME = "data_loader_tools_agent"
30
31
 
@@ -174,6 +175,12 @@ class DataLoaderToolsAgent(BaseAgent):
174
175
  return Markdown(self.response["messages"][0].content)
175
176
  else:
176
177
  return self.response["messages"][0].content
178
+
179
+ def get_tool_calls(self):
180
+ """
181
+ Returns the tool calls made by the agent.
182
+ """
183
+ return self.response["tool_calls"]
177
184
 
178
185
 
179
186
 
@@ -204,6 +211,7 @@ def make_data_loader_tools_agent(
204
211
  internal_messages: Annotated[Sequence[BaseMessage], operator.add]
205
212
  user_instructions: str
206
213
  data_loader_artifacts: dict
214
+ tool_calls: List[str]
207
215
 
208
216
  def data_loader_agent(state):
209
217
 
@@ -253,10 +261,13 @@ def make_data_loader_tools_agent(
253
261
  elif isinstance(last_message, dict) and "artifact" in last_message:
254
262
  last_tool_artifact = last_message["artifact"]
255
263
 
264
+ tool_calls = get_tool_call_names(internal_messages)
265
+
256
266
  return {
257
267
  "messages": [last_ai_message],
258
268
  "internal_messages": internal_messages,
259
269
  "data_loader_artifacts": last_tool_artifact,
270
+ "tool_calls": tool_calls,
260
271
  }
261
272
 
262
273
  workflow = StateGraph(GraphState)
@@ -19,17 +19,20 @@ from ai_data_science_team.templates import BaseAgent
19
19
  from ai_data_science_team.utils.regex import format_agent_name
20
20
 
21
21
  from ai_data_science_team.tools.eda import (
22
+ explain_data,
22
23
  describe_dataset,
23
24
  visualize_missing,
24
25
  correlation_funnel,
25
26
  generate_sweetviz_report,
26
27
  )
28
+ from ai_data_science_team.utils.messages import get_tool_call_names
27
29
 
28
30
 
29
31
  AGENT_NAME = "exploratory_data_analyst_agent"
30
32
 
31
33
  # Updated tool list for EDA
32
34
  EDA_TOOLS = [
35
+ explain_data,
33
36
  describe_dataset,
34
37
  visualize_missing,
35
38
  correlation_funnel,
@@ -162,6 +165,12 @@ class EDAToolsAgent(BaseAgent):
162
165
  return Markdown(self.response["messages"][0].content)
163
166
  else:
164
167
  return self.response["messages"][0].content
168
+
169
+ def get_tool_calls(self):
170
+ """
171
+ Returns the tool calls made by the agent.
172
+ """
173
+ return self.response["tool_calls"]
165
174
 
166
175
  def make_eda_tools_agent(
167
176
  model: Any,
@@ -191,6 +200,7 @@ def make_eda_tools_agent(
191
200
  user_instructions: str
192
201
  data_raw: dict
193
202
  eda_artifacts: dict
203
+ tool_calls: list
194
204
 
195
205
  def exploratory_agent(state):
196
206
  print(format_agent_name(AGENT_NAME))
@@ -229,11 +239,14 @@ def make_eda_tools_agent(
229
239
  last_tool_artifact = last_message.artifact
230
240
  elif isinstance(last_message, dict) and "artifact" in last_message:
231
241
  last_tool_artifact = last_message["artifact"]
242
+
243
+ tool_calls = get_tool_call_names(internal_messages)
232
244
 
233
245
  return {
234
246
  "messages": [last_ai_message],
235
247
  "internal_messages": internal_messages,
236
248
  "eda_artifacts": last_tool_artifact,
249
+ "tool_calls": tool_calls,
237
250
  }
238
251
 
239
252
  workflow = StateGraph(GraphState)
@@ -27,6 +27,7 @@ from ai_data_science_team.tools.mlflow import (
27
27
  mlflow_search_registered_models,
28
28
  mlflow_get_model_version_details,
29
29
  )
30
+ from ai_data_science_team.utils.messages import get_tool_call_names
30
31
 
31
32
  AGENT_NAME = "mlflow_tools_agent"
32
33
 
@@ -228,6 +229,12 @@ class MLflowToolsAgent(BaseAgent):
228
229
  return Markdown(self.response["messages"][0].content)
229
230
  else:
230
231
  return self.response["messages"][0].content
232
+
233
+ def get_tool_calls(self):
234
+ """
235
+ Returns the tool calls made by the agent.
236
+ """
237
+ return self.response["tool_calls"]
231
238
 
232
239
 
233
240
 
@@ -330,10 +337,13 @@ def make_mlflow_tools_agent(
330
337
  elif isinstance(last_message, dict) and "artifact" in last_message:
331
338
  last_tool_artifact = last_message["artifact"]
332
339
 
340
+ tool_calls = get_tool_call_names(internal_messages)
341
+
333
342
  return {
334
343
  "messages": [last_ai_message],
335
344
  "internal_messages": internal_messages,
336
345
  "mlflow_artifacts": last_tool_artifact,
346
+ "tool_calls": tool_calls,
337
347
  }
338
348
 
339
349
 
@@ -74,7 +74,12 @@ def get_dataframe_summary(
74
74
  return summaries
75
75
 
76
76
 
77
- def _summarize_dataframe(df: pd.DataFrame, dataset_name: str, n_sample=30, skip_stats=False) -> str:
77
+ def _summarize_dataframe(
78
+ df: pd.DataFrame,
79
+ dataset_name: str,
80
+ n_sample=30,
81
+ skip_stats=False
82
+ ) -> str:
78
83
  """Generate a summary string for a single DataFrame."""
79
84
  # 1. Convert dictionary-type cells to strings
80
85
  # This prevents unhashable dict errors during df.nunique().
@@ -2,11 +2,44 @@
2
2
  from typing import Annotated, Dict, Tuple, Union
3
3
 
4
4
  import os
5
+ import tempfile
5
6
 
6
7
  from langchain.tools import tool
7
8
 
8
9
  from langgraph.prebuilt import InjectedState
9
10
 
11
+ from ai_data_science_team.tools.dataframe import get_dataframe_summary
12
+
13
+
14
+ @tool(response_format='content')
15
+ def explain_data(
16
+ data_raw: Annotated[dict, InjectedState("data_raw")],
17
+ n_sample: int = 30,
18
+ skip_stats: bool = False,
19
+ ):
20
+ """
21
+ Tool: explain_data
22
+ Description:
23
+ Provides an extensive, narrative summary of a DataFrame including its shape, column types,
24
+ missing value percentages, unique counts, sample rows, and (if not skipped) descriptive stats/info.
25
+
26
+ Parameters:
27
+ data_raw (dict): Raw data.
28
+ n_sample (int, default=30): Number of rows to display.
29
+ skip_stats (bool, default=False): If True, omit descriptive stats/info.
30
+
31
+ LLM Guidance:
32
+ Use when a detailed, human-readable explanation is needed—i.e., a full overview is preferred over a concise numerical summary.
33
+
34
+ Returns:
35
+ str: Detailed DataFrame summary.
36
+ """
37
+ print(" * Tool: explain_data")
38
+ import pandas as pd
39
+
40
+ result = get_dataframe_summary(pd.DataFrame(data_raw), n_sample=n_sample, skip_stats=skip_stats)
41
+
42
+ return result
10
43
 
11
44
  @tool(response_format='content_and_artifact')
12
45
  def describe_dataset(
@@ -15,21 +48,33 @@ def describe_dataset(
15
48
  """
16
49
  Tool: describe_dataset
17
50
  Description:
18
- Describe the dataset by computing summary
19
- statistics using the DataFrame's describe() method.
20
-
51
+ Compute and return summary statistics for the dataset using pandas' describe() method.
52
+ The tool provides both a textual summary and a structured artifact (a dictionary) for further processing.
53
+
54
+ Parameters:
55
+ -----------
56
+ data_raw : dict
57
+ The raw data in dictionary format.
58
+
59
+ LLM Selection Guidance:
60
+ ------------------------
61
+ Use this tool when:
62
+ - The request emphasizes numerical descriptive statistics (e.g., count, mean, std, min, quartiles, max).
63
+ - The user needs a concise statistical snapshot rather than a detailed narrative.
64
+ - Both a brief text explanation and a structured data artifact (for downstream tasks) are required.
65
+
21
66
  Returns:
22
67
  -------
23
68
  Tuple[str, Dict]:
24
- content: A textual summary of the DataFrame's descriptive statistics.
25
- artifact: A dictionary (from DataFrame.describe()) for further inspection.
69
+ - content: A textual summary indicating that summary statistics have been computed.
70
+ - artifact: A dictionary (derived from DataFrame.describe()) containing detailed statistical measures.
26
71
  """
27
72
  print(" * Tool: describe_dataset")
28
73
  import pandas as pd
29
74
  df = pd.DataFrame(data_raw)
30
75
  description_df = df.describe(include='all')
31
76
  content = "Summary statistics computed using pandas describe()."
32
- artifact = description_df.to_dict()
77
+ artifact = {'describe_df': description_df.to_dict()}
33
78
  return content, artifact
34
79
 
35
80
 
@@ -226,8 +271,8 @@ def generate_sweetviz_report(
226
271
  data_raw: Annotated[dict, InjectedState("data_raw")],
227
272
  target: str = None,
228
273
  report_name: str = "sweetviz_report.html",
229
- report_directory: str = os.path.join(os.getcwd(), "reports"),
230
- open_browser: bool = True,
274
+ report_directory: str = None, # <-- Default to None
275
+ open_browser: bool = False,
231
276
  ) -> Tuple[str, Dict]:
232
277
  """
233
278
  Tool: generate_sweetviz_report
@@ -243,9 +288,10 @@ def generate_sweetviz_report(
243
288
  report_name : str, optional
244
289
  The file name to save the Sweetviz HTML report. Default is "sweetviz_report.html".
245
290
  report_directory : str, optional
246
- The directory where the report should be saved. Defaults to a 'reports' directory in the current working directory.
291
+ The directory where the report should be saved.
292
+ If None, a temporary directory is created and used.
247
293
  open_browser : bool, optional
248
- Whether to open the report in a web browser. Default is True.
294
+ Whether to open the report in a web browser. Default is False.
249
295
 
250
296
  Returns:
251
297
  --------
@@ -254,28 +300,37 @@ def generate_sweetviz_report(
254
300
  artifact: A dictionary with the report file path and optionally the report's HTML content.
255
301
  """
256
302
  print(" * Tool: generate_sweetviz_report")
303
+
304
+ # Import sweetviz
257
305
  try:
258
306
  import sweetviz as sv
259
307
  except ImportError:
260
308
  raise ImportError("Please install the 'sweetviz' package to use this tool. Run: pip install sweetviz")
309
+
261
310
  import pandas as pd
311
+
262
312
  # Convert injected raw data to a DataFrame.
263
313
  df = pd.DataFrame(data_raw)
264
314
 
315
+ # If no directory is specified, use a temporary directory.
316
+ if not report_directory:
317
+ report_directory = tempfile.mkdtemp()
318
+ print(f" * Using temporary directory: {report_directory}")
319
+ else:
320
+ # Ensure user-specified directory exists.
321
+ if not os.path.exists(report_directory):
322
+ os.makedirs(report_directory)
323
+
265
324
  # Create the Sweetviz report.
266
325
  report = sv.analyze(df, target_feat=target)
267
326
 
268
- # Ensure the directory exists; default is os.getcwd()/reports
269
- if not os.path.exists(report_directory):
270
- os.makedirs(report_directory)
271
-
272
327
  # Determine the full path for the report.
273
328
  full_report_path = os.path.join(report_directory, report_name)
274
329
 
275
330
  # Save the report to the specified HTML file.
276
331
  report.show_html(
277
332
  filepath=full_report_path,
278
- open_browser=True,
333
+ open_browser=open_browser,
279
334
  )
280
335
 
281
336
  # Optionally, read the HTML content (if desired to pass along in the artifact).
@@ -285,9 +340,13 @@ def generate_sweetviz_report(
285
340
  except Exception:
286
341
  html_content = None
287
342
 
288
- content = f"Sweetviz EDA report generated and saved as '{os.path.abspath(full_report_path)}'."
343
+ content = (
344
+ f"Sweetviz EDA report generated and saved as '{os.path.abspath(full_report_path)}'. "
345
+ f"{'This was saved in a temporary directory.' if 'tmp' in report_directory else ''}"
346
+ )
289
347
  artifact = {
290
348
  "report_file": os.path.abspath(full_report_path),
291
349
  "report_html": html_content,
292
350
  }
293
351
  return content, artifact
352
+
@@ -0,0 +1,27 @@
1
+
2
+
3
+
4
+ def get_tool_call_names(messages):
5
+ """
6
+ Method to extract the tool call names from a list of LangChain messages.
7
+
8
+ Parameters:
9
+ ----------
10
+ messages : list
11
+ A list of LangChain messages.
12
+
13
+ Returns:
14
+ -------
15
+ tool_calls : list
16
+ A list of tool call names.
17
+
18
+ """
19
+ tool_calls = []
20
+ for message in messages:
21
+ try:
22
+ if "tool_call_id" in list(dict(message).keys()):
23
+ tool_calls.append(message.name)
24
+ except:
25
+ pass
26
+ return tool_calls
27
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: ai-data-science-team
3
- Version: 0.0.0.9012
3
+ Version: 0.0.0.9013
4
4
  Summary: Build and run an AI-powered data science team.
5
5
  Home-page: https://github.com/business-science/ai-data-science-team
6
6
  Author: Matt Dancho
@@ -152,7 +152,11 @@ This is a top secret project I'm working on. It's a multi-agent data science app
152
152
 
153
153
  #### 🔥 Agentic Applications
154
154
 
155
- 1. **SQL Database Agent App:** Connects any SQL Database, generates SQL queries from natural language, and returns data as a downloadable table. [See Application](/apps/sql-database-agent-app/)
155
+ 1. **NEW Exploratory Data Copilot**: An AI-powered data science app that performs automated exploratory data analysis (EDA) with EDA Reporting, Missing Data Analysis, Correlation Analysis, and more. [See Application](/apps/exploratory-copilot-app/)
156
+
157
+ ![Exploratory Data Copilot](/img/apps/ai_exploratory_copilot.jpg)
158
+
159
+ 2. **SQL Database Agent App:** Connects any SQL Database, generates SQL queries from natural language, and returns data as a downloadable table. [See Application](/apps/sql-database-agent-app/)
156
160
 
157
161
  ### Agents Available Now
158
162
 
@@ -1,20 +1,20 @@
1
1
  ai_data_science_team/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- ai_data_science_team/_version.py,sha256=BybGt-zGNDZsdJxDMV3xmjghiRF8jmwG3ov_dt_rM7E,26
2
+ ai_data_science_team/_version.py,sha256=8mQbNYWB914j3xlCMQYaR14g26vq-2SV31Xf8uer_L0,26
3
3
  ai_data_science_team/orchestration.py,sha256=xiIFOsrLwPdkSmtme7wNCCGv8XopnMTNElNzlZokL-4,303
4
4
  ai_data_science_team/agents/__init__.py,sha256=Gnotza9SKr_0IxuaX8k1nsZK48wXkkeZcGcrR1EqNks,668
5
5
  ai_data_science_team/agents/data_cleaning_agent.py,sha256=V5tJMwGJK0JwrF_H-7r3S0E8UkAY6ci4BGxqjhZiGBI,27352
6
- ai_data_science_team/agents/data_loader_tools_agent.py,sha256=fnkOvmrXzvTTt1mnAyTlsF_7ZGrkp3P97YU_LgeffMg,8445
6
+ ai_data_science_team/agents/data_loader_tools_agent.py,sha256=23Uuqt-oaJfj3CFRKT7NErNkodXpraXl0HOWvXjMcJs,8802
7
7
  ai_data_science_team/agents/data_visualization_agent.py,sha256=tJy9Ehnh9mvAu6H--TXI8esSHmK1RW_L1RDAdn7Xek4,28821
8
8
  ai_data_science_team/agents/data_wrangling_agent.py,sha256=LxzphH-TmrFG0GjejGOjulhPq4SsWFo5Y9tk4WEuN4M,32347
9
9
  ai_data_science_team/agents/feature_engineering_agent.py,sha256=KmPBkj7WUBz6LFUlDDfQHMi7ujXwsH5P9LWRS-F4tdM,31026
10
10
  ai_data_science_team/agents/sql_database_agent.py,sha256=1K2o3NiuKgGKdbMz_Tq9IeQ8xhXjpfGOxx9lArZh1yE,31173
11
11
  ai_data_science_team/ds_agents/__init__.py,sha256=dnuagUTebTDHhGXbCt-hZIilzXMSUwyHaEI7sOxhvoE,95
12
- ai_data_science_team/ds_agents/eda_tools_agent.py,sha256=y65lsBXhQNOGwWealEho6uFxGSTW7FNfvTUZnW8_XNY,7609
12
+ ai_data_science_team/ds_agents/eda_tools_agent.py,sha256=VJkqyQCNxoV0kvUTpUZh8SXTTZ0K1tUlg3jq6LDnpPQ,8009
13
13
  ai_data_science_team/ds_agents/modeling_tools_agent.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  ai_data_science_team/ml_agents/__init__.py,sha256=qq3UlDCRV_z4FHQ1jj3YR6zPbA6kuCvYCisj_bHYfO4,190
15
15
  ai_data_science_team/ml_agents/h2o_ml_agent.py,sha256=DamR72agrTKfdcdhablmP2mpbj0CqtMonP-QU8p7o9w,33394
16
16
  ai_data_science_team/ml_agents/h2o_ml_tools_agent.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- ai_data_science_team/ml_agents/mlflow_tools_agent.py,sha256=zbT0KIsmQp_sEyxzXRguhqx5913Q2yPYyKGU6TUWEM8,11067
17
+ ai_data_science_team/ml_agents/mlflow_tools_agent.py,sha256=bRTT53_pHV0qAYl07iZcwUEYffGH_ZfJICdrLeOUPn4,11394
18
18
  ai_data_science_team/multiagents/__init__.py,sha256=aI4GztEwmkexZKT5XHcH3cAjO-xYUhncb3yfPJQDqTA,99
19
19
  ai_data_science_team/multiagents/sql_data_analyst.py,sha256=kmmED3gLf5STWWY6ZVJYd7_Pt8NMl6SHyBocuQzRDGk,14193
20
20
  ai_data_science_team/multiagents/supervised_data_analyst.py,sha256=uduCYpicga-UCf9nPQktQggW96-HDlqvioYmEdWejtI,158
@@ -24,8 +24,8 @@ ai_data_science_team/templates/__init__.py,sha256=_IcyFUu_mM8dFtttz95h0csJZ-XWDP
24
24
  ai_data_science_team/templates/agent_templates.py,sha256=Lezp0ugtIP3m5WUOmjLwghNnjjyQVQecysONeIHWwi0,29133
25
25
  ai_data_science_team/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  ai_data_science_team/tools/data_loader.py,sha256=ITs_6UAJ0m9h68R9_LruiaJSElv9l7SxTQYryI7YZPY,14702
27
- ai_data_science_team/tools/dataframe.py,sha256=qSflGDByqqCXv4TjuvOFvGPZmegzeOesb0Y4i4Y0gdQ,4551
28
- ai_data_science_team/tools/eda.py,sha256=UGD6PC12RsB_UmStvR4TmSqv0noxjM4DkzY-kHjI0-E,10591
27
+ ai_data_science_team/tools/dataframe.py,sha256=cckplDWu9SsA_PRo89pYsyVCmBE0PoDIwMv6tuLunT4,4572
28
+ ai_data_science_team/tools/eda.py,sha256=KoryXso_5zOPDq7jwcUAMEXV-AIzpWb62zzbUHVtgtM,12687
29
29
  ai_data_science_team/tools/h2o.py,sha256=gSK0f2FULfAfipFTTjDMUS6DjHwFFvvl4jxshr6QpS0,38997
30
30
  ai_data_science_team/tools/mlflow.py,sha256=8NTkSOvbTk01GOmwFaMkLBRse80w9Kk7Ypi6Fv4kTII,29475
31
31
  ai_data_science_team/tools/sql.py,sha256=vvz_CiOg6GqXo2_mlF4kq5IS6if79dpaizAgLR9sRyg,4784
@@ -33,10 +33,11 @@ ai_data_science_team/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJ
33
33
  ai_data_science_team/utils/html.py,sha256=1MBcjNyATi3FPOyVdqf6-_QYCJmDVQWmVPIInUr50dk,628
34
34
  ai_data_science_team/utils/logging.py,sha256=7wFOv6GGhXR_RPbh-8p0GyrS608XOnZtiaGK2IbDl_s,2081
35
35
  ai_data_science_team/utils/matplotlib.py,sha256=d6DZfCXvZ5Kocxtsp92etIymKW2cRBcUG9GmCOMtgJo,1145
36
+ ai_data_science_team/utils/messages.py,sha256=feWIPGsv8ly9jpNnS97SoPsn1feaY1Km0VCbHTbRpI8,549
36
37
  ai_data_science_team/utils/plotly.py,sha256=nST-NG0oizKVHhH6HsjHUpTUumq9bCccBdxjuaJWnVQ,504
37
38
  ai_data_science_team/utils/regex.py,sha256=lwarbLqTA2VfNQSyqKCl-PBlH_0WH3zXZvYGBYGUiu4,5144
38
- ai_data_science_team-0.0.0.9012.dist-info/LICENSE,sha256=Xif0IRLdd2HGLATxV2EVp91aSY6KOuacRr_6BorKGzA,1084
39
- ai_data_science_team-0.0.0.9012.dist-info/METADATA,sha256=geRCFLG3YO9uprp_CGKiqCTSThg06L2U6WxVqYKzyM8,12704
40
- ai_data_science_team-0.0.0.9012.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
41
- ai_data_science_team-0.0.0.9012.dist-info/top_level.txt,sha256=CnoMgOphCoAdGTLueWdCVByVyjwOubaGiTB1lchdy4M,21
42
- ai_data_science_team-0.0.0.9012.dist-info/RECORD,,
39
+ ai_data_science_team-0.0.0.9013.dist-info/LICENSE,sha256=Xif0IRLdd2HGLATxV2EVp91aSY6KOuacRr_6BorKGzA,1084
40
+ ai_data_science_team-0.0.0.9013.dist-info/METADATA,sha256=z18MmCwNdEgovskYmYmd4CS1I4WKTvh_mSnmzKOaHZs,13021
41
+ ai_data_science_team-0.0.0.9013.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
42
+ ai_data_science_team-0.0.0.9013.dist-info/top_level.txt,sha256=CnoMgOphCoAdGTLueWdCVByVyjwOubaGiTB1lchdy4M,21
43
+ ai_data_science_team-0.0.0.9013.dist-info/RECORD,,