ai-data-science-team 0.0.0.9000__py3-none-any.whl → 0.0.0.9005__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +4 -0
- ai_data_science_team/agents/data_cleaning_agent.py +347 -0
- ai_data_science_team/agents/data_wrangling_agent.py +365 -0
- ai_data_science_team/agents/feature_engineering_agent.py +368 -0
- ai_data_science_team/templates/__init__.py +0 -0
- ai_data_science_team/templates/agent_templates.py +409 -0
- ai_data_science_team/tools/__init__.py +0 -0
- ai_data_science_team/tools/data_analysis.py +116 -0
- ai_data_science_team/tools/logging.py +61 -0
- ai_data_science_team/tools/parsers.py +57 -0
- ai_data_science_team/tools/regex.py +73 -0
- ai_data_science_team-0.0.0.9005.dist-info/METADATA +162 -0
- ai_data_science_team-0.0.0.9005.dist-info/RECORD +19 -0
- ai_data_science_team/agents.py +0 -325
- ai_data_science_team-0.0.0.9000.dist-info/METADATA +0 -131
- ai_data_science_team-0.0.0.9000.dist-info/RECORD +0 -9
- {ai_data_science_team-0.0.0.9000.dist-info → ai_data_science_team-0.0.0.9005.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9000.dist-info → ai_data_science_team-0.0.0.9005.dist-info}/WHEEL +0 -0
- {ai_data_science_team-0.0.0.9000.dist-info → ai_data_science_team-0.0.0.9005.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,409 @@
|
|
1
|
+
from langchain_core.messages import AIMessage
|
2
|
+
from langgraph.graph import StateGraph, END
|
3
|
+
from langgraph.types import interrupt, Command
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
from typing import Any, Callable, Dict, Type, Optional
|
8
|
+
|
9
|
+
from ai_data_science_team.tools.parsers import PythonOutputParser
|
10
|
+
from ai_data_science_team.tools.regex import relocate_imports_inside_function, add_comments_to_top
|
11
|
+
|
12
|
+
def create_coding_agent_graph(
|
13
|
+
GraphState: Type,
|
14
|
+
node_functions: Dict[str, Callable],
|
15
|
+
recommended_steps_node_name: str,
|
16
|
+
create_code_node_name: str,
|
17
|
+
execute_code_node_name: str,
|
18
|
+
fix_code_node_name: str,
|
19
|
+
explain_code_node_name: str,
|
20
|
+
error_key: str,
|
21
|
+
max_retries_key: str = "max_retries",
|
22
|
+
retry_count_key: str = "retry_count",
|
23
|
+
human_in_the_loop: bool = False,
|
24
|
+
human_review_node_name: str = "human_review",
|
25
|
+
checkpointer: Optional[Callable] = None
|
26
|
+
):
|
27
|
+
"""
|
28
|
+
Creates a generic agent graph using the provided node functions and node names.
|
29
|
+
|
30
|
+
Parameters
|
31
|
+
----------
|
32
|
+
GraphState : Type
|
33
|
+
The TypedDict or class used as state for the workflow.
|
34
|
+
node_functions : dict
|
35
|
+
A dictionary mapping node names to their respective functions.
|
36
|
+
Example: {
|
37
|
+
"recommend_cleaning_steps": recommend_cleaning_steps,
|
38
|
+
"human_review": human_review,
|
39
|
+
"create_data_cleaner_code": create_data_cleaner_code,
|
40
|
+
"execute_data_cleaner_code": execute_data_cleaner_code,
|
41
|
+
"fix_data_cleaner_code": fix_data_cleaner_code,
|
42
|
+
"explain_data_cleaner_code": explain_data_cleaner_code
|
43
|
+
}
|
44
|
+
recommended_steps_node_name : str
|
45
|
+
The node name that recommends steps.
|
46
|
+
create_code_node_name : str
|
47
|
+
The node name that creates the code.
|
48
|
+
execute_code_node_name : str
|
49
|
+
The node name that executes the generated code.
|
50
|
+
fix_code_node_name : str
|
51
|
+
The node name that fixes code if errors occur.
|
52
|
+
explain_code_node_name : str
|
53
|
+
The node name that explains the final code.
|
54
|
+
error_key : str
|
55
|
+
The state key used to check for errors.
|
56
|
+
max_retries_key : str, optional
|
57
|
+
The state key used for the maximum number of retries.
|
58
|
+
retry_count_key : str, optional
|
59
|
+
The state key for the current retry count.
|
60
|
+
human_in_the_loop : bool, optional
|
61
|
+
Whether to include a human review step.
|
62
|
+
human_review_node_name : str, optional
|
63
|
+
The node name for human review if human_in_the_loop is True.
|
64
|
+
checkpointer : callable, optional
|
65
|
+
A checkpointer callable if desired.
|
66
|
+
|
67
|
+
Returns
|
68
|
+
-------
|
69
|
+
app : langchain.graphs.StateGraph
|
70
|
+
The compiled workflow application.
|
71
|
+
"""
|
72
|
+
|
73
|
+
workflow = StateGraph(GraphState)
|
74
|
+
|
75
|
+
# Add the recommended steps node
|
76
|
+
workflow.add_node(recommended_steps_node_name, node_functions[recommended_steps_node_name])
|
77
|
+
|
78
|
+
# Optionally add the human review node
|
79
|
+
if human_in_the_loop:
|
80
|
+
workflow.add_node(human_review_node_name, node_functions[human_review_node_name])
|
81
|
+
|
82
|
+
# Add main nodes
|
83
|
+
workflow.add_node(create_code_node_name, node_functions[create_code_node_name])
|
84
|
+
workflow.add_node(execute_code_node_name, node_functions[execute_code_node_name])
|
85
|
+
workflow.add_node(fix_code_node_name, node_functions[fix_code_node_name])
|
86
|
+
workflow.add_node(explain_code_node_name, node_functions[explain_code_node_name])
|
87
|
+
|
88
|
+
# Set the entry point
|
89
|
+
workflow.set_entry_point(recommended_steps_node_name)
|
90
|
+
|
91
|
+
# Add edges depending on human_in_the_loop
|
92
|
+
if human_in_the_loop:
|
93
|
+
workflow.add_edge(recommended_steps_node_name, human_review_node_name)
|
94
|
+
else:
|
95
|
+
workflow.add_edge(recommended_steps_node_name, create_code_node_name)
|
96
|
+
|
97
|
+
# Connect create_code_node to execution node
|
98
|
+
workflow.add_edge(create_code_node_name, execute_code_node_name)
|
99
|
+
|
100
|
+
# Add conditional edges for error handling
|
101
|
+
workflow.add_conditional_edges(
|
102
|
+
execute_code_node_name,
|
103
|
+
lambda state: "fix_code" if (
|
104
|
+
state.get(error_key) is not None and
|
105
|
+
state.get(retry_count_key) is not None and
|
106
|
+
state.get(max_retries_key) is not None and
|
107
|
+
state.get(retry_count_key) < state.get(max_retries_key)
|
108
|
+
) else "explain_code",
|
109
|
+
{"fix_code": fix_code_node_name, "explain_code": explain_code_node_name},
|
110
|
+
)
|
111
|
+
|
112
|
+
# From fix_code_node_name back to execution node
|
113
|
+
workflow.add_edge(fix_code_node_name, execute_code_node_name)
|
114
|
+
|
115
|
+
# explain_code_node_name leads to end
|
116
|
+
workflow.add_edge(explain_code_node_name, END)
|
117
|
+
|
118
|
+
# Compile workflow, optionally with checkpointer
|
119
|
+
if human_in_the_loop and checkpointer is not None:
|
120
|
+
app = workflow.compile(checkpointer=checkpointer)
|
121
|
+
else:
|
122
|
+
app = workflow.compile()
|
123
|
+
|
124
|
+
return app
|
125
|
+
|
126
|
+
|
127
|
+
def node_func_human_review(
|
128
|
+
state: Any,
|
129
|
+
prompt_text: str,
|
130
|
+
yes_goto: str,
|
131
|
+
no_goto: str,
|
132
|
+
user_instructions_key: str = "user_instructions",
|
133
|
+
recommended_steps_key: str = "recommended_steps",
|
134
|
+
) -> Command[str]:
|
135
|
+
"""
|
136
|
+
A generic function to handle human review steps.
|
137
|
+
|
138
|
+
Parameters
|
139
|
+
----------
|
140
|
+
state : GraphState
|
141
|
+
The current GraphState.
|
142
|
+
prompt_text : str
|
143
|
+
The text to display to the user before their input.
|
144
|
+
yes_goto : str
|
145
|
+
The node to go to if the user confirms (answers "yes").
|
146
|
+
no_goto : str
|
147
|
+
The node to go to if the user suggests modifications.
|
148
|
+
user_instructions_key : str, optional
|
149
|
+
The key in the state to store user instructions.
|
150
|
+
recommended_steps_key : str, optional
|
151
|
+
The key in the state to store recommended steps.
|
152
|
+
|
153
|
+
Returns
|
154
|
+
-------
|
155
|
+
Command[str]
|
156
|
+
A Command object directing the next state and updates to the state.
|
157
|
+
"""
|
158
|
+
print(" * HUMAN REVIEW")
|
159
|
+
|
160
|
+
# Display instructions and get user response
|
161
|
+
user_input = interrupt(value=prompt_text.format(steps=state.get(recommended_steps_key, '')))
|
162
|
+
|
163
|
+
# Decide next steps based on user input
|
164
|
+
if user_input.strip().lower() == "yes":
|
165
|
+
goto = yes_goto
|
166
|
+
update = {}
|
167
|
+
else:
|
168
|
+
goto = no_goto
|
169
|
+
modifications = "Modifications: \n" + user_input
|
170
|
+
if state.get(user_instructions_key) is None:
|
171
|
+
update = {user_instructions_key: modifications}
|
172
|
+
else:
|
173
|
+
update = {user_instructions_key: state.get(user_instructions_key) + modifications}
|
174
|
+
|
175
|
+
return Command(goto=goto, update=update)
|
176
|
+
|
177
|
+
|
178
|
+
def node_func_execute_agent_code_on_data(
|
179
|
+
state: Any,
|
180
|
+
data_key: str,
|
181
|
+
code_snippet_key: str,
|
182
|
+
result_key: str,
|
183
|
+
error_key: str,
|
184
|
+
agent_function_name: str,
|
185
|
+
pre_processing: Optional[Callable[[Any], Any]] = None,
|
186
|
+
post_processing: Optional[Callable[[Any], Any]] = None,
|
187
|
+
error_message_prefix: str = "An error occurred during agent execution: "
|
188
|
+
) -> Dict[str, Any]:
|
189
|
+
"""
|
190
|
+
Execute a generic agent code defined in a code snippet retrieved from the state on input data and return the result.
|
191
|
+
|
192
|
+
Parameters
|
193
|
+
----------
|
194
|
+
state : Any
|
195
|
+
A state object that supports `get(key: str)` method to retrieve values.
|
196
|
+
data_key : str
|
197
|
+
The key in the state used to retrieve the input data.
|
198
|
+
code_snippet_key : str
|
199
|
+
The key in the state used to retrieve the Python code snippet defining the agent function.
|
200
|
+
result_key : str
|
201
|
+
The key in the state used to store the result of the agent function.
|
202
|
+
error_key : str
|
203
|
+
The key in the state used to store the error message if any.
|
204
|
+
agent_function_name : str
|
205
|
+
The name of the function (e.g., 'data_cleaner') expected to be defined in the code snippet.
|
206
|
+
pre_processing : Callable[[Any], Any], optional
|
207
|
+
A function to preprocess the data before passing it to the agent function.
|
208
|
+
This might be used to convert raw data into a DataFrame or otherwise transform it.
|
209
|
+
If not provided, a default approach will be used if data is a dict.
|
210
|
+
post_processing : Callable[[Any], Any], optional
|
211
|
+
A function to postprocess the output of the agent function before returning it.
|
212
|
+
error_message_prefix : str, optional
|
213
|
+
A prefix or full message to use in the error output if an exception occurs.
|
214
|
+
|
215
|
+
Returns
|
216
|
+
-------
|
217
|
+
Dict[str, Any]
|
218
|
+
A dictionary containing the result and/or error messages. Keys are arbitrary,
|
219
|
+
but typically include something like "result" or "error".
|
220
|
+
"""
|
221
|
+
|
222
|
+
print(" * EXECUTING AGENT CODE")
|
223
|
+
|
224
|
+
# Retrieve raw data and code snippet from the state
|
225
|
+
data = state.get(data_key)
|
226
|
+
agent_code = state.get(code_snippet_key)
|
227
|
+
|
228
|
+
# Preprocessing: If no pre-processing function is given, attempt a default handling
|
229
|
+
if pre_processing is None:
|
230
|
+
if isinstance(data, dict):
|
231
|
+
df = pd.DataFrame.from_dict(data)
|
232
|
+
elif isinstance(data, list):
|
233
|
+
df = [pd.DataFrame.from_dict(item) for item in data]
|
234
|
+
else:
|
235
|
+
raise ValueError("Data is not a dictionary or list and no pre_processing function was provided.")
|
236
|
+
else:
|
237
|
+
df = pre_processing(data)
|
238
|
+
|
239
|
+
# Execute the code snippet to define the agent function
|
240
|
+
local_vars = {}
|
241
|
+
global_vars = {}
|
242
|
+
exec(agent_code, global_vars, local_vars)
|
243
|
+
|
244
|
+
# Retrieve the agent function from the executed code
|
245
|
+
agent_function = local_vars.get(agent_function_name, None)
|
246
|
+
if agent_function is None or not callable(agent_function):
|
247
|
+
raise ValueError(f"Agent function '{agent_function_name}' not found or not callable in the provided code.")
|
248
|
+
|
249
|
+
# Execute the agent function
|
250
|
+
agent_error = None
|
251
|
+
result = None
|
252
|
+
try:
|
253
|
+
result = agent_function(df)
|
254
|
+
|
255
|
+
# Test an error
|
256
|
+
# if state.get("retry_count") == 0:
|
257
|
+
# 10/0
|
258
|
+
|
259
|
+
# Apply post-processing if provided
|
260
|
+
if post_processing is not None:
|
261
|
+
result = post_processing(result)
|
262
|
+
except Exception as e:
|
263
|
+
print(e)
|
264
|
+
agent_error = f"{error_message_prefix}{str(e)}"
|
265
|
+
|
266
|
+
# Return results
|
267
|
+
output = {result_key: result, error_key: agent_error}
|
268
|
+
return output
|
269
|
+
|
270
|
+
def node_func_fix_agent_code(
|
271
|
+
state: Any,
|
272
|
+
code_snippet_key: str,
|
273
|
+
error_key: str,
|
274
|
+
llm: Any,
|
275
|
+
prompt_template: str,
|
276
|
+
agent_name: str,
|
277
|
+
retry_count_key: str = "retry_count",
|
278
|
+
log: bool = False,
|
279
|
+
file_path: str = "logs/agent_function.py",
|
280
|
+
) -> dict:
|
281
|
+
"""
|
282
|
+
Generic function to fix a given piece of agent code using an LLM and a prompt template.
|
283
|
+
|
284
|
+
Parameters
|
285
|
+
----------
|
286
|
+
state : Any
|
287
|
+
A state object that supports `get(key: str)` method to retrieve values.
|
288
|
+
code_snippet_key : str
|
289
|
+
The key in the state used to retrieve the broken code snippet.
|
290
|
+
error_key : str
|
291
|
+
The key in the state used to retrieve the related error message.
|
292
|
+
llm : Any
|
293
|
+
The language model or pipeline capable of receiving prompts and returning responses.
|
294
|
+
It should support a call like `(llm | PythonOutputParser()).invoke(prompt)`.
|
295
|
+
prompt_template : str
|
296
|
+
A string template for the prompt that will be sent to the LLM. It should contain
|
297
|
+
placeholders `{code_snippet}` and `{error}` which will be formatted with the actual values.
|
298
|
+
agent_name : str
|
299
|
+
The name of the agent being fixed. This is used to add comments to the top of the code.
|
300
|
+
retry_count_key : str, optional
|
301
|
+
The key in the state that tracks how many times we've retried fixing the code.
|
302
|
+
log : bool, optional
|
303
|
+
Whether to log the returned code to a file.
|
304
|
+
file_path : str, optional
|
305
|
+
The path to the file where the code will be logged.
|
306
|
+
|
307
|
+
Returns
|
308
|
+
-------
|
309
|
+
dict
|
310
|
+
A dictionary containing updated code, cleared error, and incremented retry count.
|
311
|
+
"""
|
312
|
+
print(" * FIX AGENT CODE")
|
313
|
+
print(" retry_count:" + str(state.get(retry_count_key)))
|
314
|
+
|
315
|
+
# Retrieve the code snippet and the error from the state
|
316
|
+
code_snippet = state.get(code_snippet_key)
|
317
|
+
error_message = state.get(error_key)
|
318
|
+
|
319
|
+
# Format the prompt with the code snippet and the error
|
320
|
+
prompt = prompt_template.format(
|
321
|
+
code_snippet=code_snippet,
|
322
|
+
error=error_message
|
323
|
+
)
|
324
|
+
|
325
|
+
# Execute the prompt with the LLM
|
326
|
+
response = (llm | PythonOutputParser()).invoke(prompt)
|
327
|
+
|
328
|
+
response = relocate_imports_inside_function(response)
|
329
|
+
response = add_comments_to_top(response, agent_name="data_wrangler")
|
330
|
+
|
331
|
+
# Log the response if requested
|
332
|
+
if log:
|
333
|
+
with open(file_path, 'w') as file:
|
334
|
+
file.write(response)
|
335
|
+
print(f" File saved to: {file_path}")
|
336
|
+
|
337
|
+
# Return updated results
|
338
|
+
return {
|
339
|
+
code_snippet_key: response,
|
340
|
+
error_key: None,
|
341
|
+
retry_count_key: state.get(retry_count_key) + 1
|
342
|
+
}
|
343
|
+
|
344
|
+
def node_func_explain_agent_code(
|
345
|
+
state: Any,
|
346
|
+
code_snippet_key: str,
|
347
|
+
result_key: str,
|
348
|
+
error_key: str,
|
349
|
+
llm: Any,
|
350
|
+
role: str,
|
351
|
+
explanation_prompt_template: str,
|
352
|
+
success_prefix: str = "# Agent Explanation:\n\n",
|
353
|
+
error_message: str = "The agent encountered an error during execution and cannot be explained."
|
354
|
+
) -> Dict[str, Any]:
|
355
|
+
"""
|
356
|
+
Generic function to explain what a given agent code snippet does.
|
357
|
+
|
358
|
+
Parameters
|
359
|
+
----------
|
360
|
+
state : Any
|
361
|
+
A state object that supports `get(key: str)` to retrieve values.
|
362
|
+
code_snippet_key : str
|
363
|
+
The key in `state` where the agent code snippet is stored.
|
364
|
+
result_key : str
|
365
|
+
The key in `state` where the LLM's explanation is stored. Typically this is "messages".
|
366
|
+
error_key : str
|
367
|
+
The key in `state` where any error messages related to the code snippet are stored.
|
368
|
+
llm : Any
|
369
|
+
The language model used to explain the code. Should support `.invoke(prompt)`.
|
370
|
+
role : str
|
371
|
+
The role of the agent explaining the code snippet. Examples: "Data Scientist", "Data Engineer", etc.
|
372
|
+
explanation_prompt_template : str
|
373
|
+
A prompt template that can be used to explain the code. It should contain a placeholder
|
374
|
+
for inserting the agent code snippet. For example:
|
375
|
+
|
376
|
+
"Explain the steps performed by this agent code in a succinct manner:\n\n{code}"
|
377
|
+
|
378
|
+
success_prefix : str, optional
|
379
|
+
A prefix to add before the LLM's explanation, helping format the final message.
|
380
|
+
error_message : str, optional
|
381
|
+
Message to return if the agent code snippet cannot be explained due to an error.
|
382
|
+
|
383
|
+
Returns
|
384
|
+
-------
|
385
|
+
Dict[str, Any]
|
386
|
+
A dictionary containing one key "messages", which is a list of messages (e.g., AIMessage)
|
387
|
+
describing the explanation or the error.
|
388
|
+
"""
|
389
|
+
print(" * EXPLAIN AGENT CODE")
|
390
|
+
|
391
|
+
# Check if there's an error associated with the code
|
392
|
+
agent_error = state.get(error_key)
|
393
|
+
if agent_error is None:
|
394
|
+
# Retrieve the code snippet
|
395
|
+
code_snippet = state.get(code_snippet_key)
|
396
|
+
|
397
|
+
# Format the prompt by inserting the code snippet
|
398
|
+
prompt = explanation_prompt_template.format(code=code_snippet)
|
399
|
+
|
400
|
+
# Invoke the LLM to get an explanation
|
401
|
+
response = llm.invoke(prompt)
|
402
|
+
|
403
|
+
# Prepare the success message
|
404
|
+
message = AIMessage(content=f"{success_prefix}{response.content}", role=role)
|
405
|
+
return {"messages": [message]}
|
406
|
+
else:
|
407
|
+
# Return an error message if there was a problem with the code
|
408
|
+
message = AIMessage(content=error_message)
|
409
|
+
return {result_key: [message]}
|
File without changes
|
@@ -0,0 +1,116 @@
|
|
1
|
+
import io
|
2
|
+
import pandas as pd
|
3
|
+
from typing import Union, List, Dict
|
4
|
+
|
5
|
+
def summarize_dataframes(
|
6
|
+
dataframes: Union[pd.DataFrame, List[pd.DataFrame], Dict[str, pd.DataFrame]]
|
7
|
+
) -> List[str]:
|
8
|
+
"""
|
9
|
+
Generate a summary for one or more DataFrames. Accepts a single DataFrame, a list of DataFrames,
|
10
|
+
or a dictionary mapping names to DataFrames.
|
11
|
+
|
12
|
+
Parameters
|
13
|
+
----------
|
14
|
+
dataframes : pandas.DataFrame or list of pandas.DataFrame or dict of (str -> pandas.DataFrame)
|
15
|
+
- Single DataFrame: produce a single summary (returned within a one-element list).
|
16
|
+
- List of DataFrames: produce a summary for each DataFrame, using index-based names.
|
17
|
+
- Dictionary of DataFrames: produce a summary for each DataFrame, using dictionary keys as names.
|
18
|
+
|
19
|
+
Example:
|
20
|
+
--------
|
21
|
+
``` python
|
22
|
+
import pandas as pd
|
23
|
+
from sklearn.datasets import load_iris
|
24
|
+
data = load_iris(as_frame=True)
|
25
|
+
dataframes = {
|
26
|
+
"iris": data.frame,
|
27
|
+
"iris_target": data.target,
|
28
|
+
}
|
29
|
+
summaries = summarize_dataframes(dataframes)
|
30
|
+
print(summaries[0])
|
31
|
+
```
|
32
|
+
|
33
|
+
Returns
|
34
|
+
-------
|
35
|
+
list of str
|
36
|
+
A list of summaries, one for each provided DataFrame. Each summary includes:
|
37
|
+
- Shape of the DataFrame (rows, columns)
|
38
|
+
- Column data types
|
39
|
+
- Missing value percentage
|
40
|
+
- Unique value counts
|
41
|
+
- First 30 rows
|
42
|
+
- Descriptive statistics
|
43
|
+
- DataFrame info output
|
44
|
+
"""
|
45
|
+
|
46
|
+
summaries = []
|
47
|
+
|
48
|
+
# --- Dictionary Case ---
|
49
|
+
if isinstance(dataframes, dict):
|
50
|
+
for dataset_name, df in dataframes.items():
|
51
|
+
summaries.append(_summarize_dataframe(df, dataset_name))
|
52
|
+
|
53
|
+
# --- Single DataFrame Case ---
|
54
|
+
elif isinstance(dataframes, pd.DataFrame):
|
55
|
+
summaries.append(_summarize_dataframe(dataframes, "Single_Dataset"))
|
56
|
+
|
57
|
+
# --- List of DataFrames Case ---
|
58
|
+
elif isinstance(dataframes, list):
|
59
|
+
for idx, df in enumerate(dataframes):
|
60
|
+
dataset_name = f"Dataset_{idx}"
|
61
|
+
summaries.append(_summarize_dataframe(df, dataset_name))
|
62
|
+
|
63
|
+
else:
|
64
|
+
raise TypeError(
|
65
|
+
"Input must be a single DataFrame, a list of DataFrames, or a dictionary of DataFrames."
|
66
|
+
)
|
67
|
+
|
68
|
+
return summaries
|
69
|
+
|
70
|
+
|
71
|
+
def _summarize_dataframe(df: pd.DataFrame, dataset_name: str) -> str:
|
72
|
+
"""Generate a summary string for a single DataFrame."""
|
73
|
+
# 1. Convert dictionary-type cells to strings
|
74
|
+
# This prevents unhashable dict errors during df.nunique().
|
75
|
+
df = df.apply(lambda col: col.map(lambda x: str(x) if isinstance(x, dict) else x))
|
76
|
+
|
77
|
+
# 2. Capture df.info() output
|
78
|
+
buffer = io.StringIO()
|
79
|
+
df.info(buf=buffer)
|
80
|
+
info_text = buffer.getvalue()
|
81
|
+
|
82
|
+
# 3. Calculate missing value stats
|
83
|
+
missing_stats = (df.isna().sum() / len(df) * 100).sort_values(ascending=False)
|
84
|
+
missing_summary = "\n".join([f"{col}: {val:.2f}%" for col, val in missing_stats.items()])
|
85
|
+
|
86
|
+
# 4. Get column data types
|
87
|
+
column_types = "\n".join([f"{col}: {dtype}" for col, dtype in df.dtypes.items()])
|
88
|
+
|
89
|
+
# 5. Get unique value counts
|
90
|
+
unique_counts = df.nunique() # Will no longer fail on unhashable dict
|
91
|
+
unique_counts_summary = "\n".join([f"{col}: {count}" for col, count in unique_counts.items()])
|
92
|
+
|
93
|
+
summary_text = f"""
|
94
|
+
Dataset Name: {dataset_name}
|
95
|
+
----------------------------
|
96
|
+
Shape: {df.shape[0]} rows x {df.shape[1]} columns
|
97
|
+
|
98
|
+
Column Data Types:
|
99
|
+
{column_types}
|
100
|
+
|
101
|
+
Missing Value Percentage:
|
102
|
+
{missing_summary}
|
103
|
+
|
104
|
+
Unique Value Counts:
|
105
|
+
{unique_counts_summary}
|
106
|
+
|
107
|
+
Data (first 30 rows):
|
108
|
+
{df.head(30).to_string()}
|
109
|
+
|
110
|
+
Data Description:
|
111
|
+
{df.describe().to_string()}
|
112
|
+
|
113
|
+
Data Info:
|
114
|
+
{info_text}
|
115
|
+
"""
|
116
|
+
return summary_text.strip()
|
@@ -0,0 +1,61 @@
|
|
1
|
+
|
2
|
+
import os
|
3
|
+
|
4
|
+
def log_ai_function(response: str, file_name: str, log: bool = True, log_path: str = './logs/', overwrite: bool = True):
|
5
|
+
"""
|
6
|
+
Logs the response of an AI function to a file.
|
7
|
+
|
8
|
+
Parameters
|
9
|
+
----------
|
10
|
+
response : str
|
11
|
+
The response of the AI function.
|
12
|
+
file_name : str
|
13
|
+
The name of the file to save the response to.
|
14
|
+
log : bool, optional
|
15
|
+
Whether to log the response or not. The default is True.
|
16
|
+
log_path : str, optional
|
17
|
+
The path to save the log file. The default is './logs/'.
|
18
|
+
overwrite : bool, optional
|
19
|
+
Whether to overwrite the file if it already exists. The default is True.
|
20
|
+
- If True, the file will be overwritten.
|
21
|
+
- If False, a unique file name will be created.
|
22
|
+
|
23
|
+
Returns
|
24
|
+
-------
|
25
|
+
tuple
|
26
|
+
The path and name of the log file.
|
27
|
+
"""
|
28
|
+
|
29
|
+
if log:
|
30
|
+
# Ensure the directory exists
|
31
|
+
os.makedirs(log_path, exist_ok=True)
|
32
|
+
|
33
|
+
# file_name = 'data_wrangler.py'
|
34
|
+
file_path = os.path.join(log_path, file_name)
|
35
|
+
|
36
|
+
if not overwrite:
|
37
|
+
# If file already exists and we're NOT overwriting, we create a new name
|
38
|
+
if os.path.exists(file_path):
|
39
|
+
# Use an incremental suffix (e.g., data_wrangler_1.py, data_wrangler_2.py, etc.)
|
40
|
+
# or a time-based suffix if you prefer.
|
41
|
+
base_name, ext = os.path.splitext(file_name)
|
42
|
+
i = 1
|
43
|
+
while True:
|
44
|
+
new_file_name = f"{base_name}_{i}{ext}"
|
45
|
+
new_file_path = os.path.join(log_path, new_file_name)
|
46
|
+
if not os.path.exists(new_file_path):
|
47
|
+
file_path = new_file_path
|
48
|
+
file_name = new_file_name
|
49
|
+
break
|
50
|
+
i += 1
|
51
|
+
|
52
|
+
# Write the file
|
53
|
+
with open(file_path, 'w', encoding='utf-8') as file:
|
54
|
+
file.write(response)
|
55
|
+
|
56
|
+
print(f" File saved to: {file_path}")
|
57
|
+
|
58
|
+
return (file_path, file_name)
|
59
|
+
|
60
|
+
else:
|
61
|
+
return None
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# BUSINESS SCIENCE UNIVERSITY
|
2
|
+
# AI DATA SCIENCE TEAM
|
3
|
+
# ***
|
4
|
+
# Parsers
|
5
|
+
|
6
|
+
from langchain_core.output_parsers import JsonOutputParser
|
7
|
+
from langchain_core.output_parsers import BaseOutputParser
|
8
|
+
|
9
|
+
import re
|
10
|
+
|
11
|
+
# Python Parser for output standardization
|
12
|
+
class PythonOutputParser(BaseOutputParser):
|
13
|
+
def parse(self, text: str):
|
14
|
+
def extract_python_code(text):
|
15
|
+
python_code_match = re.search(r'```python(.*?)```', text, re.DOTALL)
|
16
|
+
if python_code_match:
|
17
|
+
python_code = python_code_match.group(1).strip()
|
18
|
+
return python_code
|
19
|
+
else:
|
20
|
+
python_code_match = re.search(r"python(.*?)'", text, re.DOTALL)
|
21
|
+
if python_code_match:
|
22
|
+
python_code = python_code_match.group(1).strip()
|
23
|
+
return python_code
|
24
|
+
else:
|
25
|
+
return None
|
26
|
+
python_code = extract_python_code(text)
|
27
|
+
if python_code is not None:
|
28
|
+
return python_code
|
29
|
+
else:
|
30
|
+
# Assume ```sql wasn't used
|
31
|
+
return text
|
32
|
+
|
33
|
+
# SQL Parser for output standardization
|
34
|
+
class SQLOutputParser(BaseOutputParser):
|
35
|
+
def parse(self, text: str):
|
36
|
+
def extract_sql_code(text):
|
37
|
+
sql_code_match = re.search(r'```sql(.*?)```', text, re.DOTALL)
|
38
|
+
sql_code_match_2 = re.search(r"SQLQuery:\s*(.*)", text)
|
39
|
+
if sql_code_match:
|
40
|
+
sql_code = sql_code_match.group(1).strip()
|
41
|
+
return sql_code
|
42
|
+
if sql_code_match_2:
|
43
|
+
sql_code = sql_code_match_2.group(1).strip()
|
44
|
+
return sql_code
|
45
|
+
else:
|
46
|
+
sql_code_match = re.search(r"sql(.*?)'", text, re.DOTALL)
|
47
|
+
if sql_code_match:
|
48
|
+
sql_code = sql_code_match.group(1).strip()
|
49
|
+
return sql_code
|
50
|
+
else:
|
51
|
+
return None
|
52
|
+
sql_code = extract_sql_code(text)
|
53
|
+
if sql_code is not None:
|
54
|
+
return sql_code
|
55
|
+
else:
|
56
|
+
# Assume ```sql wasn't used
|
57
|
+
return text
|