ai-data-science-team 0.0.0.9008__py3-none-any.whl → 0.0.0.9010__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +0 -1
- ai_data_science_team/agents/data_cleaning_agent.py +50 -39
- ai_data_science_team/agents/data_loader_tools_agent.py +69 -0
- ai_data_science_team/agents/data_visualization_agent.py +45 -50
- ai_data_science_team/agents/data_wrangling_agent.py +50 -49
- ai_data_science_team/agents/feature_engineering_agent.py +48 -67
- ai_data_science_team/agents/sql_database_agent.py +130 -76
- ai_data_science_team/ml_agents/__init__.py +2 -0
- ai_data_science_team/ml_agents/h2o_ml_agent.py +852 -0
- ai_data_science_team/ml_agents/mlflow_tools_agent.py +327 -0
- ai_data_science_team/multiagents/sql_data_analyst.py +120 -9
- ai_data_science_team/parsers/__init__.py +0 -0
- ai_data_science_team/{tools → parsers}/parsers.py +0 -1
- ai_data_science_team/templates/__init__.py +1 -0
- ai_data_science_team/templates/agent_templates.py +78 -7
- ai_data_science_team/tools/data_loader.py +378 -0
- ai_data_science_team/tools/{metadata.py → dataframe.py} +0 -91
- ai_data_science_team/tools/h2o.py +643 -0
- ai_data_science_team/tools/mlflow.py +961 -0
- ai_data_science_team/tools/sql.py +126 -0
- ai_data_science_team/{tools → utils}/regex.py +59 -1
- {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/METADATA +56 -24
- ai_data_science_team-0.0.0.9010.dist-info/RECORD +35 -0
- ai_data_science_team-0.0.0.9008.dist-info/RECORD +0 -26
- /ai_data_science_team/{tools → utils}/logging.py +0 -0
- {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/WHEEL +0 -0
- {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,852 @@
|
|
1
|
+
# BUSINESS SCIENCE UNIVERSITY
|
2
|
+
# AI DATA SCIENCE TEAM
|
3
|
+
# ***
|
4
|
+
# * Agents: H2O Machine Learning Agent
|
5
|
+
|
6
|
+
import os
|
7
|
+
import json
|
8
|
+
from typing import TypedDict, Annotated, Sequence, Literal
|
9
|
+
import operator
|
10
|
+
|
11
|
+
import pandas as pd
|
12
|
+
from IPython.display import Markdown
|
13
|
+
|
14
|
+
from langchain.prompts import PromptTemplate
|
15
|
+
from langchain_core.messages import BaseMessage
|
16
|
+
|
17
|
+
from langgraph.types import Command
|
18
|
+
from langgraph.checkpoint.memory import MemorySaver
|
19
|
+
|
20
|
+
from ai_data_science_team.templates import(
|
21
|
+
node_func_execute_agent_code_on_data,
|
22
|
+
node_func_human_review,
|
23
|
+
node_func_fix_agent_code,
|
24
|
+
node_func_report_agent_outputs,
|
25
|
+
create_coding_agent_graph,
|
26
|
+
BaseAgent,
|
27
|
+
)
|
28
|
+
from ai_data_science_team.parsers.parsers import PythonOutputParser
|
29
|
+
from ai_data_science_team.utils.regex import (
|
30
|
+
relocate_imports_inside_function,
|
31
|
+
add_comments_to_top,
|
32
|
+
format_agent_name,
|
33
|
+
format_recommended_steps,
|
34
|
+
get_generic_summary,
|
35
|
+
)
|
36
|
+
from ai_data_science_team.tools.dataframe import get_dataframe_summary
|
37
|
+
from ai_data_science_team.utils.logging import log_ai_function
|
38
|
+
from ai_data_science_team.tools.h2o import H2O_AUTOML_DOCUMENTATION
|
39
|
+
|
40
|
+
AGENT_NAME = "h2o_ml_agent"
|
41
|
+
LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
42
|
+
|
43
|
+
class H2OMLAgent(BaseAgent):
|
44
|
+
"""
|
45
|
+
A Machine Learning agent that uses H2O's AutoML for training,
|
46
|
+
allowing the user to specify a model directory for saving the best model.
|
47
|
+
If neither model_directory nor log_path is provided, model saving is skipped.
|
48
|
+
|
49
|
+
Parameters
|
50
|
+
----------
|
51
|
+
model : langchain.llms.base.LLM
|
52
|
+
The language model used to generate the ML code.
|
53
|
+
n_samples : int, optional
|
54
|
+
Number of samples used when summarizing the dataset. Defaults to 30.
|
55
|
+
log : bool, optional
|
56
|
+
Whether to log the generated code and errors. Defaults to False.
|
57
|
+
log_path : str, optional
|
58
|
+
Directory path for storing log files. Defaults to None.
|
59
|
+
file_name : str, optional
|
60
|
+
Name of the Python file for saving the generated code. Defaults to "h2o_automl.py".
|
61
|
+
function_name : str, optional
|
62
|
+
Name of the function that performs the AutoML training. Defaults to "h2o_automl".
|
63
|
+
model_directory : str or None, optional
|
64
|
+
Directory to save the H2O Machine Learning model. If None, defaults to log_path (if available).
|
65
|
+
If both are None, no model is saved. Defaults to None.
|
66
|
+
overwrite : bool, optional
|
67
|
+
Whether to overwrite the log file if it exists. Defaults to True.
|
68
|
+
human_in_the_loop : bool, optional
|
69
|
+
Enables user review of the code. Defaults to False.
|
70
|
+
bypass_recommended_steps : bool, optional
|
71
|
+
If True, skips the recommended steps prompt. Defaults to False.
|
72
|
+
bypass_explain_code : bool, optional
|
73
|
+
If True, skips the code-explanation step. Defaults to False.
|
74
|
+
enable_mlflow : bool, default False
|
75
|
+
Whether to enable MLflow logging. If False, skip MLflow entirely.
|
76
|
+
mlflow_tracking_uri : str or None
|
77
|
+
If provided, sets MLflow tracking URI at runtime.
|
78
|
+
mlflow_experiment_name : str
|
79
|
+
Name of the MLflow experiment (created if doesn't exist).
|
80
|
+
mlflow_run_name : str, default None
|
81
|
+
A custom name for the MLflow run.
|
82
|
+
|
83
|
+
|
84
|
+
Methods
|
85
|
+
-------
|
86
|
+
update_params(**kwargs)
|
87
|
+
Updates the agent's parameters and rebuilds the compiled state graph.
|
88
|
+
ainvoke_agent(user_instructions, data_raw, target_variable, ...)
|
89
|
+
Asynchronously runs the agent to produce an H2O AutoML model, optionally saving the model to disk.
|
90
|
+
invoke_agent(user_instructions, data_raw, target_variable, ...)
|
91
|
+
Synchronously runs the agent to produce an H2O AutoML model, optionally saving the model to disk.
|
92
|
+
get_leaderboard()
|
93
|
+
Retrieves the H2O AutoML leaderboard from the agent's response.
|
94
|
+
get_best_model_id()
|
95
|
+
Retrieves the best model ID from the agent's response.
|
96
|
+
get_model_path()
|
97
|
+
Retrieves the saved model path from the agent's response (or None if not saved).
|
98
|
+
get_data_raw()
|
99
|
+
Retrieves the raw data as a DataFrame from the agent's response.
|
100
|
+
get_h2o_train_function(markdown=False)
|
101
|
+
Retrieves the H2O AutoML function code generated by the agent.
|
102
|
+
get_recommended_ml_steps(markdown=False)
|
103
|
+
Retrieves recommended ML steps from the agent's response.
|
104
|
+
get_workflow_summary()
|
105
|
+
Retrieves a summary of the agent's workflow.
|
106
|
+
get_response()
|
107
|
+
Returns the entire response dictionary.
|
108
|
+
show()
|
109
|
+
Visualizes the compiled graph as a Mermaid diagram.
|
110
|
+
|
111
|
+
Examples
|
112
|
+
--------
|
113
|
+
```python
|
114
|
+
from langchain_openai import ChatOpenAI
|
115
|
+
import pandas as pd
|
116
|
+
from ai_data_science_team.ml_agents import H2OMLAgent
|
117
|
+
|
118
|
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
119
|
+
|
120
|
+
df = pd.read_csv("data/churn_data.csv")
|
121
|
+
|
122
|
+
ml_agent = H2OMLAgent(
|
123
|
+
model=llm,
|
124
|
+
log=True,
|
125
|
+
log_path=LOG_PATH,
|
126
|
+
model_directory=MODEL_PATH,
|
127
|
+
)
|
128
|
+
|
129
|
+
ml_agent.invoke_agent(
|
130
|
+
data_raw=df.drop(columns=["customerID"]),
|
131
|
+
user_instructions="Please do classification on 'Churn'. Use a max runtime of 30 seconds.",
|
132
|
+
target_variable="Churn"
|
133
|
+
)
|
134
|
+
|
135
|
+
# Retrieve and display the leaderboard of models
|
136
|
+
ml_agent.get_leaderboard()
|
137
|
+
|
138
|
+
# Get the H2O training function in markdown format
|
139
|
+
ml_agent.get_h2o_train_function(markdown=True)
|
140
|
+
|
141
|
+
# Get the recommended machine learning steps in markdown format
|
142
|
+
ml_agent.get_recommended_ml_steps(markdown=True)
|
143
|
+
|
144
|
+
# Get a summary of the workflow in markdown format
|
145
|
+
ml_agent.get_workflow_summary(markdown=True)
|
146
|
+
|
147
|
+
# Get a summary of the logs in markdown format
|
148
|
+
ml_agent.get_log_summary(markdown=True)
|
149
|
+
|
150
|
+
# Get the path to the saved model
|
151
|
+
model_path = ml_agent.get_model_path()
|
152
|
+
model_path
|
153
|
+
```
|
154
|
+
|
155
|
+
Returns
|
156
|
+
-------
|
157
|
+
H2OMLAgent : langchain.graphs.CompiledStateGraph
|
158
|
+
An instance of the H2O ML agent.
|
159
|
+
|
160
|
+
"""
|
161
|
+
|
162
|
+
def __init__(
|
163
|
+
self,
|
164
|
+
model,
|
165
|
+
n_samples=30,
|
166
|
+
log=False,
|
167
|
+
log_path=None,
|
168
|
+
file_name="h2o_automl.py",
|
169
|
+
function_name="h2o_automl",
|
170
|
+
model_directory=None,
|
171
|
+
overwrite=True,
|
172
|
+
human_in_the_loop=False,
|
173
|
+
bypass_recommended_steps=False,
|
174
|
+
bypass_explain_code=False,
|
175
|
+
enable_mlflow=False,
|
176
|
+
mlflow_tracking_uri=None,
|
177
|
+
mlflow_experiment_name="H2O AutoML",
|
178
|
+
mlflow_run_name=None,
|
179
|
+
):
|
180
|
+
self._params = {
|
181
|
+
"model": model,
|
182
|
+
"n_samples": n_samples,
|
183
|
+
"log": log,
|
184
|
+
"log_path": log_path,
|
185
|
+
"file_name": file_name,
|
186
|
+
"function_name": function_name,
|
187
|
+
"model_directory": model_directory,
|
188
|
+
"overwrite": overwrite,
|
189
|
+
"human_in_the_loop": human_in_the_loop,
|
190
|
+
"bypass_recommended_steps": bypass_recommended_steps,
|
191
|
+
"bypass_explain_code": bypass_explain_code,
|
192
|
+
"enable_mlflow": enable_mlflow,
|
193
|
+
"mlflow_tracking_uri": mlflow_tracking_uri,
|
194
|
+
"mlflow_experiment_name": mlflow_experiment_name,
|
195
|
+
"mlflow_run_name": mlflow_run_name,
|
196
|
+
}
|
197
|
+
self._compiled_graph = self._make_compiled_graph()
|
198
|
+
self.response = None
|
199
|
+
|
200
|
+
def _make_compiled_graph(self):
|
201
|
+
"""
|
202
|
+
Creates the compiled graph for the agent.
|
203
|
+
"""
|
204
|
+
self.response = None
|
205
|
+
return make_h2o_ml_agent(**self._params)
|
206
|
+
|
207
|
+
def update_params(self, **kwargs):
|
208
|
+
"""
|
209
|
+
Updates the agent's parameters and rebuilds the compiled graph.
|
210
|
+
"""
|
211
|
+
for k, v in kwargs.items():
|
212
|
+
self._params[k] = v
|
213
|
+
self._compiled_graph = self._make_compiled_graph()
|
214
|
+
|
215
|
+
async def ainvoke_agent(
|
216
|
+
self,
|
217
|
+
data_raw: pd.DataFrame,
|
218
|
+
user_instructions: str=None,
|
219
|
+
target_variable: str=None,
|
220
|
+
max_retries=3,
|
221
|
+
retry_count=0,
|
222
|
+
**kwargs
|
223
|
+
):
|
224
|
+
"""
|
225
|
+
Asynchronously trains an H2O AutoML model for the provided dataset,
|
226
|
+
saving the best model to disk if model_directory or log_path is available.
|
227
|
+
"""
|
228
|
+
response = await self._compiled_graph.ainvoke({
|
229
|
+
"user_instructions": user_instructions,
|
230
|
+
"data_raw": data_raw.to_dict(),
|
231
|
+
"target_variable": target_variable,
|
232
|
+
"max_retries": max_retries,
|
233
|
+
"retry_count": retry_count
|
234
|
+
}, **kwargs)
|
235
|
+
self.response = response
|
236
|
+
return None
|
237
|
+
|
238
|
+
def invoke_agent(
|
239
|
+
self,
|
240
|
+
data_raw: pd.DataFrame,
|
241
|
+
user_instructions: str=None,
|
242
|
+
target_variable: str=None,
|
243
|
+
max_retries=3,
|
244
|
+
retry_count=0,
|
245
|
+
**kwargs
|
246
|
+
):
|
247
|
+
"""
|
248
|
+
Synchronously trains an H2O AutoML model for the provided dataset,
|
249
|
+
saving the best model to disk if model_directory or log_path is available.
|
250
|
+
"""
|
251
|
+
response = self._compiled_graph.invoke({
|
252
|
+
"user_instructions": user_instructions,
|
253
|
+
"data_raw": data_raw.to_dict(),
|
254
|
+
"target_variable": target_variable,
|
255
|
+
"max_retries": max_retries,
|
256
|
+
"retry_count": retry_count
|
257
|
+
}, **kwargs)
|
258
|
+
self.response = response
|
259
|
+
return None
|
260
|
+
|
261
|
+
def get_leaderboard(self):
|
262
|
+
"""Returns the H2O AutoML leaderboard as a DataFrame."""
|
263
|
+
if self.response and "leaderboard" in self.response:
|
264
|
+
return pd.DataFrame(self.response["leaderboard"])
|
265
|
+
return None
|
266
|
+
|
267
|
+
def get_best_model_id(self):
|
268
|
+
"""Returns the best model id from the AutoML run."""
|
269
|
+
if self.response and "best_model_id" in self.response:
|
270
|
+
return self.response["best_model_id"]
|
271
|
+
return None
|
272
|
+
|
273
|
+
def get_model_path(self):
|
274
|
+
"""Returns the file path to the saved best model, or None if not saved."""
|
275
|
+
if self.response and "model_path" in self.response:
|
276
|
+
return self.response["model_path"]
|
277
|
+
return None
|
278
|
+
|
279
|
+
def get_data_raw(self):
|
280
|
+
"""Retrieves the raw data as a DataFrame from the response."""
|
281
|
+
if self.response and "data_raw" in self.response:
|
282
|
+
return pd.DataFrame(self.response["data_raw"])
|
283
|
+
return None
|
284
|
+
|
285
|
+
def get_h2o_train_function(self, markdown=False):
|
286
|
+
"""Retrieves the H2O AutoML function code generated by the agent."""
|
287
|
+
if self.response and "h2o_train_function" in self.response:
|
288
|
+
code = self.response["h2o_train_function"]
|
289
|
+
if markdown:
|
290
|
+
return Markdown(f"```python\n{code}\n```")
|
291
|
+
return code
|
292
|
+
return None
|
293
|
+
|
294
|
+
def get_recommended_ml_steps(self, markdown=False):
|
295
|
+
"""Retrieves recommended ML steps from the agent's response."""
|
296
|
+
if self.response and "recommended_steps" in self.response:
|
297
|
+
steps = self.response["recommended_steps"]
|
298
|
+
if markdown:
|
299
|
+
return Markdown(steps)
|
300
|
+
return steps
|
301
|
+
return None
|
302
|
+
|
303
|
+
def get_workflow_summary(self, markdown=False):
|
304
|
+
"""
|
305
|
+
Retrieves the agent's workflow summary, if logging is enabled.
|
306
|
+
"""
|
307
|
+
if self.response and self.response.get("messages"):
|
308
|
+
summary = get_generic_summary(json.loads(self.response.get("messages")[-1].content))
|
309
|
+
if markdown:
|
310
|
+
return Markdown(summary)
|
311
|
+
else:
|
312
|
+
return summary
|
313
|
+
|
314
|
+
def get_log_summary(self, markdown=False):
|
315
|
+
"""
|
316
|
+
Logs a summary of the agent's operations, if logging is enabled.
|
317
|
+
"""
|
318
|
+
if self.response:
|
319
|
+
if self.response.get('h2o_train_function_path'):
|
320
|
+
log_details = f"""
|
321
|
+
## H2O Machine Learning Agent Log Summary:
|
322
|
+
|
323
|
+
Function Path: {self.response.get('h2o_train_function_path')}
|
324
|
+
|
325
|
+
Function Name: {self.response.get('h2o_train_function_name')}
|
326
|
+
|
327
|
+
Best Model ID: {self.get_best_model_id()}
|
328
|
+
|
329
|
+
Model Path: {self.get_model_path()}
|
330
|
+
"""
|
331
|
+
if markdown:
|
332
|
+
return Markdown(log_details)
|
333
|
+
else:
|
334
|
+
return log_details
|
335
|
+
|
336
|
+
|
337
|
+
def make_h2o_ml_agent(
|
338
|
+
model,
|
339
|
+
n_samples=30,
|
340
|
+
log=False,
|
341
|
+
log_path=None,
|
342
|
+
file_name="h2o_automl.py",
|
343
|
+
function_name="h2o_automl",
|
344
|
+
model_directory=None,
|
345
|
+
overwrite=True,
|
346
|
+
human_in_the_loop=False,
|
347
|
+
bypass_recommended_steps=False,
|
348
|
+
bypass_explain_code=False,
|
349
|
+
enable_mlflow=False,
|
350
|
+
mlflow_tracking_uri=None,
|
351
|
+
mlflow_experiment_name="H2O AutoML",
|
352
|
+
mlflow_run_name=None,
|
353
|
+
):
|
354
|
+
"""
|
355
|
+
Creates a machine learning agent that uses H2O for AutoML.
|
356
|
+
The agent will:
|
357
|
+
1. Optionally recommend ML steps,
|
358
|
+
2. Creates Python code that sets up H2OAutoML,
|
359
|
+
3. Executes that code (optionally saving the best model to disk),
|
360
|
+
4. Fixes errors if needed,
|
361
|
+
5. Optionally explains the code.
|
362
|
+
|
363
|
+
model_directory: Directory to save the model.
|
364
|
+
If None, defaults to log_path.
|
365
|
+
If both are None, skip saving.
|
366
|
+
"""
|
367
|
+
|
368
|
+
llm = model
|
369
|
+
|
370
|
+
# Handle logging directory
|
371
|
+
if log:
|
372
|
+
if log_path is None:
|
373
|
+
log_path = "logs/"
|
374
|
+
if not os.path.exists(log_path):
|
375
|
+
os.makedirs(log_path)
|
376
|
+
|
377
|
+
# Check if H2O is installed
|
378
|
+
try:
|
379
|
+
import h2o
|
380
|
+
from h2o.automl import H2OAutoML
|
381
|
+
except ImportError as e:
|
382
|
+
raise ImportError(
|
383
|
+
"The 'h2o' library is not installed. Please install it using pip:\n\n"
|
384
|
+
" pip install h2o\n\n"
|
385
|
+
"Visit https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html for details."
|
386
|
+
) from e
|
387
|
+
|
388
|
+
# Define GraphState
|
389
|
+
class GraphState(TypedDict):
|
390
|
+
messages: Annotated[Sequence[BaseMessage], operator.add]
|
391
|
+
user_instructions: str
|
392
|
+
recommended_steps: str
|
393
|
+
data_raw: dict
|
394
|
+
leaderboard: dict
|
395
|
+
best_model_id: str
|
396
|
+
model_path: str
|
397
|
+
model_results: dict
|
398
|
+
target_variable: str
|
399
|
+
all_datasets_summary: str
|
400
|
+
h2o_train_function: str
|
401
|
+
h2o_train_function_path: str
|
402
|
+
h2o_train_file_name: str
|
403
|
+
h2o_train_function_name: str
|
404
|
+
h2o_train_error: str
|
405
|
+
max_retries: int
|
406
|
+
retry_count: int
|
407
|
+
|
408
|
+
# 1) Recommend ML steps (optional)
|
409
|
+
def recommend_ml_steps(state: GraphState):
|
410
|
+
print(format_agent_name(AGENT_NAME))
|
411
|
+
print(" * RECOMMEND MACHINE LEARNING STEPS")
|
412
|
+
|
413
|
+
recommend_steps_prompt = PromptTemplate(
|
414
|
+
template="""
|
415
|
+
You are an AutoML Expert using H2O.
|
416
|
+
|
417
|
+
We have the following dataset summary, user instructions, and H2O AutoML documentation:
|
418
|
+
|
419
|
+
User instructions:
|
420
|
+
{user_instructions}
|
421
|
+
|
422
|
+
Data Summary:
|
423
|
+
{all_datasets_summary}
|
424
|
+
|
425
|
+
H2O AutoML Documentation:
|
426
|
+
{h2o_automl_documentation}
|
427
|
+
|
428
|
+
Please recommend a short list of steps or considerations for performing H2OAutoML on this data. Specifically focus on maximizing model accuracy while remaining flexible to user instructions and the dataset.
|
429
|
+
|
430
|
+
- Recommend any paramters and values that might improve performance (predictive accuracy).
|
431
|
+
- Recommend the Loss Function, Stopping Criteria, and other advanced parameters.
|
432
|
+
- Use the H2O AutoML documentation to your advantage.
|
433
|
+
- Exclude deep learning algorithms since these are typically low performance.
|
434
|
+
|
435
|
+
Avoid these:
|
436
|
+
|
437
|
+
- Do not perform data cleaning or feature engineering here. We will handle that separately.
|
438
|
+
- Do not limit memory size or CPU usage unless the user specifies it.
|
439
|
+
|
440
|
+
Return as a numbered list. You can return short code snippets to demonstrate actions. But do not return a fully coded solution. The H2O AutoML code will be generated separately by a Coding Agent.
|
441
|
+
""",
|
442
|
+
input_variables=["user_instructions", "all_datasets_summary", "h2o_automl_documentation"]
|
443
|
+
)
|
444
|
+
|
445
|
+
data_raw = state.get("data_raw")
|
446
|
+
df = pd.DataFrame.from_dict(data_raw)
|
447
|
+
all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
|
448
|
+
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
449
|
+
|
450
|
+
steps_agent = recommend_steps_prompt | llm
|
451
|
+
recommended_steps = steps_agent.invoke({
|
452
|
+
"user_instructions": state.get("user_instructions"),
|
453
|
+
"all_datasets_summary": all_datasets_summary_str,
|
454
|
+
"h2o_automl_documentation": H2O_AUTOML_DOCUMENTATION
|
455
|
+
})
|
456
|
+
|
457
|
+
return {
|
458
|
+
"recommended_steps": format_recommended_steps(
|
459
|
+
recommended_steps.content.strip(),
|
460
|
+
heading="# Recommended ML Steps:"
|
461
|
+
),
|
462
|
+
"all_datasets_summary": all_datasets_summary_str
|
463
|
+
}
|
464
|
+
|
465
|
+
# 2) Create code
|
466
|
+
def create_h2o_code(state: GraphState):
|
467
|
+
if bypass_recommended_steps:
|
468
|
+
print(format_agent_name(AGENT_NAME))
|
469
|
+
|
470
|
+
data_raw = state.get("data_raw")
|
471
|
+
df = pd.DataFrame.from_dict(data_raw)
|
472
|
+
all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
|
473
|
+
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
474
|
+
else:
|
475
|
+
all_datasets_summary_str = state.get("all_datasets_summary")
|
476
|
+
|
477
|
+
print(" * CREATE H2O AUTOML CODE")
|
478
|
+
|
479
|
+
code_prompt = PromptTemplate(
|
480
|
+
template="""
|
481
|
+
You are an H2O AutoML agent. Create a Python function named {function_name}(data_raw)
|
482
|
+
that runs H2OAutoML on the provided data with a focus on maximizing model accuracy and
|
483
|
+
incorporating user instructions for flexibility.
|
484
|
+
|
485
|
+
Do not perform substantial data cleaning or feature engineering here. We will handle that separately.
|
486
|
+
|
487
|
+
We have two variables for deciding where to save the model:
|
488
|
+
model_directory = {model_directory}
|
489
|
+
log_path = {log_path}
|
490
|
+
|
491
|
+
IMPORTANT: MLflow Parameters if the user wants to enable MLflow with H2O AutoML:
|
492
|
+
enable_mlflow: {enable_mlflow}
|
493
|
+
mlflow_tracking_uri: {mlflow_tracking_uri}
|
494
|
+
mlflow_experiment_name: {mlflow_experiment_name}
|
495
|
+
mlflow_run_name: {mlflow_run_name}
|
496
|
+
|
497
|
+
|
498
|
+
Additional Requirements:
|
499
|
+
- Convert `data_raw` (pandas DataFrame) into an H2OFrame.
|
500
|
+
- Identify the target variable from {target_variable} (if provided).
|
501
|
+
- Start H2O if not already started.
|
502
|
+
- Use Recommended Steps to guide any advanced parameters (e.g., cross-validation folds,
|
503
|
+
balancing classes, extended training time, stacking) that might improve performance.
|
504
|
+
- If the user does not specify anything special, use H2OAutoML defaults (including stacked ensembles).
|
505
|
+
- Focus on maximizing accuracy (or the most relevant metric if it's not classification)
|
506
|
+
while remaining flexible to user instructions.
|
507
|
+
- Return a dict with keys: leaderboard, best_model_id, model_path, and model_results.
|
508
|
+
- If enable_mlfow is True, log the top metrics and save the model as an artifact. (See example function)
|
509
|
+
|
510
|
+
Initial User Instructions (Disregard any instructions that are unrelated to modeling):
|
511
|
+
{user_instructions}
|
512
|
+
|
513
|
+
Recommended Steps:
|
514
|
+
{recommended_steps}
|
515
|
+
|
516
|
+
Data summary for reference:
|
517
|
+
{all_datasets_summary}
|
518
|
+
|
519
|
+
Return only code in ```python``` with a single function definition. Use this as an example starting template:
|
520
|
+
```python
|
521
|
+
def {function_name}(
|
522
|
+
data_raw: List[Dict[str, Any]],
|
523
|
+
target: str,
|
524
|
+
max_runtime_secs: int,
|
525
|
+
exclude_algos: List[str],
|
526
|
+
balance_classes: bool,
|
527
|
+
nfolds: int,
|
528
|
+
seed: int,
|
529
|
+
max_models: int,
|
530
|
+
stopping_metric: str,
|
531
|
+
stopping_tolerance: float,
|
532
|
+
stopping_rounds: int,
|
533
|
+
sort_metric: str ,
|
534
|
+
model_directory: Optional[str] = None,
|
535
|
+
log_path: Optional[str] = None,
|
536
|
+
enable_mlflow: bool,
|
537
|
+
mlflow_tracking_uri: Optional[str],
|
538
|
+
mlflow_experiment_name: str,
|
539
|
+
mlflow_run_name: str,
|
540
|
+
**kwargs # Additional parameters for H2OAutoML (feel free to add these based on user instructions and recommended steps)
|
541
|
+
):
|
542
|
+
|
543
|
+
import h2o
|
544
|
+
from h2o.automl import H2OAutoML
|
545
|
+
import pandas as pd
|
546
|
+
import json
|
547
|
+
|
548
|
+
# Optional MLflow usage
|
549
|
+
if enable_mlflow:
|
550
|
+
import mlflow
|
551
|
+
if mlflow_tracking_uri:
|
552
|
+
mlflow.set_tracking_uri(mlflow_tracking_uri)
|
553
|
+
mlflow.set_experiment(mlflow_experiment_name)
|
554
|
+
run_context = mlflow.start_run(run_name=mlflow_run_name)
|
555
|
+
else:
|
556
|
+
# Dummy context manager to skip MLflow if not enabled
|
557
|
+
from contextlib import nullcontext
|
558
|
+
run_context = nullcontext()
|
559
|
+
|
560
|
+
exclude_algos = exclude_algos or ["DeepLearning"] # default if not provided
|
561
|
+
|
562
|
+
# Convert data to DataFrame
|
563
|
+
df = pd.DataFrame(data_raw)
|
564
|
+
|
565
|
+
with run_context as run:
|
566
|
+
# If using MLflow, track run ID
|
567
|
+
run_id = None
|
568
|
+
if enable_mlflow and run is not None:
|
569
|
+
run_id = run.info.run_id
|
570
|
+
import mlflow
|
571
|
+
|
572
|
+
|
573
|
+
# Initialize H2O
|
574
|
+
h2o.init()
|
575
|
+
|
576
|
+
# Create H2OFrame
|
577
|
+
data_h2o = h2o.H2OFrame(df)
|
578
|
+
|
579
|
+
# Setup AutoML
|
580
|
+
aml = H2OAutoML(
|
581
|
+
max_runtime_secs=max_runtime_secs,
|
582
|
+
exclude_algos=exclude_algos,
|
583
|
+
balance_classes=balance_classes,
|
584
|
+
nfolds=nfolds,
|
585
|
+
seed=seed,
|
586
|
+
max_models=max_models,
|
587
|
+
stopping_metric=stopping_metric,
|
588
|
+
stopping_tolerance=stopping_tolerance,
|
589
|
+
stopping_rounds=stopping_rounds,
|
590
|
+
sort_metric=sort_metric,
|
591
|
+
**kwargs
|
592
|
+
)
|
593
|
+
|
594
|
+
# Train
|
595
|
+
x = [col for col in data_h2o.columns if col != target]
|
596
|
+
aml.train(x=x, y=target, training_frame=data_h2o)
|
597
|
+
|
598
|
+
# Save model if we have a directory/log path
|
599
|
+
if model_directory is None and log_path is None:
|
600
|
+
model_path = None
|
601
|
+
else:
|
602
|
+
path_to_save = model_directory if model_directory else log_path
|
603
|
+
model_path = h2o.save_model(model=aml.leader, path=path_to_save, force=True)
|
604
|
+
|
605
|
+
# Leaderboard (DataFrame -> dict)
|
606
|
+
leaderboard_df = pd.DataFrame(aml.leaderboard)
|
607
|
+
leaderboard_dict = leaderboard_df.to_dict()
|
608
|
+
|
609
|
+
# Gather top-model metrics from the first row
|
610
|
+
top_metrics = leaderboard_df.iloc[0].to_dict()
|
611
|
+
|
612
|
+
# Construct model_results
|
613
|
+
model_results = dict(
|
614
|
+
model_flavor= "H2O AutoML",
|
615
|
+
model_path= model_path,
|
616
|
+
best_model_id= aml.leader.model_id,
|
617
|
+
metrics= top_metrics # all metrics from the top row
|
618
|
+
)
|
619
|
+
|
620
|
+
# IMPORTANT: Log these to MLflow if enabled
|
621
|
+
if enable_mlflow and run is not None:
|
622
|
+
|
623
|
+
# Log the top metrics if numeric
|
624
|
+
numeric_metrics = {{k: v for k, v in top_metrics.items() if isinstance(v, (int, float))}}
|
625
|
+
mlflow.log_metrics(numeric_metrics)
|
626
|
+
|
627
|
+
# Log artifact if we saved the model
|
628
|
+
mlflow.h2o.log_model(aml.leader, artifact_path="model")
|
629
|
+
|
630
|
+
# Log the leaderboard
|
631
|
+
mlflow.log_table(leaderboard_dict, "leaderboard.json")
|
632
|
+
|
633
|
+
# Log these parameters (if specified)
|
634
|
+
mlflow.log_params(dict(
|
635
|
+
target= target,
|
636
|
+
max_runtime_secs= max_runtime_secs,
|
637
|
+
exclude_algos= str(exclude_algos),
|
638
|
+
balance_classes= balance_classes,
|
639
|
+
nfolds= nfolds,
|
640
|
+
seed= seed,
|
641
|
+
max_models= max_models,
|
642
|
+
stopping_metric= stopping_metric,
|
643
|
+
stopping_tolerance= stopping_tolerance,
|
644
|
+
stopping_rounds= stopping_rounds,
|
645
|
+
sort_metric= sort_metric,
|
646
|
+
model_directory= model_directory,
|
647
|
+
log_path= log_path
|
648
|
+
))
|
649
|
+
|
650
|
+
# Build the output
|
651
|
+
output = dict(
|
652
|
+
leaderboard= leaderboard_dict,
|
653
|
+
best_model_id= aml.leader.model_id,
|
654
|
+
model_path= model_path,
|
655
|
+
model_results= model_results,
|
656
|
+
mlflow_run_id= run_id
|
657
|
+
)
|
658
|
+
|
659
|
+
return output
|
660
|
+
```
|
661
|
+
|
662
|
+
Avoid these errors:
|
663
|
+
|
664
|
+
- WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
|
665
|
+
|
666
|
+
- 'list' object has no attribute 'tolist'
|
667
|
+
|
668
|
+
- with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True): pandas_df = h2o_df.as_data_frame() # Convert to pandas DataFrame using pd.DataFrame(h2o_df)
|
669
|
+
|
670
|
+
- dtype is only supported for one column frames
|
671
|
+
|
672
|
+
- h2o.is_running() module 'h2o' has no attribute 'is_running'. Solution: just do h2o.init() and it will check if H2O is running.
|
673
|
+
|
674
|
+
|
675
|
+
""",
|
676
|
+
input_variables=[
|
677
|
+
"user_instructions",
|
678
|
+
"function_name",
|
679
|
+
"target_variable",
|
680
|
+
"recommended_steps",
|
681
|
+
"all_datasets_summary",
|
682
|
+
"model_directory",
|
683
|
+
"log_path",
|
684
|
+
"enable_mlflow",
|
685
|
+
"mlflow_tracking_uri",
|
686
|
+
"mlflow_experiment_name",
|
687
|
+
"mlflow_run_name",
|
688
|
+
]
|
689
|
+
)
|
690
|
+
|
691
|
+
recommended_steps = state.get("recommended_steps", "")
|
692
|
+
h2o_code_agent = code_prompt | llm | PythonOutputParser()
|
693
|
+
|
694
|
+
resp = h2o_code_agent.invoke({
|
695
|
+
"user_instructions": state.get("user_instructions"),
|
696
|
+
"function_name": function_name,
|
697
|
+
"target_variable": state.get("target_variable"),
|
698
|
+
"recommended_steps": recommended_steps,
|
699
|
+
"all_datasets_summary": all_datasets_summary_str,
|
700
|
+
"model_directory": model_directory,
|
701
|
+
"log_path": log_path,
|
702
|
+
"enable_mlflow": enable_mlflow,
|
703
|
+
"mlflow_tracking_uri": mlflow_tracking_uri,
|
704
|
+
"mlflow_experiment_name": mlflow_experiment_name,
|
705
|
+
"mlflow_run_name": mlflow_run_name,
|
706
|
+
})
|
707
|
+
|
708
|
+
resp = relocate_imports_inside_function(resp)
|
709
|
+
resp = add_comments_to_top(resp, agent_name=AGENT_NAME)
|
710
|
+
|
711
|
+
# Log the code snippet if requested
|
712
|
+
file_path, f_name = log_ai_function(
|
713
|
+
response=resp,
|
714
|
+
file_name=file_name,
|
715
|
+
log=log,
|
716
|
+
log_path=log_path,
|
717
|
+
overwrite=overwrite
|
718
|
+
)
|
719
|
+
|
720
|
+
return {
|
721
|
+
"h2o_train_function": resp,
|
722
|
+
"h2o_train_function_path": file_path,
|
723
|
+
"h2o_train_file_name": f_name,
|
724
|
+
"h2o_train_function_name": function_name,
|
725
|
+
}
|
726
|
+
|
727
|
+
# Human Review
|
728
|
+
prompt_text_human_review = "Are the following Machine Learning instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
|
729
|
+
|
730
|
+
if not bypass_explain_code:
|
731
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_ml_steps", "explain_h2o_code"]]:
|
732
|
+
return node_func_human_review(
|
733
|
+
state=state,
|
734
|
+
prompt_text=prompt_text_human_review,
|
735
|
+
yes_goto= 'explain_h2o_code',
|
736
|
+
no_goto="recommend_ml_steps",
|
737
|
+
user_instructions_key="user_instructions",
|
738
|
+
recommended_steps_key="recommended_steps",
|
739
|
+
code_snippet_key="h2o_train_function",
|
740
|
+
)
|
741
|
+
else:
|
742
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_ml_steps", "__end__"]]:
|
743
|
+
return node_func_human_review(
|
744
|
+
state=state,
|
745
|
+
prompt_text=prompt_text_human_review,
|
746
|
+
yes_goto= '__end__',
|
747
|
+
no_goto="recommend_ml_steps",
|
748
|
+
user_instructions_key="user_instructions",
|
749
|
+
recommended_steps_key="recommended_steps",
|
750
|
+
code_snippet_key="h2o_train_function",
|
751
|
+
)
|
752
|
+
|
753
|
+
# 3) Execute code
|
754
|
+
def execute_h2o_code(state):
|
755
|
+
result = node_func_execute_agent_code_on_data(
|
756
|
+
state=state,
|
757
|
+
data_key="data_raw",
|
758
|
+
code_snippet_key="h2o_train_function",
|
759
|
+
result_key="h2o_train_result",
|
760
|
+
error_key="h2o_train_error",
|
761
|
+
agent_function_name=state.get("h2o_train_function_name"),
|
762
|
+
pre_processing=lambda data: pd.DataFrame.from_dict(data),
|
763
|
+
post_processing=lambda x: x,
|
764
|
+
error_message_prefix="Error occurred during H2O AutoML: "
|
765
|
+
)
|
766
|
+
|
767
|
+
# If no error, extract leaderboard, best_model_id, and model_path
|
768
|
+
if not result["h2o_train_error"]:
|
769
|
+
if result["h2o_train_result"] and isinstance(result["h2o_train_result"], dict):
|
770
|
+
lb = result["h2o_train_result"].get("leaderboard", {})
|
771
|
+
best_id = result["h2o_train_result"].get("best_model_id", None)
|
772
|
+
mpath = result["h2o_train_result"].get("model_path", None)
|
773
|
+
model_results = result["h2o_train_result"].get("model_results", {})
|
774
|
+
|
775
|
+
result["leaderboard"] = lb
|
776
|
+
result["best_model_id"] = best_id
|
777
|
+
result["model_path"] = mpath
|
778
|
+
result["model_results"] = model_results
|
779
|
+
|
780
|
+
return result
|
781
|
+
|
782
|
+
# 4) Fix code if there's an error
|
783
|
+
def fix_h2o_code(state: GraphState):
|
784
|
+
fix_prompt = """
|
785
|
+
You are an H2O AutoML agent. The function {function_name} currently has errors.
|
786
|
+
Please fix it. Return only the corrected function in ```python``` format.
|
787
|
+
|
788
|
+
Broken code:
|
789
|
+
{code_snippet}
|
790
|
+
|
791
|
+
Last Known Error:
|
792
|
+
{error}
|
793
|
+
"""
|
794
|
+
return node_func_fix_agent_code(
|
795
|
+
state=state,
|
796
|
+
code_snippet_key="h2o_train_function",
|
797
|
+
error_key="h2o_train_error",
|
798
|
+
llm=llm,
|
799
|
+
prompt_template=fix_prompt,
|
800
|
+
agent_name=AGENT_NAME,
|
801
|
+
file_path=state.get("h2o_train_function_path"),
|
802
|
+
function_name=state.get("h2o_train_function_name"),
|
803
|
+
log=log
|
804
|
+
)
|
805
|
+
|
806
|
+
# 5) Final reporting node
|
807
|
+
def report_agent_outputs(state: GraphState):
|
808
|
+
return node_func_report_agent_outputs(
|
809
|
+
state=state,
|
810
|
+
keys_to_include=[
|
811
|
+
"recommended_steps",
|
812
|
+
"h2o_train_function",
|
813
|
+
"h2o_train_function_path",
|
814
|
+
"h2o_train_function_name",
|
815
|
+
"h2o_train_error",
|
816
|
+
"model_path",
|
817
|
+
"best_model_id",
|
818
|
+
],
|
819
|
+
result_key="messages",
|
820
|
+
role=AGENT_NAME,
|
821
|
+
custom_title="H2O Machine Learning Agent Outputs"
|
822
|
+
)
|
823
|
+
|
824
|
+
node_functions = {
|
825
|
+
"recommend_ml_steps": recommend_ml_steps,
|
826
|
+
"human_review": human_review,
|
827
|
+
"create_h2o_code": create_h2o_code,
|
828
|
+
"execute_h2o_code": execute_h2o_code,
|
829
|
+
"fix_h2o_code": fix_h2o_code,
|
830
|
+
"report_agent_outputs": report_agent_outputs,
|
831
|
+
}
|
832
|
+
|
833
|
+
app = create_coding_agent_graph(
|
834
|
+
GraphState=GraphState,
|
835
|
+
node_functions=node_functions,
|
836
|
+
recommended_steps_node_name="recommend_ml_steps",
|
837
|
+
create_code_node_name="create_h2o_code",
|
838
|
+
execute_code_node_name="execute_h2o_code",
|
839
|
+
fix_code_node_name="fix_h2o_code",
|
840
|
+
explain_code_node_name="report_agent_outputs",
|
841
|
+
error_key="h2o_train_error",
|
842
|
+
max_retries_key="max_retries",
|
843
|
+
retry_count_key="retry_count",
|
844
|
+
human_in_the_loop=human_in_the_loop,
|
845
|
+
human_review_node_name="human_review",
|
846
|
+
checkpointer=MemorySaver(),
|
847
|
+
bypass_recommended_steps=bypass_recommended_steps,
|
848
|
+
bypass_explain_code=bypass_explain_code,
|
849
|
+
)
|
850
|
+
|
851
|
+
return app
|
852
|
+
|