ai-data-science-team 0.0.0.9008__py3-none-any.whl → 0.0.0.9010__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +0 -1
- ai_data_science_team/agents/data_cleaning_agent.py +50 -39
- ai_data_science_team/agents/data_loader_tools_agent.py +69 -0
- ai_data_science_team/agents/data_visualization_agent.py +45 -50
- ai_data_science_team/agents/data_wrangling_agent.py +50 -49
- ai_data_science_team/agents/feature_engineering_agent.py +48 -67
- ai_data_science_team/agents/sql_database_agent.py +130 -76
- ai_data_science_team/ml_agents/__init__.py +2 -0
- ai_data_science_team/ml_agents/h2o_ml_agent.py +852 -0
- ai_data_science_team/ml_agents/mlflow_tools_agent.py +327 -0
- ai_data_science_team/multiagents/sql_data_analyst.py +120 -9
- ai_data_science_team/parsers/__init__.py +0 -0
- ai_data_science_team/{tools → parsers}/parsers.py +0 -1
- ai_data_science_team/templates/__init__.py +1 -0
- ai_data_science_team/templates/agent_templates.py +78 -7
- ai_data_science_team/tools/data_loader.py +378 -0
- ai_data_science_team/tools/{metadata.py → dataframe.py} +0 -91
- ai_data_science_team/tools/h2o.py +643 -0
- ai_data_science_team/tools/mlflow.py +961 -0
- ai_data_science_team/tools/sql.py +126 -0
- ai_data_science_team/{tools → utils}/regex.py +59 -1
- {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/METADATA +56 -24
- ai_data_science_team-0.0.0.9010.dist-info/RECORD +35 -0
- ai_data_science_team-0.0.0.9008.dist-info/RECORD +0 -26
- /ai_data_science_team/{tools → utils}/logging.py +0 -0
- {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/WHEEL +0 -0
- {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,852 @@
|
|
1
|
+
# BUSINESS SCIENCE UNIVERSITY
|
2
|
+
# AI DATA SCIENCE TEAM
|
3
|
+
# ***
|
4
|
+
# * Agents: H2O Machine Learning Agent
|
5
|
+
|
6
|
+
import os
|
7
|
+
import json
|
8
|
+
from typing import TypedDict, Annotated, Sequence, Literal
|
9
|
+
import operator
|
10
|
+
|
11
|
+
import pandas as pd
|
12
|
+
from IPython.display import Markdown
|
13
|
+
|
14
|
+
from langchain.prompts import PromptTemplate
|
15
|
+
from langchain_core.messages import BaseMessage
|
16
|
+
|
17
|
+
from langgraph.types import Command
|
18
|
+
from langgraph.checkpoint.memory import MemorySaver
|
19
|
+
|
20
|
+
from ai_data_science_team.templates import(
|
21
|
+
node_func_execute_agent_code_on_data,
|
22
|
+
node_func_human_review,
|
23
|
+
node_func_fix_agent_code,
|
24
|
+
node_func_report_agent_outputs,
|
25
|
+
create_coding_agent_graph,
|
26
|
+
BaseAgent,
|
27
|
+
)
|
28
|
+
from ai_data_science_team.parsers.parsers import PythonOutputParser
|
29
|
+
from ai_data_science_team.utils.regex import (
|
30
|
+
relocate_imports_inside_function,
|
31
|
+
add_comments_to_top,
|
32
|
+
format_agent_name,
|
33
|
+
format_recommended_steps,
|
34
|
+
get_generic_summary,
|
35
|
+
)
|
36
|
+
from ai_data_science_team.tools.dataframe import get_dataframe_summary
|
37
|
+
from ai_data_science_team.utils.logging import log_ai_function
|
38
|
+
from ai_data_science_team.tools.h2o import H2O_AUTOML_DOCUMENTATION
|
39
|
+
|
40
|
+
AGENT_NAME = "h2o_ml_agent"
|
41
|
+
LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
42
|
+
|
43
|
+
class H2OMLAgent(BaseAgent):
|
44
|
+
"""
|
45
|
+
A Machine Learning agent that uses H2O's AutoML for training,
|
46
|
+
allowing the user to specify a model directory for saving the best model.
|
47
|
+
If neither model_directory nor log_path is provided, model saving is skipped.
|
48
|
+
|
49
|
+
Parameters
|
50
|
+
----------
|
51
|
+
model : langchain.llms.base.LLM
|
52
|
+
The language model used to generate the ML code.
|
53
|
+
n_samples : int, optional
|
54
|
+
Number of samples used when summarizing the dataset. Defaults to 30.
|
55
|
+
log : bool, optional
|
56
|
+
Whether to log the generated code and errors. Defaults to False.
|
57
|
+
log_path : str, optional
|
58
|
+
Directory path for storing log files. Defaults to None.
|
59
|
+
file_name : str, optional
|
60
|
+
Name of the Python file for saving the generated code. Defaults to "h2o_automl.py".
|
61
|
+
function_name : str, optional
|
62
|
+
Name of the function that performs the AutoML training. Defaults to "h2o_automl".
|
63
|
+
model_directory : str or None, optional
|
64
|
+
Directory to save the H2O Machine Learning model. If None, defaults to log_path (if available).
|
65
|
+
If both are None, no model is saved. Defaults to None.
|
66
|
+
overwrite : bool, optional
|
67
|
+
Whether to overwrite the log file if it exists. Defaults to True.
|
68
|
+
human_in_the_loop : bool, optional
|
69
|
+
Enables user review of the code. Defaults to False.
|
70
|
+
bypass_recommended_steps : bool, optional
|
71
|
+
If True, skips the recommended steps prompt. Defaults to False.
|
72
|
+
bypass_explain_code : bool, optional
|
73
|
+
If True, skips the code-explanation step. Defaults to False.
|
74
|
+
enable_mlflow : bool, default False
|
75
|
+
Whether to enable MLflow logging. If False, skip MLflow entirely.
|
76
|
+
mlflow_tracking_uri : str or None
|
77
|
+
If provided, sets MLflow tracking URI at runtime.
|
78
|
+
mlflow_experiment_name : str
|
79
|
+
Name of the MLflow experiment (created if doesn't exist).
|
80
|
+
mlflow_run_name : str, default None
|
81
|
+
A custom name for the MLflow run.
|
82
|
+
|
83
|
+
|
84
|
+
Methods
|
85
|
+
-------
|
86
|
+
update_params(**kwargs)
|
87
|
+
Updates the agent's parameters and rebuilds the compiled state graph.
|
88
|
+
ainvoke_agent(user_instructions, data_raw, target_variable, ...)
|
89
|
+
Asynchronously runs the agent to produce an H2O AutoML model, optionally saving the model to disk.
|
90
|
+
invoke_agent(user_instructions, data_raw, target_variable, ...)
|
91
|
+
Synchronously runs the agent to produce an H2O AutoML model, optionally saving the model to disk.
|
92
|
+
get_leaderboard()
|
93
|
+
Retrieves the H2O AutoML leaderboard from the agent's response.
|
94
|
+
get_best_model_id()
|
95
|
+
Retrieves the best model ID from the agent's response.
|
96
|
+
get_model_path()
|
97
|
+
Retrieves the saved model path from the agent's response (or None if not saved).
|
98
|
+
get_data_raw()
|
99
|
+
Retrieves the raw data as a DataFrame from the agent's response.
|
100
|
+
get_h2o_train_function(markdown=False)
|
101
|
+
Retrieves the H2O AutoML function code generated by the agent.
|
102
|
+
get_recommended_ml_steps(markdown=False)
|
103
|
+
Retrieves recommended ML steps from the agent's response.
|
104
|
+
get_workflow_summary()
|
105
|
+
Retrieves a summary of the agent's workflow.
|
106
|
+
get_response()
|
107
|
+
Returns the entire response dictionary.
|
108
|
+
show()
|
109
|
+
Visualizes the compiled graph as a Mermaid diagram.
|
110
|
+
|
111
|
+
Examples
|
112
|
+
--------
|
113
|
+
```python
|
114
|
+
from langchain_openai import ChatOpenAI
|
115
|
+
import pandas as pd
|
116
|
+
from ai_data_science_team.ml_agents import H2OMLAgent
|
117
|
+
|
118
|
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
119
|
+
|
120
|
+
df = pd.read_csv("data/churn_data.csv")
|
121
|
+
|
122
|
+
ml_agent = H2OMLAgent(
|
123
|
+
model=llm,
|
124
|
+
log=True,
|
125
|
+
log_path=LOG_PATH,
|
126
|
+
model_directory=MODEL_PATH,
|
127
|
+
)
|
128
|
+
|
129
|
+
ml_agent.invoke_agent(
|
130
|
+
data_raw=df.drop(columns=["customerID"]),
|
131
|
+
user_instructions="Please do classification on 'Churn'. Use a max runtime of 30 seconds.",
|
132
|
+
target_variable="Churn"
|
133
|
+
)
|
134
|
+
|
135
|
+
# Retrieve and display the leaderboard of models
|
136
|
+
ml_agent.get_leaderboard()
|
137
|
+
|
138
|
+
# Get the H2O training function in markdown format
|
139
|
+
ml_agent.get_h2o_train_function(markdown=True)
|
140
|
+
|
141
|
+
# Get the recommended machine learning steps in markdown format
|
142
|
+
ml_agent.get_recommended_ml_steps(markdown=True)
|
143
|
+
|
144
|
+
# Get a summary of the workflow in markdown format
|
145
|
+
ml_agent.get_workflow_summary(markdown=True)
|
146
|
+
|
147
|
+
# Get a summary of the logs in markdown format
|
148
|
+
ml_agent.get_log_summary(markdown=True)
|
149
|
+
|
150
|
+
# Get the path to the saved model
|
151
|
+
model_path = ml_agent.get_model_path()
|
152
|
+
model_path
|
153
|
+
```
|
154
|
+
|
155
|
+
Returns
|
156
|
+
-------
|
157
|
+
H2OMLAgent : langchain.graphs.CompiledStateGraph
|
158
|
+
An instance of the H2O ML agent.
|
159
|
+
|
160
|
+
"""
|
161
|
+
|
162
|
+
def __init__(
|
163
|
+
self,
|
164
|
+
model,
|
165
|
+
n_samples=30,
|
166
|
+
log=False,
|
167
|
+
log_path=None,
|
168
|
+
file_name="h2o_automl.py",
|
169
|
+
function_name="h2o_automl",
|
170
|
+
model_directory=None,
|
171
|
+
overwrite=True,
|
172
|
+
human_in_the_loop=False,
|
173
|
+
bypass_recommended_steps=False,
|
174
|
+
bypass_explain_code=False,
|
175
|
+
enable_mlflow=False,
|
176
|
+
mlflow_tracking_uri=None,
|
177
|
+
mlflow_experiment_name="H2O AutoML",
|
178
|
+
mlflow_run_name=None,
|
179
|
+
):
|
180
|
+
self._params = {
|
181
|
+
"model": model,
|
182
|
+
"n_samples": n_samples,
|
183
|
+
"log": log,
|
184
|
+
"log_path": log_path,
|
185
|
+
"file_name": file_name,
|
186
|
+
"function_name": function_name,
|
187
|
+
"model_directory": model_directory,
|
188
|
+
"overwrite": overwrite,
|
189
|
+
"human_in_the_loop": human_in_the_loop,
|
190
|
+
"bypass_recommended_steps": bypass_recommended_steps,
|
191
|
+
"bypass_explain_code": bypass_explain_code,
|
192
|
+
"enable_mlflow": enable_mlflow,
|
193
|
+
"mlflow_tracking_uri": mlflow_tracking_uri,
|
194
|
+
"mlflow_experiment_name": mlflow_experiment_name,
|
195
|
+
"mlflow_run_name": mlflow_run_name,
|
196
|
+
}
|
197
|
+
self._compiled_graph = self._make_compiled_graph()
|
198
|
+
self.response = None
|
199
|
+
|
200
|
+
def _make_compiled_graph(self):
|
201
|
+
"""
|
202
|
+
Creates the compiled graph for the agent.
|
203
|
+
"""
|
204
|
+
self.response = None
|
205
|
+
return make_h2o_ml_agent(**self._params)
|
206
|
+
|
207
|
+
def update_params(self, **kwargs):
|
208
|
+
"""
|
209
|
+
Updates the agent's parameters and rebuilds the compiled graph.
|
210
|
+
"""
|
211
|
+
for k, v in kwargs.items():
|
212
|
+
self._params[k] = v
|
213
|
+
self._compiled_graph = self._make_compiled_graph()
|
214
|
+
|
215
|
+
async def ainvoke_agent(
|
216
|
+
self,
|
217
|
+
data_raw: pd.DataFrame,
|
218
|
+
user_instructions: str=None,
|
219
|
+
target_variable: str=None,
|
220
|
+
max_retries=3,
|
221
|
+
retry_count=0,
|
222
|
+
**kwargs
|
223
|
+
):
|
224
|
+
"""
|
225
|
+
Asynchronously trains an H2O AutoML model for the provided dataset,
|
226
|
+
saving the best model to disk if model_directory or log_path is available.
|
227
|
+
"""
|
228
|
+
response = await self._compiled_graph.ainvoke({
|
229
|
+
"user_instructions": user_instructions,
|
230
|
+
"data_raw": data_raw.to_dict(),
|
231
|
+
"target_variable": target_variable,
|
232
|
+
"max_retries": max_retries,
|
233
|
+
"retry_count": retry_count
|
234
|
+
}, **kwargs)
|
235
|
+
self.response = response
|
236
|
+
return None
|
237
|
+
|
238
|
+
def invoke_agent(
|
239
|
+
self,
|
240
|
+
data_raw: pd.DataFrame,
|
241
|
+
user_instructions: str=None,
|
242
|
+
target_variable: str=None,
|
243
|
+
max_retries=3,
|
244
|
+
retry_count=0,
|
245
|
+
**kwargs
|
246
|
+
):
|
247
|
+
"""
|
248
|
+
Synchronously trains an H2O AutoML model for the provided dataset,
|
249
|
+
saving the best model to disk if model_directory or log_path is available.
|
250
|
+
"""
|
251
|
+
response = self._compiled_graph.invoke({
|
252
|
+
"user_instructions": user_instructions,
|
253
|
+
"data_raw": data_raw.to_dict(),
|
254
|
+
"target_variable": target_variable,
|
255
|
+
"max_retries": max_retries,
|
256
|
+
"retry_count": retry_count
|
257
|
+
}, **kwargs)
|
258
|
+
self.response = response
|
259
|
+
return None
|
260
|
+
|
261
|
+
def get_leaderboard(self):
|
262
|
+
"""Returns the H2O AutoML leaderboard as a DataFrame."""
|
263
|
+
if self.response and "leaderboard" in self.response:
|
264
|
+
return pd.DataFrame(self.response["leaderboard"])
|
265
|
+
return None
|
266
|
+
|
267
|
+
def get_best_model_id(self):
|
268
|
+
"""Returns the best model id from the AutoML run."""
|
269
|
+
if self.response and "best_model_id" in self.response:
|
270
|
+
return self.response["best_model_id"]
|
271
|
+
return None
|
272
|
+
|
273
|
+
def get_model_path(self):
|
274
|
+
"""Returns the file path to the saved best model, or None if not saved."""
|
275
|
+
if self.response and "model_path" in self.response:
|
276
|
+
return self.response["model_path"]
|
277
|
+
return None
|
278
|
+
|
279
|
+
def get_data_raw(self):
|
280
|
+
"""Retrieves the raw data as a DataFrame from the response."""
|
281
|
+
if self.response and "data_raw" in self.response:
|
282
|
+
return pd.DataFrame(self.response["data_raw"])
|
283
|
+
return None
|
284
|
+
|
285
|
+
def get_h2o_train_function(self, markdown=False):
|
286
|
+
"""Retrieves the H2O AutoML function code generated by the agent."""
|
287
|
+
if self.response and "h2o_train_function" in self.response:
|
288
|
+
code = self.response["h2o_train_function"]
|
289
|
+
if markdown:
|
290
|
+
return Markdown(f"```python\n{code}\n```")
|
291
|
+
return code
|
292
|
+
return None
|
293
|
+
|
294
|
+
def get_recommended_ml_steps(self, markdown=False):
|
295
|
+
"""Retrieves recommended ML steps from the agent's response."""
|
296
|
+
if self.response and "recommended_steps" in self.response:
|
297
|
+
steps = self.response["recommended_steps"]
|
298
|
+
if markdown:
|
299
|
+
return Markdown(steps)
|
300
|
+
return steps
|
301
|
+
return None
|
302
|
+
|
303
|
+
def get_workflow_summary(self, markdown=False):
|
304
|
+
"""
|
305
|
+
Retrieves the agent's workflow summary, if logging is enabled.
|
306
|
+
"""
|
307
|
+
if self.response and self.response.get("messages"):
|
308
|
+
summary = get_generic_summary(json.loads(self.response.get("messages")[-1].content))
|
309
|
+
if markdown:
|
310
|
+
return Markdown(summary)
|
311
|
+
else:
|
312
|
+
return summary
|
313
|
+
|
314
|
+
def get_log_summary(self, markdown=False):
|
315
|
+
"""
|
316
|
+
Logs a summary of the agent's operations, if logging is enabled.
|
317
|
+
"""
|
318
|
+
if self.response:
|
319
|
+
if self.response.get('h2o_train_function_path'):
|
320
|
+
log_details = f"""
|
321
|
+
## H2O Machine Learning Agent Log Summary:
|
322
|
+
|
323
|
+
Function Path: {self.response.get('h2o_train_function_path')}
|
324
|
+
|
325
|
+
Function Name: {self.response.get('h2o_train_function_name')}
|
326
|
+
|
327
|
+
Best Model ID: {self.get_best_model_id()}
|
328
|
+
|
329
|
+
Model Path: {self.get_model_path()}
|
330
|
+
"""
|
331
|
+
if markdown:
|
332
|
+
return Markdown(log_details)
|
333
|
+
else:
|
334
|
+
return log_details
|
335
|
+
|
336
|
+
|
337
|
+
def make_h2o_ml_agent(
|
338
|
+
model,
|
339
|
+
n_samples=30,
|
340
|
+
log=False,
|
341
|
+
log_path=None,
|
342
|
+
file_name="h2o_automl.py",
|
343
|
+
function_name="h2o_automl",
|
344
|
+
model_directory=None,
|
345
|
+
overwrite=True,
|
346
|
+
human_in_the_loop=False,
|
347
|
+
bypass_recommended_steps=False,
|
348
|
+
bypass_explain_code=False,
|
349
|
+
enable_mlflow=False,
|
350
|
+
mlflow_tracking_uri=None,
|
351
|
+
mlflow_experiment_name="H2O AutoML",
|
352
|
+
mlflow_run_name=None,
|
353
|
+
):
|
354
|
+
"""
|
355
|
+
Creates a machine learning agent that uses H2O for AutoML.
|
356
|
+
The agent will:
|
357
|
+
1. Optionally recommend ML steps,
|
358
|
+
2. Creates Python code that sets up H2OAutoML,
|
359
|
+
3. Executes that code (optionally saving the best model to disk),
|
360
|
+
4. Fixes errors if needed,
|
361
|
+
5. Optionally explains the code.
|
362
|
+
|
363
|
+
model_directory: Directory to save the model.
|
364
|
+
If None, defaults to log_path.
|
365
|
+
If both are None, skip saving.
|
366
|
+
"""
|
367
|
+
|
368
|
+
llm = model
|
369
|
+
|
370
|
+
# Handle logging directory
|
371
|
+
if log:
|
372
|
+
if log_path is None:
|
373
|
+
log_path = "logs/"
|
374
|
+
if not os.path.exists(log_path):
|
375
|
+
os.makedirs(log_path)
|
376
|
+
|
377
|
+
# Check if H2O is installed
|
378
|
+
try:
|
379
|
+
import h2o
|
380
|
+
from h2o.automl import H2OAutoML
|
381
|
+
except ImportError as e:
|
382
|
+
raise ImportError(
|
383
|
+
"The 'h2o' library is not installed. Please install it using pip:\n\n"
|
384
|
+
" pip install h2o\n\n"
|
385
|
+
"Visit https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html for details."
|
386
|
+
) from e
|
387
|
+
|
388
|
+
# Define GraphState
|
389
|
+
class GraphState(TypedDict):
|
390
|
+
messages: Annotated[Sequence[BaseMessage], operator.add]
|
391
|
+
user_instructions: str
|
392
|
+
recommended_steps: str
|
393
|
+
data_raw: dict
|
394
|
+
leaderboard: dict
|
395
|
+
best_model_id: str
|
396
|
+
model_path: str
|
397
|
+
model_results: dict
|
398
|
+
target_variable: str
|
399
|
+
all_datasets_summary: str
|
400
|
+
h2o_train_function: str
|
401
|
+
h2o_train_function_path: str
|
402
|
+
h2o_train_file_name: str
|
403
|
+
h2o_train_function_name: str
|
404
|
+
h2o_train_error: str
|
405
|
+
max_retries: int
|
406
|
+
retry_count: int
|
407
|
+
|
408
|
+
# 1) Recommend ML steps (optional)
|
409
|
+
def recommend_ml_steps(state: GraphState):
|
410
|
+
print(format_agent_name(AGENT_NAME))
|
411
|
+
print(" * RECOMMEND MACHINE LEARNING STEPS")
|
412
|
+
|
413
|
+
recommend_steps_prompt = PromptTemplate(
|
414
|
+
template="""
|
415
|
+
You are an AutoML Expert using H2O.
|
416
|
+
|
417
|
+
We have the following dataset summary, user instructions, and H2O AutoML documentation:
|
418
|
+
|
419
|
+
User instructions:
|
420
|
+
{user_instructions}
|
421
|
+
|
422
|
+
Data Summary:
|
423
|
+
{all_datasets_summary}
|
424
|
+
|
425
|
+
H2O AutoML Documentation:
|
426
|
+
{h2o_automl_documentation}
|
427
|
+
|
428
|
+
Please recommend a short list of steps or considerations for performing H2OAutoML on this data. Specifically focus on maximizing model accuracy while remaining flexible to user instructions and the dataset.
|
429
|
+
|
430
|
+
- Recommend any paramters and values that might improve performance (predictive accuracy).
|
431
|
+
- Recommend the Loss Function, Stopping Criteria, and other advanced parameters.
|
432
|
+
- Use the H2O AutoML documentation to your advantage.
|
433
|
+
- Exclude deep learning algorithms since these are typically low performance.
|
434
|
+
|
435
|
+
Avoid these:
|
436
|
+
|
437
|
+
- Do not perform data cleaning or feature engineering here. We will handle that separately.
|
438
|
+
- Do not limit memory size or CPU usage unless the user specifies it.
|
439
|
+
|
440
|
+
Return as a numbered list. You can return short code snippets to demonstrate actions. But do not return a fully coded solution. The H2O AutoML code will be generated separately by a Coding Agent.
|
441
|
+
""",
|
442
|
+
input_variables=["user_instructions", "all_datasets_summary", "h2o_automl_documentation"]
|
443
|
+
)
|
444
|
+
|
445
|
+
data_raw = state.get("data_raw")
|
446
|
+
df = pd.DataFrame.from_dict(data_raw)
|
447
|
+
all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
|
448
|
+
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
449
|
+
|
450
|
+
steps_agent = recommend_steps_prompt | llm
|
451
|
+
recommended_steps = steps_agent.invoke({
|
452
|
+
"user_instructions": state.get("user_instructions"),
|
453
|
+
"all_datasets_summary": all_datasets_summary_str,
|
454
|
+
"h2o_automl_documentation": H2O_AUTOML_DOCUMENTATION
|
455
|
+
})
|
456
|
+
|
457
|
+
return {
|
458
|
+
"recommended_steps": format_recommended_steps(
|
459
|
+
recommended_steps.content.strip(),
|
460
|
+
heading="# Recommended ML Steps:"
|
461
|
+
),
|
462
|
+
"all_datasets_summary": all_datasets_summary_str
|
463
|
+
}
|
464
|
+
|
465
|
+
# 2) Create code
|
466
|
+
def create_h2o_code(state: GraphState):
|
467
|
+
if bypass_recommended_steps:
|
468
|
+
print(format_agent_name(AGENT_NAME))
|
469
|
+
|
470
|
+
data_raw = state.get("data_raw")
|
471
|
+
df = pd.DataFrame.from_dict(data_raw)
|
472
|
+
all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
|
473
|
+
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
474
|
+
else:
|
475
|
+
all_datasets_summary_str = state.get("all_datasets_summary")
|
476
|
+
|
477
|
+
print(" * CREATE H2O AUTOML CODE")
|
478
|
+
|
479
|
+
code_prompt = PromptTemplate(
|
480
|
+
template="""
|
481
|
+
You are an H2O AutoML agent. Create a Python function named {function_name}(data_raw)
|
482
|
+
that runs H2OAutoML on the provided data with a focus on maximizing model accuracy and
|
483
|
+
incorporating user instructions for flexibility.
|
484
|
+
|
485
|
+
Do not perform substantial data cleaning or feature engineering here. We will handle that separately.
|
486
|
+
|
487
|
+
We have two variables for deciding where to save the model:
|
488
|
+
model_directory = {model_directory}
|
489
|
+
log_path = {log_path}
|
490
|
+
|
491
|
+
IMPORTANT: MLflow Parameters if the user wants to enable MLflow with H2O AutoML:
|
492
|
+
enable_mlflow: {enable_mlflow}
|
493
|
+
mlflow_tracking_uri: {mlflow_tracking_uri}
|
494
|
+
mlflow_experiment_name: {mlflow_experiment_name}
|
495
|
+
mlflow_run_name: {mlflow_run_name}
|
496
|
+
|
497
|
+
|
498
|
+
Additional Requirements:
|
499
|
+
- Convert `data_raw` (pandas DataFrame) into an H2OFrame.
|
500
|
+
- Identify the target variable from {target_variable} (if provided).
|
501
|
+
- Start H2O if not already started.
|
502
|
+
- Use Recommended Steps to guide any advanced parameters (e.g., cross-validation folds,
|
503
|
+
balancing classes, extended training time, stacking) that might improve performance.
|
504
|
+
- If the user does not specify anything special, use H2OAutoML defaults (including stacked ensembles).
|
505
|
+
- Focus on maximizing accuracy (or the most relevant metric if it's not classification)
|
506
|
+
while remaining flexible to user instructions.
|
507
|
+
- Return a dict with keys: leaderboard, best_model_id, model_path, and model_results.
|
508
|
+
- If enable_mlfow is True, log the top metrics and save the model as an artifact. (See example function)
|
509
|
+
|
510
|
+
Initial User Instructions (Disregard any instructions that are unrelated to modeling):
|
511
|
+
{user_instructions}
|
512
|
+
|
513
|
+
Recommended Steps:
|
514
|
+
{recommended_steps}
|
515
|
+
|
516
|
+
Data summary for reference:
|
517
|
+
{all_datasets_summary}
|
518
|
+
|
519
|
+
Return only code in ```python``` with a single function definition. Use this as an example starting template:
|
520
|
+
```python
|
521
|
+
def {function_name}(
|
522
|
+
data_raw: List[Dict[str, Any]],
|
523
|
+
target: str,
|
524
|
+
max_runtime_secs: int,
|
525
|
+
exclude_algos: List[str],
|
526
|
+
balance_classes: bool,
|
527
|
+
nfolds: int,
|
528
|
+
seed: int,
|
529
|
+
max_models: int,
|
530
|
+
stopping_metric: str,
|
531
|
+
stopping_tolerance: float,
|
532
|
+
stopping_rounds: int,
|
533
|
+
sort_metric: str ,
|
534
|
+
model_directory: Optional[str] = None,
|
535
|
+
log_path: Optional[str] = None,
|
536
|
+
enable_mlflow: bool,
|
537
|
+
mlflow_tracking_uri: Optional[str],
|
538
|
+
mlflow_experiment_name: str,
|
539
|
+
mlflow_run_name: str,
|
540
|
+
**kwargs # Additional parameters for H2OAutoML (feel free to add these based on user instructions and recommended steps)
|
541
|
+
):
|
542
|
+
|
543
|
+
import h2o
|
544
|
+
from h2o.automl import H2OAutoML
|
545
|
+
import pandas as pd
|
546
|
+
import json
|
547
|
+
|
548
|
+
# Optional MLflow usage
|
549
|
+
if enable_mlflow:
|
550
|
+
import mlflow
|
551
|
+
if mlflow_tracking_uri:
|
552
|
+
mlflow.set_tracking_uri(mlflow_tracking_uri)
|
553
|
+
mlflow.set_experiment(mlflow_experiment_name)
|
554
|
+
run_context = mlflow.start_run(run_name=mlflow_run_name)
|
555
|
+
else:
|
556
|
+
# Dummy context manager to skip MLflow if not enabled
|
557
|
+
from contextlib import nullcontext
|
558
|
+
run_context = nullcontext()
|
559
|
+
|
560
|
+
exclude_algos = exclude_algos or ["DeepLearning"] # default if not provided
|
561
|
+
|
562
|
+
# Convert data to DataFrame
|
563
|
+
df = pd.DataFrame(data_raw)
|
564
|
+
|
565
|
+
with run_context as run:
|
566
|
+
# If using MLflow, track run ID
|
567
|
+
run_id = None
|
568
|
+
if enable_mlflow and run is not None:
|
569
|
+
run_id = run.info.run_id
|
570
|
+
import mlflow
|
571
|
+
|
572
|
+
|
573
|
+
# Initialize H2O
|
574
|
+
h2o.init()
|
575
|
+
|
576
|
+
# Create H2OFrame
|
577
|
+
data_h2o = h2o.H2OFrame(df)
|
578
|
+
|
579
|
+
# Setup AutoML
|
580
|
+
aml = H2OAutoML(
|
581
|
+
max_runtime_secs=max_runtime_secs,
|
582
|
+
exclude_algos=exclude_algos,
|
583
|
+
balance_classes=balance_classes,
|
584
|
+
nfolds=nfolds,
|
585
|
+
seed=seed,
|
586
|
+
max_models=max_models,
|
587
|
+
stopping_metric=stopping_metric,
|
588
|
+
stopping_tolerance=stopping_tolerance,
|
589
|
+
stopping_rounds=stopping_rounds,
|
590
|
+
sort_metric=sort_metric,
|
591
|
+
**kwargs
|
592
|
+
)
|
593
|
+
|
594
|
+
# Train
|
595
|
+
x = [col for col in data_h2o.columns if col != target]
|
596
|
+
aml.train(x=x, y=target, training_frame=data_h2o)
|
597
|
+
|
598
|
+
# Save model if we have a directory/log path
|
599
|
+
if model_directory is None and log_path is None:
|
600
|
+
model_path = None
|
601
|
+
else:
|
602
|
+
path_to_save = model_directory if model_directory else log_path
|
603
|
+
model_path = h2o.save_model(model=aml.leader, path=path_to_save, force=True)
|
604
|
+
|
605
|
+
# Leaderboard (DataFrame -> dict)
|
606
|
+
leaderboard_df = pd.DataFrame(aml.leaderboard)
|
607
|
+
leaderboard_dict = leaderboard_df.to_dict()
|
608
|
+
|
609
|
+
# Gather top-model metrics from the first row
|
610
|
+
top_metrics = leaderboard_df.iloc[0].to_dict()
|
611
|
+
|
612
|
+
# Construct model_results
|
613
|
+
model_results = dict(
|
614
|
+
model_flavor= "H2O AutoML",
|
615
|
+
model_path= model_path,
|
616
|
+
best_model_id= aml.leader.model_id,
|
617
|
+
metrics= top_metrics # all metrics from the top row
|
618
|
+
)
|
619
|
+
|
620
|
+
# IMPORTANT: Log these to MLflow if enabled
|
621
|
+
if enable_mlflow and run is not None:
|
622
|
+
|
623
|
+
# Log the top metrics if numeric
|
624
|
+
numeric_metrics = {{k: v for k, v in top_metrics.items() if isinstance(v, (int, float))}}
|
625
|
+
mlflow.log_metrics(numeric_metrics)
|
626
|
+
|
627
|
+
# Log artifact if we saved the model
|
628
|
+
mlflow.h2o.log_model(aml.leader, artifact_path="model")
|
629
|
+
|
630
|
+
# Log the leaderboard
|
631
|
+
mlflow.log_table(leaderboard_dict, "leaderboard.json")
|
632
|
+
|
633
|
+
# Log these parameters (if specified)
|
634
|
+
mlflow.log_params(dict(
|
635
|
+
target= target,
|
636
|
+
max_runtime_secs= max_runtime_secs,
|
637
|
+
exclude_algos= str(exclude_algos),
|
638
|
+
balance_classes= balance_classes,
|
639
|
+
nfolds= nfolds,
|
640
|
+
seed= seed,
|
641
|
+
max_models= max_models,
|
642
|
+
stopping_metric= stopping_metric,
|
643
|
+
stopping_tolerance= stopping_tolerance,
|
644
|
+
stopping_rounds= stopping_rounds,
|
645
|
+
sort_metric= sort_metric,
|
646
|
+
model_directory= model_directory,
|
647
|
+
log_path= log_path
|
648
|
+
))
|
649
|
+
|
650
|
+
# Build the output
|
651
|
+
output = dict(
|
652
|
+
leaderboard= leaderboard_dict,
|
653
|
+
best_model_id= aml.leader.model_id,
|
654
|
+
model_path= model_path,
|
655
|
+
model_results= model_results,
|
656
|
+
mlflow_run_id= run_id
|
657
|
+
)
|
658
|
+
|
659
|
+
return output
|
660
|
+
```
|
661
|
+
|
662
|
+
Avoid these errors:
|
663
|
+
|
664
|
+
- WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
|
665
|
+
|
666
|
+
- 'list' object has no attribute 'tolist'
|
667
|
+
|
668
|
+
- with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True): pandas_df = h2o_df.as_data_frame() # Convert to pandas DataFrame using pd.DataFrame(h2o_df)
|
669
|
+
|
670
|
+
- dtype is only supported for one column frames
|
671
|
+
|
672
|
+
- h2o.is_running() module 'h2o' has no attribute 'is_running'. Solution: just do h2o.init() and it will check if H2O is running.
|
673
|
+
|
674
|
+
|
675
|
+
""",
|
676
|
+
input_variables=[
|
677
|
+
"user_instructions",
|
678
|
+
"function_name",
|
679
|
+
"target_variable",
|
680
|
+
"recommended_steps",
|
681
|
+
"all_datasets_summary",
|
682
|
+
"model_directory",
|
683
|
+
"log_path",
|
684
|
+
"enable_mlflow",
|
685
|
+
"mlflow_tracking_uri",
|
686
|
+
"mlflow_experiment_name",
|
687
|
+
"mlflow_run_name",
|
688
|
+
]
|
689
|
+
)
|
690
|
+
|
691
|
+
recommended_steps = state.get("recommended_steps", "")
|
692
|
+
h2o_code_agent = code_prompt | llm | PythonOutputParser()
|
693
|
+
|
694
|
+
resp = h2o_code_agent.invoke({
|
695
|
+
"user_instructions": state.get("user_instructions"),
|
696
|
+
"function_name": function_name,
|
697
|
+
"target_variable": state.get("target_variable"),
|
698
|
+
"recommended_steps": recommended_steps,
|
699
|
+
"all_datasets_summary": all_datasets_summary_str,
|
700
|
+
"model_directory": model_directory,
|
701
|
+
"log_path": log_path,
|
702
|
+
"enable_mlflow": enable_mlflow,
|
703
|
+
"mlflow_tracking_uri": mlflow_tracking_uri,
|
704
|
+
"mlflow_experiment_name": mlflow_experiment_name,
|
705
|
+
"mlflow_run_name": mlflow_run_name,
|
706
|
+
})
|
707
|
+
|
708
|
+
resp = relocate_imports_inside_function(resp)
|
709
|
+
resp = add_comments_to_top(resp, agent_name=AGENT_NAME)
|
710
|
+
|
711
|
+
# Log the code snippet if requested
|
712
|
+
file_path, f_name = log_ai_function(
|
713
|
+
response=resp,
|
714
|
+
file_name=file_name,
|
715
|
+
log=log,
|
716
|
+
log_path=log_path,
|
717
|
+
overwrite=overwrite
|
718
|
+
)
|
719
|
+
|
720
|
+
return {
|
721
|
+
"h2o_train_function": resp,
|
722
|
+
"h2o_train_function_path": file_path,
|
723
|
+
"h2o_train_file_name": f_name,
|
724
|
+
"h2o_train_function_name": function_name,
|
725
|
+
}
|
726
|
+
|
727
|
+
# Human Review
|
728
|
+
prompt_text_human_review = "Are the following Machine Learning instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
|
729
|
+
|
730
|
+
if not bypass_explain_code:
|
731
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_ml_steps", "explain_h2o_code"]]:
|
732
|
+
return node_func_human_review(
|
733
|
+
state=state,
|
734
|
+
prompt_text=prompt_text_human_review,
|
735
|
+
yes_goto= 'explain_h2o_code',
|
736
|
+
no_goto="recommend_ml_steps",
|
737
|
+
user_instructions_key="user_instructions",
|
738
|
+
recommended_steps_key="recommended_steps",
|
739
|
+
code_snippet_key="h2o_train_function",
|
740
|
+
)
|
741
|
+
else:
|
742
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_ml_steps", "__end__"]]:
|
743
|
+
return node_func_human_review(
|
744
|
+
state=state,
|
745
|
+
prompt_text=prompt_text_human_review,
|
746
|
+
yes_goto= '__end__',
|
747
|
+
no_goto="recommend_ml_steps",
|
748
|
+
user_instructions_key="user_instructions",
|
749
|
+
recommended_steps_key="recommended_steps",
|
750
|
+
code_snippet_key="h2o_train_function",
|
751
|
+
)
|
752
|
+
|
753
|
+
# 3) Execute code
|
754
|
+
def execute_h2o_code(state):
|
755
|
+
result = node_func_execute_agent_code_on_data(
|
756
|
+
state=state,
|
757
|
+
data_key="data_raw",
|
758
|
+
code_snippet_key="h2o_train_function",
|
759
|
+
result_key="h2o_train_result",
|
760
|
+
error_key="h2o_train_error",
|
761
|
+
agent_function_name=state.get("h2o_train_function_name"),
|
762
|
+
pre_processing=lambda data: pd.DataFrame.from_dict(data),
|
763
|
+
post_processing=lambda x: x,
|
764
|
+
error_message_prefix="Error occurred during H2O AutoML: "
|
765
|
+
)
|
766
|
+
|
767
|
+
# If no error, extract leaderboard, best_model_id, and model_path
|
768
|
+
if not result["h2o_train_error"]:
|
769
|
+
if result["h2o_train_result"] and isinstance(result["h2o_train_result"], dict):
|
770
|
+
lb = result["h2o_train_result"].get("leaderboard", {})
|
771
|
+
best_id = result["h2o_train_result"].get("best_model_id", None)
|
772
|
+
mpath = result["h2o_train_result"].get("model_path", None)
|
773
|
+
model_results = result["h2o_train_result"].get("model_results", {})
|
774
|
+
|
775
|
+
result["leaderboard"] = lb
|
776
|
+
result["best_model_id"] = best_id
|
777
|
+
result["model_path"] = mpath
|
778
|
+
result["model_results"] = model_results
|
779
|
+
|
780
|
+
return result
|
781
|
+
|
782
|
+
# 4) Fix code if there's an error
|
783
|
+
def fix_h2o_code(state: GraphState):
|
784
|
+
fix_prompt = """
|
785
|
+
You are an H2O AutoML agent. The function {function_name} currently has errors.
|
786
|
+
Please fix it. Return only the corrected function in ```python``` format.
|
787
|
+
|
788
|
+
Broken code:
|
789
|
+
{code_snippet}
|
790
|
+
|
791
|
+
Last Known Error:
|
792
|
+
{error}
|
793
|
+
"""
|
794
|
+
return node_func_fix_agent_code(
|
795
|
+
state=state,
|
796
|
+
code_snippet_key="h2o_train_function",
|
797
|
+
error_key="h2o_train_error",
|
798
|
+
llm=llm,
|
799
|
+
prompt_template=fix_prompt,
|
800
|
+
agent_name=AGENT_NAME,
|
801
|
+
file_path=state.get("h2o_train_function_path"),
|
802
|
+
function_name=state.get("h2o_train_function_name"),
|
803
|
+
log=log
|
804
|
+
)
|
805
|
+
|
806
|
+
# 5) Final reporting node
|
807
|
+
def report_agent_outputs(state: GraphState):
|
808
|
+
return node_func_report_agent_outputs(
|
809
|
+
state=state,
|
810
|
+
keys_to_include=[
|
811
|
+
"recommended_steps",
|
812
|
+
"h2o_train_function",
|
813
|
+
"h2o_train_function_path",
|
814
|
+
"h2o_train_function_name",
|
815
|
+
"h2o_train_error",
|
816
|
+
"model_path",
|
817
|
+
"best_model_id",
|
818
|
+
],
|
819
|
+
result_key="messages",
|
820
|
+
role=AGENT_NAME,
|
821
|
+
custom_title="H2O Machine Learning Agent Outputs"
|
822
|
+
)
|
823
|
+
|
824
|
+
node_functions = {
|
825
|
+
"recommend_ml_steps": recommend_ml_steps,
|
826
|
+
"human_review": human_review,
|
827
|
+
"create_h2o_code": create_h2o_code,
|
828
|
+
"execute_h2o_code": execute_h2o_code,
|
829
|
+
"fix_h2o_code": fix_h2o_code,
|
830
|
+
"report_agent_outputs": report_agent_outputs,
|
831
|
+
}
|
832
|
+
|
833
|
+
app = create_coding_agent_graph(
|
834
|
+
GraphState=GraphState,
|
835
|
+
node_functions=node_functions,
|
836
|
+
recommended_steps_node_name="recommend_ml_steps",
|
837
|
+
create_code_node_name="create_h2o_code",
|
838
|
+
execute_code_node_name="execute_h2o_code",
|
839
|
+
fix_code_node_name="fix_h2o_code",
|
840
|
+
explain_code_node_name="report_agent_outputs",
|
841
|
+
error_key="h2o_train_error",
|
842
|
+
max_retries_key="max_retries",
|
843
|
+
retry_count_key="retry_count",
|
844
|
+
human_in_the_loop=human_in_the_loop,
|
845
|
+
human_review_node_name="human_review",
|
846
|
+
checkpointer=MemorySaver(),
|
847
|
+
bypass_recommended_steps=bypass_recommended_steps,
|
848
|
+
bypass_explain_code=bypass_explain_code,
|
849
|
+
)
|
850
|
+
|
851
|
+
return app
|
852
|
+
|