ai-data-science-team 0.0.0.9007__py3-none-any.whl → 0.0.0.9009__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- ai_data_science_team/_version.py +1 -1
- ai_data_science_team/agents/__init__.py +4 -5
- ai_data_science_team/agents/data_cleaning_agent.py +268 -116
- ai_data_science_team/agents/data_visualization_agent.py +470 -41
- ai_data_science_team/agents/data_wrangling_agent.py +471 -31
- ai_data_science_team/agents/feature_engineering_agent.py +426 -41
- ai_data_science_team/agents/sql_database_agent.py +458 -58
- ai_data_science_team/ml_agents/__init__.py +1 -0
- ai_data_science_team/ml_agents/h2o_ml_agent.py +1032 -0
- ai_data_science_team/multiagents/__init__.py +1 -0
- ai_data_science_team/multiagents/sql_data_analyst.py +398 -0
- ai_data_science_team/multiagents/supervised_data_analyst.py +2 -0
- ai_data_science_team/templates/__init__.py +3 -1
- ai_data_science_team/templates/agent_templates.py +319 -43
- ai_data_science_team/tools/metadata.py +94 -62
- ai_data_science_team/tools/regex.py +86 -1
- ai_data_science_team/utils/__init__.py +0 -0
- ai_data_science_team/utils/plotly.py +24 -0
- ai_data_science_team-0.0.0.9009.dist-info/METADATA +245 -0
- ai_data_science_team-0.0.0.9009.dist-info/RECORD +28 -0
- ai_data_science_team-0.0.0.9007.dist-info/METADATA +0 -183
- ai_data_science_team-0.0.0.9007.dist-info/RECORD +0 -21
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/LICENSE +0 -0
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/WHEEL +0 -0
- {ai_data_science_team-0.0.0.9007.dist-info → ai_data_science_team-0.0.0.9009.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1032 @@
|
|
1
|
+
# BUSINESS SCIENCE UNIVERSITY
|
2
|
+
# AI DATA SCIENCE TEAM
|
3
|
+
# ***
|
4
|
+
# * Agents: H2O Machine Learning Agent
|
5
|
+
|
6
|
+
import os
|
7
|
+
import json
|
8
|
+
from typing import TypedDict, Annotated, Sequence, Literal
|
9
|
+
import operator
|
10
|
+
|
11
|
+
import pandas as pd
|
12
|
+
from IPython.display import Markdown
|
13
|
+
|
14
|
+
from langchain.prompts import PromptTemplate
|
15
|
+
from langchain_core.messages import BaseMessage
|
16
|
+
|
17
|
+
from langgraph.types import Command
|
18
|
+
from langgraph.checkpoint.memory import MemorySaver
|
19
|
+
|
20
|
+
from ai_data_science_team.templates import(
|
21
|
+
node_func_execute_agent_code_on_data,
|
22
|
+
node_func_human_review,
|
23
|
+
node_func_fix_agent_code,
|
24
|
+
node_func_report_agent_outputs,
|
25
|
+
create_coding_agent_graph,
|
26
|
+
BaseAgent,
|
27
|
+
)
|
28
|
+
from ai_data_science_team.tools.parsers import PythonOutputParser
|
29
|
+
from ai_data_science_team.tools.regex import (
|
30
|
+
relocate_imports_inside_function,
|
31
|
+
add_comments_to_top,
|
32
|
+
format_agent_name,
|
33
|
+
format_recommended_steps,
|
34
|
+
get_generic_summary,
|
35
|
+
)
|
36
|
+
from ai_data_science_team.tools.metadata import get_dataframe_summary
|
37
|
+
from ai_data_science_team.tools.logging import log_ai_function
|
38
|
+
|
39
|
+
AGENT_NAME = "h2o_ml_agent"
|
40
|
+
LOG_PATH = os.path.join(os.getcwd(), "logs/")
|
41
|
+
|
42
|
+
class H2OMLAgent(BaseAgent):
|
43
|
+
"""
|
44
|
+
A Machine Learning agent that uses H2O's AutoML for training,
|
45
|
+
allowing the user to specify a model directory for saving the best model.
|
46
|
+
If neither model_directory nor log_path is provided, model saving is skipped.
|
47
|
+
|
48
|
+
Parameters
|
49
|
+
----------
|
50
|
+
model : langchain.llms.base.LLM
|
51
|
+
The language model used to generate the ML code.
|
52
|
+
n_samples : int, optional
|
53
|
+
Number of samples used when summarizing the dataset. Defaults to 30.
|
54
|
+
log : bool, optional
|
55
|
+
Whether to log the generated code and errors. Defaults to False.
|
56
|
+
log_path : str, optional
|
57
|
+
Directory path for storing log files. Defaults to None.
|
58
|
+
file_name : str, optional
|
59
|
+
Name of the Python file for saving the generated code. Defaults to "h2o_automl.py".
|
60
|
+
function_name : str, optional
|
61
|
+
Name of the function that performs the AutoML training. Defaults to "h2o_automl".
|
62
|
+
model_directory : str or None, optional
|
63
|
+
Directory to save the model. If None, defaults to log_path (if available).
|
64
|
+
If both are None, no model is saved. Defaults to None.
|
65
|
+
overwrite : bool, optional
|
66
|
+
Whether to overwrite the log file if it exists. Defaults to True.
|
67
|
+
human_in_the_loop : bool, optional
|
68
|
+
Enables user review of the code. Defaults to False.
|
69
|
+
bypass_recommended_steps : bool, optional
|
70
|
+
If True, skips the recommended steps prompt. Defaults to False.
|
71
|
+
bypass_explain_code : bool, optional
|
72
|
+
If True, skips the code-explanation step. Defaults to False.
|
73
|
+
|
74
|
+
Methods
|
75
|
+
-------
|
76
|
+
update_params(**kwargs)
|
77
|
+
Updates the agent's parameters and rebuilds the compiled state graph.
|
78
|
+
ainvoke_agent(user_instructions, data_raw, target_variable, ...)
|
79
|
+
Asynchronously runs the agent to produce an H2O AutoML model, optionally saving the model to disk.
|
80
|
+
invoke_agent(user_instructions, data_raw, target_variable, ...)
|
81
|
+
Synchronously runs the agent to produce an H2O AutoML model, optionally saving the model to disk.
|
82
|
+
get_leaderboard()
|
83
|
+
Retrieves the H2O AutoML leaderboard from the agent's response.
|
84
|
+
get_best_model_id()
|
85
|
+
Retrieves the best model ID from the agent's response.
|
86
|
+
get_model_path()
|
87
|
+
Retrieves the saved model path from the agent's response (or None if not saved).
|
88
|
+
get_data_raw()
|
89
|
+
Retrieves the raw data as a DataFrame from the agent's response.
|
90
|
+
get_h2o_train_function(markdown=False)
|
91
|
+
Retrieves the H2O AutoML function code generated by the agent.
|
92
|
+
get_recommended_ml_steps(markdown=False)
|
93
|
+
Retrieves recommended ML steps from the agent's response.
|
94
|
+
get_workflow_summary()
|
95
|
+
Retrieves a summary of the agent's workflow.
|
96
|
+
get_response()
|
97
|
+
Returns the entire response dictionary.
|
98
|
+
show()
|
99
|
+
Visualizes the compiled graph as a Mermaid diagram.
|
100
|
+
|
101
|
+
Examples
|
102
|
+
--------
|
103
|
+
```python
|
104
|
+
from langchain_openai import ChatOpenAI
|
105
|
+
import pandas as pd
|
106
|
+
from ai_data_science_team.ml_agents import H2OMLAgent
|
107
|
+
|
108
|
+
llm = ChatOpenAI(model="gpt-4o-mini")
|
109
|
+
|
110
|
+
df = pd.read_csv("data/churn_data.csv")
|
111
|
+
|
112
|
+
ml_agent = H2OMLAgent(
|
113
|
+
model=llm,
|
114
|
+
log=True,
|
115
|
+
log_path=LOG_PATH,
|
116
|
+
model_directory=MODEL_PATH,
|
117
|
+
)
|
118
|
+
|
119
|
+
ml_agent.invoke_agent(
|
120
|
+
data_raw=df.drop(columns=["customerID"]),
|
121
|
+
user_instructions="Please do classification on 'Churn'. Use a max runtime of 30 seconds.",
|
122
|
+
target_variable="Churn"
|
123
|
+
)
|
124
|
+
|
125
|
+
# Retrieve and display the leaderboard of models
|
126
|
+
ml_agent.get_leaderboard()
|
127
|
+
|
128
|
+
# Get the H2O training function in markdown format
|
129
|
+
ml_agent.get_h2o_train_function(markdown=True)
|
130
|
+
|
131
|
+
# Get the recommended machine learning steps in markdown format
|
132
|
+
ml_agent.get_recommended_ml_steps(markdown=True)
|
133
|
+
|
134
|
+
# Get a summary of the workflow in markdown format
|
135
|
+
ml_agent.get_workflow_summary(markdown=True)
|
136
|
+
|
137
|
+
# Get a summary of the logs in markdown format
|
138
|
+
ml_agent.get_log_summary(markdown=True)
|
139
|
+
|
140
|
+
# Get the path to the saved model
|
141
|
+
model_path = ml_agent.get_model_path()
|
142
|
+
model_path
|
143
|
+
```
|
144
|
+
|
145
|
+
Returns
|
146
|
+
-------
|
147
|
+
H2OMLAgent : langchain.graphs.CompiledStateGraph
|
148
|
+
An instance of the H2O ML agent.
|
149
|
+
|
150
|
+
"""
|
151
|
+
|
152
|
+
def __init__(
|
153
|
+
self,
|
154
|
+
model,
|
155
|
+
n_samples=30,
|
156
|
+
log=False,
|
157
|
+
log_path=None,
|
158
|
+
file_name="h2o_automl.py",
|
159
|
+
function_name="h2o_automl",
|
160
|
+
model_directory=None, # New
|
161
|
+
overwrite=True,
|
162
|
+
human_in_the_loop=False,
|
163
|
+
bypass_recommended_steps=False,
|
164
|
+
bypass_explain_code=False
|
165
|
+
):
|
166
|
+
self._params = {
|
167
|
+
"model": model,
|
168
|
+
"n_samples": n_samples,
|
169
|
+
"log": log,
|
170
|
+
"log_path": log_path,
|
171
|
+
"file_name": file_name,
|
172
|
+
"function_name": function_name,
|
173
|
+
"model_directory": model_directory,
|
174
|
+
"overwrite": overwrite,
|
175
|
+
"human_in_the_loop": human_in_the_loop,
|
176
|
+
"bypass_recommended_steps": bypass_recommended_steps,
|
177
|
+
"bypass_explain_code": bypass_explain_code
|
178
|
+
}
|
179
|
+
self._compiled_graph = self._make_compiled_graph()
|
180
|
+
self.response = None
|
181
|
+
|
182
|
+
def _make_compiled_graph(self):
|
183
|
+
"""
|
184
|
+
Creates the compiled graph for the H2O ML agent.
|
185
|
+
"""
|
186
|
+
self.response = None
|
187
|
+
return make_h2o_ml_agent(**self._params)
|
188
|
+
|
189
|
+
def update_params(self, **kwargs):
|
190
|
+
"""
|
191
|
+
Updates the agent's parameters and rebuilds the compiled graph.
|
192
|
+
"""
|
193
|
+
for k, v in kwargs.items():
|
194
|
+
self._params[k] = v
|
195
|
+
self._compiled_graph = self._make_compiled_graph()
|
196
|
+
|
197
|
+
def ainvoke_agent(
|
198
|
+
self,
|
199
|
+
data_raw: pd.DataFrame,
|
200
|
+
user_instructions: str=None,
|
201
|
+
target_variable: str=None,
|
202
|
+
max_retries=3,
|
203
|
+
retry_count=0,
|
204
|
+
**kwargs
|
205
|
+
):
|
206
|
+
"""
|
207
|
+
Asynchronously trains an H2O AutoML model for the provided dataset,
|
208
|
+
saving the best model to disk if model_directory or log_path is available.
|
209
|
+
"""
|
210
|
+
response = self._compiled_graph.ainvoke({
|
211
|
+
"user_instructions": user_instructions,
|
212
|
+
"data_raw": data_raw.to_dict(),
|
213
|
+
"target_variable": target_variable,
|
214
|
+
"max_retries": max_retries,
|
215
|
+
"retry_count": retry_count
|
216
|
+
}, **kwargs)
|
217
|
+
self.response = response
|
218
|
+
return None
|
219
|
+
|
220
|
+
def invoke_agent(
|
221
|
+
self,
|
222
|
+
data_raw: pd.DataFrame,
|
223
|
+
user_instructions: str=None,
|
224
|
+
target_variable: str=None,
|
225
|
+
max_retries=3,
|
226
|
+
retry_count=0,
|
227
|
+
**kwargs
|
228
|
+
):
|
229
|
+
"""
|
230
|
+
Synchronously trains an H2O AutoML model for the provided dataset,
|
231
|
+
saving the best model to disk if model_directory or log_path is available.
|
232
|
+
"""
|
233
|
+
response = self._compiled_graph.invoke({
|
234
|
+
"user_instructions": user_instructions,
|
235
|
+
"data_raw": data_raw.to_dict(),
|
236
|
+
"target_variable": target_variable,
|
237
|
+
"max_retries": max_retries,
|
238
|
+
"retry_count": retry_count
|
239
|
+
}, **kwargs)
|
240
|
+
self.response = response
|
241
|
+
return None
|
242
|
+
|
243
|
+
def get_leaderboard(self):
|
244
|
+
"""Returns the H2O AutoML leaderboard as a DataFrame."""
|
245
|
+
if self.response and "leaderboard" in self.response:
|
246
|
+
return pd.DataFrame(self.response["leaderboard"])
|
247
|
+
return None
|
248
|
+
|
249
|
+
def get_best_model_id(self):
|
250
|
+
"""Returns the best model id from the AutoML run."""
|
251
|
+
if self.response and "best_model_id" in self.response:
|
252
|
+
return self.response["best_model_id"]
|
253
|
+
return None
|
254
|
+
|
255
|
+
def get_model_path(self):
|
256
|
+
"""Returns the file path to the saved best model, or None if not saved."""
|
257
|
+
if self.response and "model_path" in self.response:
|
258
|
+
return self.response["model_path"]
|
259
|
+
return None
|
260
|
+
|
261
|
+
def get_data_raw(self):
|
262
|
+
"""Retrieves the raw data as a DataFrame from the response."""
|
263
|
+
if self.response and "data_raw" in self.response:
|
264
|
+
return pd.DataFrame(self.response["data_raw"])
|
265
|
+
return None
|
266
|
+
|
267
|
+
def get_h2o_train_function(self, markdown=False):
|
268
|
+
"""Retrieves the H2O AutoML function code generated by the agent."""
|
269
|
+
if self.response and "h2o_train_function" in self.response:
|
270
|
+
code = self.response["h2o_train_function"]
|
271
|
+
if markdown:
|
272
|
+
return Markdown(f"```python\n{code}\n```")
|
273
|
+
return code
|
274
|
+
return None
|
275
|
+
|
276
|
+
def get_recommended_ml_steps(self, markdown=False):
|
277
|
+
"""Retrieves recommended ML steps from the agent's response."""
|
278
|
+
if self.response and "recommended_steps" in self.response:
|
279
|
+
steps = self.response["recommended_steps"]
|
280
|
+
if markdown:
|
281
|
+
return Markdown(steps)
|
282
|
+
return steps
|
283
|
+
return None
|
284
|
+
|
285
|
+
def get_workflow_summary(self, markdown=False):
|
286
|
+
"""
|
287
|
+
Retrieves the agent's workflow summary, if logging is enabled.
|
288
|
+
"""
|
289
|
+
if self.response and self.response.get("messages"):
|
290
|
+
summary = get_generic_summary(json.loads(self.response.get("messages")[-1].content))
|
291
|
+
if markdown:
|
292
|
+
return Markdown(summary)
|
293
|
+
else:
|
294
|
+
return summary
|
295
|
+
|
296
|
+
def get_log_summary(self, markdown=False):
|
297
|
+
"""
|
298
|
+
Logs a summary of the agent's operations, if logging is enabled.
|
299
|
+
"""
|
300
|
+
if self.response:
|
301
|
+
if self.response.get('h2o_train_function_path'):
|
302
|
+
log_details = f"""
|
303
|
+
## H2O Machine Learning Agent Log Summary:
|
304
|
+
|
305
|
+
Function Path: {self.response.get('h2o_train_function_path')}
|
306
|
+
|
307
|
+
Function Name: {self.response.get('h2o_train_function_name')}
|
308
|
+
|
309
|
+
Best Model ID: {self.get_best_model_id()}
|
310
|
+
|
311
|
+
Model Path: {self.get_model_path()}
|
312
|
+
"""
|
313
|
+
if markdown:
|
314
|
+
return Markdown(log_details)
|
315
|
+
else:
|
316
|
+
return log_details
|
317
|
+
|
318
|
+
|
319
|
+
def make_h2o_ml_agent(
|
320
|
+
model,
|
321
|
+
n_samples=30,
|
322
|
+
log=False,
|
323
|
+
log_path=None,
|
324
|
+
file_name="h2o_automl.py",
|
325
|
+
function_name="h2o_automl",
|
326
|
+
model_directory=None,
|
327
|
+
overwrite=True,
|
328
|
+
human_in_the_loop=False,
|
329
|
+
bypass_recommended_steps=False,
|
330
|
+
bypass_explain_code=False,
|
331
|
+
):
|
332
|
+
"""
|
333
|
+
Creates a machine learning agent that uses H2O for AutoML.
|
334
|
+
The agent will:
|
335
|
+
1. Optionally recommend ML steps,
|
336
|
+
2. Creates Python code that sets up H2OAutoML,
|
337
|
+
3. Executes that code (optionally saving the best model to disk),
|
338
|
+
4. Fixes errors if needed,
|
339
|
+
5. Optionally explains the code.
|
340
|
+
|
341
|
+
model_directory: Directory to save the model.
|
342
|
+
If None, defaults to log_path.
|
343
|
+
If both are None, skip saving.
|
344
|
+
"""
|
345
|
+
|
346
|
+
llm = model
|
347
|
+
|
348
|
+
# Handle logging directory
|
349
|
+
if log:
|
350
|
+
if log_path is None:
|
351
|
+
log_path = "logs/"
|
352
|
+
if not os.path.exists(log_path):
|
353
|
+
os.makedirs(log_path)
|
354
|
+
|
355
|
+
# Check if H2O is installed
|
356
|
+
try:
|
357
|
+
import h2o
|
358
|
+
from h2o.automl import H2OAutoML
|
359
|
+
except ImportError as e:
|
360
|
+
raise ImportError(
|
361
|
+
"The 'h2o' library is not installed. Please install it using pip:\n\n"
|
362
|
+
" pip install h2o\n\n"
|
363
|
+
"Visit https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html for details."
|
364
|
+
) from e
|
365
|
+
|
366
|
+
# Define GraphState
|
367
|
+
class GraphState(TypedDict):
|
368
|
+
messages: Annotated[Sequence[BaseMessage], operator.add]
|
369
|
+
user_instructions: str
|
370
|
+
recommended_steps: str
|
371
|
+
data_raw: dict
|
372
|
+
leaderboard: dict
|
373
|
+
best_model_id: str
|
374
|
+
model_path: str
|
375
|
+
target_variable: str
|
376
|
+
all_datasets_summary: str
|
377
|
+
h2o_train_function: str
|
378
|
+
h2o_train_function_path: str
|
379
|
+
h2o_train_file_name: str
|
380
|
+
h2o_train_function_name: str
|
381
|
+
h2o_train_error: str
|
382
|
+
max_retries: int
|
383
|
+
retry_count: int
|
384
|
+
|
385
|
+
# 1) Recommend ML steps (optional)
|
386
|
+
def recommend_ml_steps(state: GraphState):
|
387
|
+
print(format_agent_name(AGENT_NAME))
|
388
|
+
print(" * RECOMMEND MACHINE LEARNING STEPS")
|
389
|
+
|
390
|
+
recommend_steps_prompt = PromptTemplate(
|
391
|
+
template="""
|
392
|
+
You are an AutoML Expert using H2O.
|
393
|
+
|
394
|
+
We have the following dataset summary, user instructions, and H2O AutoML documentation:
|
395
|
+
|
396
|
+
User instructions:
|
397
|
+
{user_instructions}
|
398
|
+
|
399
|
+
Data Summary:
|
400
|
+
{all_datasets_summary}
|
401
|
+
|
402
|
+
H2O AutoML Documentation:
|
403
|
+
{h2o_automl_documentation}
|
404
|
+
|
405
|
+
Please recommend a short list of steps or considerations for performing H2OAutoML on this data. Specifically focus on maximizing model accuracy while remaining flexible to user instructions and the dataset.
|
406
|
+
|
407
|
+
- Recommend any paramters and values that might improve performance (predictive accuracy).
|
408
|
+
- Recommend the Loss Function, Stopping Criteria, and other advanced parameters.
|
409
|
+
- Use the H2O AutoML documentation to your advantage.
|
410
|
+
- Exclude deep learning algorithms since these are typically low performance.
|
411
|
+
|
412
|
+
Avoid these:
|
413
|
+
|
414
|
+
- Do not perform data cleaning or feature engineering here. We will handle that separately.
|
415
|
+
- Do not limit memory size or CPU usage unless the user specifies it.
|
416
|
+
|
417
|
+
Return as a numbered list. You can return short code snippets to demonstrate actions. But do not return a fully coded solution. The H2O AutoML code will be generated separately by a Coding Agent.
|
418
|
+
""",
|
419
|
+
input_variables=["user_instructions", "all_datasets_summary", "h2o_automl_documentation"]
|
420
|
+
)
|
421
|
+
|
422
|
+
data_raw = state.get("data_raw")
|
423
|
+
df = pd.DataFrame.from_dict(data_raw)
|
424
|
+
all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
|
425
|
+
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
426
|
+
|
427
|
+
steps_agent = recommend_steps_prompt | llm
|
428
|
+
recommended_steps = steps_agent.invoke({
|
429
|
+
"user_instructions": state.get("user_instructions"),
|
430
|
+
"all_datasets_summary": all_datasets_summary_str,
|
431
|
+
"h2o_automl_documentation": H2O_AUTOML_DOCUMENTATION
|
432
|
+
})
|
433
|
+
|
434
|
+
return {
|
435
|
+
"recommended_steps": format_recommended_steps(
|
436
|
+
recommended_steps.content.strip(),
|
437
|
+
heading="# Recommended ML Steps:"
|
438
|
+
),
|
439
|
+
"all_datasets_summary": all_datasets_summary_str
|
440
|
+
}
|
441
|
+
|
442
|
+
# 2) Create code
|
443
|
+
def create_h2o_code(state: GraphState):
|
444
|
+
if bypass_recommended_steps:
|
445
|
+
print(format_agent_name(AGENT_NAME))
|
446
|
+
|
447
|
+
data_raw = state.get("data_raw")
|
448
|
+
df = pd.DataFrame.from_dict(data_raw)
|
449
|
+
all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
|
450
|
+
all_datasets_summary_str = "\n\n".join(all_datasets_summary)
|
451
|
+
else:
|
452
|
+
all_datasets_summary_str = state.get("all_datasets_summary")
|
453
|
+
|
454
|
+
print(" * CREATE H2O AUTOML CODE")
|
455
|
+
|
456
|
+
code_prompt = PromptTemplate(
|
457
|
+
template="""
|
458
|
+
You are an H2O AutoML agent. Create a Python function named {function_name}(data_raw)
|
459
|
+
that runs H2OAutoML on the provided data with a focus on maximizing model accuracy and
|
460
|
+
incorporating user instructions for flexibility.
|
461
|
+
|
462
|
+
Do not perform substantial data cleaning or feature engineering here. We will handle that separately.
|
463
|
+
|
464
|
+
We have two variables for deciding where to save the model:
|
465
|
+
model_directory = {model_directory}
|
466
|
+
log_path = {log_path}
|
467
|
+
|
468
|
+
Logic:
|
469
|
+
1. If both model_directory and log_path are None, do NOT save the model (set model_path = None).
|
470
|
+
2. Otherwise, pick model_directory if it's not None, else pick log_path.
|
471
|
+
Then call `h2o.save_model(model=aml.leader, path=the_directory, force=True)` to save the model.
|
472
|
+
3. Return model_path as part of the final dictionary.
|
473
|
+
|
474
|
+
Additional Requirements:
|
475
|
+
- Convert `data_raw` (pandas DataFrame) into an H2OFrame.
|
476
|
+
- Identify the target variable from {target_variable} (if provided).
|
477
|
+
- Start H2O if not already started.
|
478
|
+
- Use Recommended Steps to guide any advanced parameters (e.g., cross-validation folds,
|
479
|
+
balancing classes, extended training time, stacking) that might improve performance.
|
480
|
+
- If the user does not specify anything special, use H2OAutoML defaults (including stacked ensembles).
|
481
|
+
- Focus on maximizing accuracy (or the most relevant metric if it's not classification)
|
482
|
+
while remaining flexible to user instructions.
|
483
|
+
- Return a dict with keys: leaderboard, best_model_id, and model_path.
|
484
|
+
|
485
|
+
Initial User Instructions (Disregard any instructions that are unrelated to modeling):
|
486
|
+
{user_instructions}
|
487
|
+
|
488
|
+
Recommended Steps:
|
489
|
+
{recommended_steps}
|
490
|
+
|
491
|
+
Data summary for reference:
|
492
|
+
{all_datasets_summary}
|
493
|
+
|
494
|
+
Return only code in ```python``` with a single function definition. Use this as an example starting template:
|
495
|
+
```python
|
496
|
+
def {function_name}(data_raw):
|
497
|
+
import h2o
|
498
|
+
from h2o.automl import H2OAutoML
|
499
|
+
import pandas as pd
|
500
|
+
|
501
|
+
# Initialize or connect to H2O if not already started
|
502
|
+
h2o.init()
|
503
|
+
|
504
|
+
# Convert the pandas DataFrame to an H2OFrame
|
505
|
+
data_h2o = h2o.H2OFrame(data_raw)
|
506
|
+
|
507
|
+
# Identify the target variable
|
508
|
+
target = {target_variable}
|
509
|
+
x = [col for col in data_h2o.columns if col != target]
|
510
|
+
|
511
|
+
# Example: Use advanced parameters if recommended (e.g., nfolds, max_runtime_secs, etc.)
|
512
|
+
# Adjust them based on user instructions and recommended steps:
|
513
|
+
aml = H2OAutoML(
|
514
|
+
max_runtime_secs=60, # default if no user instructions override
|
515
|
+
seed=42,
|
516
|
+
nfolds=5, # default if no user instructions override
|
517
|
+
# e.g., balance_classes=True, etc. if recommended
|
518
|
+
)
|
519
|
+
aml.train(x=x, y=target, training_frame=data_h2o)
|
520
|
+
|
521
|
+
# Determine model saving logic
|
522
|
+
if {model_directory} is None and {log_path} is None:
|
523
|
+
model_path = None
|
524
|
+
else:
|
525
|
+
path_to_save = {model_directory} if {model_directory} else {log_path}
|
526
|
+
model_path = h2o.save_model(model=aml.leader, path=path_to_save, force=True)
|
527
|
+
|
528
|
+
return dict(
|
529
|
+
leaderboard = h2o.automl.get_leaderboard(aml, extra_columns="ALL").as_data_frame().to_dict(),
|
530
|
+
best_model_id = aml.leader.model_id,
|
531
|
+
model_path = model_path,
|
532
|
+
)
|
533
|
+
```
|
534
|
+
""",
|
535
|
+
input_variables=[
|
536
|
+
"user_instructions",
|
537
|
+
"function_name",
|
538
|
+
"target_variable",
|
539
|
+
"recommended_steps",
|
540
|
+
"all_datasets_summary",
|
541
|
+
"model_directory",
|
542
|
+
"log_path"
|
543
|
+
]
|
544
|
+
)
|
545
|
+
|
546
|
+
recommended_steps = state.get("recommended_steps", "")
|
547
|
+
h2o_code_agent = code_prompt | llm | PythonOutputParser()
|
548
|
+
|
549
|
+
resp = h2o_code_agent.invoke({
|
550
|
+
"user_instructions": state.get("user_instructions"),
|
551
|
+
"function_name": function_name,
|
552
|
+
"target_variable": state.get("target_variable"),
|
553
|
+
"recommended_steps": recommended_steps,
|
554
|
+
"all_datasets_summary": all_datasets_summary_str,
|
555
|
+
"model_directory": model_directory,
|
556
|
+
"log_path": log_path
|
557
|
+
})
|
558
|
+
|
559
|
+
resp = relocate_imports_inside_function(resp)
|
560
|
+
resp = add_comments_to_top(resp, agent_name=AGENT_NAME)
|
561
|
+
|
562
|
+
# Log the code snippet if requested
|
563
|
+
file_path, f_name = log_ai_function(
|
564
|
+
response=resp,
|
565
|
+
file_name=file_name,
|
566
|
+
log=log,
|
567
|
+
log_path=log_path,
|
568
|
+
overwrite=overwrite
|
569
|
+
)
|
570
|
+
|
571
|
+
return {
|
572
|
+
"h2o_train_function": resp,
|
573
|
+
"h2o_train_function_path": file_path,
|
574
|
+
"h2o_train_file_name": f_name,
|
575
|
+
"h2o_train_function_name": function_name
|
576
|
+
}
|
577
|
+
|
578
|
+
# Human Review
|
579
|
+
prompt_text_human_review = "Are the following Machine Learning instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
|
580
|
+
|
581
|
+
if not bypass_explain_code:
|
582
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_ml_steps", "explain_h2o_code"]]:
|
583
|
+
return node_func_human_review(
|
584
|
+
state=state,
|
585
|
+
prompt_text=prompt_text_human_review,
|
586
|
+
yes_goto= 'explain_h2o_code',
|
587
|
+
no_goto="recommend_ml_steps",
|
588
|
+
user_instructions_key="user_instructions",
|
589
|
+
recommended_steps_key="recommended_steps",
|
590
|
+
code_snippet_key="h2o_train_function",
|
591
|
+
)
|
592
|
+
else:
|
593
|
+
def human_review(state: GraphState) -> Command[Literal["recommend_ml_steps", "__end__"]]:
|
594
|
+
return node_func_human_review(
|
595
|
+
state=state,
|
596
|
+
prompt_text=prompt_text_human_review,
|
597
|
+
yes_goto= '__end__',
|
598
|
+
no_goto="recommend_ml_steps",
|
599
|
+
user_instructions_key="user_instructions",
|
600
|
+
recommended_steps_key="recommended_steps",
|
601
|
+
code_snippet_key="h2o_train_function",
|
602
|
+
)
|
603
|
+
|
604
|
+
# 3) Execute code
|
605
|
+
def execute_h2o_code(state):
|
606
|
+
result = node_func_execute_agent_code_on_data(
|
607
|
+
state=state,
|
608
|
+
data_key="data_raw",
|
609
|
+
code_snippet_key="h2o_train_function",
|
610
|
+
result_key="h2o_train_result",
|
611
|
+
error_key="h2o_train_error",
|
612
|
+
agent_function_name=state.get("h2o_train_function_name"),
|
613
|
+
pre_processing=lambda data: pd.DataFrame.from_dict(data),
|
614
|
+
post_processing=lambda x: x,
|
615
|
+
error_message_prefix="Error occurred during H2O AutoML: "
|
616
|
+
)
|
617
|
+
|
618
|
+
# If no error, extract leaderboard, best_model_id, and model_path
|
619
|
+
if not result["h2o_train_error"]:
|
620
|
+
if result["h2o_train_result"] and isinstance(result["h2o_train_result"], dict):
|
621
|
+
lb = result["h2o_train_result"].get("leaderboard", {})
|
622
|
+
best_id = result["h2o_train_result"].get("best_model_id", None)
|
623
|
+
mpath = result["h2o_train_result"].get("model_path", None)
|
624
|
+
|
625
|
+
result["leaderboard"] = lb
|
626
|
+
result["best_model_id"] = best_id
|
627
|
+
result["model_path"] = mpath
|
628
|
+
|
629
|
+
return result
|
630
|
+
|
631
|
+
# 4) Fix code if there's an error
|
632
|
+
def fix_h2o_code(state: GraphState):
|
633
|
+
fix_prompt = """
|
634
|
+
You are an H2O AutoML agent. The function {function_name} currently has errors.
|
635
|
+
Please fix it. Return only the corrected function in ```python``` format.
|
636
|
+
|
637
|
+
Broken code:
|
638
|
+
{code_snippet}
|
639
|
+
|
640
|
+
Last Known Error:
|
641
|
+
{error}
|
642
|
+
"""
|
643
|
+
return node_func_fix_agent_code(
|
644
|
+
state=state,
|
645
|
+
code_snippet_key="h2o_train_function",
|
646
|
+
error_key="h2o_train_error",
|
647
|
+
llm=llm,
|
648
|
+
prompt_template=fix_prompt,
|
649
|
+
agent_name=AGENT_NAME,
|
650
|
+
file_path=state.get("h2o_train_function_path"),
|
651
|
+
function_name=state.get("h2o_train_function_name"),
|
652
|
+
log=log
|
653
|
+
)
|
654
|
+
|
655
|
+
# 5) Final reporting node
|
656
|
+
def report_agent_outputs(state: GraphState):
|
657
|
+
return node_func_report_agent_outputs(
|
658
|
+
state=state,
|
659
|
+
keys_to_include=[
|
660
|
+
"recommended_steps",
|
661
|
+
"h2o_train_function",
|
662
|
+
"h2o_train_function_path",
|
663
|
+
"h2o_train_function_name",
|
664
|
+
"h2o_train_error",
|
665
|
+
"model_path",
|
666
|
+
"best_model_id",
|
667
|
+
],
|
668
|
+
result_key="messages",
|
669
|
+
role=AGENT_NAME,
|
670
|
+
custom_title="H2O Machine Learning Agent Outputs"
|
671
|
+
)
|
672
|
+
|
673
|
+
node_functions = {
|
674
|
+
"recommend_ml_steps": recommend_ml_steps,
|
675
|
+
"human_review": human_review,
|
676
|
+
"create_h2o_code": create_h2o_code,
|
677
|
+
"execute_h2o_code": execute_h2o_code,
|
678
|
+
"fix_h2o_code": fix_h2o_code,
|
679
|
+
"report_agent_outputs": report_agent_outputs,
|
680
|
+
}
|
681
|
+
|
682
|
+
app = create_coding_agent_graph(
|
683
|
+
GraphState=GraphState,
|
684
|
+
node_functions=node_functions,
|
685
|
+
recommended_steps_node_name="recommend_ml_steps",
|
686
|
+
create_code_node_name="create_h2o_code",
|
687
|
+
execute_code_node_name="execute_h2o_code",
|
688
|
+
fix_code_node_name="fix_h2o_code",
|
689
|
+
explain_code_node_name="report_agent_outputs",
|
690
|
+
error_key="h2o_train_error",
|
691
|
+
max_retries_key="max_retries",
|
692
|
+
retry_count_key="retry_count",
|
693
|
+
human_in_the_loop=human_in_the_loop,
|
694
|
+
human_review_node_name="human_review",
|
695
|
+
checkpointer=MemorySaver(),
|
696
|
+
bypass_recommended_steps=bypass_recommended_steps,
|
697
|
+
bypass_explain_code=bypass_explain_code,
|
698
|
+
)
|
699
|
+
|
700
|
+
return app
|
701
|
+
|
702
|
+
H2O_AUTOML_DOCUMENTATION = """
|
703
|
+
Title: H2O AutoML: Automatic Machine Learning
|
704
|
+
Source: https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html
|
705
|
+
|
706
|
+
AutoML Interface
|
707
|
+
The H2O AutoML interface is designed to have as few parameters as possible so that all the user needs to do is point to their dataset, identify the response column and optionally specify a time constraint or limit on the number of total models trained. Below are the parameters that can be set by the user in the R and Python interfaces. See the Web UI via H2O Wave section below for information on how to use the H2O Wave web interface for AutoML.
|
708
|
+
|
709
|
+
In both the R and Python API, AutoML uses the same data-related arguments, x, y, training_frame, validation_frame, as the other H2O algorithms. Most of the time, all you'll need to do is specify the data arguments. You can then configure values for max_runtime_secs and/or max_models to set explicit time or number-of-model limits on your run.
|
710
|
+
|
711
|
+
Required Parameters
|
712
|
+
Required Data Parameters
|
713
|
+
y: This argument is the name (or index) of the response column.
|
714
|
+
|
715
|
+
training_frame: Specifies the training set.
|
716
|
+
|
717
|
+
Required Stopping Parameters
|
718
|
+
One of the following stopping strategies (time or number-of-model based) must be specified. When both options are set, then the AutoML run will stop as soon as it hits one of either When both options are set, then the AutoML run will stop as soon as it hits either of these limits.
|
719
|
+
|
720
|
+
max_runtime_secs: This argument specifies the maximum time that the AutoML process will run for. The default is 0 (no limit), but dynamically sets to 1 hour if none of max_runtime_secs and max_models are specified by the user.
|
721
|
+
|
722
|
+
max_models: Specify the maximum number of models to build in an AutoML run, excluding the Stacked Ensemble models. Defaults to NULL/None. Always set this parameter to ensure AutoML reproducibility: all models are then trained until convergence and none is constrained by a time budget.
|
723
|
+
|
724
|
+
Optional Parameters
|
725
|
+
Optional Data Parameters
|
726
|
+
x: A list/vector of predictor column names or indexes. This argument only needs to be specified if the user wants to exclude columns from the set of predictors. If all columns (other than the response) should be used in prediction, then this does not need to be set.
|
727
|
+
|
728
|
+
validation_frame: This argument is ignored unless nfolds == 0, in which a validation frame can be specified and used for early stopping of individual models and early stopping of the grid searches (unless max_models or max_runtime_secs overrides metric-based early stopping). By default and when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored.
|
729
|
+
|
730
|
+
leaderboard_frame: This argument allows the user to specify a particular data frame to use to score and rank models on the leaderboard. This frame will not be used for anything besides leaderboard scoring. If a leaderboard frame is not specified by the user, then the leaderboard will use cross-validation metrics instead, or if cross-validation is turned off by setting nfolds = 0, then a leaderboard frame will be generated automatically from the training frame.
|
731
|
+
|
732
|
+
blending_frame: Specifies a frame to be used for computing the predictions that serve as the training frame for the Stacked Ensemble models metalearner. If provided, all Stacked Ensembles produced by AutoML will be trained using Blending (a.k.a. Holdout Stacking) instead of the default Stacking method based on cross-validation.
|
733
|
+
|
734
|
+
fold_column: Specifies a column with cross-validation fold index assignment per observation. This is used to override the default, randomized, 5-fold cross-validation scheme for individual models in the AutoML run.
|
735
|
+
|
736
|
+
weights_column: Specifies a column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative weights are not allowed.
|
737
|
+
|
738
|
+
Optional Miscellaneous Parameters
|
739
|
+
nfolds: Specify a value >= 2 for the number of folds for k-fold cross-validation of the models in the AutoML run or specify “-1” to let AutoML choose if k-fold cross-validation or blending mode should be used. Blending mode will use part of training_frame (if no blending_frame is provided) to train Stacked Ensembles. Use 0 to disable cross-validation; this will also disable Stacked Ensembles (thus decreasing the overall best model performance). This value defaults to “-1”.
|
740
|
+
|
741
|
+
balance_classes: Specify whether to oversample the minority classes to balance the class distribution. This option is not enabled by default and can increase the data frame size. This option is only applicable for classification. If the oversampled size of the dataset exceeds the maximum size calculated using the max_after_balance_size parameter, then the majority classes will be undersampled to satisfy the size limit.
|
742
|
+
|
743
|
+
class_sampling_factors: Specify the per-class (in lexicographical order) over/under-sampling ratios. By default, these ratios are automatically computed during training to obtain the class balance. Note that this requires balance_classes set to True.
|
744
|
+
|
745
|
+
max_after_balance_size: Specify the maximum relative size of the training data after balancing class counts (balance_classes must be enabled). Defaults to 5.0. (The value can be less than 1.0).
|
746
|
+
|
747
|
+
max_runtime_secs_per_model: Specify the max amount of time dedicated to the training of each individual model in the AutoML run. Defaults to 0 (disabled). Note that models constrained by a time budget are not guaranteed reproducible.
|
748
|
+
|
749
|
+
stopping_metric: Specify the metric to use for early stopping. Defaults to AUTO. The available options are:
|
750
|
+
|
751
|
+
- AUTO: This defaults to logloss for classification and deviance for regression.
|
752
|
+
- deviance (mean residual deviance)
|
753
|
+
- logloss
|
754
|
+
- MSE
|
755
|
+
- RMSE
|
756
|
+
- MAE
|
757
|
+
- RMSLE
|
758
|
+
- AUC (area under the ROC curve)
|
759
|
+
- AUCPR (area under the Precision-Recall curve)
|
760
|
+
- lift_top_group
|
761
|
+
- misclassification
|
762
|
+
- mean_per_class_error
|
763
|
+
|
764
|
+
stopping_tolerance: This option specifies the relative tolerance for the metric-based stopping criterion to stop a grid search and the training of individual models within the AutoML run. This value defaults to 0.001 if the dataset is at least 1 million rows; otherwise it defaults to a bigger value determined by the size of the dataset and the non-NA-rate. In that case, the value is computed as 1/sqrt(nrows * non-NA-rate).
|
765
|
+
|
766
|
+
stopping_rounds: This argument is used to stop model training when the stopping metric (e.g. AUC) doesn't improve for this specified number of training rounds, based on a simple moving average. In the context of AutoML, this controls early stopping both within the random grid searches as well as the individual models. Defaults to 3 and must be an non-negative integer. To disable early stopping altogether, set this to 0.
|
767
|
+
|
768
|
+
sort_metric: Specifies the metric used to sort the Leaderboard by at the end of an AutoML run. Available options include:
|
769
|
+
|
770
|
+
- AUTO: This defaults to AUC for binary classification, mean_per_class_error for multinomial classification, and deviance for regression.
|
771
|
+
- deviance (mean residual deviance)
|
772
|
+
- logloss
|
773
|
+
- MSE
|
774
|
+
- RMSE
|
775
|
+
- MAE
|
776
|
+
- RMSLE
|
777
|
+
- AUC (area under the ROC curve)
|
778
|
+
- AUCPR (area under the Precision-Recall curve)
|
779
|
+
- mean_per_class_error
|
780
|
+
|
781
|
+
seed: Integer. Set a seed for reproducibility. AutoML can only guarantee reproducibility under certain conditions. H2O Deep Learning models are not reproducible by default for performance reasons, so if the user requires reproducibility, then exclude_algos must contain "DeepLearning". In addition max_models must be used because max_runtime_secs is resource limited, meaning that if the available compute resources are not the same between runs, AutoML may be able to train more models on one run vs another. Defaults to NULL/None.
|
782
|
+
|
783
|
+
project_name: Character string to identify an AutoML project. Defaults to NULL/None, which means a project name will be auto-generated based on the training frame ID. More models can be trained and added to an existing AutoML project by specifying the same project name in multiple calls to the AutoML function (as long as the same training frame is used in subsequent runs).
|
784
|
+
|
785
|
+
exclude_algos: A list/vector of character strings naming the algorithms to skip during the model-building phase. An example use is exclude_algos = ["GLM", "DeepLearning", "DRF"] in Python or exclude_algos = c("GLM", "DeepLearning", "DRF") in R. Defaults to None/NULL, which means that all appropriate H2O algorithms will be used if the search stopping criteria allows and if the include_algos option is not specified. This option is mutually exclusive with include_algos. See include_algos below for the list of available options.
|
786
|
+
|
787
|
+
include_algos: A list/vector of character strings naming the algorithms to include during the model-building phase. An example use is include_algos = ["GLM", "DeepLearning", "DRF"] in Python or include_algos = c("GLM", "DeepLearning", "DRF") in R. Defaults to None/NULL, which means that all appropriate H2O algorithms will be used if the search stopping criteria allows and if no algorithms are specified in exclude_algos. This option is mutually exclusive with exclude_algos. The available algorithms are:
|
788
|
+
|
789
|
+
- DRF (This includes both the Distributed Random Forest (DRF) and Extremely Randomized Trees (XRT) models. Refer to the Extremely Randomized Trees section in the DRF chapter and the histogram_type parameter description for more information.)
|
790
|
+
- GLM (Generalized Linear Model with regularization)
|
791
|
+
- XGBoost (XGBoost GBM)
|
792
|
+
- GBM (H2O GBM)
|
793
|
+
- DeepLearning (Fully-connected multi-layer artificial neural network)
|
794
|
+
- StackedEnsemble (Stacked Ensembles, includes an ensemble of all the base models and ensembles using subsets of the base models)
|
795
|
+
|
796
|
+
modeling_plan: The list of modeling steps to be used by the AutoML engine. (They may not all get executed, depending on other constraints.)
|
797
|
+
|
798
|
+
preprocessing: The list of preprocessing steps to run. Only ["target_encoding"] is currently supported. There is more information about how Target Encoding is automatically applied here. Experimental.
|
799
|
+
|
800
|
+
exploitation_ratio: Specify the budget ratio (between 0 and 1) dedicated to the exploitation (vs exploration) phase. By default, the exploitation phase is disabled (exploitation_ratio=0) as this is still experimental; to activate it, it is recommended to try a ratio around 0.1. Note that the current exploitation phase only tries to fine-tune the best XGBoost and the best GBM found during exploration. Experimental.
|
801
|
+
|
802
|
+
monotone_constraints: A mapping that represents monotonic constraints. Use +1 to enforce an increasing constraint and -1 to specify a decreasing constraint.
|
803
|
+
|
804
|
+
keep_cross_validation_predictions: Specify whether to keep the predictions of the cross-validation predictions. This needs to be set to TRUE if running the same AutoML object for repeated runs because CV predictions are required to build additional Stacked Ensemble models in AutoML. This option defaults to FALSE.
|
805
|
+
|
806
|
+
keep_cross_validation_models: Specify whether to keep the cross-validated models. Keeping cross-validation models may consume significantly more memory in the H2O cluster. This option defaults to FALSE.
|
807
|
+
|
808
|
+
keep_cross_validation_fold_assignment: Enable this option to preserve the cross-validation fold assignment. Defaults to FALSE.
|
809
|
+
|
810
|
+
verbosity: (Optional: Python and R only) The verbosity of the backend messages printed during training. Must be one of "debug", "info", "warn". Defaults to NULL/None (client logging disabled).
|
811
|
+
|
812
|
+
export_checkpoints_dir: Specify a directory to which generated models will automatically be exported.
|
813
|
+
|
814
|
+
Notes
|
815
|
+
Validation Options
|
816
|
+
If the user turns off cross-validation by setting nfolds == 0, then cross-validation metrics will not be available to populate the leaderboard. In this case, we need to make sure there is a holdout frame (i.e. the “leaderboard frame”) to score the models on so that we can generate model performance metrics for the leaderboard. Without cross-validation, we will also require a validation frame to be used for early stopping on the models. Therefore, if either of these frames are not provided by the user, they will be automatically partitioned from the training data. If either frame is missing, 10% of the training data will be used to create a missing frame (if both are missing then a total of 20% of the training data will be used to create a 10% validation and 10% leaderboard frame).
|
817
|
+
|
818
|
+
XGBoost Memory Requirements
|
819
|
+
XGBoost, which is included in H2O as a third party library, requires its own memory outside the H2O (Java) cluster. When running AutoML with XGBoost (it is included by default), be sure you allow H2O no more than 2/3 of the total available RAM. Example: If you have 60G RAM, use h2o.init(max_mem_size = "40G"), leaving 20G for XGBoost.
|
820
|
+
|
821
|
+
Scikit-learn Compatibility
|
822
|
+
H2OAutoML can interact with the h2o.sklearn module. The h2o.sklearn module exposes 2 wrappers for H2OAutoML (H2OAutoMLClassifier and H2OAutoMLRegressor), which expose the standard API familiar to sklearn users: fit, predict, fit_predict, score, get_params, and set_params. It accepts various formats as input data (H2OFrame, numpy array, pandas Dataframe) which allows them to be combined with pure sklearn components in pipelines. For an example using H2OAutoML with the h2o.sklearn module, click here.
|
823
|
+
|
824
|
+
Explainability
|
825
|
+
AutoML objects are fully supported though the H2O Model Explainability interface. A large number of multi-model comparison and single model (AutoML leader) plots can be generated automatically with a single call to h2o.explain(). We invite you to learn more at page linked above.
|
826
|
+
|
827
|
+
Code Examples
|
828
|
+
|
829
|
+
Training
|
830
|
+
Here’s an example showing basic usage of the h2o.automl() function in R and the H2OAutoML class in Python. For demonstration purposes only, we explicitly specify the x argument, even though on this dataset, that’s not required. With this dataset, the set of predictors is all columns other than the response. Like other H2O algorithms, the default value of x is “all columns, excluding y”, so that will produce the same result.
|
831
|
+
|
832
|
+
``` python
|
833
|
+
import h2o
|
834
|
+
from h2o.automl import H2OAutoML
|
835
|
+
|
836
|
+
# Start the H2O cluster (locally)
|
837
|
+
h2o.init()
|
838
|
+
|
839
|
+
# Import a sample binary outcome train/test set into H2O
|
840
|
+
train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
|
841
|
+
test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")
|
842
|
+
|
843
|
+
# Identify predictors and response
|
844
|
+
x = train.columns
|
845
|
+
y = "response"
|
846
|
+
x.remove(y)
|
847
|
+
|
848
|
+
# For binary classification, response should be a factor
|
849
|
+
train[y] = train[y].asfactor()
|
850
|
+
test[y] = test[y].asfactor()
|
851
|
+
|
852
|
+
# Run AutoML for 20 base models
|
853
|
+
aml = H2OAutoML(max_models=20, seed=1)
|
854
|
+
aml.train(x=x, y=y, training_frame=train)
|
855
|
+
|
856
|
+
# View the AutoML Leaderboard
|
857
|
+
lb = aml.leaderboard
|
858
|
+
lb.head(rows=lb.nrows) # Print all rows instead of default (10 rows)
|
859
|
+
|
860
|
+
# model_id auc logloss mean_per_class_error rmse mse
|
861
|
+
# --------------------------------------------------- -------- --------- ---------------------- -------- --------
|
862
|
+
# StackedEnsemble_AllModels_AutoML_20181212_105540 0.789801 0.551109 0.333174 0.43211 0.186719
|
863
|
+
# StackedEnsemble_BestOfFamily_AutoML_20181212_105540 0.788425 0.552145 0.323192 0.432625 0.187165
|
864
|
+
# XGBoost_1_AutoML_20181212_105540 0.784651 0.55753 0.325471 0.434949 0.189181
|
865
|
+
# XGBoost_grid_1_AutoML_20181212_105540_model_4 0.783523 0.557854 0.318819 0.435249 0.189441
|
866
|
+
# XGBoost_grid_1_AutoML_20181212_105540_model_3 0.783004 0.559613 0.325081 0.435708 0.189841
|
867
|
+
# XGBoost_2_AutoML_20181212_105540 0.78136 0.55888 0.347074 0.435907 0.190015
|
868
|
+
# XGBoost_3_AutoML_20181212_105540 0.780847 0.559589 0.330739 0.43613 0.190209
|
869
|
+
# GBM_5_AutoML_20181212_105540 0.780837 0.559903 0.340848 0.436191 0.190263
|
870
|
+
# GBM_2_AutoML_20181212_105540 0.780036 0.559806 0.339926 0.436415 0.190458
|
871
|
+
# GBM_1_AutoML_20181212_105540 0.779827 0.560857 0.335096 0.436616 0.190633
|
872
|
+
# GBM_3_AutoML_20181212_105540 0.778669 0.56179 0.325538 0.437189 0.191134
|
873
|
+
# XGBoost_grid_1_AutoML_20181212_105540_model_2 0.774411 0.575017 0.322811 0.4427 0.195984
|
874
|
+
# GBM_4_AutoML_20181212_105540 0.771426 0.569712 0.33742 0.44107 0.194543
|
875
|
+
# GBM_grid_1_AutoML_20181212_105540_model_1 0.769752 0.572583 0.344331 0.442452 0.195764
|
876
|
+
# GBM_grid_1_AutoML_20181212_105540_model_2 0.754366 0.918567 0.355855 0.496638 0.246649
|
877
|
+
# DRF_1_AutoML_20181212_105540 0.742892 0.595883 0.355403 0.452774 0.205004
|
878
|
+
# XRT_1_AutoML_20181212_105540 0.742091 0.599346 0.356583 0.453117 0.205315
|
879
|
+
# DeepLearning_grid_1_AutoML_20181212_105540_model_2 0.741795 0.601497 0.368291 0.454904 0.206937
|
880
|
+
# XGBoost_grid_1_AutoML_20181212_105540_model_1 0.693554 0.620702 0.40588 0.465791 0.216961
|
881
|
+
# DeepLearning_1_AutoML_20181212_105540 0.69137 0.637954 0.409351 0.47178 0.222576
|
882
|
+
# DeepLearning_grid_1_AutoML_20181212_105540_model_1 0.690084 0.661794 0.418469 0.476635 0.227181
|
883
|
+
# GLM_grid_1_AutoML_20181212_105540_model_1 0.682648 0.63852 0.397234 0.472683 0.223429
|
884
|
+
#
|
885
|
+
# [22 rows x 6 columns]
|
886
|
+
|
887
|
+
# The leader model is stored here
|
888
|
+
aml.leader
|
889
|
+
```
|
890
|
+
|
891
|
+
Prediction
|
892
|
+
Using the predict() function with AutoML generates predictions on the leader model from the run. The order of the rows in the results is the same as the order in which the data was loaded, even if some rows fail (for example, due to missing values or unseen factor levels).
|
893
|
+
|
894
|
+
``` python
|
895
|
+
# To generate predictions on a test set, you can make predictions
|
896
|
+
# directly on the `H2OAutoML` object or on the leader model
|
897
|
+
# object directly
|
898
|
+
preds = aml.predict(test)
|
899
|
+
|
900
|
+
# or:
|
901
|
+
preds = aml.leader.predict(test)
|
902
|
+
```
|
903
|
+
|
904
|
+
AutoML Output
|
905
|
+
|
906
|
+
Leaderboard
|
907
|
+
The AutoML object includes a “leaderboard” of models that were trained in the process, including the 5-fold cross-validated model performance (by default). The number of folds used in the model evaluation process can be adjusted using the nfolds parameter. If you would like to score the models on a specific dataset, you can specify the leaderboard_frame argument in the AutoML run, and then the leaderboard will show scores on that dataset instead.
|
908
|
+
|
909
|
+
The models are ranked by a default metric based on the problem type (the second column of the leaderboard). In binary classification problems, that metric is AUC, and in multiclass classification problems, the metric is mean per-class error. In regression problems, the default sort metric is RMSE. Some additional metrics are also provided, for convenience.
|
910
|
+
|
911
|
+
To help users assess the complexity of AutoML models, the h2o.get_leaderboard function has been been expanded by allowing an extra_columns parameter. This parameter allows you to specify which (if any) optional columns should be added to the leaderboard. This defaults to None. Allowed options include:
|
912
|
+
|
913
|
+
- training_time_ms: A column providing the training time of each model in milliseconds. (Note that this doesn't include the training of cross validation models.)
|
914
|
+
|
915
|
+
- predict_time_per_row_ms: A column providing the average prediction time by the model for a single row.
|
916
|
+
|
917
|
+
- ALL: Adds columns for both training_time_ms and predict_time_per_row_ms.
|
918
|
+
|
919
|
+
``` python
|
920
|
+
# Get leaderboard with all possible columns
|
921
|
+
lb = h2o.automl.get_leaderboard(aml, extra_columns = "ALL")
|
922
|
+
lb
|
923
|
+
```
|
924
|
+
|
925
|
+
Examine Models
|
926
|
+
To examine the trained models more closely, you can interact with the models, either by model ID, or a convenience function which can grab the best model of each model type (ranked by the default metric, or a metric of your choosing).
|
927
|
+
|
928
|
+
``` python
|
929
|
+
# Get the best model using the metric
|
930
|
+
m = aml.leader
|
931
|
+
# this is equivalent to
|
932
|
+
m = aml.get_best_model()
|
933
|
+
|
934
|
+
# Get the best model using a non-default metric
|
935
|
+
m = aml.get_best_model(criterion="logloss")
|
936
|
+
|
937
|
+
# Get the best XGBoost model using default sort metric
|
938
|
+
xgb = aml.get_best_model(algorithm="xgboost")
|
939
|
+
|
940
|
+
# Get the best XGBoost model, ranked by logloss
|
941
|
+
xgb = aml.get_best_model(algorithm="xgboost", criterion="logloss")
|
942
|
+
```
|
943
|
+
|
944
|
+
Get a specific model by model ID:
|
945
|
+
|
946
|
+
``` python
|
947
|
+
# Get a specific model by model ID
|
948
|
+
m = h2o.get_model("StackedEnsemble_BestOfFamily_AutoML_20191213_174603")
|
949
|
+
```
|
950
|
+
|
951
|
+
Once you have retreived the model in R or Python, you can inspect the model parameters as follows:
|
952
|
+
|
953
|
+
``` python
|
954
|
+
# View the parameters for the XGBoost model selected above
|
955
|
+
xgb.params.keys()
|
956
|
+
|
957
|
+
# Inspect individual parameter values
|
958
|
+
xgb.params['ntrees']
|
959
|
+
```
|
960
|
+
|
961
|
+
AutoML Log
|
962
|
+
When using Python or R clients, you can also access meta information with the following AutoML object properties:
|
963
|
+
|
964
|
+
- event_log: an H2OFrame with selected AutoML backend events generated during training.
|
965
|
+
|
966
|
+
- training_info: a dictionary exposing data that could be useful for post-analysis (e.g. various timings). If you want training and prediction times for each model, it's easier to explore that data in the extended leaderboard using the h2o.get_leaderboard() function.
|
967
|
+
|
968
|
+
``` python
|
969
|
+
# Get AutoML event log
|
970
|
+
log = aml.event_log
|
971
|
+
|
972
|
+
# Get training timing info
|
973
|
+
info = aml.training_info
|
974
|
+
```
|
975
|
+
|
976
|
+
Experimental Features
|
977
|
+
|
978
|
+
Preprocessing
|
979
|
+
As of H2O 3.32.0.1, AutoML now has a preprocessing option with minimal support for automated Target Encoding of high cardinality categorical variables. The only currently supported option is preprocessing = ["target_encoding"]: we automatically tune a Target Encoder model and apply it to columns that meet certain cardinality requirements for the tree-based algorithms (XGBoost, H2O GBM and Random Forest).
|
980
|
+
|
981
|
+
FAQ
|
982
|
+
|
983
|
+
1. Which models are trained in the AutoML process?
|
984
|
+
|
985
|
+
The current version of AutoML trains and cross-validates the following algorithms: three pre-specified XGBoost GBM (Gradient Boosting Machine) models, a fixed grid of GLMs, a default Random Forest (DRF), five pre-specified H2O GBMs, a near-default Deep Neural Net, an Extremely Randomized Forest (XRT), a random grid of XGBoost GBMs, a random grid of H2O GBMs, and a random grid of Deep Neural Nets. In some cases, there will not be enough time to complete all the algorithms, so some may be missing from the leaderboard. In other cases, the grids will stop early, and if there's time left, the top two random grids will be restarted to train more models. AutoML trains multiple Stacked Ensemble models throughout the process (more info about the ensembles below).
|
986
|
+
|
987
|
+
Particular algorithms (or groups of algorithms) can be switched off using the exclude_algos argument. This is useful if you already have some idea of the algorithms that will do well on your dataset, though sometimes this can lead to a loss of performance because having more diversity among the set of models generally increases the performance of the Stacked Ensembles. As a first step you could leave all the algorithms on, and examine their performance characteristics (e.g. prediction speed) to get a sense of what might be practically useful in your specific use-case, and then turn off algorithms that are not interesting or useful to you. We recommend using the H2O Model Explainability interface to explore and further evaluate your AutoML models, which can inform your choice of model (if you have other goals beyond simply maximizing model accuracy).
|
988
|
+
|
989
|
+
A list of the hyperparameters searched over for each algorithm in the AutoML process is included in the appendix below. More details about the hyperparameter ranges for the models in addition to the hard-coded models will be added to the appendix at a later date.
|
990
|
+
|
991
|
+
AutoML trains several Stacked Ensemble models during the run (unless ensembles are turned off using exclude_algos). We have subdivided the model training in AutoML into “model groups” with different priority levels. After each group is completed, and at the very end of the AutoML process, we train (at most) two additional Stacked Ensembles with the existing models. There are currently two types of Stacked Ensembles: one which includes all the base models (“All Models”), and one comprised only of the best model from each algorithm family (“Best of Family”). The Best of Family ensembles are more optimized for production use since it only contains six (or fewer) base models. It should be relatively fast to use in production (to generate predictions on new data) without much degradation in model performance when compared to the final “All Models” ensemble, for example. This may be useful if you want the model performance boost from ensembling without the added time or complexity of a large ensemble. You can also inspect some of the earlier “All Models” Stacked Ensembles that have fewer models as an alternative to the Best of Family ensembles. The metalearner used in all ensembles is a variant of the default Stacked Ensemble metalearner: a non-negative GLM with regularization (Lasso or Elastic net, chosen by CV) to encourage more sparse ensembles. The metalearner also uses a logit transform (on the base learner CV preds) for classification tasks before training.
|
992
|
+
|
993
|
+
For information about how previous versions of AutoML were different than the current one, there's a brief description here.
|
994
|
+
|
995
|
+
2. How do I save AutoML runs?
|
996
|
+
|
997
|
+
Rather than saving an AutoML object itself, currently, the best thing to do is to save the models you want to keep, individually. A utility for saving all of the models at once, along with a way to save the AutoML object (with leaderboard), will be added in a future release.
|
998
|
+
|
999
|
+
3. Can we make use of GPUs with AutoML?
|
1000
|
+
|
1001
|
+
XGBoost models in AutoML can make use of GPUs. Keep in mind that the following requirements must be met:
|
1002
|
+
|
1003
|
+
- NVIDIA GPUs (GPU Cloud, DGX Station, DGX-1, or DGX-2)
|
1004
|
+
- CUDA 8
|
1005
|
+
|
1006
|
+
You can monitor your GPU utilization via the nvidia-smi command. Refer to https://developer.nvidia.com/nvidia-system-management-interface for more information.
|
1007
|
+
|
1008
|
+
4. Why don't I see XGBoost models?
|
1009
|
+
|
1010
|
+
AutoML includes XGBoost GBMs (Gradient Boosting Machines) among its set of algorithms. This feature is currently provided with the following restrictions:
|
1011
|
+
|
1012
|
+
- XGBoost is not currently available on Windows machines. Follow here: https://github.com/h2oai/h2o-3/issues/7139 for updates.
|
1013
|
+
|
1014
|
+
- XGBoost is used only if it is available globally and if it hasn't been explicitly disabled. You can check if XGBoost is available by using the h2o.xgboost.available() in R or h2o.estimators.xgboost.H2OXGBoostEstimator.available() in Python.
|
1015
|
+
|
1016
|
+
5. Why doesn't AutoML use all the time that it's given?
|
1017
|
+
|
1018
|
+
If you're using 3.34.0.1 or later, AutoML should use all the time that it's given using max_runtime_secs. However, if you're using an earlier version, then early stopping was enabled by default and you can stop early. With early stopping, AutoML will stop once there's no longer “enough” incremental improvement. The user can tweak the early stopping paramters to be more or less sensitive. Set stopping_rounds higher if you want to slow down early stopping and let AutoML train more models before it stops.
|
1019
|
+
|
1020
|
+
6. Does AutoML support MOJOs?
|
1021
|
+
|
1022
|
+
AutoML will always produce a model which has a MOJO. Though it depends on the run, you are most likely to get a Stacked Ensemble. While all models are importable, only individual models are exportable.
|
1023
|
+
|
1024
|
+
7. Why doesn't AutoML use all the time that it's given?
|
1025
|
+
|
1026
|
+
If you're using 3.34.0.1 or later, AutoML should use all the time that it's given using max_runtime_secs. However, if you're using an earlier version, then early stopping was enabled by default and you can stop early. With early stopping, AutoML will stop once there's no longer “enough” incremental improvement. The user can tweak the early stopping paramters to be more or less sensitive. Set stopping_rounds higher if you want to slow down early stopping and let AutoML train more models before it stops.
|
1027
|
+
|
1028
|
+
8. What is the history of H2O AutoML?
|
1029
|
+
|
1030
|
+
The H2O AutoML algorithm was first released in H2O 3.12.0.1 on June 6, 2017 by Erin LeDell, and is based on research from her PhD thesis. New features and performance improvements have been made in every major version of H2O since the initial release.
|
1031
|
+
|
1032
|
+
"""
|