ai-data-science-team 0.0.0.9008__py3-none-any.whl → 0.0.0.9010__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (29) hide show
  1. ai_data_science_team/_version.py +1 -1
  2. ai_data_science_team/agents/__init__.py +0 -1
  3. ai_data_science_team/agents/data_cleaning_agent.py +50 -39
  4. ai_data_science_team/agents/data_loader_tools_agent.py +69 -0
  5. ai_data_science_team/agents/data_visualization_agent.py +45 -50
  6. ai_data_science_team/agents/data_wrangling_agent.py +50 -49
  7. ai_data_science_team/agents/feature_engineering_agent.py +48 -67
  8. ai_data_science_team/agents/sql_database_agent.py +130 -76
  9. ai_data_science_team/ml_agents/__init__.py +2 -0
  10. ai_data_science_team/ml_agents/h2o_ml_agent.py +852 -0
  11. ai_data_science_team/ml_agents/mlflow_tools_agent.py +327 -0
  12. ai_data_science_team/multiagents/sql_data_analyst.py +120 -9
  13. ai_data_science_team/parsers/__init__.py +0 -0
  14. ai_data_science_team/{tools → parsers}/parsers.py +0 -1
  15. ai_data_science_team/templates/__init__.py +1 -0
  16. ai_data_science_team/templates/agent_templates.py +78 -7
  17. ai_data_science_team/tools/data_loader.py +378 -0
  18. ai_data_science_team/tools/{metadata.py → dataframe.py} +0 -91
  19. ai_data_science_team/tools/h2o.py +643 -0
  20. ai_data_science_team/tools/mlflow.py +961 -0
  21. ai_data_science_team/tools/sql.py +126 -0
  22. ai_data_science_team/{tools → utils}/regex.py +59 -1
  23. {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/METADATA +56 -24
  24. ai_data_science_team-0.0.0.9010.dist-info/RECORD +35 -0
  25. ai_data_science_team-0.0.0.9008.dist-info/RECORD +0 -26
  26. /ai_data_science_team/{tools → utils}/logging.py +0 -0
  27. {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/LICENSE +0 -0
  28. {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/WHEEL +0 -0
  29. {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,852 @@
1
+ # BUSINESS SCIENCE UNIVERSITY
2
+ # AI DATA SCIENCE TEAM
3
+ # ***
4
+ # * Agents: H2O Machine Learning Agent
5
+
6
+ import os
7
+ import json
8
+ from typing import TypedDict, Annotated, Sequence, Literal
9
+ import operator
10
+
11
+ import pandas as pd
12
+ from IPython.display import Markdown
13
+
14
+ from langchain.prompts import PromptTemplate
15
+ from langchain_core.messages import BaseMessage
16
+
17
+ from langgraph.types import Command
18
+ from langgraph.checkpoint.memory import MemorySaver
19
+
20
+ from ai_data_science_team.templates import(
21
+ node_func_execute_agent_code_on_data,
22
+ node_func_human_review,
23
+ node_func_fix_agent_code,
24
+ node_func_report_agent_outputs,
25
+ create_coding_agent_graph,
26
+ BaseAgent,
27
+ )
28
+ from ai_data_science_team.parsers.parsers import PythonOutputParser
29
+ from ai_data_science_team.utils.regex import (
30
+ relocate_imports_inside_function,
31
+ add_comments_to_top,
32
+ format_agent_name,
33
+ format_recommended_steps,
34
+ get_generic_summary,
35
+ )
36
+ from ai_data_science_team.tools.dataframe import get_dataframe_summary
37
+ from ai_data_science_team.utils.logging import log_ai_function
38
+ from ai_data_science_team.tools.h2o import H2O_AUTOML_DOCUMENTATION
39
+
40
+ AGENT_NAME = "h2o_ml_agent"
41
+ LOG_PATH = os.path.join(os.getcwd(), "logs/")
42
+
43
+ class H2OMLAgent(BaseAgent):
44
+ """
45
+ A Machine Learning agent that uses H2O's AutoML for training,
46
+ allowing the user to specify a model directory for saving the best model.
47
+ If neither model_directory nor log_path is provided, model saving is skipped.
48
+
49
+ Parameters
50
+ ----------
51
+ model : langchain.llms.base.LLM
52
+ The language model used to generate the ML code.
53
+ n_samples : int, optional
54
+ Number of samples used when summarizing the dataset. Defaults to 30.
55
+ log : bool, optional
56
+ Whether to log the generated code and errors. Defaults to False.
57
+ log_path : str, optional
58
+ Directory path for storing log files. Defaults to None.
59
+ file_name : str, optional
60
+ Name of the Python file for saving the generated code. Defaults to "h2o_automl.py".
61
+ function_name : str, optional
62
+ Name of the function that performs the AutoML training. Defaults to "h2o_automl".
63
+ model_directory : str or None, optional
64
+ Directory to save the H2O Machine Learning model. If None, defaults to log_path (if available).
65
+ If both are None, no model is saved. Defaults to None.
66
+ overwrite : bool, optional
67
+ Whether to overwrite the log file if it exists. Defaults to True.
68
+ human_in_the_loop : bool, optional
69
+ Enables user review of the code. Defaults to False.
70
+ bypass_recommended_steps : bool, optional
71
+ If True, skips the recommended steps prompt. Defaults to False.
72
+ bypass_explain_code : bool, optional
73
+ If True, skips the code-explanation step. Defaults to False.
74
+ enable_mlflow : bool, default False
75
+ Whether to enable MLflow logging. If False, skip MLflow entirely.
76
+ mlflow_tracking_uri : str or None
77
+ If provided, sets MLflow tracking URI at runtime.
78
+ mlflow_experiment_name : str
79
+ Name of the MLflow experiment (created if doesn't exist).
80
+ mlflow_run_name : str, default None
81
+ A custom name for the MLflow run.
82
+
83
+
84
+ Methods
85
+ -------
86
+ update_params(**kwargs)
87
+ Updates the agent's parameters and rebuilds the compiled state graph.
88
+ ainvoke_agent(user_instructions, data_raw, target_variable, ...)
89
+ Asynchronously runs the agent to produce an H2O AutoML model, optionally saving the model to disk.
90
+ invoke_agent(user_instructions, data_raw, target_variable, ...)
91
+ Synchronously runs the agent to produce an H2O AutoML model, optionally saving the model to disk.
92
+ get_leaderboard()
93
+ Retrieves the H2O AutoML leaderboard from the agent's response.
94
+ get_best_model_id()
95
+ Retrieves the best model ID from the agent's response.
96
+ get_model_path()
97
+ Retrieves the saved model path from the agent's response (or None if not saved).
98
+ get_data_raw()
99
+ Retrieves the raw data as a DataFrame from the agent's response.
100
+ get_h2o_train_function(markdown=False)
101
+ Retrieves the H2O AutoML function code generated by the agent.
102
+ get_recommended_ml_steps(markdown=False)
103
+ Retrieves recommended ML steps from the agent's response.
104
+ get_workflow_summary()
105
+ Retrieves a summary of the agent's workflow.
106
+ get_response()
107
+ Returns the entire response dictionary.
108
+ show()
109
+ Visualizes the compiled graph as a Mermaid diagram.
110
+
111
+ Examples
112
+ --------
113
+ ```python
114
+ from langchain_openai import ChatOpenAI
115
+ import pandas as pd
116
+ from ai_data_science_team.ml_agents import H2OMLAgent
117
+
118
+ llm = ChatOpenAI(model="gpt-4o-mini")
119
+
120
+ df = pd.read_csv("data/churn_data.csv")
121
+
122
+ ml_agent = H2OMLAgent(
123
+ model=llm,
124
+ log=True,
125
+ log_path=LOG_PATH,
126
+ model_directory=MODEL_PATH,
127
+ )
128
+
129
+ ml_agent.invoke_agent(
130
+ data_raw=df.drop(columns=["customerID"]),
131
+ user_instructions="Please do classification on 'Churn'. Use a max runtime of 30 seconds.",
132
+ target_variable="Churn"
133
+ )
134
+
135
+ # Retrieve and display the leaderboard of models
136
+ ml_agent.get_leaderboard()
137
+
138
+ # Get the H2O training function in markdown format
139
+ ml_agent.get_h2o_train_function(markdown=True)
140
+
141
+ # Get the recommended machine learning steps in markdown format
142
+ ml_agent.get_recommended_ml_steps(markdown=True)
143
+
144
+ # Get a summary of the workflow in markdown format
145
+ ml_agent.get_workflow_summary(markdown=True)
146
+
147
+ # Get a summary of the logs in markdown format
148
+ ml_agent.get_log_summary(markdown=True)
149
+
150
+ # Get the path to the saved model
151
+ model_path = ml_agent.get_model_path()
152
+ model_path
153
+ ```
154
+
155
+ Returns
156
+ -------
157
+ H2OMLAgent : langchain.graphs.CompiledStateGraph
158
+ An instance of the H2O ML agent.
159
+
160
+ """
161
+
162
+ def __init__(
163
+ self,
164
+ model,
165
+ n_samples=30,
166
+ log=False,
167
+ log_path=None,
168
+ file_name="h2o_automl.py",
169
+ function_name="h2o_automl",
170
+ model_directory=None,
171
+ overwrite=True,
172
+ human_in_the_loop=False,
173
+ bypass_recommended_steps=False,
174
+ bypass_explain_code=False,
175
+ enable_mlflow=False,
176
+ mlflow_tracking_uri=None,
177
+ mlflow_experiment_name="H2O AutoML",
178
+ mlflow_run_name=None,
179
+ ):
180
+ self._params = {
181
+ "model": model,
182
+ "n_samples": n_samples,
183
+ "log": log,
184
+ "log_path": log_path,
185
+ "file_name": file_name,
186
+ "function_name": function_name,
187
+ "model_directory": model_directory,
188
+ "overwrite": overwrite,
189
+ "human_in_the_loop": human_in_the_loop,
190
+ "bypass_recommended_steps": bypass_recommended_steps,
191
+ "bypass_explain_code": bypass_explain_code,
192
+ "enable_mlflow": enable_mlflow,
193
+ "mlflow_tracking_uri": mlflow_tracking_uri,
194
+ "mlflow_experiment_name": mlflow_experiment_name,
195
+ "mlflow_run_name": mlflow_run_name,
196
+ }
197
+ self._compiled_graph = self._make_compiled_graph()
198
+ self.response = None
199
+
200
+ def _make_compiled_graph(self):
201
+ """
202
+ Creates the compiled graph for the agent.
203
+ """
204
+ self.response = None
205
+ return make_h2o_ml_agent(**self._params)
206
+
207
+ def update_params(self, **kwargs):
208
+ """
209
+ Updates the agent's parameters and rebuilds the compiled graph.
210
+ """
211
+ for k, v in kwargs.items():
212
+ self._params[k] = v
213
+ self._compiled_graph = self._make_compiled_graph()
214
+
215
+ async def ainvoke_agent(
216
+ self,
217
+ data_raw: pd.DataFrame,
218
+ user_instructions: str=None,
219
+ target_variable: str=None,
220
+ max_retries=3,
221
+ retry_count=0,
222
+ **kwargs
223
+ ):
224
+ """
225
+ Asynchronously trains an H2O AutoML model for the provided dataset,
226
+ saving the best model to disk if model_directory or log_path is available.
227
+ """
228
+ response = await self._compiled_graph.ainvoke({
229
+ "user_instructions": user_instructions,
230
+ "data_raw": data_raw.to_dict(),
231
+ "target_variable": target_variable,
232
+ "max_retries": max_retries,
233
+ "retry_count": retry_count
234
+ }, **kwargs)
235
+ self.response = response
236
+ return None
237
+
238
+ def invoke_agent(
239
+ self,
240
+ data_raw: pd.DataFrame,
241
+ user_instructions: str=None,
242
+ target_variable: str=None,
243
+ max_retries=3,
244
+ retry_count=0,
245
+ **kwargs
246
+ ):
247
+ """
248
+ Synchronously trains an H2O AutoML model for the provided dataset,
249
+ saving the best model to disk if model_directory or log_path is available.
250
+ """
251
+ response = self._compiled_graph.invoke({
252
+ "user_instructions": user_instructions,
253
+ "data_raw": data_raw.to_dict(),
254
+ "target_variable": target_variable,
255
+ "max_retries": max_retries,
256
+ "retry_count": retry_count
257
+ }, **kwargs)
258
+ self.response = response
259
+ return None
260
+
261
+ def get_leaderboard(self):
262
+ """Returns the H2O AutoML leaderboard as a DataFrame."""
263
+ if self.response and "leaderboard" in self.response:
264
+ return pd.DataFrame(self.response["leaderboard"])
265
+ return None
266
+
267
+ def get_best_model_id(self):
268
+ """Returns the best model id from the AutoML run."""
269
+ if self.response and "best_model_id" in self.response:
270
+ return self.response["best_model_id"]
271
+ return None
272
+
273
+ def get_model_path(self):
274
+ """Returns the file path to the saved best model, or None if not saved."""
275
+ if self.response and "model_path" in self.response:
276
+ return self.response["model_path"]
277
+ return None
278
+
279
+ def get_data_raw(self):
280
+ """Retrieves the raw data as a DataFrame from the response."""
281
+ if self.response and "data_raw" in self.response:
282
+ return pd.DataFrame(self.response["data_raw"])
283
+ return None
284
+
285
+ def get_h2o_train_function(self, markdown=False):
286
+ """Retrieves the H2O AutoML function code generated by the agent."""
287
+ if self.response and "h2o_train_function" in self.response:
288
+ code = self.response["h2o_train_function"]
289
+ if markdown:
290
+ return Markdown(f"```python\n{code}\n```")
291
+ return code
292
+ return None
293
+
294
+ def get_recommended_ml_steps(self, markdown=False):
295
+ """Retrieves recommended ML steps from the agent's response."""
296
+ if self.response and "recommended_steps" in self.response:
297
+ steps = self.response["recommended_steps"]
298
+ if markdown:
299
+ return Markdown(steps)
300
+ return steps
301
+ return None
302
+
303
+ def get_workflow_summary(self, markdown=False):
304
+ """
305
+ Retrieves the agent's workflow summary, if logging is enabled.
306
+ """
307
+ if self.response and self.response.get("messages"):
308
+ summary = get_generic_summary(json.loads(self.response.get("messages")[-1].content))
309
+ if markdown:
310
+ return Markdown(summary)
311
+ else:
312
+ return summary
313
+
314
+ def get_log_summary(self, markdown=False):
315
+ """
316
+ Logs a summary of the agent's operations, if logging is enabled.
317
+ """
318
+ if self.response:
319
+ if self.response.get('h2o_train_function_path'):
320
+ log_details = f"""
321
+ ## H2O Machine Learning Agent Log Summary:
322
+
323
+ Function Path: {self.response.get('h2o_train_function_path')}
324
+
325
+ Function Name: {self.response.get('h2o_train_function_name')}
326
+
327
+ Best Model ID: {self.get_best_model_id()}
328
+
329
+ Model Path: {self.get_model_path()}
330
+ """
331
+ if markdown:
332
+ return Markdown(log_details)
333
+ else:
334
+ return log_details
335
+
336
+
337
+ def make_h2o_ml_agent(
338
+ model,
339
+ n_samples=30,
340
+ log=False,
341
+ log_path=None,
342
+ file_name="h2o_automl.py",
343
+ function_name="h2o_automl",
344
+ model_directory=None,
345
+ overwrite=True,
346
+ human_in_the_loop=False,
347
+ bypass_recommended_steps=False,
348
+ bypass_explain_code=False,
349
+ enable_mlflow=False,
350
+ mlflow_tracking_uri=None,
351
+ mlflow_experiment_name="H2O AutoML",
352
+ mlflow_run_name=None,
353
+ ):
354
+ """
355
+ Creates a machine learning agent that uses H2O for AutoML.
356
+ The agent will:
357
+ 1. Optionally recommend ML steps,
358
+ 2. Creates Python code that sets up H2OAutoML,
359
+ 3. Executes that code (optionally saving the best model to disk),
360
+ 4. Fixes errors if needed,
361
+ 5. Optionally explains the code.
362
+
363
+ model_directory: Directory to save the model.
364
+ If None, defaults to log_path.
365
+ If both are None, skip saving.
366
+ """
367
+
368
+ llm = model
369
+
370
+ # Handle logging directory
371
+ if log:
372
+ if log_path is None:
373
+ log_path = "logs/"
374
+ if not os.path.exists(log_path):
375
+ os.makedirs(log_path)
376
+
377
+ # Check if H2O is installed
378
+ try:
379
+ import h2o
380
+ from h2o.automl import H2OAutoML
381
+ except ImportError as e:
382
+ raise ImportError(
383
+ "The 'h2o' library is not installed. Please install it using pip:\n\n"
384
+ " pip install h2o\n\n"
385
+ "Visit https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html for details."
386
+ ) from e
387
+
388
+ # Define GraphState
389
+ class GraphState(TypedDict):
390
+ messages: Annotated[Sequence[BaseMessage], operator.add]
391
+ user_instructions: str
392
+ recommended_steps: str
393
+ data_raw: dict
394
+ leaderboard: dict
395
+ best_model_id: str
396
+ model_path: str
397
+ model_results: dict
398
+ target_variable: str
399
+ all_datasets_summary: str
400
+ h2o_train_function: str
401
+ h2o_train_function_path: str
402
+ h2o_train_file_name: str
403
+ h2o_train_function_name: str
404
+ h2o_train_error: str
405
+ max_retries: int
406
+ retry_count: int
407
+
408
+ # 1) Recommend ML steps (optional)
409
+ def recommend_ml_steps(state: GraphState):
410
+ print(format_agent_name(AGENT_NAME))
411
+ print(" * RECOMMEND MACHINE LEARNING STEPS")
412
+
413
+ recommend_steps_prompt = PromptTemplate(
414
+ template="""
415
+ You are an AutoML Expert using H2O.
416
+
417
+ We have the following dataset summary, user instructions, and H2O AutoML documentation:
418
+
419
+ User instructions:
420
+ {user_instructions}
421
+
422
+ Data Summary:
423
+ {all_datasets_summary}
424
+
425
+ H2O AutoML Documentation:
426
+ {h2o_automl_documentation}
427
+
428
+ Please recommend a short list of steps or considerations for performing H2OAutoML on this data. Specifically focus on maximizing model accuracy while remaining flexible to user instructions and the dataset.
429
+
430
+ - Recommend any paramters and values that might improve performance (predictive accuracy).
431
+ - Recommend the Loss Function, Stopping Criteria, and other advanced parameters.
432
+ - Use the H2O AutoML documentation to your advantage.
433
+ - Exclude deep learning algorithms since these are typically low performance.
434
+
435
+ Avoid these:
436
+
437
+ - Do not perform data cleaning or feature engineering here. We will handle that separately.
438
+ - Do not limit memory size or CPU usage unless the user specifies it.
439
+
440
+ Return as a numbered list. You can return short code snippets to demonstrate actions. But do not return a fully coded solution. The H2O AutoML code will be generated separately by a Coding Agent.
441
+ """,
442
+ input_variables=["user_instructions", "all_datasets_summary", "h2o_automl_documentation"]
443
+ )
444
+
445
+ data_raw = state.get("data_raw")
446
+ df = pd.DataFrame.from_dict(data_raw)
447
+ all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
448
+ all_datasets_summary_str = "\n\n".join(all_datasets_summary)
449
+
450
+ steps_agent = recommend_steps_prompt | llm
451
+ recommended_steps = steps_agent.invoke({
452
+ "user_instructions": state.get("user_instructions"),
453
+ "all_datasets_summary": all_datasets_summary_str,
454
+ "h2o_automl_documentation": H2O_AUTOML_DOCUMENTATION
455
+ })
456
+
457
+ return {
458
+ "recommended_steps": format_recommended_steps(
459
+ recommended_steps.content.strip(),
460
+ heading="# Recommended ML Steps:"
461
+ ),
462
+ "all_datasets_summary": all_datasets_summary_str
463
+ }
464
+
465
+ # 2) Create code
466
+ def create_h2o_code(state: GraphState):
467
+ if bypass_recommended_steps:
468
+ print(format_agent_name(AGENT_NAME))
469
+
470
+ data_raw = state.get("data_raw")
471
+ df = pd.DataFrame.from_dict(data_raw)
472
+ all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
473
+ all_datasets_summary_str = "\n\n".join(all_datasets_summary)
474
+ else:
475
+ all_datasets_summary_str = state.get("all_datasets_summary")
476
+
477
+ print(" * CREATE H2O AUTOML CODE")
478
+
479
+ code_prompt = PromptTemplate(
480
+ template="""
481
+ You are an H2O AutoML agent. Create a Python function named {function_name}(data_raw)
482
+ that runs H2OAutoML on the provided data with a focus on maximizing model accuracy and
483
+ incorporating user instructions for flexibility.
484
+
485
+ Do not perform substantial data cleaning or feature engineering here. We will handle that separately.
486
+
487
+ We have two variables for deciding where to save the model:
488
+ model_directory = {model_directory}
489
+ log_path = {log_path}
490
+
491
+ IMPORTANT: MLflow Parameters if the user wants to enable MLflow with H2O AutoML:
492
+ enable_mlflow: {enable_mlflow}
493
+ mlflow_tracking_uri: {mlflow_tracking_uri}
494
+ mlflow_experiment_name: {mlflow_experiment_name}
495
+ mlflow_run_name: {mlflow_run_name}
496
+
497
+
498
+ Additional Requirements:
499
+ - Convert `data_raw` (pandas DataFrame) into an H2OFrame.
500
+ - Identify the target variable from {target_variable} (if provided).
501
+ - Start H2O if not already started.
502
+ - Use Recommended Steps to guide any advanced parameters (e.g., cross-validation folds,
503
+ balancing classes, extended training time, stacking) that might improve performance.
504
+ - If the user does not specify anything special, use H2OAutoML defaults (including stacked ensembles).
505
+ - Focus on maximizing accuracy (or the most relevant metric if it's not classification)
506
+ while remaining flexible to user instructions.
507
+ - Return a dict with keys: leaderboard, best_model_id, model_path, and model_results.
508
+ - If enable_mlfow is True, log the top metrics and save the model as an artifact. (See example function)
509
+
510
+ Initial User Instructions (Disregard any instructions that are unrelated to modeling):
511
+ {user_instructions}
512
+
513
+ Recommended Steps:
514
+ {recommended_steps}
515
+
516
+ Data summary for reference:
517
+ {all_datasets_summary}
518
+
519
+ Return only code in ```python``` with a single function definition. Use this as an example starting template:
520
+ ```python
521
+ def {function_name}(
522
+ data_raw: List[Dict[str, Any]],
523
+ target: str,
524
+ max_runtime_secs: int,
525
+ exclude_algos: List[str],
526
+ balance_classes: bool,
527
+ nfolds: int,
528
+ seed: int,
529
+ max_models: int,
530
+ stopping_metric: str,
531
+ stopping_tolerance: float,
532
+ stopping_rounds: int,
533
+ sort_metric: str ,
534
+ model_directory: Optional[str] = None,
535
+ log_path: Optional[str] = None,
536
+ enable_mlflow: bool,
537
+ mlflow_tracking_uri: Optional[str],
538
+ mlflow_experiment_name: str,
539
+ mlflow_run_name: str,
540
+ **kwargs # Additional parameters for H2OAutoML (feel free to add these based on user instructions and recommended steps)
541
+ ):
542
+
543
+ import h2o
544
+ from h2o.automl import H2OAutoML
545
+ import pandas as pd
546
+ import json
547
+
548
+ # Optional MLflow usage
549
+ if enable_mlflow:
550
+ import mlflow
551
+ if mlflow_tracking_uri:
552
+ mlflow.set_tracking_uri(mlflow_tracking_uri)
553
+ mlflow.set_experiment(mlflow_experiment_name)
554
+ run_context = mlflow.start_run(run_name=mlflow_run_name)
555
+ else:
556
+ # Dummy context manager to skip MLflow if not enabled
557
+ from contextlib import nullcontext
558
+ run_context = nullcontext()
559
+
560
+ exclude_algos = exclude_algos or ["DeepLearning"] # default if not provided
561
+
562
+ # Convert data to DataFrame
563
+ df = pd.DataFrame(data_raw)
564
+
565
+ with run_context as run:
566
+ # If using MLflow, track run ID
567
+ run_id = None
568
+ if enable_mlflow and run is not None:
569
+ run_id = run.info.run_id
570
+ import mlflow
571
+
572
+
573
+ # Initialize H2O
574
+ h2o.init()
575
+
576
+ # Create H2OFrame
577
+ data_h2o = h2o.H2OFrame(df)
578
+
579
+ # Setup AutoML
580
+ aml = H2OAutoML(
581
+ max_runtime_secs=max_runtime_secs,
582
+ exclude_algos=exclude_algos,
583
+ balance_classes=balance_classes,
584
+ nfolds=nfolds,
585
+ seed=seed,
586
+ max_models=max_models,
587
+ stopping_metric=stopping_metric,
588
+ stopping_tolerance=stopping_tolerance,
589
+ stopping_rounds=stopping_rounds,
590
+ sort_metric=sort_metric,
591
+ **kwargs
592
+ )
593
+
594
+ # Train
595
+ x = [col for col in data_h2o.columns if col != target]
596
+ aml.train(x=x, y=target, training_frame=data_h2o)
597
+
598
+ # Save model if we have a directory/log path
599
+ if model_directory is None and log_path is None:
600
+ model_path = None
601
+ else:
602
+ path_to_save = model_directory if model_directory else log_path
603
+ model_path = h2o.save_model(model=aml.leader, path=path_to_save, force=True)
604
+
605
+ # Leaderboard (DataFrame -> dict)
606
+ leaderboard_df = pd.DataFrame(aml.leaderboard)
607
+ leaderboard_dict = leaderboard_df.to_dict()
608
+
609
+ # Gather top-model metrics from the first row
610
+ top_metrics = leaderboard_df.iloc[0].to_dict()
611
+
612
+ # Construct model_results
613
+ model_results = dict(
614
+ model_flavor= "H2O AutoML",
615
+ model_path= model_path,
616
+ best_model_id= aml.leader.model_id,
617
+ metrics= top_metrics # all metrics from the top row
618
+ )
619
+
620
+ # IMPORTANT: Log these to MLflow if enabled
621
+ if enable_mlflow and run is not None:
622
+
623
+ # Log the top metrics if numeric
624
+ numeric_metrics = {{k: v for k, v in top_metrics.items() if isinstance(v, (int, float))}}
625
+ mlflow.log_metrics(numeric_metrics)
626
+
627
+ # Log artifact if we saved the model
628
+ mlflow.h2o.log_model(aml.leader, artifact_path="model")
629
+
630
+ # Log the leaderboard
631
+ mlflow.log_table(leaderboard_dict, "leaderboard.json")
632
+
633
+ # Log these parameters (if specified)
634
+ mlflow.log_params(dict(
635
+ target= target,
636
+ max_runtime_secs= max_runtime_secs,
637
+ exclude_algos= str(exclude_algos),
638
+ balance_classes= balance_classes,
639
+ nfolds= nfolds,
640
+ seed= seed,
641
+ max_models= max_models,
642
+ stopping_metric= stopping_metric,
643
+ stopping_tolerance= stopping_tolerance,
644
+ stopping_rounds= stopping_rounds,
645
+ sort_metric= sort_metric,
646
+ model_directory= model_directory,
647
+ log_path= log_path
648
+ ))
649
+
650
+ # Build the output
651
+ output = dict(
652
+ leaderboard= leaderboard_dict,
653
+ best_model_id= aml.leader.model_id,
654
+ model_path= model_path,
655
+ model_results= model_results,
656
+ mlflow_run_id= run_id
657
+ )
658
+
659
+ return output
660
+ ```
661
+
662
+ Avoid these errors:
663
+
664
+ - WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
665
+
666
+ - 'list' object has no attribute 'tolist'
667
+
668
+ - with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True): pandas_df = h2o_df.as_data_frame() # Convert to pandas DataFrame using pd.DataFrame(h2o_df)
669
+
670
+ - dtype is only supported for one column frames
671
+
672
+ - h2o.is_running() module 'h2o' has no attribute 'is_running'. Solution: just do h2o.init() and it will check if H2O is running.
673
+
674
+
675
+ """,
676
+ input_variables=[
677
+ "user_instructions",
678
+ "function_name",
679
+ "target_variable",
680
+ "recommended_steps",
681
+ "all_datasets_summary",
682
+ "model_directory",
683
+ "log_path",
684
+ "enable_mlflow",
685
+ "mlflow_tracking_uri",
686
+ "mlflow_experiment_name",
687
+ "mlflow_run_name",
688
+ ]
689
+ )
690
+
691
+ recommended_steps = state.get("recommended_steps", "")
692
+ h2o_code_agent = code_prompt | llm | PythonOutputParser()
693
+
694
+ resp = h2o_code_agent.invoke({
695
+ "user_instructions": state.get("user_instructions"),
696
+ "function_name": function_name,
697
+ "target_variable": state.get("target_variable"),
698
+ "recommended_steps": recommended_steps,
699
+ "all_datasets_summary": all_datasets_summary_str,
700
+ "model_directory": model_directory,
701
+ "log_path": log_path,
702
+ "enable_mlflow": enable_mlflow,
703
+ "mlflow_tracking_uri": mlflow_tracking_uri,
704
+ "mlflow_experiment_name": mlflow_experiment_name,
705
+ "mlflow_run_name": mlflow_run_name,
706
+ })
707
+
708
+ resp = relocate_imports_inside_function(resp)
709
+ resp = add_comments_to_top(resp, agent_name=AGENT_NAME)
710
+
711
+ # Log the code snippet if requested
712
+ file_path, f_name = log_ai_function(
713
+ response=resp,
714
+ file_name=file_name,
715
+ log=log,
716
+ log_path=log_path,
717
+ overwrite=overwrite
718
+ )
719
+
720
+ return {
721
+ "h2o_train_function": resp,
722
+ "h2o_train_function_path": file_path,
723
+ "h2o_train_file_name": f_name,
724
+ "h2o_train_function_name": function_name,
725
+ }
726
+
727
+ # Human Review
728
+ prompt_text_human_review = "Are the following Machine Learning instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
729
+
730
+ if not bypass_explain_code:
731
+ def human_review(state: GraphState) -> Command[Literal["recommend_ml_steps", "explain_h2o_code"]]:
732
+ return node_func_human_review(
733
+ state=state,
734
+ prompt_text=prompt_text_human_review,
735
+ yes_goto= 'explain_h2o_code',
736
+ no_goto="recommend_ml_steps",
737
+ user_instructions_key="user_instructions",
738
+ recommended_steps_key="recommended_steps",
739
+ code_snippet_key="h2o_train_function",
740
+ )
741
+ else:
742
+ def human_review(state: GraphState) -> Command[Literal["recommend_ml_steps", "__end__"]]:
743
+ return node_func_human_review(
744
+ state=state,
745
+ prompt_text=prompt_text_human_review,
746
+ yes_goto= '__end__',
747
+ no_goto="recommend_ml_steps",
748
+ user_instructions_key="user_instructions",
749
+ recommended_steps_key="recommended_steps",
750
+ code_snippet_key="h2o_train_function",
751
+ )
752
+
753
+ # 3) Execute code
754
+ def execute_h2o_code(state):
755
+ result = node_func_execute_agent_code_on_data(
756
+ state=state,
757
+ data_key="data_raw",
758
+ code_snippet_key="h2o_train_function",
759
+ result_key="h2o_train_result",
760
+ error_key="h2o_train_error",
761
+ agent_function_name=state.get("h2o_train_function_name"),
762
+ pre_processing=lambda data: pd.DataFrame.from_dict(data),
763
+ post_processing=lambda x: x,
764
+ error_message_prefix="Error occurred during H2O AutoML: "
765
+ )
766
+
767
+ # If no error, extract leaderboard, best_model_id, and model_path
768
+ if not result["h2o_train_error"]:
769
+ if result["h2o_train_result"] and isinstance(result["h2o_train_result"], dict):
770
+ lb = result["h2o_train_result"].get("leaderboard", {})
771
+ best_id = result["h2o_train_result"].get("best_model_id", None)
772
+ mpath = result["h2o_train_result"].get("model_path", None)
773
+ model_results = result["h2o_train_result"].get("model_results", {})
774
+
775
+ result["leaderboard"] = lb
776
+ result["best_model_id"] = best_id
777
+ result["model_path"] = mpath
778
+ result["model_results"] = model_results
779
+
780
+ return result
781
+
782
+ # 4) Fix code if there's an error
783
+ def fix_h2o_code(state: GraphState):
784
+ fix_prompt = """
785
+ You are an H2O AutoML agent. The function {function_name} currently has errors.
786
+ Please fix it. Return only the corrected function in ```python``` format.
787
+
788
+ Broken code:
789
+ {code_snippet}
790
+
791
+ Last Known Error:
792
+ {error}
793
+ """
794
+ return node_func_fix_agent_code(
795
+ state=state,
796
+ code_snippet_key="h2o_train_function",
797
+ error_key="h2o_train_error",
798
+ llm=llm,
799
+ prompt_template=fix_prompt,
800
+ agent_name=AGENT_NAME,
801
+ file_path=state.get("h2o_train_function_path"),
802
+ function_name=state.get("h2o_train_function_name"),
803
+ log=log
804
+ )
805
+
806
+ # 5) Final reporting node
807
+ def report_agent_outputs(state: GraphState):
808
+ return node_func_report_agent_outputs(
809
+ state=state,
810
+ keys_to_include=[
811
+ "recommended_steps",
812
+ "h2o_train_function",
813
+ "h2o_train_function_path",
814
+ "h2o_train_function_name",
815
+ "h2o_train_error",
816
+ "model_path",
817
+ "best_model_id",
818
+ ],
819
+ result_key="messages",
820
+ role=AGENT_NAME,
821
+ custom_title="H2O Machine Learning Agent Outputs"
822
+ )
823
+
824
+ node_functions = {
825
+ "recommend_ml_steps": recommend_ml_steps,
826
+ "human_review": human_review,
827
+ "create_h2o_code": create_h2o_code,
828
+ "execute_h2o_code": execute_h2o_code,
829
+ "fix_h2o_code": fix_h2o_code,
830
+ "report_agent_outputs": report_agent_outputs,
831
+ }
832
+
833
+ app = create_coding_agent_graph(
834
+ GraphState=GraphState,
835
+ node_functions=node_functions,
836
+ recommended_steps_node_name="recommend_ml_steps",
837
+ create_code_node_name="create_h2o_code",
838
+ execute_code_node_name="execute_h2o_code",
839
+ fix_code_node_name="fix_h2o_code",
840
+ explain_code_node_name="report_agent_outputs",
841
+ error_key="h2o_train_error",
842
+ max_retries_key="max_retries",
843
+ retry_count_key="retry_count",
844
+ human_in_the_loop=human_in_the_loop,
845
+ human_review_node_name="human_review",
846
+ checkpointer=MemorySaver(),
847
+ bypass_recommended_steps=bypass_recommended_steps,
848
+ bypass_explain_code=bypass_explain_code,
849
+ )
850
+
851
+ return app
852
+