ai-data-science-team 0.0.0.9008__py3-none-any.whl → 0.0.0.9010__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. ai_data_science_team/_version.py +1 -1
  2. ai_data_science_team/agents/__init__.py +0 -1
  3. ai_data_science_team/agents/data_cleaning_agent.py +50 -39
  4. ai_data_science_team/agents/data_loader_tools_agent.py +69 -0
  5. ai_data_science_team/agents/data_visualization_agent.py +45 -50
  6. ai_data_science_team/agents/data_wrangling_agent.py +50 -49
  7. ai_data_science_team/agents/feature_engineering_agent.py +48 -67
  8. ai_data_science_team/agents/sql_database_agent.py +130 -76
  9. ai_data_science_team/ml_agents/__init__.py +2 -0
  10. ai_data_science_team/ml_agents/h2o_ml_agent.py +852 -0
  11. ai_data_science_team/ml_agents/mlflow_tools_agent.py +327 -0
  12. ai_data_science_team/multiagents/sql_data_analyst.py +120 -9
  13. ai_data_science_team/parsers/__init__.py +0 -0
  14. ai_data_science_team/{tools → parsers}/parsers.py +0 -1
  15. ai_data_science_team/templates/__init__.py +1 -0
  16. ai_data_science_team/templates/agent_templates.py +78 -7
  17. ai_data_science_team/tools/data_loader.py +378 -0
  18. ai_data_science_team/tools/{metadata.py → dataframe.py} +0 -91
  19. ai_data_science_team/tools/h2o.py +643 -0
  20. ai_data_science_team/tools/mlflow.py +961 -0
  21. ai_data_science_team/tools/sql.py +126 -0
  22. ai_data_science_team/{tools → utils}/regex.py +59 -1
  23. {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/METADATA +56 -24
  24. ai_data_science_team-0.0.0.9010.dist-info/RECORD +35 -0
  25. ai_data_science_team-0.0.0.9008.dist-info/RECORD +0 -26
  26. /ai_data_science_team/{tools → utils}/logging.py +0 -0
  27. {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/LICENSE +0 -0
  28. {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/WHEEL +0 -0
  29. {ai_data_science_team-0.0.0.9008.dist-info → ai_data_science_team-0.0.0.9010.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,852 @@
1
+ # BUSINESS SCIENCE UNIVERSITY
2
+ # AI DATA SCIENCE TEAM
3
+ # ***
4
+ # * Agents: H2O Machine Learning Agent
5
+
6
+ import os
7
+ import json
8
+ from typing import TypedDict, Annotated, Sequence, Literal
9
+ import operator
10
+
11
+ import pandas as pd
12
+ from IPython.display import Markdown
13
+
14
+ from langchain.prompts import PromptTemplate
15
+ from langchain_core.messages import BaseMessage
16
+
17
+ from langgraph.types import Command
18
+ from langgraph.checkpoint.memory import MemorySaver
19
+
20
+ from ai_data_science_team.templates import(
21
+ node_func_execute_agent_code_on_data,
22
+ node_func_human_review,
23
+ node_func_fix_agent_code,
24
+ node_func_report_agent_outputs,
25
+ create_coding_agent_graph,
26
+ BaseAgent,
27
+ )
28
+ from ai_data_science_team.parsers.parsers import PythonOutputParser
29
+ from ai_data_science_team.utils.regex import (
30
+ relocate_imports_inside_function,
31
+ add_comments_to_top,
32
+ format_agent_name,
33
+ format_recommended_steps,
34
+ get_generic_summary,
35
+ )
36
+ from ai_data_science_team.tools.dataframe import get_dataframe_summary
37
+ from ai_data_science_team.utils.logging import log_ai_function
38
+ from ai_data_science_team.tools.h2o import H2O_AUTOML_DOCUMENTATION
39
+
40
+ AGENT_NAME = "h2o_ml_agent"
41
+ LOG_PATH = os.path.join(os.getcwd(), "logs/")
42
+
43
+ class H2OMLAgent(BaseAgent):
44
+ """
45
+ A Machine Learning agent that uses H2O's AutoML for training,
46
+ allowing the user to specify a model directory for saving the best model.
47
+ If neither model_directory nor log_path is provided, model saving is skipped.
48
+
49
+ Parameters
50
+ ----------
51
+ model : langchain.llms.base.LLM
52
+ The language model used to generate the ML code.
53
+ n_samples : int, optional
54
+ Number of samples used when summarizing the dataset. Defaults to 30.
55
+ log : bool, optional
56
+ Whether to log the generated code and errors. Defaults to False.
57
+ log_path : str, optional
58
+ Directory path for storing log files. Defaults to None.
59
+ file_name : str, optional
60
+ Name of the Python file for saving the generated code. Defaults to "h2o_automl.py".
61
+ function_name : str, optional
62
+ Name of the function that performs the AutoML training. Defaults to "h2o_automl".
63
+ model_directory : str or None, optional
64
+ Directory to save the H2O Machine Learning model. If None, defaults to log_path (if available).
65
+ If both are None, no model is saved. Defaults to None.
66
+ overwrite : bool, optional
67
+ Whether to overwrite the log file if it exists. Defaults to True.
68
+ human_in_the_loop : bool, optional
69
+ Enables user review of the code. Defaults to False.
70
+ bypass_recommended_steps : bool, optional
71
+ If True, skips the recommended steps prompt. Defaults to False.
72
+ bypass_explain_code : bool, optional
73
+ If True, skips the code-explanation step. Defaults to False.
74
+ enable_mlflow : bool, default False
75
+ Whether to enable MLflow logging. If False, skip MLflow entirely.
76
+ mlflow_tracking_uri : str or None
77
+ If provided, sets MLflow tracking URI at runtime.
78
+ mlflow_experiment_name : str
79
+ Name of the MLflow experiment (created if doesn't exist).
80
+ mlflow_run_name : str, default None
81
+ A custom name for the MLflow run.
82
+
83
+
84
+ Methods
85
+ -------
86
+ update_params(**kwargs)
87
+ Updates the agent's parameters and rebuilds the compiled state graph.
88
+ ainvoke_agent(user_instructions, data_raw, target_variable, ...)
89
+ Asynchronously runs the agent to produce an H2O AutoML model, optionally saving the model to disk.
90
+ invoke_agent(user_instructions, data_raw, target_variable, ...)
91
+ Synchronously runs the agent to produce an H2O AutoML model, optionally saving the model to disk.
92
+ get_leaderboard()
93
+ Retrieves the H2O AutoML leaderboard from the agent's response.
94
+ get_best_model_id()
95
+ Retrieves the best model ID from the agent's response.
96
+ get_model_path()
97
+ Retrieves the saved model path from the agent's response (or None if not saved).
98
+ get_data_raw()
99
+ Retrieves the raw data as a DataFrame from the agent's response.
100
+ get_h2o_train_function(markdown=False)
101
+ Retrieves the H2O AutoML function code generated by the agent.
102
+ get_recommended_ml_steps(markdown=False)
103
+ Retrieves recommended ML steps from the agent's response.
104
+ get_workflow_summary()
105
+ Retrieves a summary of the agent's workflow.
106
+ get_response()
107
+ Returns the entire response dictionary.
108
+ show()
109
+ Visualizes the compiled graph as a Mermaid diagram.
110
+
111
+ Examples
112
+ --------
113
+ ```python
114
+ from langchain_openai import ChatOpenAI
115
+ import pandas as pd
116
+ from ai_data_science_team.ml_agents import H2OMLAgent
117
+
118
+ llm = ChatOpenAI(model="gpt-4o-mini")
119
+
120
+ df = pd.read_csv("data/churn_data.csv")
121
+
122
+ ml_agent = H2OMLAgent(
123
+ model=llm,
124
+ log=True,
125
+ log_path=LOG_PATH,
126
+ model_directory=MODEL_PATH,
127
+ )
128
+
129
+ ml_agent.invoke_agent(
130
+ data_raw=df.drop(columns=["customerID"]),
131
+ user_instructions="Please do classification on 'Churn'. Use a max runtime of 30 seconds.",
132
+ target_variable="Churn"
133
+ )
134
+
135
+ # Retrieve and display the leaderboard of models
136
+ ml_agent.get_leaderboard()
137
+
138
+ # Get the H2O training function in markdown format
139
+ ml_agent.get_h2o_train_function(markdown=True)
140
+
141
+ # Get the recommended machine learning steps in markdown format
142
+ ml_agent.get_recommended_ml_steps(markdown=True)
143
+
144
+ # Get a summary of the workflow in markdown format
145
+ ml_agent.get_workflow_summary(markdown=True)
146
+
147
+ # Get a summary of the logs in markdown format
148
+ ml_agent.get_log_summary(markdown=True)
149
+
150
+ # Get the path to the saved model
151
+ model_path = ml_agent.get_model_path()
152
+ model_path
153
+ ```
154
+
155
+ Returns
156
+ -------
157
+ H2OMLAgent : langchain.graphs.CompiledStateGraph
158
+ An instance of the H2O ML agent.
159
+
160
+ """
161
+
162
+ def __init__(
163
+ self,
164
+ model,
165
+ n_samples=30,
166
+ log=False,
167
+ log_path=None,
168
+ file_name="h2o_automl.py",
169
+ function_name="h2o_automl",
170
+ model_directory=None,
171
+ overwrite=True,
172
+ human_in_the_loop=False,
173
+ bypass_recommended_steps=False,
174
+ bypass_explain_code=False,
175
+ enable_mlflow=False,
176
+ mlflow_tracking_uri=None,
177
+ mlflow_experiment_name="H2O AutoML",
178
+ mlflow_run_name=None,
179
+ ):
180
+ self._params = {
181
+ "model": model,
182
+ "n_samples": n_samples,
183
+ "log": log,
184
+ "log_path": log_path,
185
+ "file_name": file_name,
186
+ "function_name": function_name,
187
+ "model_directory": model_directory,
188
+ "overwrite": overwrite,
189
+ "human_in_the_loop": human_in_the_loop,
190
+ "bypass_recommended_steps": bypass_recommended_steps,
191
+ "bypass_explain_code": bypass_explain_code,
192
+ "enable_mlflow": enable_mlflow,
193
+ "mlflow_tracking_uri": mlflow_tracking_uri,
194
+ "mlflow_experiment_name": mlflow_experiment_name,
195
+ "mlflow_run_name": mlflow_run_name,
196
+ }
197
+ self._compiled_graph = self._make_compiled_graph()
198
+ self.response = None
199
+
200
+ def _make_compiled_graph(self):
201
+ """
202
+ Creates the compiled graph for the agent.
203
+ """
204
+ self.response = None
205
+ return make_h2o_ml_agent(**self._params)
206
+
207
+ def update_params(self, **kwargs):
208
+ """
209
+ Updates the agent's parameters and rebuilds the compiled graph.
210
+ """
211
+ for k, v in kwargs.items():
212
+ self._params[k] = v
213
+ self._compiled_graph = self._make_compiled_graph()
214
+
215
+ async def ainvoke_agent(
216
+ self,
217
+ data_raw: pd.DataFrame,
218
+ user_instructions: str=None,
219
+ target_variable: str=None,
220
+ max_retries=3,
221
+ retry_count=0,
222
+ **kwargs
223
+ ):
224
+ """
225
+ Asynchronously trains an H2O AutoML model for the provided dataset,
226
+ saving the best model to disk if model_directory or log_path is available.
227
+ """
228
+ response = await self._compiled_graph.ainvoke({
229
+ "user_instructions": user_instructions,
230
+ "data_raw": data_raw.to_dict(),
231
+ "target_variable": target_variable,
232
+ "max_retries": max_retries,
233
+ "retry_count": retry_count
234
+ }, **kwargs)
235
+ self.response = response
236
+ return None
237
+
238
+ def invoke_agent(
239
+ self,
240
+ data_raw: pd.DataFrame,
241
+ user_instructions: str=None,
242
+ target_variable: str=None,
243
+ max_retries=3,
244
+ retry_count=0,
245
+ **kwargs
246
+ ):
247
+ """
248
+ Synchronously trains an H2O AutoML model for the provided dataset,
249
+ saving the best model to disk if model_directory or log_path is available.
250
+ """
251
+ response = self._compiled_graph.invoke({
252
+ "user_instructions": user_instructions,
253
+ "data_raw": data_raw.to_dict(),
254
+ "target_variable": target_variable,
255
+ "max_retries": max_retries,
256
+ "retry_count": retry_count
257
+ }, **kwargs)
258
+ self.response = response
259
+ return None
260
+
261
+ def get_leaderboard(self):
262
+ """Returns the H2O AutoML leaderboard as a DataFrame."""
263
+ if self.response and "leaderboard" in self.response:
264
+ return pd.DataFrame(self.response["leaderboard"])
265
+ return None
266
+
267
+ def get_best_model_id(self):
268
+ """Returns the best model id from the AutoML run."""
269
+ if self.response and "best_model_id" in self.response:
270
+ return self.response["best_model_id"]
271
+ return None
272
+
273
+ def get_model_path(self):
274
+ """Returns the file path to the saved best model, or None if not saved."""
275
+ if self.response and "model_path" in self.response:
276
+ return self.response["model_path"]
277
+ return None
278
+
279
+ def get_data_raw(self):
280
+ """Retrieves the raw data as a DataFrame from the response."""
281
+ if self.response and "data_raw" in self.response:
282
+ return pd.DataFrame(self.response["data_raw"])
283
+ return None
284
+
285
+ def get_h2o_train_function(self, markdown=False):
286
+ """Retrieves the H2O AutoML function code generated by the agent."""
287
+ if self.response and "h2o_train_function" in self.response:
288
+ code = self.response["h2o_train_function"]
289
+ if markdown:
290
+ return Markdown(f"```python\n{code}\n```")
291
+ return code
292
+ return None
293
+
294
+ def get_recommended_ml_steps(self, markdown=False):
295
+ """Retrieves recommended ML steps from the agent's response."""
296
+ if self.response and "recommended_steps" in self.response:
297
+ steps = self.response["recommended_steps"]
298
+ if markdown:
299
+ return Markdown(steps)
300
+ return steps
301
+ return None
302
+
303
+ def get_workflow_summary(self, markdown=False):
304
+ """
305
+ Retrieves the agent's workflow summary, if logging is enabled.
306
+ """
307
+ if self.response and self.response.get("messages"):
308
+ summary = get_generic_summary(json.loads(self.response.get("messages")[-1].content))
309
+ if markdown:
310
+ return Markdown(summary)
311
+ else:
312
+ return summary
313
+
314
+ def get_log_summary(self, markdown=False):
315
+ """
316
+ Logs a summary of the agent's operations, if logging is enabled.
317
+ """
318
+ if self.response:
319
+ if self.response.get('h2o_train_function_path'):
320
+ log_details = f"""
321
+ ## H2O Machine Learning Agent Log Summary:
322
+
323
+ Function Path: {self.response.get('h2o_train_function_path')}
324
+
325
+ Function Name: {self.response.get('h2o_train_function_name')}
326
+
327
+ Best Model ID: {self.get_best_model_id()}
328
+
329
+ Model Path: {self.get_model_path()}
330
+ """
331
+ if markdown:
332
+ return Markdown(log_details)
333
+ else:
334
+ return log_details
335
+
336
+
337
+ def make_h2o_ml_agent(
338
+ model,
339
+ n_samples=30,
340
+ log=False,
341
+ log_path=None,
342
+ file_name="h2o_automl.py",
343
+ function_name="h2o_automl",
344
+ model_directory=None,
345
+ overwrite=True,
346
+ human_in_the_loop=False,
347
+ bypass_recommended_steps=False,
348
+ bypass_explain_code=False,
349
+ enable_mlflow=False,
350
+ mlflow_tracking_uri=None,
351
+ mlflow_experiment_name="H2O AutoML",
352
+ mlflow_run_name=None,
353
+ ):
354
+ """
355
+ Creates a machine learning agent that uses H2O for AutoML.
356
+ The agent will:
357
+ 1. Optionally recommend ML steps,
358
+ 2. Creates Python code that sets up H2OAutoML,
359
+ 3. Executes that code (optionally saving the best model to disk),
360
+ 4. Fixes errors if needed,
361
+ 5. Optionally explains the code.
362
+
363
+ model_directory: Directory to save the model.
364
+ If None, defaults to log_path.
365
+ If both are None, skip saving.
366
+ """
367
+
368
+ llm = model
369
+
370
+ # Handle logging directory
371
+ if log:
372
+ if log_path is None:
373
+ log_path = "logs/"
374
+ if not os.path.exists(log_path):
375
+ os.makedirs(log_path)
376
+
377
+ # Check if H2O is installed
378
+ try:
379
+ import h2o
380
+ from h2o.automl import H2OAutoML
381
+ except ImportError as e:
382
+ raise ImportError(
383
+ "The 'h2o' library is not installed. Please install it using pip:\n\n"
384
+ " pip install h2o\n\n"
385
+ "Visit https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html for details."
386
+ ) from e
387
+
388
+ # Define GraphState
389
+ class GraphState(TypedDict):
390
+ messages: Annotated[Sequence[BaseMessage], operator.add]
391
+ user_instructions: str
392
+ recommended_steps: str
393
+ data_raw: dict
394
+ leaderboard: dict
395
+ best_model_id: str
396
+ model_path: str
397
+ model_results: dict
398
+ target_variable: str
399
+ all_datasets_summary: str
400
+ h2o_train_function: str
401
+ h2o_train_function_path: str
402
+ h2o_train_file_name: str
403
+ h2o_train_function_name: str
404
+ h2o_train_error: str
405
+ max_retries: int
406
+ retry_count: int
407
+
408
+ # 1) Recommend ML steps (optional)
409
+ def recommend_ml_steps(state: GraphState):
410
+ print(format_agent_name(AGENT_NAME))
411
+ print(" * RECOMMEND MACHINE LEARNING STEPS")
412
+
413
+ recommend_steps_prompt = PromptTemplate(
414
+ template="""
415
+ You are an AutoML Expert using H2O.
416
+
417
+ We have the following dataset summary, user instructions, and H2O AutoML documentation:
418
+
419
+ User instructions:
420
+ {user_instructions}
421
+
422
+ Data Summary:
423
+ {all_datasets_summary}
424
+
425
+ H2O AutoML Documentation:
426
+ {h2o_automl_documentation}
427
+
428
+ Please recommend a short list of steps or considerations for performing H2OAutoML on this data. Specifically focus on maximizing model accuracy while remaining flexible to user instructions and the dataset.
429
+
430
+ - Recommend any paramters and values that might improve performance (predictive accuracy).
431
+ - Recommend the Loss Function, Stopping Criteria, and other advanced parameters.
432
+ - Use the H2O AutoML documentation to your advantage.
433
+ - Exclude deep learning algorithms since these are typically low performance.
434
+
435
+ Avoid these:
436
+
437
+ - Do not perform data cleaning or feature engineering here. We will handle that separately.
438
+ - Do not limit memory size or CPU usage unless the user specifies it.
439
+
440
+ Return as a numbered list. You can return short code snippets to demonstrate actions. But do not return a fully coded solution. The H2O AutoML code will be generated separately by a Coding Agent.
441
+ """,
442
+ input_variables=["user_instructions", "all_datasets_summary", "h2o_automl_documentation"]
443
+ )
444
+
445
+ data_raw = state.get("data_raw")
446
+ df = pd.DataFrame.from_dict(data_raw)
447
+ all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
448
+ all_datasets_summary_str = "\n\n".join(all_datasets_summary)
449
+
450
+ steps_agent = recommend_steps_prompt | llm
451
+ recommended_steps = steps_agent.invoke({
452
+ "user_instructions": state.get("user_instructions"),
453
+ "all_datasets_summary": all_datasets_summary_str,
454
+ "h2o_automl_documentation": H2O_AUTOML_DOCUMENTATION
455
+ })
456
+
457
+ return {
458
+ "recommended_steps": format_recommended_steps(
459
+ recommended_steps.content.strip(),
460
+ heading="# Recommended ML Steps:"
461
+ ),
462
+ "all_datasets_summary": all_datasets_summary_str
463
+ }
464
+
465
+ # 2) Create code
466
+ def create_h2o_code(state: GraphState):
467
+ if bypass_recommended_steps:
468
+ print(format_agent_name(AGENT_NAME))
469
+
470
+ data_raw = state.get("data_raw")
471
+ df = pd.DataFrame.from_dict(data_raw)
472
+ all_datasets_summary = get_dataframe_summary([df], n_sample=n_samples)
473
+ all_datasets_summary_str = "\n\n".join(all_datasets_summary)
474
+ else:
475
+ all_datasets_summary_str = state.get("all_datasets_summary")
476
+
477
+ print(" * CREATE H2O AUTOML CODE")
478
+
479
+ code_prompt = PromptTemplate(
480
+ template="""
481
+ You are an H2O AutoML agent. Create a Python function named {function_name}(data_raw)
482
+ that runs H2OAutoML on the provided data with a focus on maximizing model accuracy and
483
+ incorporating user instructions for flexibility.
484
+
485
+ Do not perform substantial data cleaning or feature engineering here. We will handle that separately.
486
+
487
+ We have two variables for deciding where to save the model:
488
+ model_directory = {model_directory}
489
+ log_path = {log_path}
490
+
491
+ IMPORTANT: MLflow Parameters if the user wants to enable MLflow with H2O AutoML:
492
+ enable_mlflow: {enable_mlflow}
493
+ mlflow_tracking_uri: {mlflow_tracking_uri}
494
+ mlflow_experiment_name: {mlflow_experiment_name}
495
+ mlflow_run_name: {mlflow_run_name}
496
+
497
+
498
+ Additional Requirements:
499
+ - Convert `data_raw` (pandas DataFrame) into an H2OFrame.
500
+ - Identify the target variable from {target_variable} (if provided).
501
+ - Start H2O if not already started.
502
+ - Use Recommended Steps to guide any advanced parameters (e.g., cross-validation folds,
503
+ balancing classes, extended training time, stacking) that might improve performance.
504
+ - If the user does not specify anything special, use H2OAutoML defaults (including stacked ensembles).
505
+ - Focus on maximizing accuracy (or the most relevant metric if it's not classification)
506
+ while remaining flexible to user instructions.
507
+ - Return a dict with keys: leaderboard, best_model_id, model_path, and model_results.
508
+ - If enable_mlfow is True, log the top metrics and save the model as an artifact. (See example function)
509
+
510
+ Initial User Instructions (Disregard any instructions that are unrelated to modeling):
511
+ {user_instructions}
512
+
513
+ Recommended Steps:
514
+ {recommended_steps}
515
+
516
+ Data summary for reference:
517
+ {all_datasets_summary}
518
+
519
+ Return only code in ```python``` with a single function definition. Use this as an example starting template:
520
+ ```python
521
+ def {function_name}(
522
+ data_raw: List[Dict[str, Any]],
523
+ target: str,
524
+ max_runtime_secs: int,
525
+ exclude_algos: List[str],
526
+ balance_classes: bool,
527
+ nfolds: int,
528
+ seed: int,
529
+ max_models: int,
530
+ stopping_metric: str,
531
+ stopping_tolerance: float,
532
+ stopping_rounds: int,
533
+ sort_metric: str ,
534
+ model_directory: Optional[str] = None,
535
+ log_path: Optional[str] = None,
536
+ enable_mlflow: bool,
537
+ mlflow_tracking_uri: Optional[str],
538
+ mlflow_experiment_name: str,
539
+ mlflow_run_name: str,
540
+ **kwargs # Additional parameters for H2OAutoML (feel free to add these based on user instructions and recommended steps)
541
+ ):
542
+
543
+ import h2o
544
+ from h2o.automl import H2OAutoML
545
+ import pandas as pd
546
+ import json
547
+
548
+ # Optional MLflow usage
549
+ if enable_mlflow:
550
+ import mlflow
551
+ if mlflow_tracking_uri:
552
+ mlflow.set_tracking_uri(mlflow_tracking_uri)
553
+ mlflow.set_experiment(mlflow_experiment_name)
554
+ run_context = mlflow.start_run(run_name=mlflow_run_name)
555
+ else:
556
+ # Dummy context manager to skip MLflow if not enabled
557
+ from contextlib import nullcontext
558
+ run_context = nullcontext()
559
+
560
+ exclude_algos = exclude_algos or ["DeepLearning"] # default if not provided
561
+
562
+ # Convert data to DataFrame
563
+ df = pd.DataFrame(data_raw)
564
+
565
+ with run_context as run:
566
+ # If using MLflow, track run ID
567
+ run_id = None
568
+ if enable_mlflow and run is not None:
569
+ run_id = run.info.run_id
570
+ import mlflow
571
+
572
+
573
+ # Initialize H2O
574
+ h2o.init()
575
+
576
+ # Create H2OFrame
577
+ data_h2o = h2o.H2OFrame(df)
578
+
579
+ # Setup AutoML
580
+ aml = H2OAutoML(
581
+ max_runtime_secs=max_runtime_secs,
582
+ exclude_algos=exclude_algos,
583
+ balance_classes=balance_classes,
584
+ nfolds=nfolds,
585
+ seed=seed,
586
+ max_models=max_models,
587
+ stopping_metric=stopping_metric,
588
+ stopping_tolerance=stopping_tolerance,
589
+ stopping_rounds=stopping_rounds,
590
+ sort_metric=sort_metric,
591
+ **kwargs
592
+ )
593
+
594
+ # Train
595
+ x = [col for col in data_h2o.columns if col != target]
596
+ aml.train(x=x, y=target, training_frame=data_h2o)
597
+
598
+ # Save model if we have a directory/log path
599
+ if model_directory is None and log_path is None:
600
+ model_path = None
601
+ else:
602
+ path_to_save = model_directory if model_directory else log_path
603
+ model_path = h2o.save_model(model=aml.leader, path=path_to_save, force=True)
604
+
605
+ # Leaderboard (DataFrame -> dict)
606
+ leaderboard_df = pd.DataFrame(aml.leaderboard)
607
+ leaderboard_dict = leaderboard_df.to_dict()
608
+
609
+ # Gather top-model metrics from the first row
610
+ top_metrics = leaderboard_df.iloc[0].to_dict()
611
+
612
+ # Construct model_results
613
+ model_results = dict(
614
+ model_flavor= "H2O AutoML",
615
+ model_path= model_path,
616
+ best_model_id= aml.leader.model_id,
617
+ metrics= top_metrics # all metrics from the top row
618
+ )
619
+
620
+ # IMPORTANT: Log these to MLflow if enabled
621
+ if enable_mlflow and run is not None:
622
+
623
+ # Log the top metrics if numeric
624
+ numeric_metrics = {{k: v for k, v in top_metrics.items() if isinstance(v, (int, float))}}
625
+ mlflow.log_metrics(numeric_metrics)
626
+
627
+ # Log artifact if we saved the model
628
+ mlflow.h2o.log_model(aml.leader, artifact_path="model")
629
+
630
+ # Log the leaderboard
631
+ mlflow.log_table(leaderboard_dict, "leaderboard.json")
632
+
633
+ # Log these parameters (if specified)
634
+ mlflow.log_params(dict(
635
+ target= target,
636
+ max_runtime_secs= max_runtime_secs,
637
+ exclude_algos= str(exclude_algos),
638
+ balance_classes= balance_classes,
639
+ nfolds= nfolds,
640
+ seed= seed,
641
+ max_models= max_models,
642
+ stopping_metric= stopping_metric,
643
+ stopping_tolerance= stopping_tolerance,
644
+ stopping_rounds= stopping_rounds,
645
+ sort_metric= sort_metric,
646
+ model_directory= model_directory,
647
+ log_path= log_path
648
+ ))
649
+
650
+ # Build the output
651
+ output = dict(
652
+ leaderboard= leaderboard_dict,
653
+ best_model_id= aml.leader.model_id,
654
+ model_path= model_path,
655
+ model_results= model_results,
656
+ mlflow_run_id= run_id
657
+ )
658
+
659
+ return output
660
+ ```
661
+
662
+ Avoid these errors:
663
+
664
+ - WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
665
+
666
+ - 'list' object has no attribute 'tolist'
667
+
668
+ - with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True): pandas_df = h2o_df.as_data_frame() # Convert to pandas DataFrame using pd.DataFrame(h2o_df)
669
+
670
+ - dtype is only supported for one column frames
671
+
672
+ - h2o.is_running() module 'h2o' has no attribute 'is_running'. Solution: just do h2o.init() and it will check if H2O is running.
673
+
674
+
675
+ """,
676
+ input_variables=[
677
+ "user_instructions",
678
+ "function_name",
679
+ "target_variable",
680
+ "recommended_steps",
681
+ "all_datasets_summary",
682
+ "model_directory",
683
+ "log_path",
684
+ "enable_mlflow",
685
+ "mlflow_tracking_uri",
686
+ "mlflow_experiment_name",
687
+ "mlflow_run_name",
688
+ ]
689
+ )
690
+
691
+ recommended_steps = state.get("recommended_steps", "")
692
+ h2o_code_agent = code_prompt | llm | PythonOutputParser()
693
+
694
+ resp = h2o_code_agent.invoke({
695
+ "user_instructions": state.get("user_instructions"),
696
+ "function_name": function_name,
697
+ "target_variable": state.get("target_variable"),
698
+ "recommended_steps": recommended_steps,
699
+ "all_datasets_summary": all_datasets_summary_str,
700
+ "model_directory": model_directory,
701
+ "log_path": log_path,
702
+ "enable_mlflow": enable_mlflow,
703
+ "mlflow_tracking_uri": mlflow_tracking_uri,
704
+ "mlflow_experiment_name": mlflow_experiment_name,
705
+ "mlflow_run_name": mlflow_run_name,
706
+ })
707
+
708
+ resp = relocate_imports_inside_function(resp)
709
+ resp = add_comments_to_top(resp, agent_name=AGENT_NAME)
710
+
711
+ # Log the code snippet if requested
712
+ file_path, f_name = log_ai_function(
713
+ response=resp,
714
+ file_name=file_name,
715
+ log=log,
716
+ log_path=log_path,
717
+ overwrite=overwrite
718
+ )
719
+
720
+ return {
721
+ "h2o_train_function": resp,
722
+ "h2o_train_function_path": file_path,
723
+ "h2o_train_file_name": f_name,
724
+ "h2o_train_function_name": function_name,
725
+ }
726
+
727
+ # Human Review
728
+ prompt_text_human_review = "Are the following Machine Learning instructions correct? (Answer 'yes' or provide modifications)\n{steps}"
729
+
730
+ if not bypass_explain_code:
731
+ def human_review(state: GraphState) -> Command[Literal["recommend_ml_steps", "explain_h2o_code"]]:
732
+ return node_func_human_review(
733
+ state=state,
734
+ prompt_text=prompt_text_human_review,
735
+ yes_goto= 'explain_h2o_code',
736
+ no_goto="recommend_ml_steps",
737
+ user_instructions_key="user_instructions",
738
+ recommended_steps_key="recommended_steps",
739
+ code_snippet_key="h2o_train_function",
740
+ )
741
+ else:
742
+ def human_review(state: GraphState) -> Command[Literal["recommend_ml_steps", "__end__"]]:
743
+ return node_func_human_review(
744
+ state=state,
745
+ prompt_text=prompt_text_human_review,
746
+ yes_goto= '__end__',
747
+ no_goto="recommend_ml_steps",
748
+ user_instructions_key="user_instructions",
749
+ recommended_steps_key="recommended_steps",
750
+ code_snippet_key="h2o_train_function",
751
+ )
752
+
753
+ # 3) Execute code
754
+ def execute_h2o_code(state):
755
+ result = node_func_execute_agent_code_on_data(
756
+ state=state,
757
+ data_key="data_raw",
758
+ code_snippet_key="h2o_train_function",
759
+ result_key="h2o_train_result",
760
+ error_key="h2o_train_error",
761
+ agent_function_name=state.get("h2o_train_function_name"),
762
+ pre_processing=lambda data: pd.DataFrame.from_dict(data),
763
+ post_processing=lambda x: x,
764
+ error_message_prefix="Error occurred during H2O AutoML: "
765
+ )
766
+
767
+ # If no error, extract leaderboard, best_model_id, and model_path
768
+ if not result["h2o_train_error"]:
769
+ if result["h2o_train_result"] and isinstance(result["h2o_train_result"], dict):
770
+ lb = result["h2o_train_result"].get("leaderboard", {})
771
+ best_id = result["h2o_train_result"].get("best_model_id", None)
772
+ mpath = result["h2o_train_result"].get("model_path", None)
773
+ model_results = result["h2o_train_result"].get("model_results", {})
774
+
775
+ result["leaderboard"] = lb
776
+ result["best_model_id"] = best_id
777
+ result["model_path"] = mpath
778
+ result["model_results"] = model_results
779
+
780
+ return result
781
+
782
+ # 4) Fix code if there's an error
783
+ def fix_h2o_code(state: GraphState):
784
+ fix_prompt = """
785
+ You are an H2O AutoML agent. The function {function_name} currently has errors.
786
+ Please fix it. Return only the corrected function in ```python``` format.
787
+
788
+ Broken code:
789
+ {code_snippet}
790
+
791
+ Last Known Error:
792
+ {error}
793
+ """
794
+ return node_func_fix_agent_code(
795
+ state=state,
796
+ code_snippet_key="h2o_train_function",
797
+ error_key="h2o_train_error",
798
+ llm=llm,
799
+ prompt_template=fix_prompt,
800
+ agent_name=AGENT_NAME,
801
+ file_path=state.get("h2o_train_function_path"),
802
+ function_name=state.get("h2o_train_function_name"),
803
+ log=log
804
+ )
805
+
806
+ # 5) Final reporting node
807
+ def report_agent_outputs(state: GraphState):
808
+ return node_func_report_agent_outputs(
809
+ state=state,
810
+ keys_to_include=[
811
+ "recommended_steps",
812
+ "h2o_train_function",
813
+ "h2o_train_function_path",
814
+ "h2o_train_function_name",
815
+ "h2o_train_error",
816
+ "model_path",
817
+ "best_model_id",
818
+ ],
819
+ result_key="messages",
820
+ role=AGENT_NAME,
821
+ custom_title="H2O Machine Learning Agent Outputs"
822
+ )
823
+
824
+ node_functions = {
825
+ "recommend_ml_steps": recommend_ml_steps,
826
+ "human_review": human_review,
827
+ "create_h2o_code": create_h2o_code,
828
+ "execute_h2o_code": execute_h2o_code,
829
+ "fix_h2o_code": fix_h2o_code,
830
+ "report_agent_outputs": report_agent_outputs,
831
+ }
832
+
833
+ app = create_coding_agent_graph(
834
+ GraphState=GraphState,
835
+ node_functions=node_functions,
836
+ recommended_steps_node_name="recommend_ml_steps",
837
+ create_code_node_name="create_h2o_code",
838
+ execute_code_node_name="execute_h2o_code",
839
+ fix_code_node_name="fix_h2o_code",
840
+ explain_code_node_name="report_agent_outputs",
841
+ error_key="h2o_train_error",
842
+ max_retries_key="max_retries",
843
+ retry_count_key="retry_count",
844
+ human_in_the_loop=human_in_the_loop,
845
+ human_review_node_name="human_review",
846
+ checkpointer=MemorySaver(),
847
+ bypass_recommended_steps=bypass_recommended_steps,
848
+ bypass_explain_code=bypass_explain_code,
849
+ )
850
+
851
+ return app
852
+