vision-agent 1.0.4__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,742 +0,0 @@
1
- import copy
2
- import logging
3
- import os
4
- import sys
5
- from pathlib import Path
6
- from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
7
-
8
- from tabulate import tabulate
9
-
10
- import vision_agent.tools as T
11
- from vision_agent.agent.agent import Agent
12
- from vision_agent.agent.vision_agent_coder_prompts import (
13
- CODE,
14
- FIX_BUG,
15
- FULL_TASK,
16
- SIMPLE_TEST,
17
- )
18
- from vision_agent.agent.vision_agent_planner import (
19
- AnthropicVisionAgentPlanner,
20
- AzureVisionAgentPlanner,
21
- OllamaVisionAgentPlanner,
22
- OpenAIVisionAgentPlanner,
23
- PlanContext,
24
- )
25
- from vision_agent.lmm import LMM, AnthropicLMM, AzureOpenAILMM, OllamaLMM, OpenAILMM
26
- from vision_agent.models import Message
27
- from vision_agent.tools.meta_tools import get_diff
28
- from vision_agent.utils import CodeInterpreterFactory, Execution
29
- from vision_agent.utils.agent import (
30
- _MAX_TABULATE_COL_WIDTH,
31
- DefaultImports,
32
- extract_code,
33
- extract_tag,
34
- format_feedback,
35
- print_code,
36
- remove_installs_from_code,
37
- strip_function_calls,
38
- )
39
- from vision_agent.utils.execute import CodeInterpreter
40
-
41
- logging.basicConfig(stream=sys.stdout)
42
- WORKSPACE = Path(os.getenv("WORKSPACE", ""))
43
- _LOGGER = logging.getLogger(__name__)
44
-
45
-
46
- def write_code(
47
- coder: LMM,
48
- chat: List[Message],
49
- plan: str,
50
- tool_info: str,
51
- plan_thoughts: str,
52
- tool_output: str,
53
- feedback: str,
54
- ) -> str:
55
- chat = copy.deepcopy(chat)
56
- if chat[-1]["role"] != "user":
57
- raise ValueError("Last chat message must be from the user.")
58
-
59
- user_request = chat[-1]["content"]
60
- prompt = CODE.format(
61
- docstring=tool_info,
62
- question=FULL_TASK.format(user_request=user_request, subtasks=plan),
63
- tool_output=tool_output,
64
- plan_thoughts=plan_thoughts,
65
- feedback=feedback,
66
- )
67
- chat[-1]["content"] = prompt
68
- return extract_code(coder(chat, stream=False)) # type: ignore
69
-
70
-
71
- def write_test(
72
- tester: LMM,
73
- chat: List[Message],
74
- tool_utils: str,
75
- code: str,
76
- feedback: str,
77
- media: Optional[Sequence[Union[str, Path]]] = None,
78
- ) -> str:
79
- chat = copy.deepcopy(chat)
80
- if chat[-1]["role"] != "user":
81
- raise ValueError("Last chat message must be from the user.")
82
-
83
- user_request = chat[-1]["content"]
84
- prompt = SIMPLE_TEST.format(
85
- docstring=tool_utils,
86
- question=user_request,
87
- code=code,
88
- feedback=feedback,
89
- media=media,
90
- )
91
- chat[-1]["content"] = prompt
92
- return extract_code(tester(chat, stream=False)) # type: ignore
93
-
94
-
95
- def write_and_test_code(
96
- chat: List[Message],
97
- plan: str,
98
- tool_info: str,
99
- tool_output: str,
100
- plan_thoughts: str,
101
- tool_utils: str,
102
- working_memory: List[Dict[str, str]],
103
- coder: LMM,
104
- tester: LMM,
105
- debugger: LMM,
106
- code_interpreter: CodeInterpreter,
107
- log_progress: Callable[[Dict[str, Any]], None],
108
- verbosity: int = 0,
109
- max_retries: int = 3,
110
- media: Optional[Sequence[Union[str, Path]]] = None,
111
- ) -> Dict[str, Any]:
112
- log_progress(
113
- {
114
- "type": "log",
115
- "log_content": "Generating code",
116
- "status": "started",
117
- }
118
- )
119
- code = write_code(
120
- coder,
121
- chat,
122
- plan,
123
- tool_info,
124
- tool_output,
125
- plan_thoughts,
126
- format_feedback(working_memory),
127
- )
128
- code = strip_function_calls(code)
129
- test = write_test(
130
- tester, chat, tool_utils, code, format_feedback(working_memory), media
131
- )
132
-
133
- log_progress(
134
- {
135
- "type": "log",
136
- "log_content": "Running code",
137
- "status": "running",
138
- "code": DefaultImports.prepend_imports(code),
139
- "payload": {
140
- "test": test,
141
- },
142
- }
143
- )
144
- result = code_interpreter.exec_isolation(
145
- f"{DefaultImports.to_code_string()}\n{code}\n{test}"
146
- )
147
- log_progress(
148
- {
149
- "type": "log",
150
- "log_content": (
151
- "Code execution succeeded"
152
- if result.success
153
- else "Code execution failed"
154
- ),
155
- "status": "completed" if result.success else "failed",
156
- "code": DefaultImports.prepend_imports(code),
157
- "payload": {
158
- "test": test,
159
- },
160
- }
161
- )
162
- if verbosity == 2:
163
- print_code("Initial code and tests:", code, test)
164
- _LOGGER.info(
165
- f"Initial code execution result:\n{result.text(include_logs=True)}"
166
- )
167
-
168
- count = 0
169
- new_working_memory: List[Dict[str, str]] = []
170
- while not result.success and count < max_retries:
171
- if verbosity == 2:
172
- _LOGGER.info(f"Start debugging attempt {count + 1}")
173
- code, test, result = debug_code(
174
- working_memory,
175
- debugger,
176
- code_interpreter,
177
- tool_info,
178
- code,
179
- test,
180
- result,
181
- new_working_memory,
182
- log_progress,
183
- verbosity,
184
- )
185
- count += 1
186
-
187
- if verbosity >= 1:
188
- print_code("Final code and tests:", code, test)
189
-
190
- return {
191
- "code": code,
192
- "test": test,
193
- "success": result.success,
194
- "test_result": result,
195
- "working_memory": new_working_memory,
196
- }
197
-
198
-
199
- def debug_code(
200
- working_memory: List[Dict[str, str]],
201
- debugger: LMM,
202
- code_interpreter: CodeInterpreter,
203
- tool_info: str,
204
- code: str,
205
- test: str,
206
- result: Execution,
207
- new_working_memory: List[Dict[str, str]],
208
- log_progress: Callable[[Dict[str, Any]], None],
209
- verbosity: int = 0,
210
- ) -> tuple[str, str, Execution]:
211
- log_progress(
212
- {
213
- "type": "log",
214
- "log_content": ("Debugging code"),
215
- "status": "started",
216
- }
217
- )
218
-
219
- fixed_code = None
220
- fixed_test = None
221
- thoughts = ""
222
- success = False
223
- count = 0
224
- while not success and count < 3:
225
- try:
226
- # LLMs write worse code when it's in JSON, so we have it write JSON
227
- # followed by code each wrapped in markdown blocks.
228
- fixed_code_and_test_str = debugger(
229
- FIX_BUG.format(
230
- docstring=tool_info,
231
- code=code,
232
- tests=test,
233
- # Because of the way we trace function calls the trace information
234
- # ends up in the results. We don't want to show this info to the
235
- # LLM so we don't include it in the tool_output_str.
236
- result="\n".join(
237
- result.text(include_results=False).splitlines()[-50:]
238
- ),
239
- feedback=format_feedback(working_memory + new_working_memory),
240
- ),
241
- stream=False,
242
- )
243
- fixed_code_and_test_str = cast(str, fixed_code_and_test_str)
244
- thoughts_tag = extract_tag(fixed_code_and_test_str, "thoughts")
245
- thoughts = thoughts_tag if thoughts_tag is not None else ""
246
- fixed_code = extract_tag(fixed_code_and_test_str, "code")
247
- fixed_test = extract_tag(fixed_code_and_test_str, "test")
248
-
249
- if fixed_code is None and fixed_test is None:
250
- success = False
251
- else:
252
- success = True
253
-
254
- except Exception as e:
255
- _LOGGER.exception(f"Error while extracting JSON: {e}")
256
-
257
- count += 1
258
-
259
- old_code = code
260
- old_test = test
261
-
262
- if fixed_code is not None and fixed_code.strip() != "":
263
- code = fixed_code
264
- if fixed_test is not None and fixed_test.strip() != "":
265
- test = fixed_test
266
-
267
- new_working_memory.append(
268
- {
269
- "code": f"{code}\n{test}",
270
- "feedback": thoughts,
271
- "edits": get_diff(f"{old_code}\n{old_test}", f"{code}\n{test}"),
272
- }
273
- )
274
- log_progress(
275
- {
276
- "type": "log",
277
- "log_content": ("Running code"),
278
- "status": "running",
279
- "code": DefaultImports.prepend_imports(code),
280
- "payload": {
281
- "test": test,
282
- },
283
- }
284
- )
285
-
286
- result = code_interpreter.exec_isolation(
287
- f"{DefaultImports.to_code_string()}\n{code}\n{test}"
288
- )
289
- log_progress(
290
- {
291
- "type": "log",
292
- "log_content": (
293
- "Code execution succeed" if result.success else "Code execution failed"
294
- ),
295
- "status": "completed" if result.success else "failed",
296
- "code": DefaultImports.prepend_imports(code),
297
- "payload": {
298
- "test": test,
299
- # "result": result.to_json(),
300
- },
301
- }
302
- )
303
- if verbosity == 2:
304
- print_code("Code and test after attempted fix:", code, test)
305
- _LOGGER.info(
306
- f"Reflection: {thoughts}\nCode execution result after attempted fix: {result.text(include_logs=True)}"
307
- )
308
-
309
- return code, test, result
310
-
311
-
312
- class VisionAgentCoder(Agent):
313
- """Vision Agent Coder is an agentic framework that can output code based on a user
314
- request. It can plan tasks, retrieve relevant tools, write code, write tests and
315
- reflect on failed test cases to debug code. It is inspired by AgentCoder
316
- https://arxiv.org/abs/2312.13010 and Data Interpeter https://arxiv.org/abs/2402.18679
317
-
318
- Example
319
- -------
320
- >>> import vision_agent as va
321
- >>> agent = va.agent.VisionAgentCoder()
322
- >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
323
- """
324
-
325
- def __init__(
326
- self,
327
- planner: Optional[Agent] = None,
328
- coder: Optional[LMM] = None,
329
- tester: Optional[LMM] = None,
330
- debugger: Optional[LMM] = None,
331
- verbosity: int = 0,
332
- report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
333
- code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
334
- ) -> None:
335
- """Initialize the Vision Agent Coder.
336
-
337
- Parameters:
338
- planner (Optional[Agent]): The planner model to use. Defaults to
339
- AnthropicVisionAgentPlanner.
340
- coder (Optional[LMM]): The coder model to use. Defaults to AnthropicLMM.
341
- tester (Optional[LMM]): The tester model to use. Defaults to AnthropicLMM.
342
- debugger (Optional[LMM]): The debugger model to use. Defaults to AnthropicLMM.
343
- verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
344
- highest verbosity level which will output all intermediate debugging
345
- code.
346
- report_progress_callback (Optional[Callable[Dict[str, Any]]]): a callback
347
- to report the progress of the agent. This is useful for streaming logs
348
- in a web application where multiple VisionAgentCoder instances are
349
- running in parallel. This callback ensures that the progress are not
350
- mixed up.
351
- code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
352
- it can be one of: None or "local". If None, it will read from
353
- the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
354
- object is provided it will use that.
355
- """
356
-
357
- self.planner = (
358
- AnthropicVisionAgentPlanner(verbosity=verbosity)
359
- if planner is None
360
- else planner
361
- )
362
- self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
363
- self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
364
- self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
365
- self.verbosity = verbosity
366
- if self.verbosity > 0:
367
- _LOGGER.setLevel(logging.INFO)
368
-
369
- self.report_progress_callback = report_progress_callback
370
- self.code_interpreter = code_interpreter
371
-
372
- def __call__(
373
- self,
374
- input: Union[str, List[Message]],
375
- media: Optional[Union[str, Path]] = None,
376
- ) -> str:
377
- """Generate code based on a user request.
378
-
379
- Parameters:
380
- input (Union[str, List[Message]]): A conversation in the format of
381
- [{"role": "user", "content": "describe your task here..."}] or a string
382
- of just the contents.
383
- media (Optional[Union[str, Path]]): The media file to be used in the task.
384
-
385
- Returns:
386
- str: The code output by the VisionAgentCoder.
387
- """
388
-
389
- if isinstance(input, str):
390
- input = [{"role": "user", "content": input}]
391
- if media is not None:
392
- input[0]["media"] = [media]
393
- code_and_context = self.generate_code(input)
394
- return code_and_context["code"] # type: ignore
395
-
396
- def generate_code_from_plan(
397
- self,
398
- chat: List[Message],
399
- plan_context: PlanContext,
400
- code_interpreter: Optional[CodeInterpreter] = None,
401
- ) -> Dict[str, Any]:
402
- """Generates code and other intermediate outputs from a chat input and a plan.
403
- The plan includes:
404
- - plans: The plans generated by the planner.
405
- - best_plan: The best plan selected by the planner.
406
- - plan_thoughts: The thoughts of the planner, including any modifications
407
- to the plan.
408
- - tool_doc: The tool documentation for the best plan.
409
- - tool_output: The tool output from the tools used by the best plan.
410
-
411
- Parameters:
412
- chat (List[Message]): A conversation in the format of
413
- [{"role": "user", "content": "describe your task here..."}].
414
- plan_context (PlanContext): The context of the plan, including the plans,
415
- best_plan, plan_thoughts, tool_doc, and tool_output.
416
-
417
- Returns:
418
- Dict[str, Any]: A dictionary containing the code output by the
419
- VisionAgentCoder and other intermediate outputs. include:
420
- - status (str): Whether or not the agent completed or failed generating
421
- the code.
422
- - code (str): The code output by the VisionAgentCoder.
423
- - test (str): The test output by the VisionAgentCoder.
424
- - test_result (Execution): The result of the test execution.
425
- - plans (Dict[str, Any]): The plans generated by the planner.
426
- - plan_thoughts (str): The thoughts of the planner.
427
- - working_memory (List[Dict[str, str]]): The working memory of the agent.
428
- """
429
- if not chat:
430
- raise ValueError("Chat cannot be empty.")
431
-
432
- # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
433
- code_interpreter = (
434
- self.code_interpreter
435
- if self.code_interpreter is not None
436
- and not isinstance(self.code_interpreter, str)
437
- else CodeInterpreterFactory.new_instance(
438
- code_sandbox_runtime=self.code_interpreter,
439
- )
440
- )
441
- with code_interpreter:
442
- chat = copy.deepcopy(chat)
443
- media_list = []
444
- for chat_i in chat:
445
- if "media" in chat_i:
446
- for media in chat_i["media"]:
447
- chat_i["content"] += f" Media name {media}" # type: ignore
448
- media_list.append(str(media))
449
-
450
- int_chat = cast(
451
- List[Message],
452
- [
453
- (
454
- {
455
- "role": c["role"],
456
- "content": c["content"],
457
- "media": c["media"],
458
- }
459
- if "media" in c
460
- else {"role": c["role"], "content": c["content"]}
461
- )
462
- for c in chat
463
- ],
464
- )
465
-
466
- code = ""
467
- test = ""
468
- working_memory: List[Dict[str, str]] = []
469
- plan = plan_context.plans[plan_context.best_plan]
470
- tool_doc = plan_context.tool_doc
471
- tool_output_str = plan_context.tool_output
472
- plan_thoughts_str = str(plan_context.plan_thoughts)
473
-
474
- if self.verbosity >= 1:
475
- plan_fixed = [{"instructions": e} for e in plan["instructions"]]
476
- _LOGGER.info(
477
- f"Picked best plan:\n{tabulate(tabular_data=plan_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
478
- )
479
-
480
- results = write_and_test_code(
481
- chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
482
- plan=f"\n{plan['thoughts']}\n-"
483
- + "\n-".join([e for e in plan["instructions"]]),
484
- tool_info=tool_doc,
485
- tool_output=tool_output_str,
486
- plan_thoughts=plan_thoughts_str,
487
- tool_utils=T.get_utilties_docstring(),
488
- working_memory=working_memory,
489
- coder=self.coder,
490
- tester=self.tester,
491
- debugger=self.debugger,
492
- code_interpreter=code_interpreter,
493
- log_progress=self.log_progress,
494
- verbosity=self.verbosity,
495
- media=media_list,
496
- )
497
- success = cast(bool, results["success"])
498
- code = remove_installs_from_code(cast(str, results["code"]))
499
- test = remove_installs_from_code(cast(str, results["test"]))
500
- working_memory.extend(results["working_memory"])
501
- execution_result = cast(Execution, results["test_result"])
502
-
503
- return {
504
- "status": "completed" if success else "failed",
505
- "code": DefaultImports.prepend_imports(code),
506
- "test": test,
507
- "test_result": execution_result,
508
- "plans": plan_context.plans,
509
- "plan_thoughts": plan_thoughts_str,
510
- "working_memory": working_memory,
511
- }
512
-
513
- def generate_code(
514
- self,
515
- chat: List[Message],
516
- test_multi_plan: bool = True,
517
- custom_tool_names: Optional[List[str]] = None,
518
- ) -> Dict[str, Any]:
519
- """Generates code and other intermediate outputs from a chat input.
520
-
521
- Parameters:
522
- chat (List[Message]): A conversation in the format of
523
- [{"role": "user", "content": "describe your task here..."}].
524
- test_multi_plan (bool): Whether to test multiple plans or just the best plan.
525
- custom_tool_names (Optional[List[str]]): A list of custom tool names to use
526
- for the planner.
527
-
528
- Returns:
529
- Dict[str, Any]: A dictionary containing the code output by the
530
- VisionAgentCoder and other intermediate outputs. include:
531
- - status (str): Whether or not the agent completed or failed generating
532
- the code.
533
- - code (str): The code output by the VisionAgentCoder.
534
- - test (str): The test output by the VisionAgentCoder.
535
- - test_result (Execution): The result of the test execution.
536
- - plans (Dict[str, Any]): The plans generated by the planner.
537
- - plan_thoughts (str): The thoughts of the planner.
538
- - working_memory (List[Dict[str, str]]): The working memory of the agent.
539
- """
540
- if not chat:
541
- raise ValueError("Chat cannot be empty.")
542
-
543
- # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
544
- code_interpreter = (
545
- self.code_interpreter
546
- if self.code_interpreter is not None
547
- and not isinstance(self.code_interpreter, str)
548
- else CodeInterpreterFactory.new_instance(
549
- code_sandbox_runtime=self.code_interpreter,
550
- )
551
- )
552
- with code_interpreter:
553
- plan_context = self.planner.generate_plan( # type: ignore
554
- chat,
555
- test_multi_plan=test_multi_plan,
556
- custom_tool_names=custom_tool_names,
557
- code_interpreter=code_interpreter,
558
- )
559
-
560
- code_and_context = self.generate_code_from_plan(
561
- chat,
562
- plan_context,
563
- code_interpreter=code_interpreter,
564
- )
565
- return code_and_context
566
-
567
- def chat(self, chat: List[Message]) -> List[Message]:
568
- chat = copy.deepcopy(chat)
569
- code = self.generate_code(chat)
570
- chat.append({"role": "agent", "content": code["code"]})
571
- return chat
572
-
573
- def log_progress(self, data: Dict[str, Any]) -> None:
574
- if self.report_progress_callback is not None:
575
- self.report_progress_callback(data)
576
-
577
-
578
- class OpenAIVisionAgentCoder(VisionAgentCoder):
579
- """Initializes Vision Agent Coder using OpenAI models for planning, coding, testing."""
580
-
581
- def __init__(
582
- self,
583
- planner: Optional[Agent] = None,
584
- coder: Optional[LMM] = None,
585
- tester: Optional[LMM] = None,
586
- debugger: Optional[LMM] = None,
587
- verbosity: int = 0,
588
- report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
589
- code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
590
- ) -> None:
591
- self.planner = (
592
- OpenAIVisionAgentPlanner(verbosity=verbosity)
593
- if planner is None
594
- else planner
595
- )
596
- self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
597
- self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
598
- self.debugger = OpenAILMM(temperature=0.0) if debugger is None else debugger
599
- self.verbosity = verbosity
600
- if self.verbosity > 0:
601
- _LOGGER.setLevel(logging.INFO)
602
-
603
- self.report_progress_callback = report_progress_callback
604
- self.code_interpreter = code_interpreter
605
-
606
-
607
- class AnthropicVisionAgentCoder(VisionAgentCoder):
608
- """Initializes Vision Agent Coder using Anthropic models for planning, coding, testing."""
609
-
610
- def __init__(
611
- self,
612
- planner: Optional[Agent] = None,
613
- coder: Optional[LMM] = None,
614
- tester: Optional[LMM] = None,
615
- debugger: Optional[LMM] = None,
616
- verbosity: int = 0,
617
- report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
618
- code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
619
- ) -> None:
620
- # NOTE: Claude doesn't have an official JSON mode
621
- self.planner = (
622
- AnthropicVisionAgentPlanner(verbosity=verbosity)
623
- if planner is None
624
- else planner
625
- )
626
- self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
627
- self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
628
- self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
629
- self.verbosity = verbosity
630
- if self.verbosity > 0:
631
- _LOGGER.setLevel(logging.INFO)
632
-
633
- self.report_progress_callback = report_progress_callback
634
- self.code_interpreter = code_interpreter
635
-
636
-
637
- class OllamaVisionAgentCoder(VisionAgentCoder):
638
- """VisionAgentCoder that uses Ollama models for planning, coding, testing.
639
-
640
- Pre-requisites:
641
- 1. Run ollama pull llama3.2-vision for the LMM
642
- 2. Run ollama pull mxbai-embed-large for the embedding similarity model
643
-
644
- Example
645
- -------
646
- >>> image vision_agent as va
647
- >>> agent = va.agent.OllamaVisionAgentCoder()
648
- >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
649
- """
650
-
651
- def __init__(
652
- self,
653
- planner: Optional[Agent] = None,
654
- coder: Optional[LMM] = None,
655
- tester: Optional[LMM] = None,
656
- debugger: Optional[LMM] = None,
657
- verbosity: int = 0,
658
- report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
659
- code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
660
- ) -> None:
661
- super().__init__(
662
- planner=(
663
- OllamaVisionAgentPlanner(verbosity=verbosity)
664
- if planner is None
665
- else planner
666
- ),
667
- coder=(
668
- OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
669
- if coder is None
670
- else coder
671
- ),
672
- tester=(
673
- OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
674
- if tester is None
675
- else tester
676
- ),
677
- debugger=(
678
- OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
679
- if debugger is None
680
- else debugger
681
- ),
682
- verbosity=verbosity,
683
- report_progress_callback=report_progress_callback,
684
- code_interpreter=code_interpreter,
685
- )
686
-
687
-
688
- class AzureVisionAgentCoder(VisionAgentCoder):
689
- """VisionAgentCoder that uses Azure OpenAI APIs for planning, coding, testing.
690
-
691
- Pre-requisites:
692
- 1. Set the environment variable AZURE_OPENAI_API_KEY to your Azure OpenAI API key.
693
- 2. Set the environment variable AZURE_OPENAI_ENDPOINT to your Azure OpenAI endpoint.
694
-
695
- Example
696
- -------
697
- >>> import vision_agent as va
698
- >>> agent = va.agent.AzureVisionAgentCoder()
699
- >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
700
- """
701
-
702
- def __init__(
703
- self,
704
- planner: Optional[Agent] = None,
705
- coder: Optional[LMM] = None,
706
- tester: Optional[LMM] = None,
707
- debugger: Optional[LMM] = None,
708
- verbosity: int = 0,
709
- report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
710
- code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
711
- ) -> None:
712
- """Initialize the Vision Agent Coder.
713
-
714
- Parameters:
715
- planner (Optional[Agent]): The planner model to use. Defaults to
716
- AzureVisionAgentPlanner.
717
- coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
718
- tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
719
- debugger (Optional[LMM]): The debugger model to
720
- verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
721
- highest verbosity level which will output all intermediate debugging
722
- code.
723
- report_progress_callback: a callback to report the progress of the agent.
724
- This is useful for streaming logs in a web application where multiple
725
- VisionAgentCoder instances are running in parallel. This callback
726
- ensures that the progress are not mixed up.
727
- """
728
- super().__init__(
729
- planner=(
730
- AzureVisionAgentPlanner(verbosity=verbosity)
731
- if planner is None
732
- else planner
733
- ),
734
- coder=AzureOpenAILMM(temperature=0.0) if coder is None else coder,
735
- tester=AzureOpenAILMM(temperature=0.0) if tester is None else tester,
736
- debugger=(
737
- AzureOpenAILMM(temperature=0.0) if debugger is None else debugger
738
- ),
739
- verbosity=verbosity,
740
- report_progress_callback=report_progress_callback,
741
- code_interpreter=code_interpreter,
742
- )