vision-agent 0.2.10__py3-none-any.whl → 0.2.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +2 -0
- vision_agent/agent/agent_coder.py +196 -0
- vision_agent/agent/agent_coder_prompts.py +135 -0
- vision_agent/agent/vision_agent.py +46 -30
- vision_agent/agent/vision_agent_prompts.py +3 -3
- vision_agent/agent/vision_agent_v2.py +396 -0
- vision_agent/agent/vision_agent_v2_prompt.py +185 -0
- vision_agent/llm/llm.py +12 -4
- vision_agent/tools/__init__.py +3 -1
- vision_agent/tools/tool_utils.py +30 -0
- vision_agent/tools/tools.py +157 -79
- vision_agent/tools/tools_v2.py +442 -0
- vision_agent/utils/__init__.py +3 -0
- vision_agent/utils/execute.py +104 -0
- vision_agent/utils/sim.py +85 -0
- {vision_agent-0.2.10.dist-info → vision_agent-0.2.22.dist-info}/METADATA +7 -3
- vision_agent-0.2.22.dist-info/RECORD +34 -0
- vision_agent-0.2.10.dist-info/RECORD +0 -25
- /vision_agent/{image_utils.py → utils/image_utils.py} +0 -0
- /vision_agent/{type_defs.py → utils/type_defs.py} +0 -0
- /vision_agent/{tools → utils}/video.py +0 -0
- {vision_agent-0.2.10.dist-info → vision_agent-0.2.22.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.10.dist-info → vision_agent-0.2.22.dist-info}/WHEEL +0 -0
vision_agent/agent/__init__.py
CHANGED
@@ -0,0 +1,196 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
import sys
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Dict, List, Optional, Union
|
7
|
+
|
8
|
+
from vision_agent.agent import Agent
|
9
|
+
from vision_agent.agent.agent_coder_prompts import (
|
10
|
+
DEBUG,
|
11
|
+
FIX_BUG,
|
12
|
+
PROGRAM,
|
13
|
+
TEST,
|
14
|
+
VISUAL_TEST,
|
15
|
+
)
|
16
|
+
from vision_agent.llm import LLM, OpenAILLM
|
17
|
+
from vision_agent.lmm import LMM, OpenAILMM
|
18
|
+
from vision_agent.tools.tools_v2 import TOOL_DOCSTRING, UTILITIES_DOCSTRING
|
19
|
+
from vision_agent.utils import Execute
|
20
|
+
|
21
|
+
IMPORT_HELPER = """
|
22
|
+
import math
|
23
|
+
import re
|
24
|
+
import sys
|
25
|
+
import copy
|
26
|
+
import datetime
|
27
|
+
import itertools
|
28
|
+
import collections
|
29
|
+
import heapq
|
30
|
+
import statistics
|
31
|
+
import functools
|
32
|
+
import hashlib
|
33
|
+
import numpy
|
34
|
+
import numpy as np
|
35
|
+
import string
|
36
|
+
from typing import *
|
37
|
+
from collections import *
|
38
|
+
from vision_agent.tools.tools_v2 import *
|
39
|
+
"""
|
40
|
+
logging.basicConfig(stream=sys.stdout)
|
41
|
+
_LOGGER = logging.getLogger(__name__)
|
42
|
+
_EXECUTE = Execute()
|
43
|
+
|
44
|
+
|
45
|
+
def write_tests(question: str, code: str, model: LLM) -> str:
|
46
|
+
prompt = TEST.format(
|
47
|
+
question=question,
|
48
|
+
code=code,
|
49
|
+
)
|
50
|
+
completion = model(prompt)
|
51
|
+
return preprocess_data(completion)
|
52
|
+
|
53
|
+
|
54
|
+
def preprocess_data(code: str) -> str:
|
55
|
+
if "```python" in code:
|
56
|
+
code = code[code.find("```python") + len("```python") :]
|
57
|
+
code = code[: code.find("```")]
|
58
|
+
return code
|
59
|
+
|
60
|
+
|
61
|
+
def parse_file_name(s: str) -> str:
|
62
|
+
# We only output png files
|
63
|
+
return "".join([p for p in s.split(" ") if p.endswith(".png")])
|
64
|
+
|
65
|
+
|
66
|
+
def write_program(question: str, feedback: str, model: LLM) -> str:
|
67
|
+
prompt = PROGRAM.format(
|
68
|
+
docstring=TOOL_DOCSTRING, question=question, feedback=feedback
|
69
|
+
)
|
70
|
+
completion = model(prompt)
|
71
|
+
return preprocess_data(completion)
|
72
|
+
|
73
|
+
|
74
|
+
def write_debug(question: str, code: str, feedback: str, model: LLM) -> str:
|
75
|
+
prompt = DEBUG.format(
|
76
|
+
docstring=UTILITIES_DOCSTRING,
|
77
|
+
code=code,
|
78
|
+
question=question,
|
79
|
+
feedback=feedback,
|
80
|
+
)
|
81
|
+
completion = model(prompt)
|
82
|
+
return preprocess_data(completion)
|
83
|
+
|
84
|
+
|
85
|
+
def execute_tests(code: str, tests: str) -> Dict[str, Union[str, bool]]:
|
86
|
+
full_code = f"{IMPORT_HELPER}\n{code}\n{tests}"
|
87
|
+
success, result = _EXECUTE.run_isolation(full_code)
|
88
|
+
return {"code": code, "result": result, "passed": success}
|
89
|
+
|
90
|
+
|
91
|
+
def run_visual_tests(
|
92
|
+
question: str, code: str, viz_file: str, feedback: str, model: LMM
|
93
|
+
) -> Dict[str, Union[str, bool]]:
|
94
|
+
prompt = VISUAL_TEST.format(
|
95
|
+
docstring=TOOL_DOCSTRING,
|
96
|
+
code=code,
|
97
|
+
question=question,
|
98
|
+
feedback=feedback,
|
99
|
+
)
|
100
|
+
completion = model(prompt, images=[viz_file])
|
101
|
+
# type is from the prompt
|
102
|
+
return json.loads(completion) # type: ignore
|
103
|
+
|
104
|
+
|
105
|
+
def fix_bugs(code: str, tests: str, result: str, feedback: str, model: LLM) -> str:
|
106
|
+
prompt = FIX_BUG.format(completion=code, test_case=tests, result=result)
|
107
|
+
completion = model(prompt)
|
108
|
+
return preprocess_data(completion)
|
109
|
+
|
110
|
+
|
111
|
+
class AgentCoder(Agent):
|
112
|
+
"""AgentCoder is based off of the AgentCoder paper https://arxiv.org/abs/2312.13010
|
113
|
+
and it's open source code https://github.com/huangd1999/AgentCoder with some key
|
114
|
+
differences. AgentCoder comprises of 3 components: a coder agent, a tester agent,
|
115
|
+
and an executor. The tester agents writes code to test the code written by the coder
|
116
|
+
agent, but in our case because we are solving a vision task it's difficult to write
|
117
|
+
testing code. We instead have the tester agent write code to visualize the output
|
118
|
+
of the code written by the coder agent. If the code fails, we pass it back to the
|
119
|
+
coder agent to fix the bug, if it succeeds we pass it to a visual tester agent, which
|
120
|
+
is an LMM model like GPT4V, to visually inspect the output and make sure it looks
|
121
|
+
good."""
|
122
|
+
|
123
|
+
def __init__(
|
124
|
+
self,
|
125
|
+
coder_agent: Optional[LLM] = None,
|
126
|
+
tester_agent: Optional[LLM] = None,
|
127
|
+
visual_tester_agent: Optional[LMM] = None,
|
128
|
+
verbose: bool = False,
|
129
|
+
) -> None:
|
130
|
+
self.coder_agent = (
|
131
|
+
OpenAILLM(temperature=0.1) if coder_agent is None else coder_agent
|
132
|
+
)
|
133
|
+
self.tester_agent = (
|
134
|
+
OpenAILLM(temperature=0.1) if tester_agent is None else tester_agent
|
135
|
+
)
|
136
|
+
self.visual_tester_agent = (
|
137
|
+
OpenAILMM(temperature=0.1, json_mode=True)
|
138
|
+
if visual_tester_agent is None
|
139
|
+
else visual_tester_agent
|
140
|
+
)
|
141
|
+
self.max_turns = 3
|
142
|
+
if verbose:
|
143
|
+
_LOGGER.setLevel(logging.INFO)
|
144
|
+
|
145
|
+
def __call__(
|
146
|
+
self,
|
147
|
+
input: Union[List[Dict[str, str]], str],
|
148
|
+
image: Optional[Union[str, Path]] = None,
|
149
|
+
) -> str:
|
150
|
+
if isinstance(input, str):
|
151
|
+
input = [{"role": "user", "content": input}]
|
152
|
+
return self.chat(input, image)
|
153
|
+
|
154
|
+
def chat(
|
155
|
+
self,
|
156
|
+
input: List[Dict[str, str]],
|
157
|
+
image: Optional[Union[str, Path]] = None,
|
158
|
+
) -> str:
|
159
|
+
question = input[0]["content"]
|
160
|
+
if image:
|
161
|
+
question += f" Input file path: {os.path.abspath(image)}"
|
162
|
+
|
163
|
+
code = ""
|
164
|
+
feedback = ""
|
165
|
+
for _ in range(self.max_turns):
|
166
|
+
code = write_program(question, feedback, self.coder_agent)
|
167
|
+
_LOGGER.info(f"code:\n{code}")
|
168
|
+
debug = write_debug(question, code, feedback, self.tester_agent)
|
169
|
+
_LOGGER.info(f"debug:\n{debug}")
|
170
|
+
results = execute_tests(code, debug)
|
171
|
+
_LOGGER.info(
|
172
|
+
f"execution results: passed: {results['passed']}\n{results['result']}"
|
173
|
+
)
|
174
|
+
|
175
|
+
if not results["passed"]:
|
176
|
+
code = fix_bugs(
|
177
|
+
code, debug, results["result"].strip(), feedback, self.coder_agent # type: ignore
|
178
|
+
)
|
179
|
+
_LOGGER.info(f"fixed code:\n{code}")
|
180
|
+
else:
|
181
|
+
# TODO: Sometimes it prints nothing, so we need to handle that case
|
182
|
+
# TODO: The visual agent reflection does not work very well, needs more testing
|
183
|
+
# viz_test_results = run_visual_tests(
|
184
|
+
# question, code, parse_file_name(results["result"].strip()), feedback, self.visual_tester_agent
|
185
|
+
# )
|
186
|
+
# _LOGGER.info(f"visual test results:\n{viz_test_results}")
|
187
|
+
# if viz_test_results["finished"]:
|
188
|
+
# return f"{IMPORT_HELPER}\n{code}"
|
189
|
+
# feedback += f"\n{viz_test_results['feedback']}"
|
190
|
+
|
191
|
+
return f"{IMPORT_HELPER}\n{code}"
|
192
|
+
|
193
|
+
return f"{IMPORT_HELPER}\n{code}"
|
194
|
+
|
195
|
+
def log_progress(self, description: str) -> None:
|
196
|
+
_LOGGER.info(description)
|
@@ -0,0 +1,135 @@
|
|
1
|
+
PROGRAM = """
|
2
|
+
**Role**: You are a software programmer.
|
3
|
+
|
4
|
+
**Task**: As a programmer, you are required to complete the function. Use a Chain-of-Thought approach to break down the problem, create pseudocode, and then write the code in Python language. Ensure that your code is efficient, readable, and well-commented. Return the requested information from the function you create.
|
5
|
+
|
6
|
+
**Documentation**:
|
7
|
+
This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task, you do not need to worry about defining them or importing them and can assume they are available to you.
|
8
|
+
{docstring}
|
9
|
+
|
10
|
+
**Input Code Snippet**:
|
11
|
+
```python
|
12
|
+
def execute(image_path: str):
|
13
|
+
# Your code here
|
14
|
+
```
|
15
|
+
|
16
|
+
**User Instructions**:
|
17
|
+
{question}
|
18
|
+
|
19
|
+
**Previous Feedback**:
|
20
|
+
{feedback}
|
21
|
+
|
22
|
+
**Instructions**:
|
23
|
+
1. **Understand and Clarify**: Make sure you understand the task.
|
24
|
+
2. **Algorithm/Method Selection**: Decide on the most efficient way.
|
25
|
+
3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
|
26
|
+
4. **Code Generation**: Translate your pseudocode into executable Python code.
|
27
|
+
"""
|
28
|
+
|
29
|
+
DEBUG = """
|
30
|
+
**Role**: You are a software programmer.
|
31
|
+
|
32
|
+
**Task**: Your task is to run the `execute` function and either print the output or print a file name containing visualized output for another agent to examine. The other agent will then use your output, either the printed return value of the function or the visualized output as a file, to determine if `execute` is functioning correctly.
|
33
|
+
|
34
|
+
**Documentation**
|
35
|
+
This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task, you do not need to worry about defining them or importing them and can assume they are available to you.
|
36
|
+
{docstring}
|
37
|
+
|
38
|
+
**Input Code Snippet**:
|
39
|
+
```python
|
40
|
+
### Please decided how would you want to generate test cases. Based on incomplete code or completed version.
|
41
|
+
{code}
|
42
|
+
```
|
43
|
+
|
44
|
+
**User Instructions**:
|
45
|
+
{question}
|
46
|
+
|
47
|
+
**Previous Feedback**:
|
48
|
+
{feedback}
|
49
|
+
|
50
|
+
**Instructions**:
|
51
|
+
1. **Understand and Clarify**: Make sure you understand the task.
|
52
|
+
2. **Code Execution**: Run the `execute` function with the given input from the user instructions.
|
53
|
+
3. **Output Generation**: Print the output or save it as a file for visualization utilizing the functions you have access to.
|
54
|
+
"""
|
55
|
+
|
56
|
+
VISUAL_TEST = """
|
57
|
+
**Role**: You are a machine vision expert.
|
58
|
+
|
59
|
+
**Task**: Your task is to visually inspect the output of the `execute` function and determine if the visualization of the function output looks correct given the user's instructions. If not, you can provide suggestions to improve the `execute` function to imporve it.
|
60
|
+
|
61
|
+
**Documentation**:
|
62
|
+
This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task, you do not need to worry about defining them or importing them and can assume they are available to you.
|
63
|
+
{docstring}
|
64
|
+
|
65
|
+
|
66
|
+
**Input Code Snippet**:
|
67
|
+
This is the code that
|
68
|
+
```python
|
69
|
+
{code}
|
70
|
+
```
|
71
|
+
|
72
|
+
**User Instructions**:
|
73
|
+
{question}
|
74
|
+
|
75
|
+
**Previous Feedback**:
|
76
|
+
{feedback}
|
77
|
+
|
78
|
+
**Instructions**:
|
79
|
+
1. **Visual Inspection**: Examine the visual output of the `execute` function.
|
80
|
+
2. **Evaluation**: Determine if the visualization is correct based on the user's instructions.
|
81
|
+
3. **Feedback**: Provide feedback on the visualization and suggest improvements if necessary.
|
82
|
+
4. **Clear Concrete Instructions**: Provide clear concrete instructions to improve the results. You can only make coding suggestions based on the either the input code snippet or the documented code provided. For example, do not say the threshold needs to be adjust, instead provide an exact value for adjusting the threshold.
|
83
|
+
|
84
|
+
Provide output in JSON format {{"finished": boolean, "feedback": "your feedback"}} where "finished" is True if the output is correct and False if not and "feedback" is your feedback.
|
85
|
+
"""
|
86
|
+
|
87
|
+
FIX_BUG = """
|
88
|
+
Please re-complete the code to fix the error message. Here is the previous version:
|
89
|
+
```python
|
90
|
+
{code}
|
91
|
+
```
|
92
|
+
|
93
|
+
When we run this code:
|
94
|
+
```python
|
95
|
+
{tests}
|
96
|
+
```
|
97
|
+
|
98
|
+
It raises this error:
|
99
|
+
```python
|
100
|
+
{result}
|
101
|
+
```
|
102
|
+
|
103
|
+
This is previous feedback provided on the code:
|
104
|
+
{feedback}
|
105
|
+
|
106
|
+
Please fix the bug by follow the error information and only return python code. You do not need return the test cases. The re-completion code should in triple backticks format(i.e., in ```python ```).
|
107
|
+
"""
|
108
|
+
|
109
|
+
TEST = """
|
110
|
+
**Role**: As a tester, your task is to create comprehensive test cases for the incomplete `execute` function. These test cases should encompass Basic, Edge, and Large Scale scenarios to ensure the code's robustness, reliability, and scalability.
|
111
|
+
|
112
|
+
**User Instructions**:
|
113
|
+
{question}
|
114
|
+
|
115
|
+
**Input Code Snippet**:
|
116
|
+
```python
|
117
|
+
### Please decided how would you want to generate test cases. Based on incomplete code or completed version.
|
118
|
+
{code}
|
119
|
+
```
|
120
|
+
|
121
|
+
**1. Basic Test Cases**:
|
122
|
+
- **Objective**: To verify the fundamental functionality of the `has_close_elements` function under normal conditions.
|
123
|
+
|
124
|
+
**2. Edge Test Cases**:
|
125
|
+
- **Objective**: To evaluate the function's behavior under extreme or unusual conditions.
|
126
|
+
|
127
|
+
**3. Large Scale Test Cases**:
|
128
|
+
- **Objective**: To assess the function’s performance and scalability with large data samples.
|
129
|
+
|
130
|
+
**Instructions**:
|
131
|
+
- Implement a comprehensive set of test cases following the guidelines above.
|
132
|
+
- Ensure each test case is well-documented with comments explaining the scenario it covers.
|
133
|
+
- Pay special attention to edge cases as they often reveal hidden bugs.
|
134
|
+
- For large-scale tests, focus on the function's efficiency and performance under heavy loads.
|
135
|
+
"""
|
@@ -8,18 +8,8 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
|
|
8
8
|
from PIL import Image
|
9
9
|
from tabulate import tabulate
|
10
10
|
|
11
|
-
from vision_agent.
|
12
|
-
|
13
|
-
overlay_bboxes,
|
14
|
-
overlay_heat_map,
|
15
|
-
overlay_masks,
|
16
|
-
)
|
17
|
-
from vision_agent.llm import LLM, OpenAILLM
|
18
|
-
from vision_agent.lmm import LMM, OpenAILMM
|
19
|
-
from vision_agent.tools import TOOLS
|
20
|
-
|
21
|
-
from .agent import Agent
|
22
|
-
from .easytool_prompts import (
|
11
|
+
from vision_agent.agent.agent import Agent
|
12
|
+
from vision_agent.agent.easytool_prompts import (
|
23
13
|
ANSWER_GENERATE,
|
24
14
|
ANSWER_SUMMARIZE,
|
25
15
|
CHOOSE_PARAMETER,
|
@@ -27,7 +17,7 @@ from .easytool_prompts import (
|
|
27
17
|
TASK_DECOMPOSE,
|
28
18
|
TASK_TOPOLOGY,
|
29
19
|
)
|
30
|
-
from .vision_agent_prompts import (
|
20
|
+
from vision_agent.agent.vision_agent_prompts import (
|
31
21
|
ANSWER_GENERATE_DEPENDS,
|
32
22
|
ANSWER_SUMMARIZE_DEPENDS,
|
33
23
|
CHOOSE_PARAMETER_DEPENDS,
|
@@ -35,6 +25,15 @@ from .vision_agent_prompts import (
|
|
35
25
|
TASK_DECOMPOSE_DEPENDS,
|
36
26
|
VISION_AGENT_REFLECTION,
|
37
27
|
)
|
28
|
+
from vision_agent.llm import LLM, OpenAILLM
|
29
|
+
from vision_agent.lmm import LMM, OpenAILMM
|
30
|
+
from vision_agent.tools import TOOLS
|
31
|
+
from vision_agent.utils.image_utils import (
|
32
|
+
convert_to_b64,
|
33
|
+
overlay_bboxes,
|
34
|
+
overlay_heat_map,
|
35
|
+
overlay_masks,
|
36
|
+
)
|
38
37
|
|
39
38
|
logging.basicConfig(stream=sys.stdout)
|
40
39
|
_LOGGER = logging.getLogger(__name__)
|
@@ -309,7 +308,7 @@ def _handle_extract_frames(
|
|
309
308
|
# any following processing
|
310
309
|
for video_file_output in tool_result["call_results"]:
|
311
310
|
# When the video tool is run with wrong parameters, exit the loop
|
312
|
-
if len(video_file_output) < 2:
|
311
|
+
if not isinstance(video_file_output, tuple) or len(video_file_output) < 2:
|
313
312
|
break
|
314
313
|
for frame, _ in video_file_output:
|
315
314
|
image = frame
|
@@ -465,15 +464,17 @@ class VisionAgent(Agent):
|
|
465
464
|
report_progress_callback: a callback to report the progress of the agent. This is useful for streaming logs in a web application where multiple VisionAgent instances are running in parallel. This callback ensures that the progress are not mixed up.
|
466
465
|
"""
|
467
466
|
self.task_model = (
|
468
|
-
OpenAILLM(json_mode=True, temperature=0.
|
467
|
+
OpenAILLM(model_name="gpt-4-turbo", json_mode=True, temperature=0.0)
|
469
468
|
if task_model is None
|
470
469
|
else task_model
|
471
470
|
)
|
472
471
|
self.answer_model = (
|
473
|
-
OpenAILLM(temperature=0.
|
472
|
+
OpenAILLM(model_name="gpt-4-turbo", temperature=0.0)
|
473
|
+
if answer_model is None
|
474
|
+
else answer_model
|
474
475
|
)
|
475
476
|
self.reflect_model = (
|
476
|
-
OpenAILMM(json_mode=True, temperature=0.
|
477
|
+
OpenAILMM(model_name="gpt-4-turbo", json_mode=True, temperature=0.0)
|
477
478
|
if reflect_model is None
|
478
479
|
else reflect_model
|
479
480
|
)
|
@@ -489,6 +490,7 @@ class VisionAgent(Agent):
|
|
489
490
|
image: Optional[Union[str, Path]] = None,
|
490
491
|
reference_data: Optional[Dict[str, str]] = None,
|
491
492
|
visualize_output: Optional[bool] = False,
|
493
|
+
self_reflection: Optional[bool] = True,
|
492
494
|
) -> str:
|
493
495
|
"""Invoke the vision agent.
|
494
496
|
|
@@ -501,6 +503,7 @@ class VisionAgent(Agent):
|
|
501
503
|
{"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
|
502
504
|
where the bounding box coordinates are normalized.
|
503
505
|
visualize_output: Whether to visualize the output.
|
506
|
+
self_reflection: boolean to enable and disable self reflection.
|
504
507
|
|
505
508
|
Returns:
|
506
509
|
The result of the vision agent in text.
|
@@ -512,6 +515,7 @@ class VisionAgent(Agent):
|
|
512
515
|
image=image,
|
513
516
|
visualize_output=visualize_output,
|
514
517
|
reference_data=reference_data,
|
518
|
+
self_reflection=self_reflection,
|
515
519
|
)
|
516
520
|
|
517
521
|
def log_progress(self, description: str) -> None:
|
@@ -538,6 +542,7 @@ class VisionAgent(Agent):
|
|
538
542
|
image: Optional[Union[str, Path]] = None,
|
539
543
|
reference_data: Optional[Dict[str, str]] = None,
|
540
544
|
visualize_output: Optional[bool] = False,
|
545
|
+
self_reflection: Optional[bool] = True,
|
541
546
|
) -> Tuple[str, List[Dict]]:
|
542
547
|
"""Chat with the vision agent and return the final answer and all tool results.
|
543
548
|
|
@@ -550,12 +555,16 @@ class VisionAgent(Agent):
|
|
550
555
|
{"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
|
551
556
|
where the bounding box coordinates are normalized.
|
552
557
|
visualize_output: Whether to visualize the output.
|
558
|
+
self_reflection: boolean to enable and disable self reflection.
|
553
559
|
|
554
560
|
Returns:
|
555
561
|
A tuple where the first item is the final answer and the second item is a
|
556
562
|
list of all the tool results. The last item in the tool results also
|
557
563
|
contains the visualized output.
|
558
564
|
"""
|
565
|
+
if len(chat) == 0:
|
566
|
+
raise ValueError("Input cannot be empty.")
|
567
|
+
|
559
568
|
question = chat[0]["content"]
|
560
569
|
if image:
|
561
570
|
question += f" Image name: {image}"
|
@@ -625,20 +634,25 @@ class VisionAgent(Agent):
|
|
625
634
|
reflection_images = [image]
|
626
635
|
else:
|
627
636
|
reflection_images = None
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
637
|
+
|
638
|
+
if self_reflection:
|
639
|
+
reflection = self_reflect(
|
640
|
+
self.reflect_model,
|
641
|
+
question,
|
642
|
+
self.tools,
|
643
|
+
all_tool_results,
|
644
|
+
final_answer,
|
645
|
+
reflection_images,
|
646
|
+
)
|
647
|
+
self.log_progress(f"Reflection: {reflection}")
|
648
|
+
parsed_reflection = parse_reflect(reflection)
|
649
|
+
if parsed_reflection["Finish"]:
|
650
|
+
break
|
651
|
+
else:
|
652
|
+
reflections += "\n" + parsed_reflection["Reflection"]
|
640
653
|
else:
|
641
|
-
|
654
|
+
self.log_progress("Self Reflection skipped based on user request.")
|
655
|
+
break
|
642
656
|
# '<ANSWER>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
|
643
657
|
self.log_progress(
|
644
658
|
f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}</ANSWER>"
|
@@ -660,12 +674,14 @@ class VisionAgent(Agent):
|
|
660
674
|
image: Optional[Union[str, Path]] = None,
|
661
675
|
reference_data: Optional[Dict[str, str]] = None,
|
662
676
|
visualize_output: Optional[bool] = False,
|
677
|
+
self_reflection: Optional[bool] = True,
|
663
678
|
) -> str:
|
664
679
|
answer, _ = self.chat_with_workflow(
|
665
680
|
chat,
|
666
681
|
image=image,
|
667
682
|
visualize_output=visualize_output,
|
668
683
|
reference_data=reference_data,
|
684
|
+
self_reflection=self_reflection,
|
669
685
|
)
|
670
686
|
return answer
|
671
687
|
|
@@ -70,7 +70,7 @@ These are the tools you can select to solve the question:
|
|
70
70
|
|
71
71
|
Please note that:
|
72
72
|
1. You should only choose one tool from the Tool List to solve this question and it should have maximum chance of solving the question.
|
73
|
-
2. You should only choose the tool whose parameters are most relevant to the user's question and are
|
73
|
+
2. You should only choose the tool whose parameters are most relevant to the user's question and are available as part of the question.
|
74
74
|
3. You should choose the tool whose return type is most relevant to the answer of the user's question.
|
75
75
|
4. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like:
|
76
76
|
|
@@ -88,7 +88,7 @@ This is a reflection from a previous failed attempt:
|
|
88
88
|
|
89
89
|
Please note that:
|
90
90
|
1. You should only choose one tool from the Tool List to solve this question and it should have maximum chance of solving the question.
|
91
|
-
2. You should only choose the tool whose parameters are most relevant to the user's question and are
|
91
|
+
2. You should only choose the tool whose parameters are most relevant to the user's question and are available as part of the question.
|
92
92
|
3. You should choose the tool whose return type is most relevant to the answer of the user's question.
|
93
93
|
4. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like:
|
94
94
|
|
@@ -100,7 +100,7 @@ Output: """
|
|
100
100
|
CHOOSE_PARAMETER_DEPENDS = """Given a user's question and an API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question.
|
101
101
|
Please note that:
|
102
102
|
1. The Example in the API tool documentation can help you better understand the use of the API. Pay attention to the examples which show how to parse the question and extract tool parameters such as prompts and visual inputs.
|
103
|
-
2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no
|
103
|
+
2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no parameters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}
|
104
104
|
3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs.
|
105
105
|
4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers for your reference.
|
106
106
|
5. If you need to use this API multiple times, please set "Parameters" to a list.
|