vision-agent 0.2.29__py3-none-any.whl → 0.2.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +2 -2
- vision_agent/agent/agent.py +2 -2
- vision_agent/agent/agent_coder.py +8 -8
- vision_agent/agent/{vision_agent_v2.py → data_interpreter.py} +12 -12
- vision_agent/agent/{vision_agent_v2_prompts.py → data_interpreter_prompts.py} +3 -3
- vision_agent/agent/easytool.py +8 -8
- vision_agent/agent/easytool_v2.py +778 -0
- vision_agent/agent/easytool_v2_prompts.py +152 -0
- vision_agent/agent/reflexion.py +8 -8
- vision_agent/agent/vision_agent.py +360 -691
- vision_agent/agent/vision_agent_prompts.py +231 -149
- vision_agent/llm/llm.py +3 -4
- vision_agent/lmm/lmm.py +6 -6
- vision_agent/tools/__init__.py +21 -22
- vision_agent/tools/easytool_tools.py +1242 -0
- vision_agent/tools/tools.py +533 -1090
- vision_agent-0.2.31.dist-info/METADATA +175 -0
- vision_agent-0.2.31.dist-info/RECORD +36 -0
- vision_agent/agent/vision_agent_v3.py +0 -386
- vision_agent/agent/vision_agent_v3_prompts.py +0 -226
- vision_agent/tools/tools_v2.py +0 -685
- vision_agent-0.2.29.dist-info/METADATA +0 -226
- vision_agent-0.2.29.dist-info/RECORD +0 -36
- {vision_agent-0.2.29.dist-info → vision_agent-0.2.31.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.29.dist-info → vision_agent-0.2.31.dist-info}/WHEEL +0 -0
@@ -0,0 +1,175 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: vision-agent
|
3
|
+
Version: 0.2.31
|
4
|
+
Summary: Toolset for Vision Agent
|
5
|
+
Author: Landing AI
|
6
|
+
Author-email: dev@landing.ai
|
7
|
+
Requires-Python: >=3.9,<4.0
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
12
|
+
Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
|
13
|
+
Requires-Dist: langsmith (>=0.1.58,<0.2.0)
|
14
|
+
Requires-Dist: moviepy (>=1.0.0,<2.0.0)
|
15
|
+
Requires-Dist: nbclient (>=0.10.0,<0.11.0)
|
16
|
+
Requires-Dist: nbformat (>=5.10.4,<6.0.0)
|
17
|
+
Requires-Dist: numpy (>=1.21.0,<2.0.0)
|
18
|
+
Requires-Dist: openai (>=1.0.0,<2.0.0)
|
19
|
+
Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
|
20
|
+
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
21
|
+
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
22
|
+
Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
23
|
+
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
24
|
+
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
25
|
+
Requires-Dist: scipy (>=1.13.0,<1.14.0)
|
26
|
+
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
27
|
+
Requires-Dist: tqdm (>=4.64.0,<5.0.0)
|
28
|
+
Requires-Dist: typing_extensions (>=4.0.0,<5.0.0)
|
29
|
+
Project-URL: Homepage, https://landing.ai
|
30
|
+
Project-URL: documentation, https://github.com/landing-ai/vision-agent
|
31
|
+
Project-URL: repository, https://github.com/landing-ai/vision-agent
|
32
|
+
Description-Content-Type: text/markdown
|
33
|
+
|
34
|
+
<div align="center">
|
35
|
+
<img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
|
36
|
+
|
37
|
+
# 🔍🤖 Vision Agent
|
38
|
+
|
39
|
+
[](https://discord.gg/wPdN8RCYew)
|
40
|
+

|
41
|
+
[](https://badge.fury.io/py/vision-agent)
|
42
|
+

|
43
|
+
</div>
|
44
|
+
|
45
|
+
Vision Agent is a library that helps you utilize agent frameworks to generate code to
|
46
|
+
solve your vision task. Many current vision problems can easily take hours or days to
|
47
|
+
solve, you need to find the right model, figure out how to use it and program it to
|
48
|
+
accomplish the task you want. Vision Agent aims to provide an in-seconds experience by
|
49
|
+
allowing users to describe their problem in text and have the agent framework generate
|
50
|
+
code to solve the task for them. Check out our discord for updates and roadmaps!
|
51
|
+
|
52
|
+
## Documentation
|
53
|
+
|
54
|
+
- [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
|
55
|
+
|
56
|
+
|
57
|
+
## Getting Started
|
58
|
+
### Installation
|
59
|
+
To get started, you can install the library using pip:
|
60
|
+
|
61
|
+
```bash
|
62
|
+
pip install vision-agent
|
63
|
+
```
|
64
|
+
|
65
|
+
Ensure you have an OpenAI API key and set it as an environment variable (if you are
|
66
|
+
using Azure OpenAI please see the Azure setup section):
|
67
|
+
|
68
|
+
```bash
|
69
|
+
export OPENAI_API_KEY="your-api-key"
|
70
|
+
```
|
71
|
+
|
72
|
+
### Vision Agent
|
73
|
+
You can interact with the agent as you would with any LLM or LMM model:
|
74
|
+
|
75
|
+
```python
|
76
|
+
>>> from vision_agent.agent import VisionAgent
|
77
|
+
>>> agent = VisionAgent()
|
78
|
+
>>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
|
79
|
+
```
|
80
|
+
|
81
|
+
Which produces the following code:
|
82
|
+
```python
|
83
|
+
from vision_agent.tools import load_image, grounding_sam
|
84
|
+
|
85
|
+
def calculate_filled_percentage(image_path: str) -> float:
|
86
|
+
# Step 1: Load the image
|
87
|
+
image = load_image(image_path)
|
88
|
+
|
89
|
+
# Step 2: Segment the jar
|
90
|
+
jar_segments = grounding_sam(prompt="jar", image=image)
|
91
|
+
|
92
|
+
# Step 3: Segment the coffee beans
|
93
|
+
coffee_beans_segments = grounding_sam(prompt="coffee beans", image=image)
|
94
|
+
|
95
|
+
# Step 4: Calculate the area of the segmented jar
|
96
|
+
jar_area = 0
|
97
|
+
for segment in jar_segments:
|
98
|
+
jar_area += segment['mask'].sum()
|
99
|
+
|
100
|
+
# Step 5: Calculate the area of the segmented coffee beans
|
101
|
+
coffee_beans_area = 0
|
102
|
+
for segment in coffee_beans_segments:
|
103
|
+
coffee_beans_area += segment['mask'].sum()
|
104
|
+
|
105
|
+
# Step 6: Compute the percentage of the jar area that is filled with coffee beans
|
106
|
+
if jar_area == 0:
|
107
|
+
return 0.0 # To avoid division by zero
|
108
|
+
filled_percentage = (coffee_beans_area / jar_area) * 100
|
109
|
+
|
110
|
+
# Step 7: Return the computed percentage
|
111
|
+
return filled_percentage
|
112
|
+
```
|
113
|
+
|
114
|
+
To better understand how the model came up with it's answer, you can run it in debug
|
115
|
+
mode by passing in the verbose argument:
|
116
|
+
|
117
|
+
```python
|
118
|
+
>>> agent = VisionAgent(verbose=2)
|
119
|
+
```
|
120
|
+
|
121
|
+
You can also have it return more information by calling `chat_with_workflow`:
|
122
|
+
|
123
|
+
```python
|
124
|
+
>>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?"}], media="jar.jpg")
|
125
|
+
>>> print(results)
|
126
|
+
{
|
127
|
+
"code": "from vision_agent.tools import ..."
|
128
|
+
"test": "calculate_filled_percentage('jar.jpg')",
|
129
|
+
"test_result": "...",
|
130
|
+
"plan": [{"code": "...", "test": "...", "plan": "..."}, ...],
|
131
|
+
"working_memory": ...,
|
132
|
+
}
|
133
|
+
```
|
134
|
+
|
135
|
+
With this you can examine more detailed information such as the etesting code, testing
|
136
|
+
results, plan or working memory it used to complete the task.
|
137
|
+
|
138
|
+
### Tools
|
139
|
+
There are a variety of tools for the model or the user to use. Some are executed locally
|
140
|
+
while others are hosted for you. You can also ask an LLM directly to build a tool for
|
141
|
+
you. For example:
|
142
|
+
|
143
|
+
```python
|
144
|
+
>>> import vision_agent as va
|
145
|
+
>>> llm = va.llm.OpenAILLM()
|
146
|
+
>>> detector = llm.generate_detector("Can you build a jar detector for me?")
|
147
|
+
>>> detector("jar.jpg")
|
148
|
+
[{"labels": ["jar",],
|
149
|
+
"scores": [0.99],
|
150
|
+
"bboxes": [
|
151
|
+
[0.58, 0.2, 0.72, 0.45],
|
152
|
+
]
|
153
|
+
}]
|
154
|
+
```
|
155
|
+
|
156
|
+
### Azure Setup
|
157
|
+
If you want to use Azure OpenAI models, you can set the environment variable:
|
158
|
+
|
159
|
+
```bash
|
160
|
+
export AZURE_OPENAI_API_KEY="your-api-key"
|
161
|
+
export AZURE_OPENAI_ENDPOINT="your-endpoint"
|
162
|
+
```
|
163
|
+
|
164
|
+
You can then run Vision Agent using the Azure OpenAI models:
|
165
|
+
|
166
|
+
```python
|
167
|
+
>>> import vision_agent as va
|
168
|
+
>>> agent = va.agent.VisionAgent(
|
169
|
+
>>> task_model=va.llm.AzureOpenAILLM(),
|
170
|
+
>>> answer_model=va.lmm.AzureOpenAILMM(),
|
171
|
+
>>> reflection_model=va.lmm.AzureOpenAILMM(),
|
172
|
+
>>> )
|
173
|
+
```
|
174
|
+
|
175
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
vision_agent/__init__.py,sha256=GVLHCeK_R-zgldpbcPmOzJat-BkadvkuRCMxDvTIcXs,108
|
2
|
+
vision_agent/agent/__init__.py,sha256=iiC5eknTQnv87iSwAoHqBthJ3g2Zm6D0dWbYPDfuQ7A,245
|
3
|
+
vision_agent/agent/agent.py,sha256=TXh93MOwmArNRieOkYrhliq1rf7wIkhxvCdTiGhTqFs,538
|
4
|
+
vision_agent/agent/agent_coder.py,sha256=MQw8SPeNy1D9tUvB-u60H9ab1eLXnrpV0Ggn7Eq_mIo,6988
|
5
|
+
vision_agent/agent/agent_coder_prompts.py,sha256=CJe3v7xvHQ32u3RQAXQga_Tk_4UgU64RBAMHZ3S70KY,5538
|
6
|
+
vision_agent/agent/data_interpreter.py,sha256=YCREEHWiyTYpKT8hibotylEkx1kF5AH0k9wnmymwPBY,15143
|
7
|
+
vision_agent/agent/data_interpreter_prompts.py,sha256=RDJggOfXwGaEoIcTYGX41ZEayCgYei1AootDOc_SN2g,6134
|
8
|
+
vision_agent/agent/easytool.py,sha256=wMa9-tpAaiC4E2ONbidxmMM9YvAOw4_Sypf5mGKco_w,11526
|
9
|
+
vision_agent/agent/easytool_prompts.py,sha256=Bikw-PPLkm78dwywTlnv32Y1Tw6JMeC-R7oCnXWLcTk,4656
|
10
|
+
vision_agent/agent/easytool_v2.py,sha256=CjY-sSj3abxnSq3ZHZMt-7YvRWDXEZsC6RN8FFIypCA,27274
|
11
|
+
vision_agent/agent/easytool_v2_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
|
12
|
+
vision_agent/agent/reflexion.py,sha256=AlM5AvBJvCslXlYQdZiadq4oVHsNBm3IF_03DglTxRo,10506
|
13
|
+
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
14
|
+
vision_agent/agent/vision_agent.py,sha256=5Bfxif2sqRKS1ZUlQ4yT468EfevI9CQ6V7_Y6xRbbq0,14992
|
15
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=s6T5UnyrKIAcaKqcMudWQOBCHt6Obn9QpX3QtqiDv2I,8034
|
16
|
+
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
18
|
+
vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
|
19
|
+
vision_agent/llm/llm.py,sha256=UZ73GqQHE-NKOJWsrOTWfmdHYsbCBkJ5rZ7dhcSCHHw,5951
|
20
|
+
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
21
|
+
vision_agent/lmm/lmm.py,sha256=NwcZYLTzi95LSMAk0sTtw7G_zBLa9lU-DHM5GUUCiK4,10622
|
22
|
+
vision_agent/tools/__init__.py,sha256=1kyJy4euA8t73_ALhKZIUOjVb2A1IyYztu-MJJJ0TYI,505
|
23
|
+
vision_agent/tools/easytool_tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
|
24
|
+
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
25
|
+
vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
|
26
|
+
vision_agent/tools/tools.py,sha256=mio0A1l5QcyRC5IgaD4Trfqg7hFTZ8rOjx1dYivwb4Q,21585
|
27
|
+
vision_agent/utils/__init__.py,sha256=xsHFyJSDbLdonB9Dh74cwZnVTiT__2OQF3Brd3Nmglc,116
|
28
|
+
vision_agent/utils/execute.py,sha256=8_SfK-IkHH4lXF0JVyV7sDFszZn9HKsh1bFITKGCJ1g,3881
|
29
|
+
vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
|
30
|
+
vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
|
31
|
+
vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
|
32
|
+
vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
|
33
|
+
vision_agent-0.2.31.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
34
|
+
vision_agent-0.2.31.dist-info/METADATA,sha256=tsCUD6WuSXUt5XLCmOD89DMzDTAxyrCPiA0cAES85AI,5942
|
35
|
+
vision_agent-0.2.31.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
36
|
+
vision_agent-0.2.31.dist-info/RECORD,,
|
@@ -1,386 +0,0 @@
|
|
1
|
-
import copy
|
2
|
-
import json
|
3
|
-
import logging
|
4
|
-
import sys
|
5
|
-
from pathlib import Path
|
6
|
-
from typing import Any, Dict, List, Optional, Union, cast, Callable, no_type_check
|
7
|
-
|
8
|
-
from rich.console import Console
|
9
|
-
from rich.syntax import Syntax
|
10
|
-
from tabulate import tabulate
|
11
|
-
|
12
|
-
from vision_agent.agent import Agent
|
13
|
-
from vision_agent.agent.vision_agent_v3_prompts import (
|
14
|
-
CODE,
|
15
|
-
FEEDBACK,
|
16
|
-
FIX_BUG,
|
17
|
-
PLAN,
|
18
|
-
REFLECT,
|
19
|
-
SIMPLE_TEST,
|
20
|
-
USER_REQ,
|
21
|
-
)
|
22
|
-
from vision_agent.llm import LLM, OpenAILLM
|
23
|
-
from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF, UTILITIES_DOCSTRING
|
24
|
-
from vision_agent.utils import Execute
|
25
|
-
from vision_agent.utils.sim import Sim
|
26
|
-
|
27
|
-
logging.basicConfig(stream=sys.stdout)
|
28
|
-
_LOGGER = logging.getLogger(__name__)
|
29
|
-
_MAX_TABULATE_COL_WIDTH = 80
|
30
|
-
_EXECUTE = Execute(600)
|
31
|
-
_CONSOLE = Console()
|
32
|
-
|
33
|
-
|
34
|
-
def format_memory(memory: List[Dict[str, str]]) -> str:
|
35
|
-
return FEEDBACK.format(
|
36
|
-
feedback="\n".join(
|
37
|
-
[
|
38
|
-
f"### Feedback {i}:\nCode: ```python\n{m['code']}\n```\nFeedback: {m['feedback']}\n"
|
39
|
-
for i, m in enumerate(memory)
|
40
|
-
]
|
41
|
-
)
|
42
|
-
)
|
43
|
-
|
44
|
-
|
45
|
-
def extract_code(code: str) -> str:
|
46
|
-
if "\n```python" in code:
|
47
|
-
start = "\n```python"
|
48
|
-
elif "```python" in code:
|
49
|
-
start = "```python"
|
50
|
-
else:
|
51
|
-
return code
|
52
|
-
|
53
|
-
code = code[code.find(start) + len(start) :]
|
54
|
-
code = code[: code.find("```")]
|
55
|
-
if code.startswith("python\n"):
|
56
|
-
code = code[len("python\n") :]
|
57
|
-
return code
|
58
|
-
|
59
|
-
|
60
|
-
def extract_json(json_str: str) -> Dict[str, Any]:
|
61
|
-
try:
|
62
|
-
json_dict = json.loads(json_str)
|
63
|
-
except json.JSONDecodeError:
|
64
|
-
if "```json" in json_str:
|
65
|
-
json_str = json_str[json_str.find("```json") + len("```json") :]
|
66
|
-
json_str = json_str[: json_str.find("```")]
|
67
|
-
elif "```" in json_str:
|
68
|
-
json_str = json_str[json_str.find("```") + len("```") :]
|
69
|
-
# get the last ``` not one from an intermediate string
|
70
|
-
json_str = json_str[: json_str.find("}```")]
|
71
|
-
json_dict = json.loads(json_str)
|
72
|
-
return json_dict # type: ignore
|
73
|
-
|
74
|
-
|
75
|
-
def write_plan(
|
76
|
-
chat: List[Dict[str, str]],
|
77
|
-
tool_desc: str,
|
78
|
-
working_memory: str,
|
79
|
-
model: LLM,
|
80
|
-
) -> List[Dict[str, str]]:
|
81
|
-
chat = copy.deepcopy(chat)
|
82
|
-
if chat[-1]["role"] != "user":
|
83
|
-
raise ValueError("Last chat message must be from the user.")
|
84
|
-
|
85
|
-
user_request = chat[-1]["content"]
|
86
|
-
context = USER_REQ.format(user_request=user_request)
|
87
|
-
prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
|
88
|
-
chat[-1]["content"] = prompt
|
89
|
-
return extract_json(model.chat(chat))["plan"] # type: ignore
|
90
|
-
|
91
|
-
|
92
|
-
def reflect(
|
93
|
-
chat: List[Dict[str, str]],
|
94
|
-
plan: str,
|
95
|
-
code: str,
|
96
|
-
model: LLM,
|
97
|
-
) -> Dict[str, Union[str, bool]]:
|
98
|
-
chat = copy.deepcopy(chat)
|
99
|
-
if chat[-1]["role"] != "user":
|
100
|
-
raise ValueError("Last chat message must be from the user.")
|
101
|
-
|
102
|
-
user_request = chat[-1]["content"]
|
103
|
-
context = USER_REQ.format(user_request=user_request)
|
104
|
-
prompt = REFLECT.format(context=context, plan=plan, code=code)
|
105
|
-
chat[-1]["content"] = prompt
|
106
|
-
return extract_json(model.chat(chat))
|
107
|
-
|
108
|
-
|
109
|
-
def write_and_test_code(
|
110
|
-
task: str,
|
111
|
-
tool_info: str,
|
112
|
-
tool_utils: str,
|
113
|
-
working_memory: str,
|
114
|
-
coder: LLM,
|
115
|
-
tester: LLM,
|
116
|
-
debugger: LLM,
|
117
|
-
log_progress: Callable[[Dict[str, Any]], None],
|
118
|
-
verbosity: int = 0,
|
119
|
-
max_retries: int = 3,
|
120
|
-
input_media: Optional[Union[str, Path]] = None,
|
121
|
-
) -> Dict[str, Any]:
|
122
|
-
code = extract_code(
|
123
|
-
coder(CODE.format(docstring=tool_info, question=task, feedback=working_memory))
|
124
|
-
)
|
125
|
-
test = extract_code(
|
126
|
-
tester(
|
127
|
-
SIMPLE_TEST.format(
|
128
|
-
docstring=tool_utils,
|
129
|
-
question=task,
|
130
|
-
code=code,
|
131
|
-
feedback=working_memory,
|
132
|
-
media=input_media,
|
133
|
-
)
|
134
|
-
)
|
135
|
-
)
|
136
|
-
|
137
|
-
success, result = _EXECUTE.run_isolation(f"{code}\n{test}")
|
138
|
-
if verbosity == 2:
|
139
|
-
_LOGGER.info("Initial code and tests:")
|
140
|
-
log_progress(
|
141
|
-
{
|
142
|
-
"log": "Code:",
|
143
|
-
"code": code,
|
144
|
-
}
|
145
|
-
)
|
146
|
-
log_progress(
|
147
|
-
{
|
148
|
-
"log": "Test:",
|
149
|
-
"code": test,
|
150
|
-
}
|
151
|
-
)
|
152
|
-
_CONSOLE.print(
|
153
|
-
Syntax(f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True)
|
154
|
-
)
|
155
|
-
log_progress(
|
156
|
-
{
|
157
|
-
"log": "Result:",
|
158
|
-
"result": result,
|
159
|
-
}
|
160
|
-
)
|
161
|
-
_LOGGER.info(f"Initial result: {result}")
|
162
|
-
|
163
|
-
count = 0
|
164
|
-
new_working_memory = []
|
165
|
-
while not success and count < max_retries:
|
166
|
-
fixed_code_and_test = extract_json(
|
167
|
-
debugger(
|
168
|
-
FIX_BUG.format(
|
169
|
-
code=code, tests=test, result=result, feedback=working_memory
|
170
|
-
)
|
171
|
-
)
|
172
|
-
)
|
173
|
-
if fixed_code_and_test["code"].strip() != "":
|
174
|
-
code = extract_code(fixed_code_and_test["code"])
|
175
|
-
if fixed_code_and_test["test"].strip() != "":
|
176
|
-
test = extract_code(fixed_code_and_test["test"])
|
177
|
-
new_working_memory.append(
|
178
|
-
{"code": f"{code}\n{test}", "feedback": fixed_code_and_test["reflections"]}
|
179
|
-
)
|
180
|
-
|
181
|
-
success, result = _EXECUTE.run_isolation(f"{code}\n{test}")
|
182
|
-
if verbosity == 2:
|
183
|
-
log_progress(
|
184
|
-
{
|
185
|
-
"log": f"Debug attempt {count + 1}, reflection:",
|
186
|
-
"result": fixed_code_and_test["reflections"],
|
187
|
-
}
|
188
|
-
)
|
189
|
-
_LOGGER.info(
|
190
|
-
f"Debug attempt {count + 1}, reflection: {fixed_code_and_test['reflections']}"
|
191
|
-
)
|
192
|
-
_CONSOLE.print(
|
193
|
-
Syntax(
|
194
|
-
f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True
|
195
|
-
)
|
196
|
-
)
|
197
|
-
log_progress(
|
198
|
-
{
|
199
|
-
"log": "Debug result:",
|
200
|
-
"result": result,
|
201
|
-
}
|
202
|
-
)
|
203
|
-
_LOGGER.info(f"Debug result: {result}")
|
204
|
-
count += 1
|
205
|
-
|
206
|
-
if verbosity >= 1:
|
207
|
-
_LOGGER.info("Final code and tests:")
|
208
|
-
_CONSOLE.print(
|
209
|
-
Syntax(f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True)
|
210
|
-
)
|
211
|
-
_LOGGER.info(f"Final Result: {result}")
|
212
|
-
|
213
|
-
return {
|
214
|
-
"code": code,
|
215
|
-
"test": test,
|
216
|
-
"success": success,
|
217
|
-
"test_result": result,
|
218
|
-
"working_memory": new_working_memory,
|
219
|
-
}
|
220
|
-
|
221
|
-
|
222
|
-
def retrieve_tools(
|
223
|
-
plan: List[Dict[str, str]],
|
224
|
-
tool_recommender: Sim,
|
225
|
-
log_progress: Callable[[Dict[str, Any]], None],
|
226
|
-
verbosity: int = 0,
|
227
|
-
) -> str:
|
228
|
-
tool_info = []
|
229
|
-
tool_desc = []
|
230
|
-
for task in plan:
|
231
|
-
tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3)
|
232
|
-
tool_info.extend([e["doc"] for e in tools])
|
233
|
-
tool_desc.extend([e["desc"] for e in tools])
|
234
|
-
if verbosity == 2:
|
235
|
-
log_progress(
|
236
|
-
{
|
237
|
-
"log": "Retrieved tools:",
|
238
|
-
"tools": tool_desc,
|
239
|
-
}
|
240
|
-
)
|
241
|
-
_LOGGER.info(f"Tools: {tool_desc}")
|
242
|
-
tool_info_set = set(tool_info)
|
243
|
-
return "\n\n".join(tool_info_set)
|
244
|
-
|
245
|
-
|
246
|
-
class VisionAgentV3(Agent):
|
247
|
-
def __init__(
|
248
|
-
self,
|
249
|
-
timeout: int = 600,
|
250
|
-
planner: Optional[LLM] = None,
|
251
|
-
coder: Optional[LLM] = None,
|
252
|
-
tester: Optional[LLM] = None,
|
253
|
-
debugger: Optional[LLM] = None,
|
254
|
-
tool_recommender: Optional[Sim] = None,
|
255
|
-
verbosity: int = 0,
|
256
|
-
report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
257
|
-
) -> None:
|
258
|
-
self.planner = (
|
259
|
-
OpenAILLM(temperature=0.0, json_mode=True) if planner is None else planner
|
260
|
-
)
|
261
|
-
self.coder = OpenAILLM(temperature=0.0) if coder is None else coder
|
262
|
-
self.tester = OpenAILLM(temperature=0.0) if tester is None else tester
|
263
|
-
self.debugger = (
|
264
|
-
OpenAILLM(temperature=0.0, json_mode=True) if debugger is None else debugger
|
265
|
-
)
|
266
|
-
|
267
|
-
self.tool_recommender = (
|
268
|
-
Sim(TOOLS_DF, sim_key="desc")
|
269
|
-
if tool_recommender is None
|
270
|
-
else tool_recommender
|
271
|
-
)
|
272
|
-
self.verbosity = verbosity
|
273
|
-
self.max_retries = 2
|
274
|
-
self.report_progress_callback = report_progress_callback
|
275
|
-
|
276
|
-
@no_type_check
|
277
|
-
def __call__(
|
278
|
-
self,
|
279
|
-
input: Union[List[Dict[str, str]], str],
|
280
|
-
image: Optional[Union[str, Path]] = None,
|
281
|
-
) -> Dict[str, Any]:
|
282
|
-
if isinstance(input, str):
|
283
|
-
input = [{"role": "user", "content": input}]
|
284
|
-
results = self.chat_with_workflow(input, image)
|
285
|
-
results.pop("working_memory")
|
286
|
-
return results
|
287
|
-
|
288
|
-
def chat_with_workflow(
|
289
|
-
self,
|
290
|
-
chat: List[Dict[str, str]],
|
291
|
-
image: Optional[Union[str, Path]] = None,
|
292
|
-
self_reflection: bool = False,
|
293
|
-
) -> Dict[str, Any]:
|
294
|
-
if len(chat) == 0:
|
295
|
-
raise ValueError("Chat cannot be empty.")
|
296
|
-
|
297
|
-
if image is not None:
|
298
|
-
for chat_i in chat:
|
299
|
-
if chat_i["role"] == "user":
|
300
|
-
chat_i["content"] += f" Image name {image}"
|
301
|
-
|
302
|
-
code = ""
|
303
|
-
test = ""
|
304
|
-
working_memory: List[Dict[str, str]] = []
|
305
|
-
results = {"code": "", "test": "", "plan": []}
|
306
|
-
plan = []
|
307
|
-
success = False
|
308
|
-
retries = 0
|
309
|
-
|
310
|
-
while not success and retries < self.max_retries:
|
311
|
-
plan_i = write_plan(
|
312
|
-
chat, TOOL_DESCRIPTIONS, format_memory(working_memory), self.planner
|
313
|
-
)
|
314
|
-
plan_i_str = "\n-".join([e["instructions"] for e in plan_i])
|
315
|
-
if self.verbosity >= 1:
|
316
|
-
self.log_progress(
|
317
|
-
{
|
318
|
-
"log": "Going to run the following plan(s) in sequence:\n",
|
319
|
-
"plan": plan_i,
|
320
|
-
}
|
321
|
-
)
|
322
|
-
|
323
|
-
_LOGGER.info(
|
324
|
-
f"""
|
325
|
-
{tabulate(tabular_data=plan_i, headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
|
326
|
-
)
|
327
|
-
|
328
|
-
tool_info = retrieve_tools(
|
329
|
-
plan_i,
|
330
|
-
self.tool_recommender,
|
331
|
-
self.log_progress,
|
332
|
-
self.verbosity,
|
333
|
-
)
|
334
|
-
results = write_and_test_code(
|
335
|
-
plan_i_str,
|
336
|
-
tool_info,
|
337
|
-
UTILITIES_DOCSTRING,
|
338
|
-
format_memory(working_memory),
|
339
|
-
self.coder,
|
340
|
-
self.tester,
|
341
|
-
self.debugger,
|
342
|
-
self.log_progress,
|
343
|
-
verbosity=self.verbosity,
|
344
|
-
input_media=image,
|
345
|
-
)
|
346
|
-
success = cast(bool, results["success"])
|
347
|
-
code = cast(str, results["code"])
|
348
|
-
test = cast(str, results["test"])
|
349
|
-
working_memory.extend(results["working_memory"]) # type: ignore
|
350
|
-
plan.append({"code": code, "test": test, "plan": plan_i})
|
351
|
-
|
352
|
-
if self_reflection:
|
353
|
-
reflection = reflect(chat, plan_i_str, code, self.planner)
|
354
|
-
if self.verbosity > 0:
|
355
|
-
self.log_progress(
|
356
|
-
{
|
357
|
-
"log": "Reflection:",
|
358
|
-
"reflection": reflection,
|
359
|
-
}
|
360
|
-
)
|
361
|
-
_LOGGER.info(f"Reflection: {reflection}")
|
362
|
-
feedback = cast(str, reflection["feedback"])
|
363
|
-
success = cast(bool, reflection["success"])
|
364
|
-
working_memory.append({"code": f"{code}\n{test}", "feedback": feedback})
|
365
|
-
|
366
|
-
retries += 1
|
367
|
-
|
368
|
-
self.log_progress(
|
369
|
-
{
|
370
|
-
"log": f"The Vision Agent V3 has concluded this chat.\nSuccess: {success}",
|
371
|
-
"finished": True,
|
372
|
-
}
|
373
|
-
)
|
374
|
-
|
375
|
-
return {
|
376
|
-
"code": code,
|
377
|
-
"test": test,
|
378
|
-
"test_result": results["test_result"],
|
379
|
-
"plan": plan,
|
380
|
-
"working_memory": working_memory,
|
381
|
-
}
|
382
|
-
|
383
|
-
def log_progress(self, data: Dict[str, Any]) -> None:
|
384
|
-
if self.report_progress_callback is not None:
|
385
|
-
self.report_progress_callback(data)
|
386
|
-
pass
|