vision-agent 0.2.44__py3-none-any.whl → 0.2.45__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +30 -4
- vision_agent/agent/vision_agent_prompts.py +5 -2
- vision_agent/tools/tools.py +4 -3
- {vision_agent-0.2.44.dist-info → vision_agent-0.2.45.dist-info}/METADATA +1 -1
- {vision_agent-0.2.44.dist-info → vision_agent-0.2.45.dist-info}/RECORD +7 -7
- {vision_agent-0.2.44.dist-info → vision_agent-0.2.45.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.44.dist-info → vision_agent-0.2.45.dist-info}/WHEEL +0 -0
@@ -2,9 +2,11 @@ import copy
|
|
2
2
|
import json
|
3
3
|
import logging
|
4
4
|
import sys
|
5
|
+
import tempfile
|
5
6
|
from pathlib import Path
|
6
|
-
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
7
|
+
from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
|
7
8
|
|
9
|
+
from PIL import Image
|
8
10
|
from rich.console import Console
|
9
11
|
from rich.style import Style
|
10
12
|
from rich.syntax import Syntax
|
@@ -78,12 +80,35 @@ def extract_json(json_str: str) -> Dict[str, Any]:
|
|
78
80
|
return json_dict # type: ignore
|
79
81
|
|
80
82
|
|
83
|
+
def extract_image(
|
84
|
+
media: Optional[Sequence[Union[str, Path]]]
|
85
|
+
) -> Optional[Sequence[Union[str, Path]]]:
|
86
|
+
if media is None:
|
87
|
+
return None
|
88
|
+
|
89
|
+
new_media = []
|
90
|
+
for m in media:
|
91
|
+
m = Path(m)
|
92
|
+
extension = m.suffix
|
93
|
+
if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
|
94
|
+
new_media.append(m)
|
95
|
+
elif extension in [".mp4", ".mov"]:
|
96
|
+
frames = T.extract_frames(m)
|
97
|
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
98
|
+
if len(frames) > 0:
|
99
|
+
Image.fromarray(frames[0][0]).save(tmp.name)
|
100
|
+
new_media.append(Path(tmp.name))
|
101
|
+
if len(new_media) == 0:
|
102
|
+
return None
|
103
|
+
return new_media
|
104
|
+
|
105
|
+
|
81
106
|
def write_plan(
|
82
107
|
chat: List[Dict[str, str]],
|
83
108
|
tool_desc: str,
|
84
109
|
working_memory: str,
|
85
110
|
model: Union[LLM, LMM],
|
86
|
-
media: Optional[
|
111
|
+
media: Optional[Sequence[Union[str, Path]]] = None,
|
87
112
|
) -> List[Dict[str, str]]:
|
88
113
|
chat = copy.deepcopy(chat)
|
89
114
|
if chat[-1]["role"] != "user":
|
@@ -94,6 +119,7 @@ def write_plan(
|
|
94
119
|
prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
|
95
120
|
chat[-1]["content"] = prompt
|
96
121
|
if isinstance(model, OpenAILMM):
|
122
|
+
media = extract_image(media)
|
97
123
|
return extract_json(model.chat(chat, images=media))["plan"] # type: ignore
|
98
124
|
else:
|
99
125
|
return extract_json(model.chat(chat))["plan"] # type: ignore
|
@@ -103,7 +129,7 @@ def reflect(
|
|
103
129
|
chat: List[Dict[str, str]],
|
104
130
|
plan: str,
|
105
131
|
code: str,
|
106
|
-
model: LLM,
|
132
|
+
model: Union[LLM, LMM],
|
107
133
|
) -> Dict[str, Union[str, bool]]:
|
108
134
|
chat = copy.deepcopy(chat)
|
109
135
|
if chat[-1]["role"] != "user":
|
@@ -309,7 +335,7 @@ class VisionAgent(Agent):
|
|
309
335
|
|
310
336
|
def __init__(
|
311
337
|
self,
|
312
|
-
planner: Optional[LLM] = None,
|
338
|
+
planner: Optional[Union[LLM, LMM]] = None,
|
313
339
|
coder: Optional[LLM] = None,
|
314
340
|
tester: Optional[LLM] = None,
|
315
341
|
debugger: Optional[LLM] = None,
|
@@ -29,14 +29,17 @@ PLAN = """
|
|
29
29
|
{feedback}
|
30
30
|
|
31
31
|
**Instructions**:
|
32
|
-
Based on the context and tools you have available, write a plan of subtasks to achieve the user request
|
32
|
+
1. Based on the context and tools you have available, write a plan of subtasks to achieve the user request.
|
33
|
+
2. Go over the users request step by step and ensure each step is represented as a clear subtask in your plan.
|
34
|
+
|
35
|
+
Output a list of jsons in the following format
|
33
36
|
|
34
37
|
```json
|
35
38
|
{{
|
36
39
|
"plan":
|
37
40
|
[
|
38
41
|
{{
|
39
|
-
"instructions": str # what you should do in this task
|
42
|
+
"instructions": str # what you should do in this task associated with a tool
|
40
43
|
}}
|
41
44
|
]
|
42
45
|
}}
|
vision_agent/tools/tools.py
CHANGED
@@ -199,14 +199,15 @@ def extract_frames(
|
|
199
199
|
|
200
200
|
def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
201
201
|
"""'ocr' extracts text from an image. It returns a list of detected text, bounding
|
202
|
-
boxes, and confidence scores. The results are sorted
|
202
|
+
boxes with normalized coordinates, and confidence scores. The results are sorted
|
203
|
+
from top-left to bottom right.
|
203
204
|
|
204
205
|
Parameters:
|
205
206
|
image (np.ndarray): The image to extract text from.
|
206
207
|
|
207
208
|
Returns:
|
208
|
-
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
209
|
-
and confidence score.
|
209
|
+
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
210
|
+
with nornmalized coordinates, and confidence score.
|
210
211
|
|
211
212
|
Example
|
212
213
|
-------
|
@@ -11,8 +11,8 @@ vision_agent/agent/easytool_v2.py,sha256=CjY-sSj3abxnSq3ZHZMt-7YvRWDXEZsC6RN8FFI
|
|
11
11
|
vision_agent/agent/easytool_v2_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
|
12
12
|
vision_agent/agent/reflexion.py,sha256=AlM5AvBJvCslXlYQdZiadq4oVHsNBm3IF_03DglTxRo,10506
|
13
13
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
14
|
-
vision_agent/agent/vision_agent.py,sha256=
|
15
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
14
|
+
vision_agent/agent/vision_agent.py,sha256=c3jJd1uiCtmVC2xazUvW9rwc7usi-EOYW7NZnMFOdt8,19586
|
15
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=bIcqutsyM2bEhWE2XGw01PuZ9f-jePSwapbvkOOrFZ4,8384
|
16
16
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
18
18
|
vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
|
@@ -23,14 +23,14 @@ vision_agent/tools/__init__.py,sha256=K_7knxmyTIcSEGL8c9wF8RpVh3GrMYfybFaq-2SUM1
|
|
23
23
|
vision_agent/tools/easytool_tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
|
24
24
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
25
25
|
vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
|
26
|
-
vision_agent/tools/tools.py,sha256=
|
26
|
+
vision_agent/tools/tools.py,sha256=66pFXUIVvnOa1fk0PY5u_75kblIbAVqkRP2U9qLixrY,25951
|
27
27
|
vision_agent/utils/__init__.py,sha256=Ce4yPhoWanRsnTy3X7YzZNBYYRJsrJeT7N59WUf8GZM,209
|
28
28
|
vision_agent/utils/execute.py,sha256=GlpUGe3pg5KdSvRHLFfVcn9ptXBIp-QRoHT3Wa6aIMs,20318
|
29
29
|
vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
|
30
30
|
vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
|
31
31
|
vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
|
32
32
|
vision_agent/utils/video.py,sha256=EuJJ7Owi3pIV-q3WcZ-LaaTrGAmmZ8YAA22rmEkY7GI,8885
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
36
|
-
vision_agent-0.2.
|
33
|
+
vision_agent-0.2.45.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
34
|
+
vision_agent-0.2.45.dist-info/METADATA,sha256=G9Cy7cUPEWi42cuVP8V7u_ZjNXEnYp26_kb_u9rXSQk,6817
|
35
|
+
vision_agent-0.2.45.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
36
|
+
vision_agent-0.2.45.dist-info/RECORD,,
|
File without changes
|
File without changes
|