vision-agent 0.2.44__py3-none-any.whl → 0.2.45__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,9 +2,11 @@ import copy
2
2
  import json
3
3
  import logging
4
4
  import sys
5
+ import tempfile
5
6
  from pathlib import Path
6
- from typing import Any, Callable, Dict, List, Optional, Union, cast
7
+ from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
7
8
 
9
+ from PIL import Image
8
10
  from rich.console import Console
9
11
  from rich.style import Style
10
12
  from rich.syntax import Syntax
@@ -78,12 +80,35 @@ def extract_json(json_str: str) -> Dict[str, Any]:
78
80
  return json_dict # type: ignore
79
81
 
80
82
 
83
+ def extract_image(
84
+ media: Optional[Sequence[Union[str, Path]]]
85
+ ) -> Optional[Sequence[Union[str, Path]]]:
86
+ if media is None:
87
+ return None
88
+
89
+ new_media = []
90
+ for m in media:
91
+ m = Path(m)
92
+ extension = m.suffix
93
+ if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
94
+ new_media.append(m)
95
+ elif extension in [".mp4", ".mov"]:
96
+ frames = T.extract_frames(m)
97
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
98
+ if len(frames) > 0:
99
+ Image.fromarray(frames[0][0]).save(tmp.name)
100
+ new_media.append(Path(tmp.name))
101
+ if len(new_media) == 0:
102
+ return None
103
+ return new_media
104
+
105
+
81
106
  def write_plan(
82
107
  chat: List[Dict[str, str]],
83
108
  tool_desc: str,
84
109
  working_memory: str,
85
110
  model: Union[LLM, LMM],
86
- media: Optional[List[Union[str, Path]]] = None,
111
+ media: Optional[Sequence[Union[str, Path]]] = None,
87
112
  ) -> List[Dict[str, str]]:
88
113
  chat = copy.deepcopy(chat)
89
114
  if chat[-1]["role"] != "user":
@@ -94,6 +119,7 @@ def write_plan(
94
119
  prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
95
120
  chat[-1]["content"] = prompt
96
121
  if isinstance(model, OpenAILMM):
122
+ media = extract_image(media)
97
123
  return extract_json(model.chat(chat, images=media))["plan"] # type: ignore
98
124
  else:
99
125
  return extract_json(model.chat(chat))["plan"] # type: ignore
@@ -103,7 +129,7 @@ def reflect(
103
129
  chat: List[Dict[str, str]],
104
130
  plan: str,
105
131
  code: str,
106
- model: LLM,
132
+ model: Union[LLM, LMM],
107
133
  ) -> Dict[str, Union[str, bool]]:
108
134
  chat = copy.deepcopy(chat)
109
135
  if chat[-1]["role"] != "user":
@@ -309,7 +335,7 @@ class VisionAgent(Agent):
309
335
 
310
336
  def __init__(
311
337
  self,
312
- planner: Optional[LLM] = None,
338
+ planner: Optional[Union[LLM, LMM]] = None,
313
339
  coder: Optional[LLM] = None,
314
340
  tester: Optional[LLM] = None,
315
341
  debugger: Optional[LLM] = None,
@@ -29,14 +29,17 @@ PLAN = """
29
29
  {feedback}
30
30
 
31
31
  **Instructions**:
32
- Based on the context and tools you have available, write a plan of subtasks to achieve the user request utilizing given tools when necessary. Output a list of jsons in the following format:
32
+ 1. Based on the context and tools you have available, write a plan of subtasks to achieve the user request.
33
+ 2. Go over the users request step by step and ensure each step is represented as a clear subtask in your plan.
34
+
35
+ Output a list of jsons in the following format
33
36
 
34
37
  ```json
35
38
  {{
36
39
  "plan":
37
40
  [
38
41
  {{
39
- "instructions": str # what you should do in this task, one short phrase or sentence
42
+ "instructions": str # what you should do in this task associated with a tool
40
43
  }}
41
44
  ]
42
45
  }}
@@ -199,14 +199,15 @@ def extract_frames(
199
199
 
200
200
  def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
201
201
  """'ocr' extracts text from an image. It returns a list of detected text, bounding
202
- boxes, and confidence scores. The results are sorted from top-left to bottom right
202
+ boxes with normalized coordinates, and confidence scores. The results are sorted
203
+ from top-left to bottom right.
203
204
 
204
205
  Parameters:
205
206
  image (np.ndarray): The image to extract text from.
206
207
 
207
208
  Returns:
208
- List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox,
209
- and confidence score.
209
+ List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
210
+ with nornmalized coordinates, and confidence score.
210
211
 
211
212
  Example
212
213
  -------
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.44
3
+ Version: 0.2.45
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -11,8 +11,8 @@ vision_agent/agent/easytool_v2.py,sha256=CjY-sSj3abxnSq3ZHZMt-7YvRWDXEZsC6RN8FFI
11
11
  vision_agent/agent/easytool_v2_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
12
12
  vision_agent/agent/reflexion.py,sha256=AlM5AvBJvCslXlYQdZiadq4oVHsNBm3IF_03DglTxRo,10506
13
13
  vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
14
- vision_agent/agent/vision_agent.py,sha256=JtPDIiLINXm3jBR0LbqblfB9yCv-8M-B7XRx1EPDhFU,18749
15
- vision_agent/agent/vision_agent_prompts.py,sha256=FnIYF2Fe3joRvFnOJD9ZyWXMihMyL606nXxWJ0adTZ8,8314
14
+ vision_agent/agent/vision_agent.py,sha256=c3jJd1uiCtmVC2xazUvW9rwc7usi-EOYW7NZnMFOdt8,19586
15
+ vision_agent/agent/vision_agent_prompts.py,sha256=bIcqutsyM2bEhWE2XGw01PuZ9f-jePSwapbvkOOrFZ4,8384
16
16
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
18
18
  vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
@@ -23,14 +23,14 @@ vision_agent/tools/__init__.py,sha256=K_7knxmyTIcSEGL8c9wF8RpVh3GrMYfybFaq-2SUM1
23
23
  vision_agent/tools/easytool_tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
24
24
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
25
25
  vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
26
- vision_agent/tools/tools.py,sha256=PhmJ0kQeZ-tSQ675HI8QnR49zlH6nJ_opt6QS4dNSVA,25889
26
+ vision_agent/tools/tools.py,sha256=66pFXUIVvnOa1fk0PY5u_75kblIbAVqkRP2U9qLixrY,25951
27
27
  vision_agent/utils/__init__.py,sha256=Ce4yPhoWanRsnTy3X7YzZNBYYRJsrJeT7N59WUf8GZM,209
28
28
  vision_agent/utils/execute.py,sha256=GlpUGe3pg5KdSvRHLFfVcn9ptXBIp-QRoHT3Wa6aIMs,20318
29
29
  vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
30
30
  vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
31
31
  vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
32
32
  vision_agent/utils/video.py,sha256=EuJJ7Owi3pIV-q3WcZ-LaaTrGAmmZ8YAA22rmEkY7GI,8885
33
- vision_agent-0.2.44.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
34
- vision_agent-0.2.44.dist-info/METADATA,sha256=EbnJiKZzbAgeCN30GRMYfMPN5w_wo9XBkuhWEP_0cN8,6817
35
- vision_agent-0.2.44.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
36
- vision_agent-0.2.44.dist-info/RECORD,,
33
+ vision_agent-0.2.45.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
34
+ vision_agent-0.2.45.dist-info/METADATA,sha256=G9Cy7cUPEWi42cuVP8V7u_ZjNXEnYp26_kb_u9rXSQk,6817
35
+ vision_agent-0.2.45.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
36
+ vision_agent-0.2.45.dist-info/RECORD,,