PyPI - vision-agent - Versions diffs - 0.2.44__py3-none-any.whl → 0.2.45__py3-none-any.whl - Mend

vision-agent 0.2.44py3-none-any.whl → 0.2.45py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -2,9 +2,11 @@ import copy
 import json
 import logging
 import sys
+import tempfile
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Union, cast
+from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
+from PIL import Image
 from rich.console import Console
 from rich.style import Style
 from rich.syntax import Syntax
@@ -78,12 +80,35 @@ def extract_json(json_str: str) -> Dict[str, Any]:
     return json_dict  # type: ignore
+def extract_image(
+    media: Optional[Sequence[Union[str, Path]]]
+) -> Optional[Sequence[Union[str, Path]]]:
+    if media is None:
+        return None
+    new_media = []
+    for m in media:
+        m = Path(m)
+        extension = m.suffix
+        if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
+            new_media.append(m)
+        elif extension in [".mp4", ".mov"]:
+            frames = T.extract_frames(m)
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                if len(frames) > 0:
+                    Image.fromarray(frames[0][0]).save(tmp.name)
+                    new_media.append(Path(tmp.name))
+    if len(new_media) == 0:
+        return None
+    return new_media
 def write_plan(
     chat: List[Dict[str, str]],
     tool_desc: str,
     working_memory: str,
     model: Union[LLM, LMM],
-    media: Optional[List[Union[str, Path]]] = None,
+    media: Optional[Sequence[Union[str, Path]]] = None,
 ) -> List[Dict[str, str]]:
     chat = copy.deepcopy(chat)
     if chat[-1]["role"] != "user":
@@ -94,6 +119,7 @@ def write_plan(
     prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
     chat[-1]["content"] = prompt
     if isinstance(model, OpenAILMM):
+        media = extract_image(media)
         return extract_json(model.chat(chat, images=media))["plan"]  # type: ignore
     else:
         return extract_json(model.chat(chat))["plan"]  # type: ignore
@@ -103,7 +129,7 @@ def reflect(
     chat: List[Dict[str, str]],
     plan: str,
     code: str,
-    model: LLM,
+    model: Union[LLM, LMM],
 ) -> Dict[str, Union[str, bool]]:
     chat = copy.deepcopy(chat)
     if chat[-1]["role"] != "user":
@@ -309,7 +335,7 @@ class VisionAgent(Agent):
     def __init__(
         self,
-        planner: Optional[LLM] = None,
+        planner: Optional[Union[LLM, LMM]] = None,
         coder: Optional[LLM] = None,
         tester: Optional[LLM] = None,
         debugger: Optional[LLM] = None,

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -29,14 +29,17 @@ PLAN = """
 {feedback}
 **Instructions**:
-Based on the context and tools you have available, write a plan of subtasks to achieve the user request utilizing given tools when necessary. Output a list of jsons in the following format:
+1. Based on the context and tools you have available, write a plan of subtasks to achieve the user request.
+2. Go over the users request step by step and ensure each step is represented as a clear subtask in your plan.
+Output a list of jsons in the following format
 ```json
 {{
     "plan":
         [
             {{
-                "instructions": str # what you should do in this task, one short phrase or sentence
+                "instructions": str # what you should do in this task associated with a tool
             }}
         ]
 }}

vision_agent/tools/tools.py CHANGED Viewed

@@ -199,14 +199,15 @@ def extract_frames(
 def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
     """'ocr' extracts text from an image. It returns a list of detected text, bounding
-    boxes, and confidence scores. The results are sorted from top-left to bottom right
+    boxes with normalized coordinates, and confidence scores. The results are sorted
+    from top-left to bottom right.
     Parameters:
         image (np.ndarray): The image to extract text from.
     Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox,
-            and confidence score.
+        List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
+            with nornmalized coordinates, and confidence score.
     Example
     -------

{vision_agent-0.2.44.dist-info → vision_agent-0.2.45.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.44
+Version: 0.2.45
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.44.dist-info → vision_agent-0.2.45.dist-info}/RECORD RENAMED Viewed

@@ -11,8 +11,8 @@ vision_agent/agent/easytool_v2.py,sha256=CjY-sSj3abxnSq3ZHZMt-7YvRWDXEZsC6RN8FFI
 vision_agent/agent/easytool_v2_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
 vision_agent/agent/reflexion.py,sha256=AlM5AvBJvCslXlYQdZiadq4oVHsNBm3IF_03DglTxRo,10506
 vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
-vision_agent/agent/vision_agent.py,sha256=JtPDIiLINXm3jBR0LbqblfB9yCv-8M-B7XRx1EPDhFU,18749
-vision_agent/agent/vision_agent_prompts.py,sha256=FnIYF2Fe3joRvFnOJD9ZyWXMihMyL606nXxWJ0adTZ8,8314
+vision_agent/agent/vision_agent.py,sha256=c3jJd1uiCtmVC2xazUvW9rwc7usi-EOYW7NZnMFOdt8,19586
+vision_agent/agent/vision_agent_prompts.py,sha256=bIcqutsyM2bEhWE2XGw01PuZ9f-jePSwapbvkOOrFZ4,8384
 vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
 vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
@@ -23,14 +23,14 @@ vision_agent/tools/__init__.py,sha256=K_7knxmyTIcSEGL8c9wF8RpVh3GrMYfybFaq-2SUM1
 vision_agent/tools/easytool_tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
 vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
-vision_agent/tools/tools.py,sha256=PhmJ0kQeZ-tSQ675HI8QnR49zlH6nJ_opt6QS4dNSVA,25889
+vision_agent/tools/tools.py,sha256=66pFXUIVvnOa1fk0PY5u_75kblIbAVqkRP2U9qLixrY,25951
 vision_agent/utils/__init__.py,sha256=Ce4yPhoWanRsnTy3X7YzZNBYYRJsrJeT7N59WUf8GZM,209
 vision_agent/utils/execute.py,sha256=GlpUGe3pg5KdSvRHLFfVcn9ptXBIp-QRoHT3Wa6aIMs,20318
 vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
 vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
 vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
 vision_agent/utils/video.py,sha256=EuJJ7Owi3pIV-q3WcZ-LaaTrGAmmZ8YAA22rmEkY7GI,8885
-vision_agent-0.2.44.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.44.dist-info/METADATA,sha256=EbnJiKZzbAgeCN30GRMYfMPN5w_wo9XBkuhWEP_0cN8,6817
-vision_agent-0.2.44.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.44.dist-info/RECORD,,
+vision_agent-0.2.45.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.45.dist-info/METADATA,sha256=G9Cy7cUPEWi42cuVP8V7u_ZjNXEnYp26_kb_u9rXSQk,6817
+vision_agent-0.2.45.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.45.dist-info/RECORD,,

{vision_agent-0.2.44.dist-info → vision_agent-0.2.45.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.44.dist-info → vision_agent-0.2.45.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.44__py3-none-any.whl → 0.2.45__py3-none-any.whl

vision-agent 0.2.44py3-none-any.whl → 0.2.45py3-none-any.whl