PyPI - vision-agent - Versions diffs - 0.2.44__py3-none-any.whl → 0.2.46__py3-none-any.whl - Mend

vision-agent 0.2.44py3-none-any.whl → 0.2.46py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import copy
+import difflib
 import json
 import logging
 import sys
+import tempfile
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Union, cast
+from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
+from PIL import Image
 from rich.console import Console
 from rich.style import Style
 from rich.syntax import Syntax
@@ -14,7 +17,6 @@ import vision_agent.tools as T
 from vision_agent.agent import Agent
 from vision_agent.agent.vision_agent_prompts import (
     CODE,
-    FEEDBACK,
     FIX_BUG,
     FULL_TASK,
     PLAN,
@@ -37,17 +39,27 @@ _CONSOLE = Console()
 _DEFAULT_IMPORT = "\n".join(T.__new_tools__)
-def format_memory(memory: List[Dict[str, str]]) -> str:
-    return FEEDBACK.format(
-        feedback="\n".join(
-            [
-                f"### Feedback {i}:\nCode: ```python\n{m['code']}\n```\nFeedback: {m['feedback']}\n"
-                for i, m in enumerate(memory)
-            ]
+def get_diff(before: str, after: str) -> str:
+    return "".join(
+        difflib.unified_diff(
+            before.splitlines(keepends=True), after.splitlines(keepends=True)
         )
     )
+def format_memory(memory: List[Dict[str, str]]) -> str:
+    output_str = ""
+    for i, m in enumerate(memory):
+        output_str += f"### Feedback {i}:\n"
+        output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
+        output_str += f"Feedback {i}: {m['feedback']}\n\n"
+        if "edits" in m:
+            output_str += f"Edits {i}:\n{m['edits']}\n"
+        output_str += "\n"
+    return output_str
 def extract_code(code: str) -> str:
     if "\n```python" in code:
         start = "\n```python"
@@ -78,12 +90,35 @@ def extract_json(json_str: str) -> Dict[str, Any]:
     return json_dict  # type: ignore
+def extract_image(
+    media: Optional[Sequence[Union[str, Path]]]
+) -> Optional[Sequence[Union[str, Path]]]:
+    if media is None:
+        return None
+    new_media = []
+    for m in media:
+        m = Path(m)
+        extension = m.suffix
+        if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
+            new_media.append(m)
+        elif extension in [".mp4", ".mov"]:
+            frames = T.extract_frames(m)
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                if len(frames) > 0:
+                    Image.fromarray(frames[0][0]).save(tmp.name)
+                    new_media.append(Path(tmp.name))
+    if len(new_media) == 0:
+        return None
+    return new_media
 def write_plan(
     chat: List[Dict[str, str]],
     tool_desc: str,
     working_memory: str,
     model: Union[LLM, LMM],
-    media: Optional[List[Union[str, Path]]] = None,
+    media: Optional[Sequence[Union[str, Path]]] = None,
 ) -> List[Dict[str, str]]:
     chat = copy.deepcopy(chat)
     if chat[-1]["role"] != "user":
@@ -94,6 +129,7 @@ def write_plan(
     prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
     chat[-1]["content"] = prompt
     if isinstance(model, OpenAILMM):
+        media = extract_image(media)
         return extract_json(model.chat(chat, images=media))["plan"]  # type: ignore
     else:
         return extract_json(model.chat(chat))["plan"]  # type: ignore
@@ -103,7 +139,7 @@ def reflect(
     chat: List[Dict[str, str]],
     plan: str,
     code: str,
-    model: LLM,
+    model: Union[LLM, LMM],
 ) -> Dict[str, Union[str, bool]]:
     chat = copy.deepcopy(chat)
     if chat[-1]["role"] != "user":
@@ -120,7 +156,7 @@ def write_and_test_code(
     task: str,
     tool_info: str,
     tool_utils: str,
-    working_memory: str,
+    working_memory: List[Dict[str, str]],
     coder: LLM,
     tester: LLM,
     debugger: LLM,
@@ -137,7 +173,13 @@ def write_and_test_code(
         }
     )
     code = extract_code(
-        coder(CODE.format(docstring=tool_info, question=task, feedback=working_memory))
+        coder(
+            CODE.format(
+                docstring=tool_info,
+                question=task,
+                feedback=format_memory(working_memory),
+            )
+        )
     )
     test = extract_code(
         tester(
@@ -180,7 +222,7 @@ def write_and_test_code(
         )
     count = 0
-    new_working_memory = []
+    new_working_memory: List[Dict[str, str]] = []
     while not result.success and count < max_retries:
         log_progress(
             {
@@ -191,14 +233,28 @@ def write_and_test_code(
         fixed_code_and_test = extract_json(
             debugger(
                 FIX_BUG.format(
-                    code=code, tests=test, result=result.text(), feedback=working_memory
+                    code=code,
+                    tests=test,
+                    result="\n".join(result.text().splitlines()[-50:]),
+                    feedback=format_memory(working_memory + new_working_memory),
                 )
             )
         )
+        old_code = code
+        old_test = test
         if fixed_code_and_test["code"].strip() != "":
             code = extract_code(fixed_code_and_test["code"])
         if fixed_code_and_test["test"].strip() != "":
             test = extract_code(fixed_code_and_test["test"])
+        new_working_memory.append(
+            {
+                "code": f"{code}\n{test}",
+                "feedback": fixed_code_and_test["reflections"],
+                "edits": get_diff(f"{old_code}\n{old_test}", f"{code}\n{test}"),
+            }
+        )
         log_progress(
             {
                 "type": "code",
@@ -209,9 +265,6 @@ def write_and_test_code(
                 },
             }
         )
-        new_working_memory.append(
-            {"code": f"{code}\n{test}", "feedback": fixed_code_and_test["reflections"]}
-        )
         result = code_interpreter.exec_isolation(f"{_DEFAULT_IMPORT}\n{code}\n{test}")
         log_progress(
@@ -309,7 +362,7 @@ class VisionAgent(Agent):
     def __init__(
         self,
-        planner: Optional[LLM] = None,
+        planner: Optional[Union[LLM, LMM]] = None,
         coder: Optional[LLM] = None,
         tester: Optional[LLM] = None,
         debugger: Optional[LLM] = None,
@@ -459,7 +512,7 @@ class VisionAgent(Agent):
                     ),
                     tool_info=tool_info,
                     tool_utils=T.UTILITIES_DOCSTRING,
-                    working_memory=format_memory(working_memory),
+                    working_memory=working_memory,
                     coder=self.coder,
                     tester=self.tester,
                     debugger=self.debugger,
@@ -503,6 +556,8 @@ class VisionAgent(Agent):
                     working_memory.append(
                         {"code": f"{code}\n{test}", "feedback": feedback}
                     )
+                else:
+                    break
                 retries += 1

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -29,14 +29,17 @@ PLAN = """
 {feedback}
 **Instructions**:
-Based on the context and tools you have available, write a plan of subtasks to achieve the user request utilizing given tools when necessary. Output a list of jsons in the following format:
+1. Based on the context and tools you have available, write a plan of subtasks to achieve the user request.
+2. Go over the users request step by step and ensure each step is represented as a clear subtask in your plan.
+Output a list of jsons in the following format
 ```json
 {{
     "plan":
         [
             {{
-                "instructions": str # what you should do in this task, one short phrase or sentence
+                "instructions": str # what you should do in this task associated with a tool
             }}
         ]
 }}
@@ -194,9 +197,7 @@ When we run this test code:
 ```
 It raises this error:
-```python
 {result}
-```
 This is previous feedback provided on the code:
 {feedback}

vision_agent/tools/tools.py CHANGED Viewed

@@ -199,14 +199,15 @@ def extract_frames(
 def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
     """'ocr' extracts text from an image. It returns a list of detected text, bounding
-    boxes, and confidence scores. The results are sorted from top-left to bottom right
+    boxes with normalized coordinates, and confidence scores. The results are sorted
+    from top-left to bottom right.
     Parameters:
         image (np.ndarray): The image to extract text from.
     Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox,
-            and confidence score.
+        List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
+            with nornmalized coordinates, and confidence score.
     Example
     -------
@@ -607,6 +608,7 @@ def overlay_bounding_boxes(
         label: COLORS[i % len(COLORS)]
         for i, label in enumerate(set([box["label"] for box in bboxes]))
     }
+    bboxes = sorted(bboxes, key=lambda x: x["label"], reverse=True)
     width, height = pil_image.size
     fontsize = max(12, int(min(width, height) / 40))
@@ -679,6 +681,7 @@ def overlay_segmentation_masks(
         label: COLORS[i % len(COLORS)]
         for i, label in enumerate(set([mask["label"] for mask in masks]))
     }
+    masks = sorted(masks, key=lambda x: x["label"], reverse=True)
     for elt in masks:
         mask = elt["mask"]

vision_agent/utils/video.py CHANGED Viewed

@@ -2,8 +2,8 @@ import base64
 import logging
 import math
 import os
-from concurrent.futures import ProcessPoolExecutor, as_completed
 import tempfile
+from concurrent.futures import ProcessPoolExecutor, as_completed
 from typing import List, Tuple, cast
 import cv2

{vision_agent-0.2.44.dist-info → vision_agent-0.2.46.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.44
+Version: 0.2.46
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.44.dist-info → vision_agent-0.2.46.dist-info}/RECORD RENAMED Viewed

@@ -11,8 +11,8 @@ vision_agent/agent/easytool_v2.py,sha256=CjY-sSj3abxnSq3ZHZMt-7YvRWDXEZsC6RN8FFI
 vision_agent/agent/easytool_v2_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
 vision_agent/agent/reflexion.py,sha256=AlM5AvBJvCslXlYQdZiadq4oVHsNBm3IF_03DglTxRo,10506
 vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
-vision_agent/agent/vision_agent.py,sha256=JtPDIiLINXm3jBR0LbqblfB9yCv-8M-B7XRx1EPDhFU,18749
-vision_agent/agent/vision_agent_prompts.py,sha256=FnIYF2Fe3joRvFnOJD9ZyWXMihMyL606nXxWJ0adTZ8,8314
+vision_agent/agent/vision_agent.py,sha256=S0VJWsdr0NIYjikXvPrEX-njGMqOIA53r4Q4NYY0Lpo,20365
+vision_agent/agent/vision_agent_prompts.py,sha256=hgnTlaYp2HMBHLi3e4faPb-DI5jQL9jfhKq9jyEUEgY,8370
 vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
 vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
@@ -23,14 +23,14 @@ vision_agent/tools/__init__.py,sha256=K_7knxmyTIcSEGL8c9wF8RpVh3GrMYfybFaq-2SUM1
 vision_agent/tools/easytool_tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
 vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
-vision_agent/tools/tools.py,sha256=PhmJ0kQeZ-tSQ675HI8QnR49zlH6nJ_opt6QS4dNSVA,25889
+vision_agent/tools/tools.py,sha256=SrNrIjyUKoTE3mCqGcy6nC-MeEzJ8uJCumlSkTvvPpg,26085
 vision_agent/utils/__init__.py,sha256=Ce4yPhoWanRsnTy3X7YzZNBYYRJsrJeT7N59WUf8GZM,209
 vision_agent/utils/execute.py,sha256=GlpUGe3pg5KdSvRHLFfVcn9ptXBIp-QRoHT3Wa6aIMs,20318
 vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
 vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
 vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
-vision_agent/utils/video.py,sha256=EuJJ7Owi3pIV-q3WcZ-LaaTrGAmmZ8YAA22rmEkY7GI,8885
-vision_agent-0.2.44.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.44.dist-info/METADATA,sha256=EbnJiKZzbAgeCN30GRMYfMPN5w_wo9XBkuhWEP_0cN8,6817
-vision_agent-0.2.44.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.44.dist-info/RECORD,,
+vision_agent/utils/video.py,sha256=_u3UrEpcJzbclKyJYxF7SiDQGhE2gUc598diYYiEv34,8885
+vision_agent-0.2.46.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.46.dist-info/METADATA,sha256=FOlKABAkLUX8oqtjeE2q9EO6j8yeoiwyw3lWUpIe0ow,6817
+vision_agent-0.2.46.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.46.dist-info/RECORD,,

{vision_agent-0.2.44.dist-info → vision_agent-0.2.46.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.44.dist-info → vision_agent-0.2.46.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.44__py3-none-any.whl → 0.2.46__py3-none-any.whl

vision-agent 0.2.44py3-none-any.whl → 0.2.46py3-none-any.whl