vision-agent 0.2.165__py3-none-any.whl → 0.2.167__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +14 -1
 - vision_agent/tools/meta_tools.py +3 -2
 - vision_agent/utils/video.py +23 -3
 - {vision_agent-0.2.165.dist-info → vision_agent-0.2.167.dist-info}/METADATA +1 -1
 - {vision_agent-0.2.165.dist-info → vision_agent-0.2.167.dist-info}/RECORD +7 -7
 - {vision_agent-0.2.165.dist-info → vision_agent-0.2.167.dist-info}/LICENSE +0 -0
 - {vision_agent-0.2.165.dist-info → vision_agent-0.2.167.dist-info}/WHEEL +0 -0
 
| 
         @@ -85,6 +85,15 @@ def format_agent_message(agent_message: str) -> str: 
     | 
|
| 
       85 
85 
     | 
    
         
             
                return output
         
     | 
| 
       86 
86 
     | 
    
         | 
| 
       87 
87 
     | 
    
         | 
| 
      
 88 
     | 
    
         
            +
            def _clean_response(response: str) -> str:
         
     | 
| 
      
 89 
     | 
    
         
            +
                # Sometimes the LLM will hallucinate responses to an <execute_python> tag as if it
         
     | 
| 
      
 90 
     | 
    
         
            +
                # had already executed the code. This function removes the hallucinated response.
         
     | 
| 
      
 91 
     | 
    
         
            +
                if "<execute_python>" in response:
         
     | 
| 
      
 92 
     | 
    
         
            +
                    end_execute_python = response.find("</execute_python>")
         
     | 
| 
      
 93 
     | 
    
         
            +
                    response = response[: end_execute_python + len("</execute_python>")]
         
     | 
| 
      
 94 
     | 
    
         
            +
                return response
         
     | 
| 
      
 95 
     | 
    
         
            +
             
     | 
| 
      
 96 
     | 
    
         
            +
             
     | 
| 
       88 
97 
     | 
    
         
             
            def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
         
     | 
| 
       89 
98 
     | 
    
         
             
                chat = copy.deepcopy(chat)
         
     | 
| 
       90 
99 
     | 
    
         | 
| 
         @@ -114,6 +123,10 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: 
     | 
|
| 
       114 
123 
     | 
    
         
             
                    message["media"] = chat[-1]["media"]
         
     | 
| 
       115 
124 
     | 
    
         
             
                conv_resp = cast(str, orch([message], stream=False))
         
     | 
| 
       116 
125 
     | 
    
         | 
| 
      
 126 
     | 
    
         
            +
                # clean the response first, if we are executing code, do not resond or end
         
     | 
| 
      
 127 
     | 
    
         
            +
                # conversation before the code has been executed.
         
     | 
| 
      
 128 
     | 
    
         
            +
                conv_resp = _clean_response(conv_resp)
         
     | 
| 
      
 129 
     | 
    
         
            +
             
     | 
| 
       117 
130 
     | 
    
         
             
                let_user_respond_str = extract_tag(conv_resp, "let_user_respond")
         
     | 
| 
       118 
131 
     | 
    
         
             
                let_user_respond = (
         
     | 
| 
       119 
132 
     | 
    
         
             
                    "true" in let_user_respond_str.lower() if let_user_respond_str else False
         
     | 
| 
         @@ -458,7 +471,7 @@ class VisionAgent(Agent): 
     | 
|
| 
       458 
471 
     | 
    
         
             
                                self.streaming_message(
         
     | 
| 
       459 
472 
     | 
    
         
             
                                    {
         
     | 
| 
       460 
473 
     | 
    
         
             
                                        "role": "assistant",
         
     | 
| 
       461 
     | 
    
         
            -
                                        "content": json.dumps(response),
         
     | 
| 
      
 474 
     | 
    
         
            +
                                        "content": json.dumps(add_step_descriptions(response)),
         
     | 
| 
       462 
475 
     | 
    
         
             
                                        "finished": finished and code_action is None,
         
     | 
| 
       463 
476 
     | 
    
         
             
                                    }
         
     | 
| 
       464 
477 
     | 
    
         
             
                                )
         
     | 
    
        vision_agent/tools/meta_tools.py
    CHANGED
    
    | 
         @@ -676,12 +676,13 @@ def use_extra_vision_agent_args( 
     | 
|
| 
       676 
676 
     | 
    
         
             
                for node in red:
         
     | 
| 
       677 
677 
     | 
    
         
             
                    # seems to always be atomtrailers not call type
         
     | 
| 
       678 
678 
     | 
    
         
             
                    if node.type == "atomtrailers":
         
     | 
| 
      
 679 
     | 
    
         
            +
                        if node.name.value == "generate_vision_code":
         
     | 
| 
      
 680 
     | 
    
         
            +
                            node.value[1].value.append(f"test_multi_plan={test_multi_plan}")
         
     | 
| 
      
 681 
     | 
    
         
            +
             
     | 
| 
       679 
682 
     | 
    
         
             
                        if (
         
     | 
| 
       680 
683 
     | 
    
         
             
                            node.name.value == "generate_vision_code"
         
     | 
| 
       681 
684 
     | 
    
         
             
                            or node.name.value == "edit_vision_code"
         
     | 
| 
       682 
685 
     | 
    
         
             
                        ):
         
     | 
| 
       683 
     | 
    
         
            -
                            node.value[1].value.append(f"test_multi_plan={test_multi_plan}")
         
     | 
| 
       684 
     | 
    
         
            -
             
     | 
| 
       685 
686 
     | 
    
         
             
                            if custom_tool_names is not None:
         
     | 
| 
       686 
687 
     | 
    
         
             
                                node.value[1].value.append(f"custom_tool_names={custom_tool_names}")
         
     | 
| 
       687 
688 
     | 
    
         
             
                cleaned_code = red.dumps().strip()
         
     | 
    
        vision_agent/utils/video.py
    CHANGED
    
    | 
         @@ -11,6 +11,9 @@ import numpy as np 
     | 
|
| 
       11 
11 
     | 
    
         
             
            _LOGGER = logging.getLogger(__name__)
         
     | 
| 
       12 
12 
     | 
    
         
             
            # The maximum length of the clip to extract frames from, in seconds
         
     | 
| 
       13 
13 
     | 
    
         | 
| 
      
 14 
     | 
    
         
            +
            _DEFAULT_VIDEO_FPS = 24
         
     | 
| 
      
 15 
     | 
    
         
            +
            _DEFAULT_INPUT_FPS = 1.0
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
       14 
17 
     | 
    
         | 
| 
       15 
18 
     | 
    
         
             
            def play_video(video_base64: str) -> None:
         
     | 
| 
       16 
19 
     | 
    
         
             
                """Play a video file"""
         
     | 
| 
         @@ -51,7 +54,9 @@ def _resize_frame(frame: np.ndarray) -> np.ndarray: 
     | 
|
| 
       51 
54 
     | 
    
         | 
| 
       52 
55 
     | 
    
         | 
| 
       53 
56 
     | 
    
         
             
            def video_writer(
         
     | 
| 
       54 
     | 
    
         
            -
                frames: List[np.ndarray], 
     | 
| 
      
 57 
     | 
    
         
            +
                frames: List[np.ndarray],
         
     | 
| 
      
 58 
     | 
    
         
            +
                fps: float = _DEFAULT_INPUT_FPS,
         
     | 
| 
      
 59 
     | 
    
         
            +
                filename: Optional[str] = None,
         
     | 
| 
       55 
60 
     | 
    
         
             
            ) -> str:
         
     | 
| 
       56 
61 
     | 
    
         
             
                if filename is None:
         
     | 
| 
       57 
62 
     | 
    
         
             
                    filename = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
         
     | 
| 
         @@ -78,7 +83,7 @@ def video_writer( 
     | 
|
| 
       78 
83 
     | 
    
         | 
| 
       79 
84 
     | 
    
         | 
| 
       80 
85 
     | 
    
         
             
            def frames_to_bytes(
         
     | 
| 
       81 
     | 
    
         
            -
                frames: List[np.ndarray], fps: float =  
     | 
| 
      
 86 
     | 
    
         
            +
                frames: List[np.ndarray], fps: float = _DEFAULT_INPUT_FPS, file_ext: str = ".mp4"
         
     | 
| 
       82 
87 
     | 
    
         
             
            ) -> bytes:
         
     | 
| 
       83 
88 
     | 
    
         
             
                r"""Convert a list of frames to a video file encoded into a byte string.
         
     | 
| 
       84 
89 
     | 
    
         | 
| 
         @@ -101,7 +106,7 @@ def frames_to_bytes( 
     | 
|
| 
       101 
106 
     | 
    
         
             
            # same file name and the time savings are very large.
         
     | 
| 
       102 
107 
     | 
    
         
             
            @lru_cache(maxsize=8)
         
     | 
| 
       103 
108 
     | 
    
         
             
            def extract_frames_from_video(
         
     | 
| 
       104 
     | 
    
         
            -
                video_uri: str, fps: float =  
     | 
| 
      
 109 
     | 
    
         
            +
                video_uri: str, fps: float = _DEFAULT_INPUT_FPS
         
     | 
| 
       105 
110 
     | 
    
         
             
            ) -> List[Tuple[np.ndarray, float]]:
         
     | 
| 
       106 
111 
     | 
    
         
             
                """Extract frames from a video along with the timestamp in seconds.
         
     | 
| 
       107 
112 
     | 
    
         | 
| 
         @@ -118,6 +123,16 @@ def extract_frames_from_video( 
     | 
|
| 
       118 
123 
     | 
    
         | 
| 
       119 
124 
     | 
    
         
             
                cap = cv2.VideoCapture(video_uri)
         
     | 
| 
       120 
125 
     | 
    
         
             
                orig_fps = cap.get(cv2.CAP_PROP_FPS)
         
     | 
| 
      
 126 
     | 
    
         
            +
                if not orig_fps or orig_fps <= 0:
         
     | 
| 
      
 127 
     | 
    
         
            +
                    _LOGGER.warning(
         
     | 
| 
      
 128 
     | 
    
         
            +
                        f"Input video, {video_uri}, has no fps, using the default value {_DEFAULT_VIDEO_FPS}"
         
     | 
| 
      
 129 
     | 
    
         
            +
                    )
         
     | 
| 
      
 130 
     | 
    
         
            +
                    orig_fps = _DEFAULT_VIDEO_FPS
         
     | 
| 
      
 131 
     | 
    
         
            +
                if not fps or fps <= 0:
         
     | 
| 
      
 132 
     | 
    
         
            +
                    _LOGGER.warning(
         
     | 
| 
      
 133 
     | 
    
         
            +
                        f"Input fps, {fps}, is illegal, using the default value: {_DEFAULT_INPUT_FPS}"
         
     | 
| 
      
 134 
     | 
    
         
            +
                    )
         
     | 
| 
      
 135 
     | 
    
         
            +
                    fps = _DEFAULT_INPUT_FPS
         
     | 
| 
       121 
136 
     | 
    
         
             
                orig_frame_time = 1 / orig_fps
         
     | 
| 
       122 
137 
     | 
    
         
             
                targ_frame_time = 1 / fps
         
     | 
| 
       123 
138 
     | 
    
         
             
                frames: List[Tuple[np.ndarray, float]] = []
         
     | 
| 
         @@ -129,10 +144,15 @@ def extract_frames_from_video( 
     | 
|
| 
       129 
144 
     | 
    
         
             
                        break
         
     | 
| 
       130 
145 
     | 
    
         | 
| 
       131 
146 
     | 
    
         
             
                    elapsed_time += orig_frame_time
         
     | 
| 
      
 147 
     | 
    
         
            +
                    # This is to prevent float point precision loss issue, which can cause
         
     | 
| 
      
 148 
     | 
    
         
            +
                    # the elapsed time to be slightly less than the target frame time, which
         
     | 
| 
      
 149 
     | 
    
         
            +
                    # causes the last frame to be skipped
         
     | 
| 
      
 150 
     | 
    
         
            +
                    elapsed_time = round(elapsed_time, 8)
         
     | 
| 
       132 
151 
     | 
    
         
             
                    if elapsed_time >= targ_frame_time:
         
     | 
| 
       133 
152 
     | 
    
         
             
                        frames.append((cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), i / orig_fps))
         
     | 
| 
       134 
153 
     | 
    
         
             
                        elapsed_time -= targ_frame_time
         
     | 
| 
       135 
154 
     | 
    
         | 
| 
       136 
155 
     | 
    
         
             
                    i += 1
         
     | 
| 
       137 
156 
     | 
    
         
             
                cap.release()
         
     | 
| 
      
 157 
     | 
    
         
            +
                _LOGGER.info(f"Extracted {len(frames)} frames from {video_uri}")
         
     | 
| 
       138 
158 
     | 
    
         
             
                return frames
         
     | 
| 
         @@ -2,7 +2,7 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57 
     | 
|
| 
       2 
2 
     | 
    
         
             
            vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
         
     | 
| 
       3 
3 
     | 
    
         
             
            vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
         
     | 
| 
       4 
4 
     | 
    
         
             
            vision_agent/agent/agent_utils.py,sha256=eSgg8CwWylX_erLTqTg2pVhEEgVkMLRrQfYRyJzI3so,5443
         
     | 
| 
       5 
     | 
    
         
            -
            vision_agent/agent/vision_agent.py,sha256= 
     | 
| 
      
 5 
     | 
    
         
            +
            vision_agent/agent/vision_agent.py,sha256=GIobCJaojOMxdMFtigklvt7RgHk49KAh7zSZoQ7HKXw,24294
         
     | 
| 
       6 
6 
     | 
    
         
             
            vision_agent/agent/vision_agent_coder.py,sha256=aVkl0b9LKvy-auuHGYSag-ixYnue0iRQqD1PYLPBR-s,29312
         
     | 
| 
       7 
7 
     | 
    
         
             
            vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
         
     | 
| 
       8 
8 
     | 
    
         
             
            vision_agent/agent/vision_agent_planner.py,sha256=mjmnXG9CvYf_ZA7ZJ3ri4H-2U_Km55gF1sZYRSOlxpY,19027
         
     | 
| 
         @@ -17,7 +17,7 @@ vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg, 
     | 
|
| 
       17 
17 
     | 
    
         
             
            vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
         
     | 
| 
       18 
18 
     | 
    
         
             
            vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
         
     | 
| 
       19 
19 
     | 
    
         
             
            vision_agent/tools/__init__.py,sha256=u-vS5iORB4ccvxoAjbtpvhTALDhXGilcATIq1_eZhKo,2332
         
     | 
| 
       20 
     | 
    
         
            -
            vision_agent/tools/meta_tools.py,sha256= 
     | 
| 
      
 20 
     | 
    
         
            +
            vision_agent/tools/meta_tools.py,sha256=7XM3VP4EW4Dtg_Hvoov_laOAEaZLdSGOeA-iPb7CimU,28315
         
     | 
| 
       21 
21 
     | 
    
         
             
            vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
         
     | 
| 
       22 
22 
     | 
    
         
             
            vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
         
     | 
| 
       23 
23 
     | 
    
         
             
            vision_agent/tools/tools.py,sha256=iKsBZxJ5--xWK-mqgZ1jbX_bfGS5HmAp-VRZ69m9yPg,77921
         
     | 
| 
         @@ -28,8 +28,8 @@ vision_agent/utils/execute.py,sha256=FqSOr5gtBeKB1g2hbV6-bhox6qItDQNn2o9efq1w6f4 
     | 
|
| 
       28 
28 
     | 
    
         
             
            vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
         
     | 
| 
       29 
29 
     | 
    
         
             
            vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
         
     | 
| 
       30 
30 
     | 
    
         
             
            vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
         
     | 
| 
       31 
     | 
    
         
            -
            vision_agent/utils/video.py,sha256= 
     | 
| 
       32 
     | 
    
         
            -
            vision_agent-0.2. 
     | 
| 
       33 
     | 
    
         
            -
            vision_agent-0.2. 
     | 
| 
       34 
     | 
    
         
            -
            vision_agent-0.2. 
     | 
| 
       35 
     | 
    
         
            -
            vision_agent-0.2. 
     | 
| 
      
 31 
     | 
    
         
            +
            vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
         
     | 
| 
      
 32 
     | 
    
         
            +
            vision_agent-0.2.167.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
         
     | 
| 
      
 33 
     | 
    
         
            +
            vision_agent-0.2.167.dist-info/METADATA,sha256=e80T_Sh_9yt4SDeTGlq9fD4RqF1iY-LL6IHgarXwLc8,18034
         
     | 
| 
      
 34 
     | 
    
         
            +
            vision_agent-0.2.167.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
         
     | 
| 
      
 35 
     | 
    
         
            +
            vision_agent-0.2.167.dist-info/RECORD,,
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     |