vision-agent 0.2.165__py3-none-any.whl → 0.2.167__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +14 -1
- vision_agent/tools/meta_tools.py +3 -2
- vision_agent/utils/video.py +23 -3
- {vision_agent-0.2.165.dist-info → vision_agent-0.2.167.dist-info}/METADATA +1 -1
- {vision_agent-0.2.165.dist-info → vision_agent-0.2.167.dist-info}/RECORD +7 -7
- {vision_agent-0.2.165.dist-info → vision_agent-0.2.167.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.165.dist-info → vision_agent-0.2.167.dist-info}/WHEEL +0 -0
@@ -85,6 +85,15 @@ def format_agent_message(agent_message: str) -> str:
|
|
85
85
|
return output
|
86
86
|
|
87
87
|
|
88
|
+
def _clean_response(response: str) -> str:
|
89
|
+
# Sometimes the LLM will hallucinate responses to an <execute_python> tag as if it
|
90
|
+
# had already executed the code. This function removes the hallucinated response.
|
91
|
+
if "<execute_python>" in response:
|
92
|
+
end_execute_python = response.find("</execute_python>")
|
93
|
+
response = response[: end_execute_python + len("</execute_python>")]
|
94
|
+
return response
|
95
|
+
|
96
|
+
|
88
97
|
def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
89
98
|
chat = copy.deepcopy(chat)
|
90
99
|
|
@@ -114,6 +123,10 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
114
123
|
message["media"] = chat[-1]["media"]
|
115
124
|
conv_resp = cast(str, orch([message], stream=False))
|
116
125
|
|
126
|
+
# clean the response first, if we are executing code, do not resond or end
|
127
|
+
# conversation before the code has been executed.
|
128
|
+
conv_resp = _clean_response(conv_resp)
|
129
|
+
|
117
130
|
let_user_respond_str = extract_tag(conv_resp, "let_user_respond")
|
118
131
|
let_user_respond = (
|
119
132
|
"true" in let_user_respond_str.lower() if let_user_respond_str else False
|
@@ -458,7 +471,7 @@ class VisionAgent(Agent):
|
|
458
471
|
self.streaming_message(
|
459
472
|
{
|
460
473
|
"role": "assistant",
|
461
|
-
"content": json.dumps(response),
|
474
|
+
"content": json.dumps(add_step_descriptions(response)),
|
462
475
|
"finished": finished and code_action is None,
|
463
476
|
}
|
464
477
|
)
|
vision_agent/tools/meta_tools.py
CHANGED
@@ -676,12 +676,13 @@ def use_extra_vision_agent_args(
|
|
676
676
|
for node in red:
|
677
677
|
# seems to always be atomtrailers not call type
|
678
678
|
if node.type == "atomtrailers":
|
679
|
+
if node.name.value == "generate_vision_code":
|
680
|
+
node.value[1].value.append(f"test_multi_plan={test_multi_plan}")
|
681
|
+
|
679
682
|
if (
|
680
683
|
node.name.value == "generate_vision_code"
|
681
684
|
or node.name.value == "edit_vision_code"
|
682
685
|
):
|
683
|
-
node.value[1].value.append(f"test_multi_plan={test_multi_plan}")
|
684
|
-
|
685
686
|
if custom_tool_names is not None:
|
686
687
|
node.value[1].value.append(f"custom_tool_names={custom_tool_names}")
|
687
688
|
cleaned_code = red.dumps().strip()
|
vision_agent/utils/video.py
CHANGED
@@ -11,6 +11,9 @@ import numpy as np
|
|
11
11
|
_LOGGER = logging.getLogger(__name__)
|
12
12
|
# The maximum length of the clip to extract frames from, in seconds
|
13
13
|
|
14
|
+
_DEFAULT_VIDEO_FPS = 24
|
15
|
+
_DEFAULT_INPUT_FPS = 1.0
|
16
|
+
|
14
17
|
|
15
18
|
def play_video(video_base64: str) -> None:
|
16
19
|
"""Play a video file"""
|
@@ -51,7 +54,9 @@ def _resize_frame(frame: np.ndarray) -> np.ndarray:
|
|
51
54
|
|
52
55
|
|
53
56
|
def video_writer(
|
54
|
-
frames: List[np.ndarray],
|
57
|
+
frames: List[np.ndarray],
|
58
|
+
fps: float = _DEFAULT_INPUT_FPS,
|
59
|
+
filename: Optional[str] = None,
|
55
60
|
) -> str:
|
56
61
|
if filename is None:
|
57
62
|
filename = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
|
@@ -78,7 +83,7 @@ def video_writer(
|
|
78
83
|
|
79
84
|
|
80
85
|
def frames_to_bytes(
|
81
|
-
frames: List[np.ndarray], fps: float =
|
86
|
+
frames: List[np.ndarray], fps: float = _DEFAULT_INPUT_FPS, file_ext: str = ".mp4"
|
82
87
|
) -> bytes:
|
83
88
|
r"""Convert a list of frames to a video file encoded into a byte string.
|
84
89
|
|
@@ -101,7 +106,7 @@ def frames_to_bytes(
|
|
101
106
|
# same file name and the time savings are very large.
|
102
107
|
@lru_cache(maxsize=8)
|
103
108
|
def extract_frames_from_video(
|
104
|
-
video_uri: str, fps: float =
|
109
|
+
video_uri: str, fps: float = _DEFAULT_INPUT_FPS
|
105
110
|
) -> List[Tuple[np.ndarray, float]]:
|
106
111
|
"""Extract frames from a video along with the timestamp in seconds.
|
107
112
|
|
@@ -118,6 +123,16 @@ def extract_frames_from_video(
|
|
118
123
|
|
119
124
|
cap = cv2.VideoCapture(video_uri)
|
120
125
|
orig_fps = cap.get(cv2.CAP_PROP_FPS)
|
126
|
+
if not orig_fps or orig_fps <= 0:
|
127
|
+
_LOGGER.warning(
|
128
|
+
f"Input video, {video_uri}, has no fps, using the default value {_DEFAULT_VIDEO_FPS}"
|
129
|
+
)
|
130
|
+
orig_fps = _DEFAULT_VIDEO_FPS
|
131
|
+
if not fps or fps <= 0:
|
132
|
+
_LOGGER.warning(
|
133
|
+
f"Input fps, {fps}, is illegal, using the default value: {_DEFAULT_INPUT_FPS}"
|
134
|
+
)
|
135
|
+
fps = _DEFAULT_INPUT_FPS
|
121
136
|
orig_frame_time = 1 / orig_fps
|
122
137
|
targ_frame_time = 1 / fps
|
123
138
|
frames: List[Tuple[np.ndarray, float]] = []
|
@@ -129,10 +144,15 @@ def extract_frames_from_video(
|
|
129
144
|
break
|
130
145
|
|
131
146
|
elapsed_time += orig_frame_time
|
147
|
+
# This is to prevent float point precision loss issue, which can cause
|
148
|
+
# the elapsed time to be slightly less than the target frame time, which
|
149
|
+
# causes the last frame to be skipped
|
150
|
+
elapsed_time = round(elapsed_time, 8)
|
132
151
|
if elapsed_time >= targ_frame_time:
|
133
152
|
frames.append((cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), i / orig_fps))
|
134
153
|
elapsed_time -= targ_frame_time
|
135
154
|
|
136
155
|
i += 1
|
137
156
|
cap.release()
|
157
|
+
_LOGGER.info(f"Extracted {len(frames)} frames from {video_uri}")
|
138
158
|
return frames
|
@@ -2,7 +2,7 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
|
2
2
|
vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
|
3
3
|
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=eSgg8CwWylX_erLTqTg2pVhEEgVkMLRrQfYRyJzI3so,5443
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=GIobCJaojOMxdMFtigklvt7RgHk49KAh7zSZoQ7HKXw,24294
|
6
6
|
vision_agent/agent/vision_agent_coder.py,sha256=aVkl0b9LKvy-auuHGYSag-ixYnue0iRQqD1PYLPBR-s,29312
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
|
8
8
|
vision_agent/agent/vision_agent_planner.py,sha256=mjmnXG9CvYf_ZA7ZJ3ri4H-2U_Km55gF1sZYRSOlxpY,19027
|
@@ -17,7 +17,7 @@ vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,
|
|
17
17
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
18
18
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
19
19
|
vision_agent/tools/__init__.py,sha256=u-vS5iORB4ccvxoAjbtpvhTALDhXGilcATIq1_eZhKo,2332
|
20
|
-
vision_agent/tools/meta_tools.py,sha256=
|
20
|
+
vision_agent/tools/meta_tools.py,sha256=7XM3VP4EW4Dtg_Hvoov_laOAEaZLdSGOeA-iPb7CimU,28315
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
23
23
|
vision_agent/tools/tools.py,sha256=iKsBZxJ5--xWK-mqgZ1jbX_bfGS5HmAp-VRZ69m9yPg,77921
|
@@ -28,8 +28,8 @@ vision_agent/utils/execute.py,sha256=FqSOr5gtBeKB1g2hbV6-bhox6qItDQNn2o9efq1w6f4
|
|
28
28
|
vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
|
29
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
30
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
31
|
-
vision_agent/utils/video.py,sha256=
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
31
|
+
vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
|
32
|
+
vision_agent-0.2.167.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.167.dist-info/METADATA,sha256=e80T_Sh_9yt4SDeTGlq9fD4RqF1iY-LL6IHgarXwLc8,18034
|
34
|
+
vision_agent-0.2.167.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.167.dist-info/RECORD,,
|
File without changes
|
File without changes
|