vision-agent 1.1.16__py3-none-any.whl → 1.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +12 -12
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/agent/__init__.py +1 -0
- vision_agent/agent/vision_agent_prompts_v3.py +372 -0
- vision_agent/agent/vision_agent_v3.py +278 -0
- vision_agent/lmm/lmm.py +219 -57
- vision_agent/tools/__init__.py +3 -3
- vision_agent/tools/planner_v3_tools.py +206 -0
- vision_agent/tools/tools.py +55 -64
- vision_agent/utils/agent.py +24 -8
- vision_agent/utils/tools.py +1 -1
- {vision_agent-1.1.16.dist-info → vision_agent-1.1.18.dist-info}/METADATA +4 -4
- {vision_agent-1.1.16.dist-info → vision_agent-1.1.18.dist-info}/RECORD +15 -12
- {vision_agent-1.1.16.dist-info → vision_agent-1.1.18.dist-info}/WHEEL +0 -0
- {vision_agent-1.1.16.dist-info → vision_agent-1.1.18.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,278 @@
|
|
1
|
+
import copy
|
2
|
+
import re
|
3
|
+
import time
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
6
|
+
|
7
|
+
from rich.console import Console
|
8
|
+
from rich.markup import escape
|
9
|
+
|
10
|
+
from vision_agent.agent import Agent
|
11
|
+
from vision_agent.agent.vision_agent_prompts_v3 import get_init_prompt
|
12
|
+
from vision_agent.configs import Config
|
13
|
+
from vision_agent.lmm import LMM, AnthropicLMM
|
14
|
+
from vision_agent.models import AgentMessage, Message
|
15
|
+
from vision_agent.utils.agent import (
|
16
|
+
add_media_to_chat,
|
17
|
+
capture_media_from_exec,
|
18
|
+
convert_message_to_agentmessage,
|
19
|
+
extract_tag,
|
20
|
+
print_code,
|
21
|
+
remove_installs_from_code,
|
22
|
+
)
|
23
|
+
from vision_agent.utils.execute import CodeInterpreter, CodeInterpreterFactory
|
24
|
+
|
25
|
+
CONFIG = Config()
|
26
|
+
MAX_IMAGES = 10
|
27
|
+
_CONSOLE = Console()
|
28
|
+
|
29
|
+
|
30
|
+
class DefaultImports:
|
31
|
+
imports = [
|
32
|
+
"import os",
|
33
|
+
"import numpy as np",
|
34
|
+
"import cv2",
|
35
|
+
"from typing import *",
|
36
|
+
"from pillow_heif import register_heif_opener",
|
37
|
+
"from vision_agent.tools import load_image",
|
38
|
+
"from vision_agent.tools.planner_v3_tools import instance_segmentation, ocr, depth_estimation, visualize_bounding_boxes, visualize_segmentation_masks, get_crops, rotate_90, display_image, iou",
|
39
|
+
"register_heif_opener()",
|
40
|
+
"import matplotlib.pyplot as plt",
|
41
|
+
]
|
42
|
+
|
43
|
+
@staticmethod
|
44
|
+
def prepend_imports(code: str) -> str:
|
45
|
+
return "\n".join(DefaultImports.imports) + "\n\n" + code
|
46
|
+
|
47
|
+
|
48
|
+
def run_chat(
|
49
|
+
model: LMM,
|
50
|
+
chat: List[AgentMessage],
|
51
|
+
kwargs: Optional[Dict[str, Any]] = None,
|
52
|
+
) -> str:
|
53
|
+
chat = copy.deepcopy(chat)
|
54
|
+
formatted_chat = []
|
55
|
+
for c in chat:
|
56
|
+
if c.role in ["user", "observation", "final_observation", "error_observation"]:
|
57
|
+
role = "user"
|
58
|
+
else:
|
59
|
+
role = "assistant"
|
60
|
+
formatted_chat.append({"role": role, "content": c.content, "media": c.media})
|
61
|
+
response = cast(str, model(formatted_chat, **(kwargs or {}))) # type: ignore
|
62
|
+
return response
|
63
|
+
|
64
|
+
|
65
|
+
def strip_signature(response: str) -> str:
|
66
|
+
signature = extract_tag(response, "signature")
|
67
|
+
if signature is not None:
|
68
|
+
response = response.replace(f"<signature>{signature}</signature>", "")
|
69
|
+
return response
|
70
|
+
|
71
|
+
|
72
|
+
def strip_signature_from_agentmessage(
|
73
|
+
response: AgentMessage,
|
74
|
+
) -> AgentMessage:
|
75
|
+
return AgentMessage(
|
76
|
+
role=response.role,
|
77
|
+
content=strip_signature(response.content),
|
78
|
+
media=response.media,
|
79
|
+
)
|
80
|
+
|
81
|
+
|
82
|
+
def fix_xml_code_tags(response: str) -> str:
|
83
|
+
start_tag = "```python"
|
84
|
+
end_tag = "```"
|
85
|
+
|
86
|
+
start_index = response.find(start_tag)
|
87
|
+
if start_index != -1:
|
88
|
+
end_index = response.find(end_tag, start_index + len(start_tag))
|
89
|
+
if end_index != -1:
|
90
|
+
# Extract the code content
|
91
|
+
code_content = response[start_index + len(start_tag) : end_index].strip()
|
92
|
+
# Replace the markdown block with XML <code> tags
|
93
|
+
response = (
|
94
|
+
response[:start_index]
|
95
|
+
+ f"<code>\n{code_content}\n</code>"
|
96
|
+
+ response[end_index + len(end_tag) :]
|
97
|
+
)
|
98
|
+
|
99
|
+
# Original logic to fix potentially missing XML tags
|
100
|
+
if "<answer>" in response and "</answer>" not in response:
|
101
|
+
response += "</answer>"
|
102
|
+
|
103
|
+
# Ensure <code> tags are closed if they exist (could be pre-existing or just added)
|
104
|
+
if "<code>" in response and "</code>" not in response:
|
105
|
+
response += "</code>"
|
106
|
+
|
107
|
+
return response
|
108
|
+
|
109
|
+
|
110
|
+
def strip_extra_content(response: str) -> str:
|
111
|
+
code_pos = [i.start() for i in re.finditer("<code>", response)]
|
112
|
+
if len(code_pos) > 0:
|
113
|
+
thinking_start = response.find("<thinking>")
|
114
|
+
thinking_end = response.find("</thinking>", thinking_start)
|
115
|
+
signature_start = response.find("<signature>")
|
116
|
+
signature_end = response.find("</signature>", signature_start)
|
117
|
+
code_start = response.find("<code>")
|
118
|
+
code_end = response.find("</code>", code_start)
|
119
|
+
return (
|
120
|
+
response[thinking_start : thinking_end + len("</thinking>")]
|
121
|
+
+ (
|
122
|
+
response[signature_start : signature_end + len("</signature>")]
|
123
|
+
if signature_start != -1
|
124
|
+
else ""
|
125
|
+
)
|
126
|
+
+ response[code_start : code_end + len("</code>")]
|
127
|
+
)
|
128
|
+
return response
|
129
|
+
|
130
|
+
|
131
|
+
def run_code(
|
132
|
+
code: str,
|
133
|
+
code_interpreter: CodeInterpreter,
|
134
|
+
) -> Tuple[str, List[str], float]:
|
135
|
+
code = remove_installs_from_code(code)
|
136
|
+
start = time.time()
|
137
|
+
execution = code_interpreter.exec_cell(DefaultImports.prepend_imports(code))
|
138
|
+
end = time.time()
|
139
|
+
|
140
|
+
obs = execution.text(include_logs=True).strip()
|
141
|
+
result_images = capture_media_from_exec(execution)
|
142
|
+
max_images_to_include = MAX_IMAGES
|
143
|
+
if result_images:
|
144
|
+
max_images_to_include = min(len(result_images), MAX_IMAGES)
|
145
|
+
return_images = result_images[:max_images_to_include]
|
146
|
+
image_note = f"\n\n[{len(return_images)} images were generated by your code and are included with this message]"
|
147
|
+
obs += image_note
|
148
|
+
return_images = result_images[:max_images_to_include] if result_images else []
|
149
|
+
|
150
|
+
return obs, return_images, end - start
|
151
|
+
|
152
|
+
|
153
|
+
def format_obs_message(
|
154
|
+
obs: str,
|
155
|
+
turn: int,
|
156
|
+
turns: int,
|
157
|
+
) -> str:
|
158
|
+
obs_message = f"[Turn {turn + 1}/{turns}] Code execution result:\n{obs}"
|
159
|
+
if turn == turns - 2:
|
160
|
+
warning_msg = "\n\n⚠️CRITICAL: The next turn will be your FINAL turn. Please make sure to provide your final answer in <answer> tags in your next response, no need to incude <code> tags. Rember to print out final answers without any explaination, it could be a single word, number, price or a list of bounding boxes of object detection."
|
161
|
+
obs_message += warning_msg
|
162
|
+
return obs_message
|
163
|
+
|
164
|
+
|
165
|
+
class VisionAgentV3(Agent):
|
166
|
+
def __init__(
|
167
|
+
self,
|
168
|
+
agent: Optional[LMM] = None,
|
169
|
+
hil: bool = False,
|
170
|
+
verbose: bool = False,
|
171
|
+
code_sandbox_runtime: Optional[str] = None,
|
172
|
+
update_callback: Callable[[Dict[str, Any]], None] = lambda x: None,
|
173
|
+
) -> None:
|
174
|
+
if agent is None:
|
175
|
+
self.agent = AnthropicLMM(
|
176
|
+
model_name="claude-3-7-sonnet-20250219", max_tokens=8192
|
177
|
+
)
|
178
|
+
self.kwargs = {
|
179
|
+
"thinking": {"type": "enabled", "budget_tokens": 4096},
|
180
|
+
"stop_sequences": ["</code>", "</answer>"],
|
181
|
+
}
|
182
|
+
|
183
|
+
self.turns = 7
|
184
|
+
self.verbose = verbose
|
185
|
+
self.code_sandbox_runtime = code_sandbox_runtime
|
186
|
+
self.update_callback = update_callback
|
187
|
+
|
188
|
+
def __call__(
|
189
|
+
self,
|
190
|
+
input: Union[str, List[Message]],
|
191
|
+
media: Optional[Union[str, Path]] = None,
|
192
|
+
) -> str:
|
193
|
+
msg = convert_message_to_agentmessage(input, media)
|
194
|
+
return self.chat(msg)[-1].content
|
195
|
+
|
196
|
+
def chat(
|
197
|
+
self,
|
198
|
+
chat: List[AgentMessage],
|
199
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
200
|
+
) -> List[AgentMessage]:
|
201
|
+
chat = copy.deepcopy(chat)
|
202
|
+
if not chat or chat[-1].role not in {"user", "interaction_response"}:
|
203
|
+
raise ValueError(
|
204
|
+
f"Last chat message must be from the user or interaction_response, got {chat[-1].role}."
|
205
|
+
)
|
206
|
+
|
207
|
+
return_chat = []
|
208
|
+
with (
|
209
|
+
CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
|
210
|
+
if code_interpreter is None
|
211
|
+
else code_interpreter
|
212
|
+
) as code_interpreter:
|
213
|
+
int_chat, _, _ = add_media_to_chat(
|
214
|
+
chat, code_interpreter, append_to_prompt=False
|
215
|
+
)
|
216
|
+
init_prompt = get_init_prompt(
|
217
|
+
model="",
|
218
|
+
turns=self.turns,
|
219
|
+
question=int_chat[0].content,
|
220
|
+
category="",
|
221
|
+
image_path=str(int_chat[0].media),
|
222
|
+
)
|
223
|
+
return_chat.append(
|
224
|
+
AgentMessage(role="user", content=init_prompt, media=int_chat[0].media)
|
225
|
+
)
|
226
|
+
|
227
|
+
for turn in range(self.turns):
|
228
|
+
response = run_chat(self.agent, return_chat, self.kwargs)
|
229
|
+
response = fix_xml_code_tags(response)
|
230
|
+
response = strip_extra_content(response)
|
231
|
+
|
232
|
+
return_chat.append(AgentMessage(role="assistant", content=response))
|
233
|
+
self.update_callback(
|
234
|
+
strip_signature_from_agentmessage(return_chat[-1]).model_dump()
|
235
|
+
)
|
236
|
+
|
237
|
+
code = extract_tag(response, "code")
|
238
|
+
thoughts = extract_tag(response, "thinking")
|
239
|
+
answer = extract_tag(response, "answer")
|
240
|
+
|
241
|
+
if self.verbose:
|
242
|
+
_CONSOLE.print(
|
243
|
+
f"[bold cyan]Step {turn}/{self.turns}[/bold cyan]\n"
|
244
|
+
f"[green]{thoughts}[/green]\n"
|
245
|
+
)
|
246
|
+
if answer is not None:
|
247
|
+
_CONSOLE.print(
|
248
|
+
f"[magenta]Final answer: {escape(answer)}[/magenta]\n"
|
249
|
+
)
|
250
|
+
if code is not None:
|
251
|
+
print_code("Code:", code)
|
252
|
+
|
253
|
+
if answer is not None:
|
254
|
+
# final answer is in the previous response message so no need to add
|
255
|
+
# add it to the return_chat
|
256
|
+
self.update_callback(
|
257
|
+
AgentMessage(
|
258
|
+
role="final_observation",
|
259
|
+
content=f"<answer>{answer}</answer>",
|
260
|
+
).model_dump()
|
261
|
+
)
|
262
|
+
elif code is not None:
|
263
|
+
obs, images, latency = run_code(code, code_interpreter)
|
264
|
+
obs = format_obs_message(obs, turn, self.turns)
|
265
|
+
_CONSOLE.print(
|
266
|
+
f"[bold cyan]Code execution took {latency:.2f} seconds.[/bold cyan]\n"
|
267
|
+
f"[yellow]{escape(obs)}[/yellow]\n"
|
268
|
+
)
|
269
|
+
return_chat.append(
|
270
|
+
AgentMessage(role="observation", content=obs, media=images)
|
271
|
+
)
|
272
|
+
self.update_callback(
|
273
|
+
strip_signature_from_agentmessage(return_chat[-1]).model_dump()
|
274
|
+
)
|
275
|
+
return return_chat
|
276
|
+
|
277
|
+
def log_progress(self, data: Dict[str, Any]) -> None:
|
278
|
+
pass
|
vision_agent/lmm/lmm.py
CHANGED
@@ -1,19 +1,33 @@
|
|
1
|
+
import base64
|
1
2
|
import json
|
2
3
|
import os
|
3
4
|
from abc import ABC, abstractmethod
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
-
|
6
|
+
from typing import (
|
7
|
+
Any,
|
8
|
+
Dict,
|
9
|
+
Iterator,
|
10
|
+
List,
|
11
|
+
Optional,
|
12
|
+
Sequence,
|
13
|
+
Union,
|
14
|
+
cast,
|
15
|
+
)
|
7
16
|
|
8
17
|
import anthropic
|
9
18
|
import requests
|
10
|
-
from anthropic.types import
|
11
|
-
|
12
|
-
|
19
|
+
from anthropic.types import (
|
20
|
+
ImageBlockParam,
|
21
|
+
MessageParam,
|
22
|
+
TextBlockParam,
|
23
|
+
ThinkingBlockParam,
|
24
|
+
)
|
13
25
|
from google import genai # type: ignore
|
14
26
|
from google.genai import types # type: ignore
|
27
|
+
from openai import AzureOpenAI, OpenAI
|
15
28
|
|
16
29
|
from vision_agent.models import Message
|
30
|
+
from vision_agent.utils.agent import extract_tag
|
17
31
|
from vision_agent.utils.image_utils import encode_media
|
18
32
|
|
19
33
|
|
@@ -99,11 +113,15 @@ class OpenAILMM(LMM):
|
|
99
113
|
[{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
|
100
114
|
"""
|
101
115
|
fixed_chat = []
|
102
|
-
for
|
103
|
-
fixed_c = {"role":
|
104
|
-
fixed_c["content"] = [{"type": "text", "text":
|
105
|
-
if
|
106
|
-
|
116
|
+
for msg in chat:
|
117
|
+
fixed_c = {"role": msg["role"]}
|
118
|
+
fixed_c["content"] = [{"type": "text", "text": msg["content"]}] # type: ignore
|
119
|
+
if (
|
120
|
+
"media" in msg
|
121
|
+
and msg["media"] is not None
|
122
|
+
and self.model_name != "o3-mini"
|
123
|
+
):
|
124
|
+
for media in msg["media"]:
|
107
125
|
resize = kwargs["resize"] if "resize" in kwargs else self.image_size
|
108
126
|
image_detail = (
|
109
127
|
kwargs["image_detail"]
|
@@ -297,14 +315,14 @@ class OllamaLMM(LMM):
|
|
297
315
|
[{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
|
298
316
|
"""
|
299
317
|
fixed_chat = []
|
300
|
-
for
|
301
|
-
if "media" in
|
318
|
+
for msg in chat:
|
319
|
+
if "media" in msg and msg["media"] is not None:
|
302
320
|
resize = kwargs["resize"] if "resize" in kwargs else self.image_size
|
303
|
-
|
304
|
-
encode_media(cast(str, m), resize=resize) for m in
|
321
|
+
msg["images"] = [
|
322
|
+
encode_media(cast(str, m), resize=resize) for m in msg["media"]
|
305
323
|
]
|
306
|
-
del
|
307
|
-
fixed_chat.append(
|
324
|
+
del msg["media"]
|
325
|
+
fixed_chat.append(msg)
|
308
326
|
url = f"{self.url}/chat"
|
309
327
|
model = self.model_name
|
310
328
|
messages = fixed_chat
|
@@ -410,63 +428,207 @@ class AnthropicLMM(LMM):
|
|
410
428
|
|
411
429
|
def __call__(
|
412
430
|
self,
|
413
|
-
input: Union[str, Sequence[
|
431
|
+
input: Union[str, Sequence[Message]],
|
414
432
|
**kwargs: Any,
|
415
433
|
) -> Union[str, Iterator[Optional[str]]]:
|
416
434
|
if isinstance(input, str):
|
417
435
|
return self.generate(input, **kwargs)
|
418
436
|
return self.chat(input, **kwargs)
|
419
437
|
|
420
|
-
def
|
438
|
+
def create_thinking_assistant_message(
|
421
439
|
self,
|
422
|
-
|
423
|
-
|
424
|
-
|
440
|
+
msg_content: str,
|
441
|
+
) -> MessageParam:
|
442
|
+
content: List[Union[TextBlockParam, ThinkingBlockParam]] = []
|
443
|
+
thinking_content = extract_tag(msg_content, "thinking")
|
444
|
+
signature = extract_tag(msg_content, "signature")
|
445
|
+
if thinking_content:
|
446
|
+
content.append(
|
447
|
+
ThinkingBlockParam(
|
448
|
+
type="thinking",
|
449
|
+
thinking=thinking_content.strip(),
|
450
|
+
signature=signature.strip() if signature else "",
|
451
|
+
)
|
452
|
+
)
|
453
|
+
signature_content = extract_tag(msg_content, "signature")
|
454
|
+
if signature_content:
|
455
|
+
text_content = msg_content.replace(
|
456
|
+
f"<thinking>{thinking_content}</thinking>", ""
|
457
|
+
).replace(f"<signature>{signature_content}</signature>", "")
|
458
|
+
else:
|
459
|
+
text_content = msg_content.replace(
|
460
|
+
f"<thinking>{thinking_content}</thinking>", ""
|
461
|
+
)
|
462
|
+
if text_content.strip():
|
463
|
+
content.append(TextBlockParam(type="text", text=text_content.strip()))
|
464
|
+
return MessageParam(role="assistant", content=content)
|
465
|
+
|
466
|
+
def _setup_chat_kwargs(self, kwargs: Dict[str, Any]) -> tuple[Dict[str, Any], bool]:
|
467
|
+
"""Set up kwargs and determine if thinking mode is enabled."""
|
468
|
+
tmp_kwargs = self.kwargs | kwargs
|
469
|
+
thinking_enabled = (
|
470
|
+
"thinking" in tmp_kwargs
|
471
|
+
and "type" in tmp_kwargs["thinking"]
|
472
|
+
and tmp_kwargs["thinking"]["type"] == "enabled"
|
473
|
+
)
|
474
|
+
if thinking_enabled:
|
475
|
+
tmp_kwargs["temperature"] = 1.0
|
476
|
+
return tmp_kwargs, thinking_enabled
|
477
|
+
|
478
|
+
def _convert_messages_to_anthropic_format(
|
479
|
+
self, chat: Sequence[Message], thinking_enabled: bool, **kwargs: Any
|
480
|
+
) -> List[MessageParam]:
|
481
|
+
"""Convert chat messages to Anthropic format."""
|
425
482
|
messages: List[MessageParam] = []
|
483
|
+
|
426
484
|
for msg in chat:
|
427
|
-
|
428
|
-
TextBlockParam
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
"
|
442
|
-
|
443
|
-
|
485
|
+
if msg["role"] == "user":
|
486
|
+
content: List[Union[TextBlockParam, ImageBlockParam]] = [
|
487
|
+
TextBlockParam(type="text", text=cast(str, msg["content"]))
|
488
|
+
]
|
489
|
+
if "media" in msg and msg["media"] is not None:
|
490
|
+
for media_path in msg["media"]:
|
491
|
+
resize = (
|
492
|
+
kwargs["resize"] if "resize" in kwargs else self.image_size
|
493
|
+
)
|
494
|
+
encoded_media = encode_media(
|
495
|
+
cast(str, media_path), resize=resize
|
496
|
+
)
|
497
|
+
if encoded_media.startswith("data:image/png;base64,"):
|
498
|
+
encoded_media = encoded_media[
|
499
|
+
len("data:image/png;base64,") :
|
500
|
+
]
|
501
|
+
content.append(
|
502
|
+
ImageBlockParam(
|
503
|
+
type="image",
|
504
|
+
source={
|
505
|
+
"type": "base64",
|
506
|
+
"media_type": "image/png",
|
507
|
+
"data": encoded_media,
|
508
|
+
},
|
509
|
+
)
|
510
|
+
)
|
511
|
+
messages.append({"role": "user", "content": content})
|
512
|
+
elif msg["role"] == "assistant":
|
513
|
+
if thinking_enabled:
|
514
|
+
messages.append(
|
515
|
+
self.create_thinking_assistant_message(
|
516
|
+
cast(str, msg["content"]),
|
517
|
+
)
|
518
|
+
)
|
519
|
+
else:
|
520
|
+
messages.append(
|
521
|
+
MessageParam(
|
522
|
+
role="assistant",
|
523
|
+
content=[
|
524
|
+
{"type": "text", "text": cast(str, msg["content"])}
|
525
|
+
],
|
444
526
|
)
|
445
527
|
)
|
446
|
-
|
528
|
+
else:
|
529
|
+
raise ValueError(
|
530
|
+
f"Unsupported role {msg['role']}. Only 'user' and 'assistant' roles are supported."
|
531
|
+
)
|
447
532
|
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
533
|
+
return messages
|
534
|
+
|
535
|
+
def _handle_streaming_response(
|
536
|
+
self, stream_response: anthropic.Stream[anthropic.MessageStreamEvent]
|
537
|
+
) -> Iterator[Optional[str]]:
|
538
|
+
"""Handle streaming response from Anthropic API."""
|
539
|
+
|
540
|
+
def f() -> Iterator[Optional[str]]:
|
541
|
+
thinking_start = False
|
542
|
+
signature_start = False
|
543
|
+
for chunk in stream_response:
|
544
|
+
if chunk.type == "message_start" or chunk.type == "content_block_start":
|
545
|
+
continue
|
546
|
+
elif chunk.type == "content_block_delta":
|
547
|
+
if chunk.delta.type == "text_delta":
|
548
|
+
if thinking_start:
|
549
|
+
thinking_start = False
|
550
|
+
yield f"</thinking>\n{chunk.delta.text}"
|
551
|
+
elif signature_start:
|
552
|
+
signature_start = False
|
553
|
+
yield f"</signature>\n{chunk.delta.text}"
|
554
|
+
else:
|
555
|
+
yield chunk.delta.text
|
556
|
+
elif chunk.delta.type == "thinking_delta":
|
557
|
+
if not thinking_start:
|
558
|
+
thinking_start = True
|
559
|
+
yield f"<thinking>{chunk.delta.thinking}"
|
560
|
+
else:
|
561
|
+
yield chunk.delta.thinking
|
562
|
+
elif chunk.delta.type == "signature_delta":
|
563
|
+
if not signature_start:
|
564
|
+
signature_start = True
|
565
|
+
yield f"<signature>{chunk.delta.signature}"
|
566
|
+
else:
|
567
|
+
yield chunk.delta.signature
|
568
|
+
elif chunk.type == "message_stop":
|
569
|
+
yield None
|
570
|
+
|
571
|
+
return f()
|
572
|
+
|
573
|
+
def _format_thinking_response(self, msg_response: anthropic.types.Message) -> str:
|
574
|
+
"""Format thinking mode response with proper tags."""
|
575
|
+
thinking = ""
|
576
|
+
signature = ""
|
577
|
+
redacted_thinking = ""
|
578
|
+
text = ""
|
579
|
+
for block in msg_response.content:
|
580
|
+
if block.type == "thinking":
|
581
|
+
thinking += block.thinking
|
582
|
+
if block.signature:
|
583
|
+
signature = block.signature
|
584
|
+
elif block.type == "text":
|
585
|
+
text += block.text
|
586
|
+
elif block.type == "redacted_thinking":
|
587
|
+
redacted_thinking += block.data
|
588
|
+
return (
|
589
|
+
f"<thinking>{thinking}</thinking>\n"
|
590
|
+
+ (
|
591
|
+
f"<redacted_thinking>{redacted_thinking}</redacted_thinking>\n"
|
592
|
+
if redacted_thinking
|
593
|
+
else ""
|
594
|
+
)
|
595
|
+
+ (f"<signature>{signature}</signature>\n" if signature else "")
|
596
|
+
+ text
|
452
597
|
)
|
453
|
-
if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
|
454
598
|
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
yield chunk.delta.text
|
464
|
-
elif chunk.type == "message_stop":
|
465
|
-
yield None
|
599
|
+
def _handle_non_streaming_response(
|
600
|
+
self, response_untyped: Any, thinking_enabled: bool
|
601
|
+
) -> str:
|
602
|
+
"""Handle non-streaming response from Anthropic API."""
|
603
|
+
msg_response = cast(anthropic.types.Message, response_untyped)
|
604
|
+
if thinking_enabled:
|
605
|
+
return self._format_thinking_response(msg_response)
|
606
|
+
return cast(anthropic.types.TextBlock, msg_response.content[0]).text
|
466
607
|
|
467
|
-
|
608
|
+
def chat(
|
609
|
+
self,
|
610
|
+
chat: Sequence[Message],
|
611
|
+
**kwargs: Any,
|
612
|
+
) -> Union[str, Iterator[Optional[str]]]:
|
613
|
+
tmp_kwargs, thinking_enabled = self._setup_chat_kwargs(kwargs)
|
614
|
+
messages = self._convert_messages_to_anthropic_format(
|
615
|
+
chat, thinking_enabled, **kwargs
|
616
|
+
)
|
617
|
+
|
618
|
+
response_untyped = self.client.messages.create(
|
619
|
+
model=self.model_name, messages=messages, **tmp_kwargs
|
620
|
+
)
|
621
|
+
|
622
|
+
is_stream = bool(tmp_kwargs.get("stream", False))
|
623
|
+
if is_stream:
|
624
|
+
stream_response = cast(
|
625
|
+
anthropic.Stream[anthropic.MessageStreamEvent], response_untyped
|
626
|
+
)
|
627
|
+
return self._handle_streaming_response(stream_response)
|
468
628
|
else:
|
469
|
-
return
|
629
|
+
return self._handle_non_streaming_response(
|
630
|
+
response_untyped, thinking_enabled
|
631
|
+
)
|
470
632
|
|
471
633
|
def generate(
|
472
634
|
self,
|
vision_agent/tools/__init__.py
CHANGED
@@ -21,7 +21,7 @@ from .tools import (
|
|
21
21
|
countgd_sam2_visual_instance_segmentation,
|
22
22
|
countgd_visual_object_detection,
|
23
23
|
custom_object_detection,
|
24
|
-
|
24
|
+
depth_pro,
|
25
25
|
detr_segmentation,
|
26
26
|
document_extraction,
|
27
27
|
document_qa,
|
@@ -42,7 +42,6 @@ from .tools import (
|
|
42
42
|
glee_sam2_video_tracking,
|
43
43
|
load_image,
|
44
44
|
minimum_distance,
|
45
|
-
ocr,
|
46
45
|
od_sam2_video_tracking,
|
47
46
|
overlay_bounding_boxes,
|
48
47
|
overlay_heat_map,
|
@@ -50,6 +49,7 @@ from .tools import (
|
|
50
49
|
owlv2_object_detection,
|
51
50
|
owlv2_sam2_instance_segmentation,
|
52
51
|
owlv2_sam2_video_tracking,
|
52
|
+
paddle_ocr,
|
53
53
|
qwen2_vl_images_vqa,
|
54
54
|
qwen2_vl_video_vqa,
|
55
55
|
qwen25_vl_images_vqa,
|
@@ -74,7 +74,7 @@ def register_tool(imports: Optional[List] = None) -> Callable:
|
|
74
74
|
def decorator(tool: Callable) -> Callable:
|
75
75
|
import inspect
|
76
76
|
|
77
|
-
global TOOLS, TOOLS_DF, TOOL_DESCRIPTIONS, TOOL_DOCSTRING, TOOLS_INFO
|
77
|
+
global TOOLS, TOOLS_DF, TOOL_DESCRIPTIONS, TOOL_DOCSTRING, TOOLS_INFO # noqa: F824
|
78
78
|
from vision_agent.tools.tools import TOOLS
|
79
79
|
|
80
80
|
if tool not in TOOLS: # type: ignore
|