vision-agent 1.1.16__py3-none-any.whl → 1.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,278 @@
1
+ import copy
2
+ import re
3
+ import time
4
+ from pathlib import Path
5
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
6
+
7
+ from rich.console import Console
8
+ from rich.markup import escape
9
+
10
+ from vision_agent.agent import Agent
11
+ from vision_agent.agent.vision_agent_prompts_v3 import get_init_prompt
12
+ from vision_agent.configs import Config
13
+ from vision_agent.lmm import LMM, AnthropicLMM
14
+ from vision_agent.models import AgentMessage, Message
15
+ from vision_agent.utils.agent import (
16
+ add_media_to_chat,
17
+ capture_media_from_exec,
18
+ convert_message_to_agentmessage,
19
+ extract_tag,
20
+ print_code,
21
+ remove_installs_from_code,
22
+ )
23
+ from vision_agent.utils.execute import CodeInterpreter, CodeInterpreterFactory
24
+
25
+ CONFIG = Config()
26
+ MAX_IMAGES = 10
27
+ _CONSOLE = Console()
28
+
29
+
30
+ class DefaultImports:
31
+ imports = [
32
+ "import os",
33
+ "import numpy as np",
34
+ "import cv2",
35
+ "from typing import *",
36
+ "from pillow_heif import register_heif_opener",
37
+ "from vision_agent.tools import load_image",
38
+ "from vision_agent.tools.planner_v3_tools import instance_segmentation, ocr, depth_estimation, visualize_bounding_boxes, visualize_segmentation_masks, get_crops, rotate_90, display_image, iou",
39
+ "register_heif_opener()",
40
+ "import matplotlib.pyplot as plt",
41
+ ]
42
+
43
+ @staticmethod
44
+ def prepend_imports(code: str) -> str:
45
+ return "\n".join(DefaultImports.imports) + "\n\n" + code
46
+
47
+
48
+ def run_chat(
49
+ model: LMM,
50
+ chat: List[AgentMessage],
51
+ kwargs: Optional[Dict[str, Any]] = None,
52
+ ) -> str:
53
+ chat = copy.deepcopy(chat)
54
+ formatted_chat = []
55
+ for c in chat:
56
+ if c.role in ["user", "observation", "final_observation", "error_observation"]:
57
+ role = "user"
58
+ else:
59
+ role = "assistant"
60
+ formatted_chat.append({"role": role, "content": c.content, "media": c.media})
61
+ response = cast(str, model(formatted_chat, **(kwargs or {}))) # type: ignore
62
+ return response
63
+
64
+
65
+ def strip_signature(response: str) -> str:
66
+ signature = extract_tag(response, "signature")
67
+ if signature is not None:
68
+ response = response.replace(f"<signature>{signature}</signature>", "")
69
+ return response
70
+
71
+
72
+ def strip_signature_from_agentmessage(
73
+ response: AgentMessage,
74
+ ) -> AgentMessage:
75
+ return AgentMessage(
76
+ role=response.role,
77
+ content=strip_signature(response.content),
78
+ media=response.media,
79
+ )
80
+
81
+
82
+ def fix_xml_code_tags(response: str) -> str:
83
+ start_tag = "```python"
84
+ end_tag = "```"
85
+
86
+ start_index = response.find(start_tag)
87
+ if start_index != -1:
88
+ end_index = response.find(end_tag, start_index + len(start_tag))
89
+ if end_index != -1:
90
+ # Extract the code content
91
+ code_content = response[start_index + len(start_tag) : end_index].strip()
92
+ # Replace the markdown block with XML <code> tags
93
+ response = (
94
+ response[:start_index]
95
+ + f"<code>\n{code_content}\n</code>"
96
+ + response[end_index + len(end_tag) :]
97
+ )
98
+
99
+ # Original logic to fix potentially missing XML tags
100
+ if "<answer>" in response and "</answer>" not in response:
101
+ response += "</answer>"
102
+
103
+ # Ensure <code> tags are closed if they exist (could be pre-existing or just added)
104
+ if "<code>" in response and "</code>" not in response:
105
+ response += "</code>"
106
+
107
+ return response
108
+
109
+
110
+ def strip_extra_content(response: str) -> str:
111
+ code_pos = [i.start() for i in re.finditer("<code>", response)]
112
+ if len(code_pos) > 0:
113
+ thinking_start = response.find("<thinking>")
114
+ thinking_end = response.find("</thinking>", thinking_start)
115
+ signature_start = response.find("<signature>")
116
+ signature_end = response.find("</signature>", signature_start)
117
+ code_start = response.find("<code>")
118
+ code_end = response.find("</code>", code_start)
119
+ return (
120
+ response[thinking_start : thinking_end + len("</thinking>")]
121
+ + (
122
+ response[signature_start : signature_end + len("</signature>")]
123
+ if signature_start != -1
124
+ else ""
125
+ )
126
+ + response[code_start : code_end + len("</code>")]
127
+ )
128
+ return response
129
+
130
+
131
+ def run_code(
132
+ code: str,
133
+ code_interpreter: CodeInterpreter,
134
+ ) -> Tuple[str, List[str], float]:
135
+ code = remove_installs_from_code(code)
136
+ start = time.time()
137
+ execution = code_interpreter.exec_cell(DefaultImports.prepend_imports(code))
138
+ end = time.time()
139
+
140
+ obs = execution.text(include_logs=True).strip()
141
+ result_images = capture_media_from_exec(execution)
142
+ max_images_to_include = MAX_IMAGES
143
+ if result_images:
144
+ max_images_to_include = min(len(result_images), MAX_IMAGES)
145
+ return_images = result_images[:max_images_to_include]
146
+ image_note = f"\n\n[{len(return_images)} images were generated by your code and are included with this message]"
147
+ obs += image_note
148
+ return_images = result_images[:max_images_to_include] if result_images else []
149
+
150
+ return obs, return_images, end - start
151
+
152
+
153
+ def format_obs_message(
154
+ obs: str,
155
+ turn: int,
156
+ turns: int,
157
+ ) -> str:
158
+ obs_message = f"[Turn {turn + 1}/{turns}] Code execution result:\n{obs}"
159
+ if turn == turns - 2:
160
+ warning_msg = "\n\n⚠️CRITICAL: The next turn will be your FINAL turn. Please make sure to provide your final answer in <answer> tags in your next response, no need to incude <code> tags. Rember to print out final answers without any explaination, it could be a single word, number, price or a list of bounding boxes of object detection."
161
+ obs_message += warning_msg
162
+ return obs_message
163
+
164
+
165
+ class VisionAgentV3(Agent):
166
+ def __init__(
167
+ self,
168
+ agent: Optional[LMM] = None,
169
+ hil: bool = False,
170
+ verbose: bool = False,
171
+ code_sandbox_runtime: Optional[str] = None,
172
+ update_callback: Callable[[Dict[str, Any]], None] = lambda x: None,
173
+ ) -> None:
174
+ if agent is None:
175
+ self.agent = AnthropicLMM(
176
+ model_name="claude-3-7-sonnet-20250219", max_tokens=8192
177
+ )
178
+ self.kwargs = {
179
+ "thinking": {"type": "enabled", "budget_tokens": 4096},
180
+ "stop_sequences": ["</code>", "</answer>"],
181
+ }
182
+
183
+ self.turns = 7
184
+ self.verbose = verbose
185
+ self.code_sandbox_runtime = code_sandbox_runtime
186
+ self.update_callback = update_callback
187
+
188
+ def __call__(
189
+ self,
190
+ input: Union[str, List[Message]],
191
+ media: Optional[Union[str, Path]] = None,
192
+ ) -> str:
193
+ msg = convert_message_to_agentmessage(input, media)
194
+ return self.chat(msg)[-1].content
195
+
196
+ def chat(
197
+ self,
198
+ chat: List[AgentMessage],
199
+ code_interpreter: Optional[CodeInterpreter] = None,
200
+ ) -> List[AgentMessage]:
201
+ chat = copy.deepcopy(chat)
202
+ if not chat or chat[-1].role not in {"user", "interaction_response"}:
203
+ raise ValueError(
204
+ f"Last chat message must be from the user or interaction_response, got {chat[-1].role}."
205
+ )
206
+
207
+ return_chat = []
208
+ with (
209
+ CodeInterpreterFactory.new_instance(self.code_sandbox_runtime)
210
+ if code_interpreter is None
211
+ else code_interpreter
212
+ ) as code_interpreter:
213
+ int_chat, _, _ = add_media_to_chat(
214
+ chat, code_interpreter, append_to_prompt=False
215
+ )
216
+ init_prompt = get_init_prompt(
217
+ model="",
218
+ turns=self.turns,
219
+ question=int_chat[0].content,
220
+ category="",
221
+ image_path=str(int_chat[0].media),
222
+ )
223
+ return_chat.append(
224
+ AgentMessage(role="user", content=init_prompt, media=int_chat[0].media)
225
+ )
226
+
227
+ for turn in range(self.turns):
228
+ response = run_chat(self.agent, return_chat, self.kwargs)
229
+ response = fix_xml_code_tags(response)
230
+ response = strip_extra_content(response)
231
+
232
+ return_chat.append(AgentMessage(role="assistant", content=response))
233
+ self.update_callback(
234
+ strip_signature_from_agentmessage(return_chat[-1]).model_dump()
235
+ )
236
+
237
+ code = extract_tag(response, "code")
238
+ thoughts = extract_tag(response, "thinking")
239
+ answer = extract_tag(response, "answer")
240
+
241
+ if self.verbose:
242
+ _CONSOLE.print(
243
+ f"[bold cyan]Step {turn}/{self.turns}[/bold cyan]\n"
244
+ f"[green]{thoughts}[/green]\n"
245
+ )
246
+ if answer is not None:
247
+ _CONSOLE.print(
248
+ f"[magenta]Final answer: {escape(answer)}[/magenta]\n"
249
+ )
250
+ if code is not None:
251
+ print_code("Code:", code)
252
+
253
+ if answer is not None:
254
+ # final answer is in the previous response message so no need to add
255
+ # add it to the return_chat
256
+ self.update_callback(
257
+ AgentMessage(
258
+ role="final_observation",
259
+ content=f"<answer>{answer}</answer>",
260
+ ).model_dump()
261
+ )
262
+ elif code is not None:
263
+ obs, images, latency = run_code(code, code_interpreter)
264
+ obs = format_obs_message(obs, turn, self.turns)
265
+ _CONSOLE.print(
266
+ f"[bold cyan]Code execution took {latency:.2f} seconds.[/bold cyan]\n"
267
+ f"[yellow]{escape(obs)}[/yellow]\n"
268
+ )
269
+ return_chat.append(
270
+ AgentMessage(role="observation", content=obs, media=images)
271
+ )
272
+ self.update_callback(
273
+ strip_signature_from_agentmessage(return_chat[-1]).model_dump()
274
+ )
275
+ return return_chat
276
+
277
+ def log_progress(self, data: Dict[str, Any]) -> None:
278
+ pass
vision_agent/lmm/lmm.py CHANGED
@@ -1,19 +1,33 @@
1
+ import base64
1
2
  import json
2
3
  import os
3
4
  from abc import ABC, abstractmethod
4
5
  from pathlib import Path
5
- from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast
6
- import base64
6
+ from typing import (
7
+ Any,
8
+ Dict,
9
+ Iterator,
10
+ List,
11
+ Optional,
12
+ Sequence,
13
+ Union,
14
+ cast,
15
+ )
7
16
 
8
17
  import anthropic
9
18
  import requests
10
- from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam
11
- from openai import AzureOpenAI, OpenAI
12
-
19
+ from anthropic.types import (
20
+ ImageBlockParam,
21
+ MessageParam,
22
+ TextBlockParam,
23
+ ThinkingBlockParam,
24
+ )
13
25
  from google import genai # type: ignore
14
26
  from google.genai import types # type: ignore
27
+ from openai import AzureOpenAI, OpenAI
15
28
 
16
29
  from vision_agent.models import Message
30
+ from vision_agent.utils.agent import extract_tag
17
31
  from vision_agent.utils.image_utils import encode_media
18
32
 
19
33
 
@@ -99,11 +113,15 @@ class OpenAILMM(LMM):
99
113
  [{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
100
114
  """
101
115
  fixed_chat = []
102
- for c in chat:
103
- fixed_c = {"role": c["role"]}
104
- fixed_c["content"] = [{"type": "text", "text": c["content"]}] # type: ignore
105
- if "media" in c and self.model_name != "o3-mini":
106
- for media in c["media"]:
116
+ for msg in chat:
117
+ fixed_c = {"role": msg["role"]}
118
+ fixed_c["content"] = [{"type": "text", "text": msg["content"]}] # type: ignore
119
+ if (
120
+ "media" in msg
121
+ and msg["media"] is not None
122
+ and self.model_name != "o3-mini"
123
+ ):
124
+ for media in msg["media"]:
107
125
  resize = kwargs["resize"] if "resize" in kwargs else self.image_size
108
126
  image_detail = (
109
127
  kwargs["image_detail"]
@@ -297,14 +315,14 @@ class OllamaLMM(LMM):
297
315
  [{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
298
316
  """
299
317
  fixed_chat = []
300
- for message in chat:
301
- if "media" in message:
318
+ for msg in chat:
319
+ if "media" in msg and msg["media"] is not None:
302
320
  resize = kwargs["resize"] if "resize" in kwargs else self.image_size
303
- message["images"] = [
304
- encode_media(cast(str, m), resize=resize) for m in message["media"]
321
+ msg["images"] = [
322
+ encode_media(cast(str, m), resize=resize) for m in msg["media"]
305
323
  ]
306
- del message["media"]
307
- fixed_chat.append(message)
324
+ del msg["media"]
325
+ fixed_chat.append(msg)
308
326
  url = f"{self.url}/chat"
309
327
  model = self.model_name
310
328
  messages = fixed_chat
@@ -410,63 +428,207 @@ class AnthropicLMM(LMM):
410
428
 
411
429
  def __call__(
412
430
  self,
413
- input: Union[str, Sequence[Dict[str, Any]]],
431
+ input: Union[str, Sequence[Message]],
414
432
  **kwargs: Any,
415
433
  ) -> Union[str, Iterator[Optional[str]]]:
416
434
  if isinstance(input, str):
417
435
  return self.generate(input, **kwargs)
418
436
  return self.chat(input, **kwargs)
419
437
 
420
- def chat(
438
+ def create_thinking_assistant_message(
421
439
  self,
422
- chat: Sequence[Dict[str, Any]],
423
- **kwargs: Any,
424
- ) -> Union[str, Iterator[Optional[str]]]:
440
+ msg_content: str,
441
+ ) -> MessageParam:
442
+ content: List[Union[TextBlockParam, ThinkingBlockParam]] = []
443
+ thinking_content = extract_tag(msg_content, "thinking")
444
+ signature = extract_tag(msg_content, "signature")
445
+ if thinking_content:
446
+ content.append(
447
+ ThinkingBlockParam(
448
+ type="thinking",
449
+ thinking=thinking_content.strip(),
450
+ signature=signature.strip() if signature else "",
451
+ )
452
+ )
453
+ signature_content = extract_tag(msg_content, "signature")
454
+ if signature_content:
455
+ text_content = msg_content.replace(
456
+ f"<thinking>{thinking_content}</thinking>", ""
457
+ ).replace(f"<signature>{signature_content}</signature>", "")
458
+ else:
459
+ text_content = msg_content.replace(
460
+ f"<thinking>{thinking_content}</thinking>", ""
461
+ )
462
+ if text_content.strip():
463
+ content.append(TextBlockParam(type="text", text=text_content.strip()))
464
+ return MessageParam(role="assistant", content=content)
465
+
466
+ def _setup_chat_kwargs(self, kwargs: Dict[str, Any]) -> tuple[Dict[str, Any], bool]:
467
+ """Set up kwargs and determine if thinking mode is enabled."""
468
+ tmp_kwargs = self.kwargs | kwargs
469
+ thinking_enabled = (
470
+ "thinking" in tmp_kwargs
471
+ and "type" in tmp_kwargs["thinking"]
472
+ and tmp_kwargs["thinking"]["type"] == "enabled"
473
+ )
474
+ if thinking_enabled:
475
+ tmp_kwargs["temperature"] = 1.0
476
+ return tmp_kwargs, thinking_enabled
477
+
478
+ def _convert_messages_to_anthropic_format(
479
+ self, chat: Sequence[Message], thinking_enabled: bool, **kwargs: Any
480
+ ) -> List[MessageParam]:
481
+ """Convert chat messages to Anthropic format."""
425
482
  messages: List[MessageParam] = []
483
+
426
484
  for msg in chat:
427
- content: List[Union[TextBlockParam, ImageBlockParam]] = [
428
- TextBlockParam(type="text", text=msg["content"])
429
- ]
430
- if "media" in msg:
431
- for media_path in msg["media"]:
432
- resize = kwargs["resize"] if "resize" in kwargs else self.image_size
433
- encoded_media = encode_media(media_path, resize=resize)
434
- if encoded_media.startswith("data:image/png;base64,"):
435
- encoded_media = encoded_media[len("data:image/png;base64,") :]
436
- content.append(
437
- ImageBlockParam(
438
- type="image",
439
- source={
440
- "type": "base64",
441
- "media_type": "image/png",
442
- "data": encoded_media,
443
- },
485
+ if msg["role"] == "user":
486
+ content: List[Union[TextBlockParam, ImageBlockParam]] = [
487
+ TextBlockParam(type="text", text=cast(str, msg["content"]))
488
+ ]
489
+ if "media" in msg and msg["media"] is not None:
490
+ for media_path in msg["media"]:
491
+ resize = (
492
+ kwargs["resize"] if "resize" in kwargs else self.image_size
493
+ )
494
+ encoded_media = encode_media(
495
+ cast(str, media_path), resize=resize
496
+ )
497
+ if encoded_media.startswith("data:image/png;base64,"):
498
+ encoded_media = encoded_media[
499
+ len("data:image/png;base64,") :
500
+ ]
501
+ content.append(
502
+ ImageBlockParam(
503
+ type="image",
504
+ source={
505
+ "type": "base64",
506
+ "media_type": "image/png",
507
+ "data": encoded_media,
508
+ },
509
+ )
510
+ )
511
+ messages.append({"role": "user", "content": content})
512
+ elif msg["role"] == "assistant":
513
+ if thinking_enabled:
514
+ messages.append(
515
+ self.create_thinking_assistant_message(
516
+ cast(str, msg["content"]),
517
+ )
518
+ )
519
+ else:
520
+ messages.append(
521
+ MessageParam(
522
+ role="assistant",
523
+ content=[
524
+ {"type": "text", "text": cast(str, msg["content"])}
525
+ ],
444
526
  )
445
527
  )
446
- messages.append({"role": msg["role"], "content": content})
528
+ else:
529
+ raise ValueError(
530
+ f"Unsupported role {msg['role']}. Only 'user' and 'assistant' roles are supported."
531
+ )
447
532
 
448
- # prefers kwargs from second dictionary over first
449
- tmp_kwargs = self.kwargs | kwargs
450
- response = self.client.messages.create(
451
- model=self.model_name, messages=messages, **tmp_kwargs
533
+ return messages
534
+
535
+ def _handle_streaming_response(
536
+ self, stream_response: anthropic.Stream[anthropic.MessageStreamEvent]
537
+ ) -> Iterator[Optional[str]]:
538
+ """Handle streaming response from Anthropic API."""
539
+
540
+ def f() -> Iterator[Optional[str]]:
541
+ thinking_start = False
542
+ signature_start = False
543
+ for chunk in stream_response:
544
+ if chunk.type == "message_start" or chunk.type == "content_block_start":
545
+ continue
546
+ elif chunk.type == "content_block_delta":
547
+ if chunk.delta.type == "text_delta":
548
+ if thinking_start:
549
+ thinking_start = False
550
+ yield f"</thinking>\n{chunk.delta.text}"
551
+ elif signature_start:
552
+ signature_start = False
553
+ yield f"</signature>\n{chunk.delta.text}"
554
+ else:
555
+ yield chunk.delta.text
556
+ elif chunk.delta.type == "thinking_delta":
557
+ if not thinking_start:
558
+ thinking_start = True
559
+ yield f"<thinking>{chunk.delta.thinking}"
560
+ else:
561
+ yield chunk.delta.thinking
562
+ elif chunk.delta.type == "signature_delta":
563
+ if not signature_start:
564
+ signature_start = True
565
+ yield f"<signature>{chunk.delta.signature}"
566
+ else:
567
+ yield chunk.delta.signature
568
+ elif chunk.type == "message_stop":
569
+ yield None
570
+
571
+ return f()
572
+
573
+ def _format_thinking_response(self, msg_response: anthropic.types.Message) -> str:
574
+ """Format thinking mode response with proper tags."""
575
+ thinking = ""
576
+ signature = ""
577
+ redacted_thinking = ""
578
+ text = ""
579
+ for block in msg_response.content:
580
+ if block.type == "thinking":
581
+ thinking += block.thinking
582
+ if block.signature:
583
+ signature = block.signature
584
+ elif block.type == "text":
585
+ text += block.text
586
+ elif block.type == "redacted_thinking":
587
+ redacted_thinking += block.data
588
+ return (
589
+ f"<thinking>{thinking}</thinking>\n"
590
+ + (
591
+ f"<redacted_thinking>{redacted_thinking}</redacted_thinking>\n"
592
+ if redacted_thinking
593
+ else ""
594
+ )
595
+ + (f"<signature>{signature}</signature>\n" if signature else "")
596
+ + text
452
597
  )
453
- if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
454
598
 
455
- def f() -> Iterator[Optional[str]]:
456
- for chunk in response:
457
- if (
458
- chunk.type == "message_start"
459
- or chunk.type == "content_block_start"
460
- ):
461
- continue
462
- elif chunk.type == "content_block_delta":
463
- yield chunk.delta.text
464
- elif chunk.type == "message_stop":
465
- yield None
599
+ def _handle_non_streaming_response(
600
+ self, response_untyped: Any, thinking_enabled: bool
601
+ ) -> str:
602
+ """Handle non-streaming response from Anthropic API."""
603
+ msg_response = cast(anthropic.types.Message, response_untyped)
604
+ if thinking_enabled:
605
+ return self._format_thinking_response(msg_response)
606
+ return cast(anthropic.types.TextBlock, msg_response.content[0]).text
466
607
 
467
- return f()
608
+ def chat(
609
+ self,
610
+ chat: Sequence[Message],
611
+ **kwargs: Any,
612
+ ) -> Union[str, Iterator[Optional[str]]]:
613
+ tmp_kwargs, thinking_enabled = self._setup_chat_kwargs(kwargs)
614
+ messages = self._convert_messages_to_anthropic_format(
615
+ chat, thinking_enabled, **kwargs
616
+ )
617
+
618
+ response_untyped = self.client.messages.create(
619
+ model=self.model_name, messages=messages, **tmp_kwargs
620
+ )
621
+
622
+ is_stream = bool(tmp_kwargs.get("stream", False))
623
+ if is_stream:
624
+ stream_response = cast(
625
+ anthropic.Stream[anthropic.MessageStreamEvent], response_untyped
626
+ )
627
+ return self._handle_streaming_response(stream_response)
468
628
  else:
469
- return cast(str, response.content[0].text)
629
+ return self._handle_non_streaming_response(
630
+ response_untyped, thinking_enabled
631
+ )
470
632
 
471
633
  def generate(
472
634
  self,
@@ -21,7 +21,7 @@ from .tools import (
21
21
  countgd_sam2_visual_instance_segmentation,
22
22
  countgd_visual_object_detection,
23
23
  custom_object_detection,
24
- depth_anything_v2,
24
+ depth_pro,
25
25
  detr_segmentation,
26
26
  document_extraction,
27
27
  document_qa,
@@ -42,7 +42,6 @@ from .tools import (
42
42
  glee_sam2_video_tracking,
43
43
  load_image,
44
44
  minimum_distance,
45
- ocr,
46
45
  od_sam2_video_tracking,
47
46
  overlay_bounding_boxes,
48
47
  overlay_heat_map,
@@ -50,6 +49,7 @@ from .tools import (
50
49
  owlv2_object_detection,
51
50
  owlv2_sam2_instance_segmentation,
52
51
  owlv2_sam2_video_tracking,
52
+ paddle_ocr,
53
53
  qwen2_vl_images_vqa,
54
54
  qwen2_vl_video_vqa,
55
55
  qwen25_vl_images_vqa,
@@ -74,7 +74,7 @@ def register_tool(imports: Optional[List] = None) -> Callable:
74
74
  def decorator(tool: Callable) -> Callable:
75
75
  import inspect
76
76
 
77
- global TOOLS, TOOLS_DF, TOOL_DESCRIPTIONS, TOOL_DOCSTRING, TOOLS_INFO
77
+ global TOOLS, TOOLS_DF, TOOL_DESCRIPTIONS, TOOL_DOCSTRING, TOOLS_INFO # noqa: F824
78
78
  from vision_agent.tools.tools import TOOLS
79
79
 
80
80
  if tool not in TOOLS: # type: ignore