vision-agent 1.1.17__py3-none-any.whl → 1.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vision_agent/lmm/lmm.py CHANGED
@@ -1,19 +1,33 @@
1
+ import base64
1
2
  import json
2
3
  import os
3
4
  from abc import ABC, abstractmethod
4
5
  from pathlib import Path
5
- from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast
6
- import base64
6
+ from typing import (
7
+ Any,
8
+ Dict,
9
+ Iterator,
10
+ List,
11
+ Optional,
12
+ Sequence,
13
+ Union,
14
+ cast,
15
+ )
7
16
 
8
17
  import anthropic
9
18
  import requests
10
- from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam
11
- from openai import AzureOpenAI, OpenAI
12
-
19
+ from anthropic.types import (
20
+ ImageBlockParam,
21
+ MessageParam,
22
+ TextBlockParam,
23
+ ThinkingBlockParam,
24
+ )
13
25
  from google import genai # type: ignore
14
26
  from google.genai import types # type: ignore
27
+ from openai import AzureOpenAI, OpenAI
15
28
 
16
29
  from vision_agent.models import Message
30
+ from vision_agent.utils.agent import extract_tag
17
31
  from vision_agent.utils.image_utils import encode_media
18
32
 
19
33
 
@@ -99,11 +113,15 @@ class OpenAILMM(LMM):
99
113
  [{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
100
114
  """
101
115
  fixed_chat = []
102
- for c in chat:
103
- fixed_c = {"role": c["role"]}
104
- fixed_c["content"] = [{"type": "text", "text": c["content"]}] # type: ignore
105
- if "media" in c and self.model_name != "o3-mini":
106
- for media in c["media"]:
116
+ for msg in chat:
117
+ fixed_c = {"role": msg["role"]}
118
+ fixed_c["content"] = [{"type": "text", "text": msg["content"]}] # type: ignore
119
+ if (
120
+ "media" in msg
121
+ and msg["media"] is not None
122
+ and self.model_name != "o3-mini"
123
+ ):
124
+ for media in msg["media"]:
107
125
  resize = kwargs["resize"] if "resize" in kwargs else self.image_size
108
126
  image_detail = (
109
127
  kwargs["image_detail"]
@@ -297,14 +315,14 @@ class OllamaLMM(LMM):
297
315
  [{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
298
316
  """
299
317
  fixed_chat = []
300
- for message in chat:
301
- if "media" in message:
318
+ for msg in chat:
319
+ if "media" in msg and msg["media"] is not None:
302
320
  resize = kwargs["resize"] if "resize" in kwargs else self.image_size
303
- message["images"] = [
304
- encode_media(cast(str, m), resize=resize) for m in message["media"]
321
+ msg["images"] = [
322
+ encode_media(cast(str, m), resize=resize) for m in msg["media"]
305
323
  ]
306
- del message["media"]
307
- fixed_chat.append(message)
324
+ del msg["media"]
325
+ fixed_chat.append(msg)
308
326
  url = f"{self.url}/chat"
309
327
  model = self.model_name
310
328
  messages = fixed_chat
@@ -410,63 +428,207 @@ class AnthropicLMM(LMM):
410
428
 
411
429
  def __call__(
412
430
  self,
413
- input: Union[str, Sequence[Dict[str, Any]]],
431
+ input: Union[str, Sequence[Message]],
414
432
  **kwargs: Any,
415
433
  ) -> Union[str, Iterator[Optional[str]]]:
416
434
  if isinstance(input, str):
417
435
  return self.generate(input, **kwargs)
418
436
  return self.chat(input, **kwargs)
419
437
 
420
- def chat(
438
+ def create_thinking_assistant_message(
421
439
  self,
422
- chat: Sequence[Dict[str, Any]],
423
- **kwargs: Any,
424
- ) -> Union[str, Iterator[Optional[str]]]:
440
+ msg_content: str,
441
+ ) -> MessageParam:
442
+ content: List[Union[TextBlockParam, ThinkingBlockParam]] = []
443
+ thinking_content = extract_tag(msg_content, "thinking")
444
+ signature = extract_tag(msg_content, "signature")
445
+ if thinking_content:
446
+ content.append(
447
+ ThinkingBlockParam(
448
+ type="thinking",
449
+ thinking=thinking_content.strip(),
450
+ signature=signature.strip() if signature else "",
451
+ )
452
+ )
453
+ signature_content = extract_tag(msg_content, "signature")
454
+ if signature_content:
455
+ text_content = msg_content.replace(
456
+ f"<thinking>{thinking_content}</thinking>", ""
457
+ ).replace(f"<signature>{signature_content}</signature>", "")
458
+ else:
459
+ text_content = msg_content.replace(
460
+ f"<thinking>{thinking_content}</thinking>", ""
461
+ )
462
+ if text_content.strip():
463
+ content.append(TextBlockParam(type="text", text=text_content.strip()))
464
+ return MessageParam(role="assistant", content=content)
465
+
466
+ def _setup_chat_kwargs(self, kwargs: Dict[str, Any]) -> tuple[Dict[str, Any], bool]:
467
+ """Set up kwargs and determine if thinking mode is enabled."""
468
+ tmp_kwargs = self.kwargs | kwargs
469
+ thinking_enabled = (
470
+ "thinking" in tmp_kwargs
471
+ and "type" in tmp_kwargs["thinking"]
472
+ and tmp_kwargs["thinking"]["type"] == "enabled"
473
+ )
474
+ if thinking_enabled:
475
+ tmp_kwargs["temperature"] = 1.0
476
+ return tmp_kwargs, thinking_enabled
477
+
478
+ def _convert_messages_to_anthropic_format(
479
+ self, chat: Sequence[Message], thinking_enabled: bool, **kwargs: Any
480
+ ) -> List[MessageParam]:
481
+ """Convert chat messages to Anthropic format."""
425
482
  messages: List[MessageParam] = []
483
+
426
484
  for msg in chat:
427
- content: List[Union[TextBlockParam, ImageBlockParam]] = [
428
- TextBlockParam(type="text", text=msg["content"])
429
- ]
430
- if "media" in msg:
431
- for media_path in msg["media"]:
432
- resize = kwargs["resize"] if "resize" in kwargs else self.image_size
433
- encoded_media = encode_media(media_path, resize=resize)
434
- if encoded_media.startswith("data:image/png;base64,"):
435
- encoded_media = encoded_media[len("data:image/png;base64,") :]
436
- content.append(
437
- ImageBlockParam(
438
- type="image",
439
- source={
440
- "type": "base64",
441
- "media_type": "image/png",
442
- "data": encoded_media,
443
- },
485
+ if msg["role"] == "user":
486
+ content: List[Union[TextBlockParam, ImageBlockParam]] = [
487
+ TextBlockParam(type="text", text=cast(str, msg["content"]))
488
+ ]
489
+ if "media" in msg and msg["media"] is not None:
490
+ for media_path in msg["media"]:
491
+ resize = (
492
+ kwargs["resize"] if "resize" in kwargs else self.image_size
493
+ )
494
+ encoded_media = encode_media(
495
+ cast(str, media_path), resize=resize
496
+ )
497
+ if encoded_media.startswith("data:image/png;base64,"):
498
+ encoded_media = encoded_media[
499
+ len("data:image/png;base64,") :
500
+ ]
501
+ content.append(
502
+ ImageBlockParam(
503
+ type="image",
504
+ source={
505
+ "type": "base64",
506
+ "media_type": "image/png",
507
+ "data": encoded_media,
508
+ },
509
+ )
510
+ )
511
+ messages.append({"role": "user", "content": content})
512
+ elif msg["role"] == "assistant":
513
+ if thinking_enabled:
514
+ messages.append(
515
+ self.create_thinking_assistant_message(
516
+ cast(str, msg["content"]),
517
+ )
518
+ )
519
+ else:
520
+ messages.append(
521
+ MessageParam(
522
+ role="assistant",
523
+ content=[
524
+ {"type": "text", "text": cast(str, msg["content"])}
525
+ ],
444
526
  )
445
527
  )
446
- messages.append({"role": msg["role"], "content": content})
528
+ else:
529
+ raise ValueError(
530
+ f"Unsupported role {msg['role']}. Only 'user' and 'assistant' roles are supported."
531
+ )
447
532
 
448
- # prefers kwargs from second dictionary over first
449
- tmp_kwargs = self.kwargs | kwargs
450
- response = self.client.messages.create(
451
- model=self.model_name, messages=messages, **tmp_kwargs
533
+ return messages
534
+
535
+ def _handle_streaming_response(
536
+ self, stream_response: anthropic.Stream[anthropic.MessageStreamEvent]
537
+ ) -> Iterator[Optional[str]]:
538
+ """Handle streaming response from Anthropic API."""
539
+
540
+ def f() -> Iterator[Optional[str]]:
541
+ thinking_start = False
542
+ signature_start = False
543
+ for chunk in stream_response:
544
+ if chunk.type == "message_start" or chunk.type == "content_block_start":
545
+ continue
546
+ elif chunk.type == "content_block_delta":
547
+ if chunk.delta.type == "text_delta":
548
+ if thinking_start:
549
+ thinking_start = False
550
+ yield f"</thinking>\n{chunk.delta.text}"
551
+ elif signature_start:
552
+ signature_start = False
553
+ yield f"</signature>\n{chunk.delta.text}"
554
+ else:
555
+ yield chunk.delta.text
556
+ elif chunk.delta.type == "thinking_delta":
557
+ if not thinking_start:
558
+ thinking_start = True
559
+ yield f"<thinking>{chunk.delta.thinking}"
560
+ else:
561
+ yield chunk.delta.thinking
562
+ elif chunk.delta.type == "signature_delta":
563
+ if not signature_start:
564
+ signature_start = True
565
+ yield f"<signature>{chunk.delta.signature}"
566
+ else:
567
+ yield chunk.delta.signature
568
+ elif chunk.type == "message_stop":
569
+ yield None
570
+
571
+ return f()
572
+
573
+ def _format_thinking_response(self, msg_response: anthropic.types.Message) -> str:
574
+ """Format thinking mode response with proper tags."""
575
+ thinking = ""
576
+ signature = ""
577
+ redacted_thinking = ""
578
+ text = ""
579
+ for block in msg_response.content:
580
+ if block.type == "thinking":
581
+ thinking += block.thinking
582
+ if block.signature:
583
+ signature = block.signature
584
+ elif block.type == "text":
585
+ text += block.text
586
+ elif block.type == "redacted_thinking":
587
+ redacted_thinking += block.data
588
+ return (
589
+ f"<thinking>{thinking}</thinking>\n"
590
+ + (
591
+ f"<redacted_thinking>{redacted_thinking}</redacted_thinking>\n"
592
+ if redacted_thinking
593
+ else ""
594
+ )
595
+ + (f"<signature>{signature}</signature>\n" if signature else "")
596
+ + text
452
597
  )
453
- if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
454
598
 
455
- def f() -> Iterator[Optional[str]]:
456
- for chunk in response:
457
- if (
458
- chunk.type == "message_start"
459
- or chunk.type == "content_block_start"
460
- ):
461
- continue
462
- elif chunk.type == "content_block_delta":
463
- yield chunk.delta.text
464
- elif chunk.type == "message_stop":
465
- yield None
599
+ def _handle_non_streaming_response(
600
+ self, response_untyped: Any, thinking_enabled: bool
601
+ ) -> str:
602
+ """Handle non-streaming response from Anthropic API."""
603
+ msg_response = cast(anthropic.types.Message, response_untyped)
604
+ if thinking_enabled:
605
+ return self._format_thinking_response(msg_response)
606
+ return cast(anthropic.types.TextBlock, msg_response.content[0]).text
466
607
 
467
- return f()
608
+ def chat(
609
+ self,
610
+ chat: Sequence[Message],
611
+ **kwargs: Any,
612
+ ) -> Union[str, Iterator[Optional[str]]]:
613
+ tmp_kwargs, thinking_enabled = self._setup_chat_kwargs(kwargs)
614
+ messages = self._convert_messages_to_anthropic_format(
615
+ chat, thinking_enabled, **kwargs
616
+ )
617
+
618
+ response_untyped = self.client.messages.create(
619
+ model=self.model_name, messages=messages, **tmp_kwargs
620
+ )
621
+
622
+ is_stream = bool(tmp_kwargs.get("stream", False))
623
+ if is_stream:
624
+ stream_response = cast(
625
+ anthropic.Stream[anthropic.MessageStreamEvent], response_untyped
626
+ )
627
+ return self._handle_streaming_response(stream_response)
468
628
  else:
469
- return cast(str, response.content[0].text)
629
+ return self._handle_non_streaming_response(
630
+ response_untyped, thinking_enabled
631
+ )
470
632
 
471
633
  def generate(
472
634
  self,
@@ -42,7 +42,6 @@ from .tools import (
42
42
  glee_sam2_video_tracking,
43
43
  load_image,
44
44
  minimum_distance,
45
- paddle_ocr,
46
45
  od_sam2_video_tracking,
47
46
  overlay_bounding_boxes,
48
47
  overlay_heat_map,
@@ -50,6 +49,7 @@ from .tools import (
50
49
  owlv2_object_detection,
51
50
  owlv2_sam2_instance_segmentation,
52
51
  owlv2_sam2_video_tracking,
52
+ paddle_ocr,
53
53
  qwen2_vl_images_vqa,
54
54
  qwen2_vl_video_vqa,
55
55
  qwen25_vl_images_vqa,
@@ -74,7 +74,7 @@ def register_tool(imports: Optional[List] = None) -> Callable:
74
74
  def decorator(tool: Callable) -> Callable:
75
75
  import inspect
76
76
 
77
- global TOOLS, TOOLS_DF, TOOL_DESCRIPTIONS, TOOL_DOCSTRING, TOOLS_INFO
77
+ global TOOLS, TOOLS_DF, TOOL_DESCRIPTIONS, TOOL_DOCSTRING, TOOLS_INFO # noqa: F824
78
78
  from vision_agent.tools.tools import TOOLS
79
79
 
80
80
  if tool not in TOOLS: # type: ignore
@@ -0,0 +1,206 @@
1
+ import base64
2
+ import copy
3
+ import io
4
+ from typing import Dict, List, Optional, Tuple, Union, cast
5
+
6
+ import cv2
7
+ import matplotlib.figure
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ from PIL import Image
11
+ from PIL.Image import Image as PILImageType
12
+
13
+ from vision_agent.utils.image_utils import (
14
+ denormalize_bbox,
15
+ normalize_bbox,
16
+ numpy_to_bytes,
17
+ rle_decode_array,
18
+ )
19
+ from vision_agent.utils.tools import send_inference_request
20
+
21
+
22
+ def maybe_denormalize_bbox(
23
+ bbox: List[Union[int, float]], image_size: Tuple[int, ...]
24
+ ) -> List[float]:
25
+ if all([0 <= c <= 1 for c in bbox]):
26
+ return denormalize_bbox(bbox, image_size)
27
+ return bbox
28
+
29
+
30
+ def maybe_normalize_bbox(
31
+ bbox: List[Union[int, float]], image_size: Tuple[int, ...]
32
+ ) -> List[float]:
33
+ if any([1 <= c for c in bbox]):
34
+ return normalize_bbox(bbox, image_size)
35
+ return bbox
36
+
37
+
38
+ def instance_segmentation(
39
+ prompt: str, image: np.ndarray, threshold: float = 0.23, nms_threshold: float = 0.5
40
+ ) -> List[Dict[str, Union[str, float, List[float], np.ndarray]]]:
41
+ image_bytes = numpy_to_bytes(image)
42
+ files = [("image", image_bytes)]
43
+ data = {"prompts": [prompt], "threshold": threshold, "nms_threshold": nms_threshold}
44
+ results = send_inference_request(
45
+ data,
46
+ "glee",
47
+ files=files,
48
+ v2=True,
49
+ )
50
+ results = results[0]
51
+ results_formatted = [
52
+ {
53
+ "label": elt["label"],
54
+ "score": elt["score"],
55
+ "bbox": normalize_bbox(elt["bounding_box"], image.shape[:2]),
56
+ "mask": np.array(rle_decode_array(elt["mask"])),
57
+ }
58
+ for elt in results
59
+ ]
60
+ return results_formatted
61
+
62
+
63
+ def ocr(image: np.ndarray) -> List[Dict[str, Union[str, float, List[float]]]]:
64
+ image_bytes = numpy_to_bytes(image)
65
+ files = [("image", image_bytes)]
66
+ results = send_inference_request(
67
+ {},
68
+ "paddle-ocr",
69
+ files=files,
70
+ v2=True,
71
+ )
72
+ results_formatted = [
73
+ {
74
+ "label": elt["label"],
75
+ "score": elt["score"],
76
+ "bbox": normalize_bbox(elt["bbox"], image.shape[:2]),
77
+ }
78
+ for elt in results
79
+ ]
80
+ return results_formatted
81
+
82
+
83
+ def depth_estimation(image: np.ndarray) -> np.ndarray:
84
+ shape = image.shape[:2]
85
+ image_bytes = numpy_to_bytes(image)
86
+ files = [("image", image_bytes)]
87
+ results = send_inference_request(
88
+ {},
89
+ "depth-pro",
90
+ files=files,
91
+ v2=True,
92
+ )
93
+ depth = np.frombuffer(base64.b64decode(results["depth"]), dtype=np.float32).reshape(
94
+ shape
95
+ )
96
+ return depth
97
+
98
+
99
+ def visualize_bounding_boxes(
100
+ image: np.ndarray, bounding_boxes: List[Dict[str, Union[str, float, List[float]]]]
101
+ ) -> np.ndarray:
102
+ image = image.copy()
103
+ image_size = image.shape[:2]
104
+ bounding_boxes = copy.deepcopy(bounding_boxes)
105
+
106
+ for bbox in bounding_boxes:
107
+ bbox["bbox"] = maybe_denormalize_bbox(
108
+ cast(List[float], bbox["bbox"]), image_size
109
+ )
110
+ for bbox in bounding_boxes:
111
+ x1, y1, x2, y2 = bbox["bbox"] # type: ignore
112
+ cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255), 2)
113
+ return image
114
+
115
+
116
+ def visualize_segmentation_masks(
117
+ image: np.ndarray,
118
+ segmentation_masks: List[Dict[str, Union[str, float, np.ndarray]]],
119
+ ) -> np.ndarray:
120
+ alpha = 0.5
121
+ overlay = image.copy()
122
+ color_mask = np.zeros_like(image)
123
+ color_mask[:, :] = (0, 100, 255)
124
+ for elt in segmentation_masks:
125
+ mask = cast(np.ndarray, elt["mask"])
126
+ overlay[mask == 1] = (1 - alpha) * overlay[mask == 1] + alpha * color_mask[
127
+ mask == 1
128
+ ]
129
+
130
+ # draw outline on the mask so it doesn't just think the color of the object changed
131
+ mask_uint8 = (mask * 255).astype(np.uint8)
132
+ contours, _ = cv2.findContours(
133
+ mask_uint8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
134
+ )
135
+ cv2.drawContours(overlay, contours, -1, (0, 0, 255), 2, lineType=cv2.LINE_AA)
136
+ overlay = np.clip(overlay, 0, 255).astype(np.uint8)
137
+ return overlay
138
+
139
+
140
+ def get_crops(
141
+ image: np.ndarray, bounding_boxes: List[Dict[str, Union[str, float, List[float]]]]
142
+ ) -> List[np.ndarray]:
143
+ image = image.copy()
144
+ bounding_boxes = copy.deepcopy(bounding_boxes)
145
+
146
+ for bbox in bounding_boxes:
147
+ bbox["bbox"] = maybe_denormalize_bbox(
148
+ cast(List[float], bbox["bbox"]), image.shape[:2]
149
+ )
150
+ crops = []
151
+ for bbox in bounding_boxes:
152
+ x1, y1, x2, y2 = bbox["bbox"] # type: ignore
153
+ crops.append(image[int(y1) : int(y2), int(x1) : int(x2)])
154
+ return crops
155
+
156
+
157
+ def rotate_90(image: np.ndarray, k: int = 1) -> np.ndarray:
158
+ return np.rot90(image, k=k, axes=(0, 1))
159
+
160
+
161
+ def iou(
162
+ pred1: Union[List[float], np.ndarray], pred2: Union[List[float], np.ndarray]
163
+ ) -> float:
164
+ if isinstance(pred1, list) and isinstance(pred2, list):
165
+ x1, y1, x2, y2 = pred1
166
+ x1_, y1_, x2_, y2_ = pred2
167
+ intersection = max(0, min(x2, x2_) - max(x1, x1_)) * max(
168
+ 0, min(y2, y2_) - max(y1, y1_)
169
+ )
170
+ union = (x2 - x1) * (y2 - y1) + (x2_ - x1_) * (y2_ - y1_) - intersection
171
+ return intersection / union
172
+ elif isinstance(pred1, np.ndarray) and isinstance(pred2, np.ndarray):
173
+ pred1 = np.clip(pred1, 0, 1)
174
+ pred2 = np.clip(pred2, 0, 1)
175
+ intersection = np.sum(pred1 * pred2)
176
+ union = np.sum(pred1) + np.sum(pred2) - intersection
177
+ return intersection / union
178
+ raise ValueError("Unsupported input types for IoU calculation.")
179
+
180
+
181
+ def display_image(
182
+ image: Union[np.ndarray, PILImageType, matplotlib.figure.Figure, str],
183
+ ) -> None:
184
+ display_img: Optional[PILImageType] = None
185
+ if isinstance(image, np.ndarray):
186
+ display_img = Image.fromarray(image)
187
+ elif isinstance(image, matplotlib.figure.Figure):
188
+ # Render the figure to a BytesIO buffer
189
+ buf = io.BytesIO()
190
+ image.savefig(buf, format="png")
191
+ buf.seek(0)
192
+ # Load the buffer as a PIL Image
193
+ display_img = Image.open(buf)
194
+ plt.close(image) # type: ignore
195
+ elif isinstance(image, PILImageType):
196
+ display_img = image # Already a PIL Image
197
+ elif isinstance(image, str):
198
+ display_img = Image.open(image)
199
+
200
+ if display_img is not None:
201
+ plt.imshow(display_img) # type: ignore
202
+ plt.axis("off") # type: ignore
203
+ plt.show()
204
+ else:
205
+ # Handle cases where image type is not supported or conversion failed
206
+ print("Unsupported image type or conversion failed.")
@@ -247,7 +247,9 @@ def print_table(title: str, columns: List[str], rows: List[List[str]]) -> None:
247
247
 
248
248
 
249
249
  def add_media_to_chat(
250
- chat: List[AgentMessage], code_interpreter: Optional[CodeInterpreter] = None
250
+ chat: List[AgentMessage],
251
+ code_interpreter: Optional[CodeInterpreter] = None,
252
+ append_to_prompt: bool = True,
251
253
  ) -> Tuple[List[AgentMessage], List[AgentMessage], List[Union[str, Path]]]:
252
254
  orig_chat = copy.deepcopy(chat)
253
255
  int_chat = copy.deepcopy(chat)
@@ -278,6 +280,7 @@ def add_media_to_chat(
278
280
  if (
279
281
  not str(chat_i.content).endswith(f" Media name {media}")
280
282
  and chat_i.role == "user"
283
+ and append_to_prompt
281
284
  ):
282
285
  chat_i.content += f" Media name {media}"
283
286
  chat_i.media = media_list_i if len(media_list_i) > 0 else None
@@ -304,13 +307,26 @@ def add_media_to_chat(
304
307
  def capture_media_from_exec(execution: Execution) -> List[str]:
305
308
  images = []
306
309
  for result in execution.results:
307
- for format in result.formats():
308
- if format in ["png", "jpeg"]:
309
- # converts the image to png and then to base64
310
- images.append(
311
- "data:image/png;base64,"
312
- + convert_to_b64(b64_to_pil(result[format]))
313
- )
310
+ if hasattr(result, "formats"):
311
+ for format in result.formats():
312
+ if format in ["png", "jpeg"]:
313
+ # converts the image to png and then to base64
314
+ images.append(
315
+ "data:image/png;base64,"
316
+ + convert_to_b64(b64_to_pil(result[format]))
317
+ )
318
+ elif hasattr(result, "savefig"):
319
+ pass
320
+ elif hasattr(result, "_repr_png_") and result._repr_png_():
321
+ images.append(
322
+ "data:image/png;base64,"
323
+ + convert_to_b64(b64_to_pil(result._repr_png_())) # type: ignore
324
+ )
325
+ elif hasattr(result, "_repr_jpeg_") and result._repr_jpeg_():
326
+ images.append(
327
+ "data:image/jpeg;base64,"
328
+ + convert_to_b64(b64_to_pil(result._repr_jpeg_())) # type: ignore
329
+ )
314
330
  return images
315
331
 
316
332
 
@@ -106,7 +106,7 @@ def send_task_inference_request(
106
106
  if metadata is not None and "function_name" in metadata:
107
107
  function_name = metadata["function_name"]
108
108
  response = _call_post(url, payload, session, files, function_name, is_form)
109
- return response["data"]
109
+ return response["data"] if "data" in response else response
110
110
 
111
111
 
112
112
  def _create_requests_session(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vision-agent
3
- Version: 1.1.17
3
+ Version: 1.1.18
4
4
  Summary: Toolset for Vision Agent
5
5
  Project-URL: Homepage, https://landing.ai
6
6
  Project-URL: repository, https://github.com/landing-ai/vision-agent
@@ -8,7 +8,7 @@ Project-URL: documentation, https://github.com/landing-ai/vision-agent
8
8
  Author-email: Landing AI <dev@landing.ai>
9
9
  License-File: LICENSE
10
10
  Requires-Python: <4.0,>=3.9
11
- Requires-Dist: anthropic<0.32,>=0.31.0
11
+ Requires-Dist: anthropic>=0.54.0
12
12
  Requires-Dist: av<12,>=11.0.0
13
13
  Requires-Dist: dotenv<0.10,>=0.9.9
14
14
  Requires-Dist: flake8<8,>=7.0.0
@@ -20,7 +20,7 @@ Requires-Dist: matplotlib<4,>=3.9.2
20
20
  Requires-Dist: nbclient<0.11,>=0.10.0
21
21
  Requires-Dist: nbformat<6,>=5.10.4
22
22
  Requires-Dist: numpy<2.0.0,>=1.21.0
23
- Requires-Dist: openai==1.55.3
23
+ Requires-Dist: openai>=1.86.0
24
24
  Requires-Dist: opencv-python==4.*
25
25
  Requires-Dist: opentelemetry-api<2,>=1.29.0
26
26
  Requires-Dist: pandas==2.*
@@ -36,7 +36,7 @@ Requires-Dist: tabulate<0.10,>=0.9.0
36
36
  Requires-Dist: tenacity<9,>=8.3.0
37
37
  Requires-Dist: tqdm<5.0.0,>=4.64.0
38
38
  Requires-Dist: typing-extensions==4.*
39
- Requires-Dist: yt-dlp>=2025.3.31
39
+ Requires-Dist: yt-dlp>=2025.6.9
40
40
  Description-Content-Type: text/markdown
41
41
 
42
42
  <div align="center">