vision-agent 1.1.17__py3-none-any.whl → 1.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +1 -1
- vision_agent/agent/__init__.py +1 -0
- vision_agent/agent/vision_agent_prompts_v3.py +372 -0
- vision_agent/agent/vision_agent_v3.py +278 -0
- vision_agent/lmm/lmm.py +219 -57
- vision_agent/tools/__init__.py +2 -2
- vision_agent/tools/planner_v3_tools.py +206 -0
- vision_agent/utils/agent.py +24 -8
- vision_agent/utils/tools.py +1 -1
- {vision_agent-1.1.17.dist-info → vision_agent-1.1.18.dist-info}/METADATA +4 -4
- {vision_agent-1.1.17.dist-info → vision_agent-1.1.18.dist-info}/RECORD +13 -10
- {vision_agent-1.1.17.dist-info → vision_agent-1.1.18.dist-info}/WHEEL +0 -0
- {vision_agent-1.1.17.dist-info → vision_agent-1.1.18.dist-info}/licenses/LICENSE +0 -0
vision_agent/lmm/lmm.py
CHANGED
@@ -1,19 +1,33 @@
|
|
1
|
+
import base64
|
1
2
|
import json
|
2
3
|
import os
|
3
4
|
from abc import ABC, abstractmethod
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
-
|
6
|
+
from typing import (
|
7
|
+
Any,
|
8
|
+
Dict,
|
9
|
+
Iterator,
|
10
|
+
List,
|
11
|
+
Optional,
|
12
|
+
Sequence,
|
13
|
+
Union,
|
14
|
+
cast,
|
15
|
+
)
|
7
16
|
|
8
17
|
import anthropic
|
9
18
|
import requests
|
10
|
-
from anthropic.types import
|
11
|
-
|
12
|
-
|
19
|
+
from anthropic.types import (
|
20
|
+
ImageBlockParam,
|
21
|
+
MessageParam,
|
22
|
+
TextBlockParam,
|
23
|
+
ThinkingBlockParam,
|
24
|
+
)
|
13
25
|
from google import genai # type: ignore
|
14
26
|
from google.genai import types # type: ignore
|
27
|
+
from openai import AzureOpenAI, OpenAI
|
15
28
|
|
16
29
|
from vision_agent.models import Message
|
30
|
+
from vision_agent.utils.agent import extract_tag
|
17
31
|
from vision_agent.utils.image_utils import encode_media
|
18
32
|
|
19
33
|
|
@@ -99,11 +113,15 @@ class OpenAILMM(LMM):
|
|
99
113
|
[{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
|
100
114
|
"""
|
101
115
|
fixed_chat = []
|
102
|
-
for
|
103
|
-
fixed_c = {"role":
|
104
|
-
fixed_c["content"] = [{"type": "text", "text":
|
105
|
-
if
|
106
|
-
|
116
|
+
for msg in chat:
|
117
|
+
fixed_c = {"role": msg["role"]}
|
118
|
+
fixed_c["content"] = [{"type": "text", "text": msg["content"]}] # type: ignore
|
119
|
+
if (
|
120
|
+
"media" in msg
|
121
|
+
and msg["media"] is not None
|
122
|
+
and self.model_name != "o3-mini"
|
123
|
+
):
|
124
|
+
for media in msg["media"]:
|
107
125
|
resize = kwargs["resize"] if "resize" in kwargs else self.image_size
|
108
126
|
image_detail = (
|
109
127
|
kwargs["image_detail"]
|
@@ -297,14 +315,14 @@ class OllamaLMM(LMM):
|
|
297
315
|
[{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
|
298
316
|
"""
|
299
317
|
fixed_chat = []
|
300
|
-
for
|
301
|
-
if "media" in
|
318
|
+
for msg in chat:
|
319
|
+
if "media" in msg and msg["media"] is not None:
|
302
320
|
resize = kwargs["resize"] if "resize" in kwargs else self.image_size
|
303
|
-
|
304
|
-
encode_media(cast(str, m), resize=resize) for m in
|
321
|
+
msg["images"] = [
|
322
|
+
encode_media(cast(str, m), resize=resize) for m in msg["media"]
|
305
323
|
]
|
306
|
-
del
|
307
|
-
fixed_chat.append(
|
324
|
+
del msg["media"]
|
325
|
+
fixed_chat.append(msg)
|
308
326
|
url = f"{self.url}/chat"
|
309
327
|
model = self.model_name
|
310
328
|
messages = fixed_chat
|
@@ -410,63 +428,207 @@ class AnthropicLMM(LMM):
|
|
410
428
|
|
411
429
|
def __call__(
|
412
430
|
self,
|
413
|
-
input: Union[str, Sequence[
|
431
|
+
input: Union[str, Sequence[Message]],
|
414
432
|
**kwargs: Any,
|
415
433
|
) -> Union[str, Iterator[Optional[str]]]:
|
416
434
|
if isinstance(input, str):
|
417
435
|
return self.generate(input, **kwargs)
|
418
436
|
return self.chat(input, **kwargs)
|
419
437
|
|
420
|
-
def
|
438
|
+
def create_thinking_assistant_message(
|
421
439
|
self,
|
422
|
-
|
423
|
-
|
424
|
-
|
440
|
+
msg_content: str,
|
441
|
+
) -> MessageParam:
|
442
|
+
content: List[Union[TextBlockParam, ThinkingBlockParam]] = []
|
443
|
+
thinking_content = extract_tag(msg_content, "thinking")
|
444
|
+
signature = extract_tag(msg_content, "signature")
|
445
|
+
if thinking_content:
|
446
|
+
content.append(
|
447
|
+
ThinkingBlockParam(
|
448
|
+
type="thinking",
|
449
|
+
thinking=thinking_content.strip(),
|
450
|
+
signature=signature.strip() if signature else "",
|
451
|
+
)
|
452
|
+
)
|
453
|
+
signature_content = extract_tag(msg_content, "signature")
|
454
|
+
if signature_content:
|
455
|
+
text_content = msg_content.replace(
|
456
|
+
f"<thinking>{thinking_content}</thinking>", ""
|
457
|
+
).replace(f"<signature>{signature_content}</signature>", "")
|
458
|
+
else:
|
459
|
+
text_content = msg_content.replace(
|
460
|
+
f"<thinking>{thinking_content}</thinking>", ""
|
461
|
+
)
|
462
|
+
if text_content.strip():
|
463
|
+
content.append(TextBlockParam(type="text", text=text_content.strip()))
|
464
|
+
return MessageParam(role="assistant", content=content)
|
465
|
+
|
466
|
+
def _setup_chat_kwargs(self, kwargs: Dict[str, Any]) -> tuple[Dict[str, Any], bool]:
|
467
|
+
"""Set up kwargs and determine if thinking mode is enabled."""
|
468
|
+
tmp_kwargs = self.kwargs | kwargs
|
469
|
+
thinking_enabled = (
|
470
|
+
"thinking" in tmp_kwargs
|
471
|
+
and "type" in tmp_kwargs["thinking"]
|
472
|
+
and tmp_kwargs["thinking"]["type"] == "enabled"
|
473
|
+
)
|
474
|
+
if thinking_enabled:
|
475
|
+
tmp_kwargs["temperature"] = 1.0
|
476
|
+
return tmp_kwargs, thinking_enabled
|
477
|
+
|
478
|
+
def _convert_messages_to_anthropic_format(
|
479
|
+
self, chat: Sequence[Message], thinking_enabled: bool, **kwargs: Any
|
480
|
+
) -> List[MessageParam]:
|
481
|
+
"""Convert chat messages to Anthropic format."""
|
425
482
|
messages: List[MessageParam] = []
|
483
|
+
|
426
484
|
for msg in chat:
|
427
|
-
|
428
|
-
TextBlockParam
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
"
|
442
|
-
|
443
|
-
|
485
|
+
if msg["role"] == "user":
|
486
|
+
content: List[Union[TextBlockParam, ImageBlockParam]] = [
|
487
|
+
TextBlockParam(type="text", text=cast(str, msg["content"]))
|
488
|
+
]
|
489
|
+
if "media" in msg and msg["media"] is not None:
|
490
|
+
for media_path in msg["media"]:
|
491
|
+
resize = (
|
492
|
+
kwargs["resize"] if "resize" in kwargs else self.image_size
|
493
|
+
)
|
494
|
+
encoded_media = encode_media(
|
495
|
+
cast(str, media_path), resize=resize
|
496
|
+
)
|
497
|
+
if encoded_media.startswith("data:image/png;base64,"):
|
498
|
+
encoded_media = encoded_media[
|
499
|
+
len("data:image/png;base64,") :
|
500
|
+
]
|
501
|
+
content.append(
|
502
|
+
ImageBlockParam(
|
503
|
+
type="image",
|
504
|
+
source={
|
505
|
+
"type": "base64",
|
506
|
+
"media_type": "image/png",
|
507
|
+
"data": encoded_media,
|
508
|
+
},
|
509
|
+
)
|
510
|
+
)
|
511
|
+
messages.append({"role": "user", "content": content})
|
512
|
+
elif msg["role"] == "assistant":
|
513
|
+
if thinking_enabled:
|
514
|
+
messages.append(
|
515
|
+
self.create_thinking_assistant_message(
|
516
|
+
cast(str, msg["content"]),
|
517
|
+
)
|
518
|
+
)
|
519
|
+
else:
|
520
|
+
messages.append(
|
521
|
+
MessageParam(
|
522
|
+
role="assistant",
|
523
|
+
content=[
|
524
|
+
{"type": "text", "text": cast(str, msg["content"])}
|
525
|
+
],
|
444
526
|
)
|
445
527
|
)
|
446
|
-
|
528
|
+
else:
|
529
|
+
raise ValueError(
|
530
|
+
f"Unsupported role {msg['role']}. Only 'user' and 'assistant' roles are supported."
|
531
|
+
)
|
447
532
|
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
533
|
+
return messages
|
534
|
+
|
535
|
+
def _handle_streaming_response(
|
536
|
+
self, stream_response: anthropic.Stream[anthropic.MessageStreamEvent]
|
537
|
+
) -> Iterator[Optional[str]]:
|
538
|
+
"""Handle streaming response from Anthropic API."""
|
539
|
+
|
540
|
+
def f() -> Iterator[Optional[str]]:
|
541
|
+
thinking_start = False
|
542
|
+
signature_start = False
|
543
|
+
for chunk in stream_response:
|
544
|
+
if chunk.type == "message_start" or chunk.type == "content_block_start":
|
545
|
+
continue
|
546
|
+
elif chunk.type == "content_block_delta":
|
547
|
+
if chunk.delta.type == "text_delta":
|
548
|
+
if thinking_start:
|
549
|
+
thinking_start = False
|
550
|
+
yield f"</thinking>\n{chunk.delta.text}"
|
551
|
+
elif signature_start:
|
552
|
+
signature_start = False
|
553
|
+
yield f"</signature>\n{chunk.delta.text}"
|
554
|
+
else:
|
555
|
+
yield chunk.delta.text
|
556
|
+
elif chunk.delta.type == "thinking_delta":
|
557
|
+
if not thinking_start:
|
558
|
+
thinking_start = True
|
559
|
+
yield f"<thinking>{chunk.delta.thinking}"
|
560
|
+
else:
|
561
|
+
yield chunk.delta.thinking
|
562
|
+
elif chunk.delta.type == "signature_delta":
|
563
|
+
if not signature_start:
|
564
|
+
signature_start = True
|
565
|
+
yield f"<signature>{chunk.delta.signature}"
|
566
|
+
else:
|
567
|
+
yield chunk.delta.signature
|
568
|
+
elif chunk.type == "message_stop":
|
569
|
+
yield None
|
570
|
+
|
571
|
+
return f()
|
572
|
+
|
573
|
+
def _format_thinking_response(self, msg_response: anthropic.types.Message) -> str:
|
574
|
+
"""Format thinking mode response with proper tags."""
|
575
|
+
thinking = ""
|
576
|
+
signature = ""
|
577
|
+
redacted_thinking = ""
|
578
|
+
text = ""
|
579
|
+
for block in msg_response.content:
|
580
|
+
if block.type == "thinking":
|
581
|
+
thinking += block.thinking
|
582
|
+
if block.signature:
|
583
|
+
signature = block.signature
|
584
|
+
elif block.type == "text":
|
585
|
+
text += block.text
|
586
|
+
elif block.type == "redacted_thinking":
|
587
|
+
redacted_thinking += block.data
|
588
|
+
return (
|
589
|
+
f"<thinking>{thinking}</thinking>\n"
|
590
|
+
+ (
|
591
|
+
f"<redacted_thinking>{redacted_thinking}</redacted_thinking>\n"
|
592
|
+
if redacted_thinking
|
593
|
+
else ""
|
594
|
+
)
|
595
|
+
+ (f"<signature>{signature}</signature>\n" if signature else "")
|
596
|
+
+ text
|
452
597
|
)
|
453
|
-
if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
|
454
598
|
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
yield chunk.delta.text
|
464
|
-
elif chunk.type == "message_stop":
|
465
|
-
yield None
|
599
|
+
def _handle_non_streaming_response(
|
600
|
+
self, response_untyped: Any, thinking_enabled: bool
|
601
|
+
) -> str:
|
602
|
+
"""Handle non-streaming response from Anthropic API."""
|
603
|
+
msg_response = cast(anthropic.types.Message, response_untyped)
|
604
|
+
if thinking_enabled:
|
605
|
+
return self._format_thinking_response(msg_response)
|
606
|
+
return cast(anthropic.types.TextBlock, msg_response.content[0]).text
|
466
607
|
|
467
|
-
|
608
|
+
def chat(
|
609
|
+
self,
|
610
|
+
chat: Sequence[Message],
|
611
|
+
**kwargs: Any,
|
612
|
+
) -> Union[str, Iterator[Optional[str]]]:
|
613
|
+
tmp_kwargs, thinking_enabled = self._setup_chat_kwargs(kwargs)
|
614
|
+
messages = self._convert_messages_to_anthropic_format(
|
615
|
+
chat, thinking_enabled, **kwargs
|
616
|
+
)
|
617
|
+
|
618
|
+
response_untyped = self.client.messages.create(
|
619
|
+
model=self.model_name, messages=messages, **tmp_kwargs
|
620
|
+
)
|
621
|
+
|
622
|
+
is_stream = bool(tmp_kwargs.get("stream", False))
|
623
|
+
if is_stream:
|
624
|
+
stream_response = cast(
|
625
|
+
anthropic.Stream[anthropic.MessageStreamEvent], response_untyped
|
626
|
+
)
|
627
|
+
return self._handle_streaming_response(stream_response)
|
468
628
|
else:
|
469
|
-
return
|
629
|
+
return self._handle_non_streaming_response(
|
630
|
+
response_untyped, thinking_enabled
|
631
|
+
)
|
470
632
|
|
471
633
|
def generate(
|
472
634
|
self,
|
vision_agent/tools/__init__.py
CHANGED
@@ -42,7 +42,6 @@ from .tools import (
|
|
42
42
|
glee_sam2_video_tracking,
|
43
43
|
load_image,
|
44
44
|
minimum_distance,
|
45
|
-
paddle_ocr,
|
46
45
|
od_sam2_video_tracking,
|
47
46
|
overlay_bounding_boxes,
|
48
47
|
overlay_heat_map,
|
@@ -50,6 +49,7 @@ from .tools import (
|
|
50
49
|
owlv2_object_detection,
|
51
50
|
owlv2_sam2_instance_segmentation,
|
52
51
|
owlv2_sam2_video_tracking,
|
52
|
+
paddle_ocr,
|
53
53
|
qwen2_vl_images_vqa,
|
54
54
|
qwen2_vl_video_vqa,
|
55
55
|
qwen25_vl_images_vqa,
|
@@ -74,7 +74,7 @@ def register_tool(imports: Optional[List] = None) -> Callable:
|
|
74
74
|
def decorator(tool: Callable) -> Callable:
|
75
75
|
import inspect
|
76
76
|
|
77
|
-
global TOOLS, TOOLS_DF, TOOL_DESCRIPTIONS, TOOL_DOCSTRING, TOOLS_INFO
|
77
|
+
global TOOLS, TOOLS_DF, TOOL_DESCRIPTIONS, TOOL_DOCSTRING, TOOLS_INFO # noqa: F824
|
78
78
|
from vision_agent.tools.tools import TOOLS
|
79
79
|
|
80
80
|
if tool not in TOOLS: # type: ignore
|
@@ -0,0 +1,206 @@
|
|
1
|
+
import base64
|
2
|
+
import copy
|
3
|
+
import io
|
4
|
+
from typing import Dict, List, Optional, Tuple, Union, cast
|
5
|
+
|
6
|
+
import cv2
|
7
|
+
import matplotlib.figure
|
8
|
+
import matplotlib.pyplot as plt
|
9
|
+
import numpy as np
|
10
|
+
from PIL import Image
|
11
|
+
from PIL.Image import Image as PILImageType
|
12
|
+
|
13
|
+
from vision_agent.utils.image_utils import (
|
14
|
+
denormalize_bbox,
|
15
|
+
normalize_bbox,
|
16
|
+
numpy_to_bytes,
|
17
|
+
rle_decode_array,
|
18
|
+
)
|
19
|
+
from vision_agent.utils.tools import send_inference_request
|
20
|
+
|
21
|
+
|
22
|
+
def maybe_denormalize_bbox(
|
23
|
+
bbox: List[Union[int, float]], image_size: Tuple[int, ...]
|
24
|
+
) -> List[float]:
|
25
|
+
if all([0 <= c <= 1 for c in bbox]):
|
26
|
+
return denormalize_bbox(bbox, image_size)
|
27
|
+
return bbox
|
28
|
+
|
29
|
+
|
30
|
+
def maybe_normalize_bbox(
|
31
|
+
bbox: List[Union[int, float]], image_size: Tuple[int, ...]
|
32
|
+
) -> List[float]:
|
33
|
+
if any([1 <= c for c in bbox]):
|
34
|
+
return normalize_bbox(bbox, image_size)
|
35
|
+
return bbox
|
36
|
+
|
37
|
+
|
38
|
+
def instance_segmentation(
|
39
|
+
prompt: str, image: np.ndarray, threshold: float = 0.23, nms_threshold: float = 0.5
|
40
|
+
) -> List[Dict[str, Union[str, float, List[float], np.ndarray]]]:
|
41
|
+
image_bytes = numpy_to_bytes(image)
|
42
|
+
files = [("image", image_bytes)]
|
43
|
+
data = {"prompts": [prompt], "threshold": threshold, "nms_threshold": nms_threshold}
|
44
|
+
results = send_inference_request(
|
45
|
+
data,
|
46
|
+
"glee",
|
47
|
+
files=files,
|
48
|
+
v2=True,
|
49
|
+
)
|
50
|
+
results = results[0]
|
51
|
+
results_formatted = [
|
52
|
+
{
|
53
|
+
"label": elt["label"],
|
54
|
+
"score": elt["score"],
|
55
|
+
"bbox": normalize_bbox(elt["bounding_box"], image.shape[:2]),
|
56
|
+
"mask": np.array(rle_decode_array(elt["mask"])),
|
57
|
+
}
|
58
|
+
for elt in results
|
59
|
+
]
|
60
|
+
return results_formatted
|
61
|
+
|
62
|
+
|
63
|
+
def ocr(image: np.ndarray) -> List[Dict[str, Union[str, float, List[float]]]]:
|
64
|
+
image_bytes = numpy_to_bytes(image)
|
65
|
+
files = [("image", image_bytes)]
|
66
|
+
results = send_inference_request(
|
67
|
+
{},
|
68
|
+
"paddle-ocr",
|
69
|
+
files=files,
|
70
|
+
v2=True,
|
71
|
+
)
|
72
|
+
results_formatted = [
|
73
|
+
{
|
74
|
+
"label": elt["label"],
|
75
|
+
"score": elt["score"],
|
76
|
+
"bbox": normalize_bbox(elt["bbox"], image.shape[:2]),
|
77
|
+
}
|
78
|
+
for elt in results
|
79
|
+
]
|
80
|
+
return results_formatted
|
81
|
+
|
82
|
+
|
83
|
+
def depth_estimation(image: np.ndarray) -> np.ndarray:
|
84
|
+
shape = image.shape[:2]
|
85
|
+
image_bytes = numpy_to_bytes(image)
|
86
|
+
files = [("image", image_bytes)]
|
87
|
+
results = send_inference_request(
|
88
|
+
{},
|
89
|
+
"depth-pro",
|
90
|
+
files=files,
|
91
|
+
v2=True,
|
92
|
+
)
|
93
|
+
depth = np.frombuffer(base64.b64decode(results["depth"]), dtype=np.float32).reshape(
|
94
|
+
shape
|
95
|
+
)
|
96
|
+
return depth
|
97
|
+
|
98
|
+
|
99
|
+
def visualize_bounding_boxes(
|
100
|
+
image: np.ndarray, bounding_boxes: List[Dict[str, Union[str, float, List[float]]]]
|
101
|
+
) -> np.ndarray:
|
102
|
+
image = image.copy()
|
103
|
+
image_size = image.shape[:2]
|
104
|
+
bounding_boxes = copy.deepcopy(bounding_boxes)
|
105
|
+
|
106
|
+
for bbox in bounding_boxes:
|
107
|
+
bbox["bbox"] = maybe_denormalize_bbox(
|
108
|
+
cast(List[float], bbox["bbox"]), image_size
|
109
|
+
)
|
110
|
+
for bbox in bounding_boxes:
|
111
|
+
x1, y1, x2, y2 = bbox["bbox"] # type: ignore
|
112
|
+
cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255), 2)
|
113
|
+
return image
|
114
|
+
|
115
|
+
|
116
|
+
def visualize_segmentation_masks(
|
117
|
+
image: np.ndarray,
|
118
|
+
segmentation_masks: List[Dict[str, Union[str, float, np.ndarray]]],
|
119
|
+
) -> np.ndarray:
|
120
|
+
alpha = 0.5
|
121
|
+
overlay = image.copy()
|
122
|
+
color_mask = np.zeros_like(image)
|
123
|
+
color_mask[:, :] = (0, 100, 255)
|
124
|
+
for elt in segmentation_masks:
|
125
|
+
mask = cast(np.ndarray, elt["mask"])
|
126
|
+
overlay[mask == 1] = (1 - alpha) * overlay[mask == 1] + alpha * color_mask[
|
127
|
+
mask == 1
|
128
|
+
]
|
129
|
+
|
130
|
+
# draw outline on the mask so it doesn't just think the color of the object changed
|
131
|
+
mask_uint8 = (mask * 255).astype(np.uint8)
|
132
|
+
contours, _ = cv2.findContours(
|
133
|
+
mask_uint8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
134
|
+
)
|
135
|
+
cv2.drawContours(overlay, contours, -1, (0, 0, 255), 2, lineType=cv2.LINE_AA)
|
136
|
+
overlay = np.clip(overlay, 0, 255).astype(np.uint8)
|
137
|
+
return overlay
|
138
|
+
|
139
|
+
|
140
|
+
def get_crops(
|
141
|
+
image: np.ndarray, bounding_boxes: List[Dict[str, Union[str, float, List[float]]]]
|
142
|
+
) -> List[np.ndarray]:
|
143
|
+
image = image.copy()
|
144
|
+
bounding_boxes = copy.deepcopy(bounding_boxes)
|
145
|
+
|
146
|
+
for bbox in bounding_boxes:
|
147
|
+
bbox["bbox"] = maybe_denormalize_bbox(
|
148
|
+
cast(List[float], bbox["bbox"]), image.shape[:2]
|
149
|
+
)
|
150
|
+
crops = []
|
151
|
+
for bbox in bounding_boxes:
|
152
|
+
x1, y1, x2, y2 = bbox["bbox"] # type: ignore
|
153
|
+
crops.append(image[int(y1) : int(y2), int(x1) : int(x2)])
|
154
|
+
return crops
|
155
|
+
|
156
|
+
|
157
|
+
def rotate_90(image: np.ndarray, k: int = 1) -> np.ndarray:
|
158
|
+
return np.rot90(image, k=k, axes=(0, 1))
|
159
|
+
|
160
|
+
|
161
|
+
def iou(
|
162
|
+
pred1: Union[List[float], np.ndarray], pred2: Union[List[float], np.ndarray]
|
163
|
+
) -> float:
|
164
|
+
if isinstance(pred1, list) and isinstance(pred2, list):
|
165
|
+
x1, y1, x2, y2 = pred1
|
166
|
+
x1_, y1_, x2_, y2_ = pred2
|
167
|
+
intersection = max(0, min(x2, x2_) - max(x1, x1_)) * max(
|
168
|
+
0, min(y2, y2_) - max(y1, y1_)
|
169
|
+
)
|
170
|
+
union = (x2 - x1) * (y2 - y1) + (x2_ - x1_) * (y2_ - y1_) - intersection
|
171
|
+
return intersection / union
|
172
|
+
elif isinstance(pred1, np.ndarray) and isinstance(pred2, np.ndarray):
|
173
|
+
pred1 = np.clip(pred1, 0, 1)
|
174
|
+
pred2 = np.clip(pred2, 0, 1)
|
175
|
+
intersection = np.sum(pred1 * pred2)
|
176
|
+
union = np.sum(pred1) + np.sum(pred2) - intersection
|
177
|
+
return intersection / union
|
178
|
+
raise ValueError("Unsupported input types for IoU calculation.")
|
179
|
+
|
180
|
+
|
181
|
+
def display_image(
|
182
|
+
image: Union[np.ndarray, PILImageType, matplotlib.figure.Figure, str],
|
183
|
+
) -> None:
|
184
|
+
display_img: Optional[PILImageType] = None
|
185
|
+
if isinstance(image, np.ndarray):
|
186
|
+
display_img = Image.fromarray(image)
|
187
|
+
elif isinstance(image, matplotlib.figure.Figure):
|
188
|
+
# Render the figure to a BytesIO buffer
|
189
|
+
buf = io.BytesIO()
|
190
|
+
image.savefig(buf, format="png")
|
191
|
+
buf.seek(0)
|
192
|
+
# Load the buffer as a PIL Image
|
193
|
+
display_img = Image.open(buf)
|
194
|
+
plt.close(image) # type: ignore
|
195
|
+
elif isinstance(image, PILImageType):
|
196
|
+
display_img = image # Already a PIL Image
|
197
|
+
elif isinstance(image, str):
|
198
|
+
display_img = Image.open(image)
|
199
|
+
|
200
|
+
if display_img is not None:
|
201
|
+
plt.imshow(display_img) # type: ignore
|
202
|
+
plt.axis("off") # type: ignore
|
203
|
+
plt.show()
|
204
|
+
else:
|
205
|
+
# Handle cases where image type is not supported or conversion failed
|
206
|
+
print("Unsupported image type or conversion failed.")
|
vision_agent/utils/agent.py
CHANGED
@@ -247,7 +247,9 @@ def print_table(title: str, columns: List[str], rows: List[List[str]]) -> None:
|
|
247
247
|
|
248
248
|
|
249
249
|
def add_media_to_chat(
|
250
|
-
chat: List[AgentMessage],
|
250
|
+
chat: List[AgentMessage],
|
251
|
+
code_interpreter: Optional[CodeInterpreter] = None,
|
252
|
+
append_to_prompt: bool = True,
|
251
253
|
) -> Tuple[List[AgentMessage], List[AgentMessage], List[Union[str, Path]]]:
|
252
254
|
orig_chat = copy.deepcopy(chat)
|
253
255
|
int_chat = copy.deepcopy(chat)
|
@@ -278,6 +280,7 @@ def add_media_to_chat(
|
|
278
280
|
if (
|
279
281
|
not str(chat_i.content).endswith(f" Media name {media}")
|
280
282
|
and chat_i.role == "user"
|
283
|
+
and append_to_prompt
|
281
284
|
):
|
282
285
|
chat_i.content += f" Media name {media}"
|
283
286
|
chat_i.media = media_list_i if len(media_list_i) > 0 else None
|
@@ -304,13 +307,26 @@ def add_media_to_chat(
|
|
304
307
|
def capture_media_from_exec(execution: Execution) -> List[str]:
|
305
308
|
images = []
|
306
309
|
for result in execution.results:
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
310
|
+
if hasattr(result, "formats"):
|
311
|
+
for format in result.formats():
|
312
|
+
if format in ["png", "jpeg"]:
|
313
|
+
# converts the image to png and then to base64
|
314
|
+
images.append(
|
315
|
+
"data:image/png;base64,"
|
316
|
+
+ convert_to_b64(b64_to_pil(result[format]))
|
317
|
+
)
|
318
|
+
elif hasattr(result, "savefig"):
|
319
|
+
pass
|
320
|
+
elif hasattr(result, "_repr_png_") and result._repr_png_():
|
321
|
+
images.append(
|
322
|
+
"data:image/png;base64,"
|
323
|
+
+ convert_to_b64(b64_to_pil(result._repr_png_())) # type: ignore
|
324
|
+
)
|
325
|
+
elif hasattr(result, "_repr_jpeg_") and result._repr_jpeg_():
|
326
|
+
images.append(
|
327
|
+
"data:image/jpeg;base64,"
|
328
|
+
+ convert_to_b64(b64_to_pil(result._repr_jpeg_())) # type: ignore
|
329
|
+
)
|
314
330
|
return images
|
315
331
|
|
316
332
|
|
vision_agent/utils/tools.py
CHANGED
@@ -106,7 +106,7 @@ def send_task_inference_request(
|
|
106
106
|
if metadata is not None and "function_name" in metadata:
|
107
107
|
function_name = metadata["function_name"]
|
108
108
|
response = _call_post(url, payload, session, files, function_name, is_form)
|
109
|
-
return response["data"]
|
109
|
+
return response["data"] if "data" in response else response
|
110
110
|
|
111
111
|
|
112
112
|
def _create_requests_session(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 1.1.
|
3
|
+
Version: 1.1.18
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Project-URL: Homepage, https://landing.ai
|
6
6
|
Project-URL: repository, https://github.com/landing-ai/vision-agent
|
@@ -8,7 +8,7 @@ Project-URL: documentation, https://github.com/landing-ai/vision-agent
|
|
8
8
|
Author-email: Landing AI <dev@landing.ai>
|
9
9
|
License-File: LICENSE
|
10
10
|
Requires-Python: <4.0,>=3.9
|
11
|
-
Requires-Dist: anthropic
|
11
|
+
Requires-Dist: anthropic>=0.54.0
|
12
12
|
Requires-Dist: av<12,>=11.0.0
|
13
13
|
Requires-Dist: dotenv<0.10,>=0.9.9
|
14
14
|
Requires-Dist: flake8<8,>=7.0.0
|
@@ -20,7 +20,7 @@ Requires-Dist: matplotlib<4,>=3.9.2
|
|
20
20
|
Requires-Dist: nbclient<0.11,>=0.10.0
|
21
21
|
Requires-Dist: nbformat<6,>=5.10.4
|
22
22
|
Requires-Dist: numpy<2.0.0,>=1.21.0
|
23
|
-
Requires-Dist: openai
|
23
|
+
Requires-Dist: openai>=1.86.0
|
24
24
|
Requires-Dist: opencv-python==4.*
|
25
25
|
Requires-Dist: opentelemetry-api<2,>=1.29.0
|
26
26
|
Requires-Dist: pandas==2.*
|
@@ -36,7 +36,7 @@ Requires-Dist: tabulate<0.10,>=0.9.0
|
|
36
36
|
Requires-Dist: tenacity<9,>=8.3.0
|
37
37
|
Requires-Dist: tqdm<5.0.0,>=4.64.0
|
38
38
|
Requires-Dist: typing-extensions==4.*
|
39
|
-
Requires-Dist: yt-dlp>=2025.
|
39
|
+
Requires-Dist: yt-dlp>=2025.6.9
|
40
40
|
Description-Content-Type: text/markdown
|
41
41
|
|
42
42
|
<div align="center">
|