vision-agent 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +25 -13
- vision_agent/tools/__init__.py +2 -0
- vision_agent/tools/tools.py +95 -19
- {vision_agent-0.2.10.dist-info → vision_agent-0.2.12.dist-info}/METADATA +4 -2
- {vision_agent-0.2.10.dist-info → vision_agent-0.2.12.dist-info}/RECORD +7 -7
- {vision_agent-0.2.10.dist-info → vision_agent-0.2.12.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.10.dist-info → vision_agent-0.2.12.dist-info}/WHEEL +0 -0
@@ -489,6 +489,7 @@ class VisionAgent(Agent):
|
|
489
489
|
image: Optional[Union[str, Path]] = None,
|
490
490
|
reference_data: Optional[Dict[str, str]] = None,
|
491
491
|
visualize_output: Optional[bool] = False,
|
492
|
+
self_reflection: Optional[bool] = True,
|
492
493
|
) -> str:
|
493
494
|
"""Invoke the vision agent.
|
494
495
|
|
@@ -501,6 +502,7 @@ class VisionAgent(Agent):
|
|
501
502
|
{"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
|
502
503
|
where the bounding box coordinates are normalized.
|
503
504
|
visualize_output: Whether to visualize the output.
|
505
|
+
self_reflection: boolean to enable and disable self reflection.
|
504
506
|
|
505
507
|
Returns:
|
506
508
|
The result of the vision agent in text.
|
@@ -512,6 +514,7 @@ class VisionAgent(Agent):
|
|
512
514
|
image=image,
|
513
515
|
visualize_output=visualize_output,
|
514
516
|
reference_data=reference_data,
|
517
|
+
self_reflection=self_reflection,
|
515
518
|
)
|
516
519
|
|
517
520
|
def log_progress(self, description: str) -> None:
|
@@ -538,6 +541,7 @@ class VisionAgent(Agent):
|
|
538
541
|
image: Optional[Union[str, Path]] = None,
|
539
542
|
reference_data: Optional[Dict[str, str]] = None,
|
540
543
|
visualize_output: Optional[bool] = False,
|
544
|
+
self_reflection: Optional[bool] = True,
|
541
545
|
) -> Tuple[str, List[Dict]]:
|
542
546
|
"""Chat with the vision agent and return the final answer and all tool results.
|
543
547
|
|
@@ -550,6 +554,7 @@ class VisionAgent(Agent):
|
|
550
554
|
{"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
|
551
555
|
where the bounding box coordinates are normalized.
|
552
556
|
visualize_output: Whether to visualize the output.
|
557
|
+
self_reflection: boolean to enable and disable self reflection.
|
553
558
|
|
554
559
|
Returns:
|
555
560
|
A tuple where the first item is the final answer and the second item is a
|
@@ -625,20 +630,25 @@ class VisionAgent(Agent):
|
|
625
630
|
reflection_images = [image]
|
626
631
|
else:
|
627
632
|
reflection_images = None
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
633
|
+
|
634
|
+
if self_reflection:
|
635
|
+
reflection = self_reflect(
|
636
|
+
self.reflect_model,
|
637
|
+
question,
|
638
|
+
self.tools,
|
639
|
+
all_tool_results,
|
640
|
+
final_answer,
|
641
|
+
reflection_images,
|
642
|
+
)
|
643
|
+
self.log_progress(f"Reflection: {reflection}")
|
644
|
+
parsed_reflection = parse_reflect(reflection)
|
645
|
+
if parsed_reflection["Finish"]:
|
646
|
+
break
|
647
|
+
else:
|
648
|
+
reflections += "\n" + parsed_reflection["Reflection"]
|
640
649
|
else:
|
641
|
-
|
650
|
+
self.log_progress("Self Reflection skipped based on user request.")
|
651
|
+
break
|
642
652
|
# '<ANSWER>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
|
643
653
|
self.log_progress(
|
644
654
|
f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}</ANSWER>"
|
@@ -660,12 +670,14 @@ class VisionAgent(Agent):
|
|
660
670
|
image: Optional[Union[str, Path]] = None,
|
661
671
|
reference_data: Optional[Dict[str, str]] = None,
|
662
672
|
visualize_output: Optional[bool] = False,
|
673
|
+
self_reflection: Optional[bool] = True,
|
663
674
|
) -> str:
|
664
675
|
answer, _ = self.chat_with_workflow(
|
665
676
|
chat,
|
666
677
|
image=image,
|
667
678
|
visualize_output=visualize_output,
|
668
679
|
reference_data=reference_data,
|
680
|
+
self_reflection=self_reflection,
|
669
681
|
)
|
670
682
|
return answer
|
671
683
|
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -9,6 +9,7 @@ import numpy as np
|
|
9
9
|
import requests
|
10
10
|
from PIL import Image
|
11
11
|
from PIL.Image import Image as ImageType
|
12
|
+
from scipy.spatial import distance # type: ignore
|
12
13
|
|
13
14
|
from vision_agent.image_utils import (
|
14
15
|
b64_to_pil,
|
@@ -544,7 +545,7 @@ class VisualPromptCounting(Tool):
|
|
544
545
|
-------
|
545
546
|
>>> import vision_agent as va
|
546
547
|
>>> prompt_count = va.tools.VisualPromptCounting()
|
547
|
-
>>> prompt_count(image="image1.jpg", prompt="0.1, 0.1, 0.4, 0.42
|
548
|
+
>>> prompt_count(image="image1.jpg", prompt={"bbox": [0.1, 0.1, 0.4, 0.42]})
|
548
549
|
{'count': 23}
|
549
550
|
"""
|
550
551
|
|
@@ -554,52 +555,60 @@ class VisualPromptCounting(Tool):
|
|
554
555
|
usage = {
|
555
556
|
"required_parameters": [
|
556
557
|
{"name": "image", "type": "str"},
|
557
|
-
{"name": "prompt", "type": "str"},
|
558
|
+
{"name": "prompt", "type": "Dict[str, List[float]"},
|
558
559
|
],
|
559
560
|
"examples": [
|
560
561
|
{
|
561
562
|
"scenario": "Here is an example of a lid '0.1, 0.1, 0.14, 0.2', Can you count the items in the image ? Image name: lids.jpg",
|
562
|
-
"parameters": {
|
563
|
+
"parameters": {
|
564
|
+
"image": "lids.jpg",
|
565
|
+
"prompt": {"bbox": [0.1, 0.1, 0.14, 0.2]},
|
566
|
+
},
|
563
567
|
},
|
564
568
|
{
|
565
|
-
"scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg",
|
566
|
-
"parameters": {
|
569
|
+
"scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg, reference_data: {'bbox': [0.1, 0.1, 0.2, 0.25]}",
|
570
|
+
"parameters": {
|
571
|
+
"image": "tray.jpg",
|
572
|
+
"prompt": {"bbox": [0.1, 0.1, 0.2, 0.25]},
|
573
|
+
},
|
567
574
|
},
|
568
575
|
{
|
569
|
-
"scenario": "Can you count this item based on an example, reference_data: '
|
576
|
+
"scenario": "Can you count this item based on an example, reference_data: {'bbox': [100, 115, 200, 200]} ? Image name: shirts.jpg",
|
570
577
|
"parameters": {
|
571
578
|
"image": "shirts.jpg",
|
572
|
-
"prompt": "
|
579
|
+
"prompt": {"bbox": [100, 115, 200, 200]},
|
573
580
|
},
|
574
581
|
},
|
575
582
|
{
|
576
|
-
"scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg",
|
583
|
+
"scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg, reference_data: {'bbox': [0.1, 0.1, 0.6, 0.65]}",
|
577
584
|
"parameters": {
|
578
585
|
"image": "shoes.jpg",
|
579
|
-
"prompt": "0.1, 0.1, 0.6, 0.65
|
586
|
+
"prompt": {"bbox": [0.1, 0.1, 0.6, 0.65]},
|
580
587
|
},
|
581
588
|
},
|
582
589
|
],
|
583
590
|
}
|
584
591
|
|
585
|
-
|
586
|
-
|
592
|
+
def __call__(
|
593
|
+
self, image: Union[str, ImageType], prompt: Dict[str, List[float]]
|
594
|
+
) -> Dict:
|
587
595
|
"""Invoke the few shot counting model.
|
588
596
|
|
589
597
|
Parameters:
|
590
598
|
image: the input image.
|
599
|
+
prompt: the visual prompt which is a bounding box describing the object.
|
591
600
|
|
592
601
|
Returns:
|
593
602
|
A dictionary containing the key 'count' and the count as value. E.g. {count: 12}
|
594
603
|
"""
|
595
604
|
image_size = get_image_size(image)
|
596
|
-
bbox = [
|
597
|
-
|
605
|
+
bbox = prompt["bbox"]
|
606
|
+
bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
|
598
607
|
image_b64 = convert_to_b64(image)
|
599
608
|
|
600
609
|
data = {
|
601
610
|
"image": image_b64,
|
602
|
-
"prompt":
|
611
|
+
"prompt": bbox_str,
|
603
612
|
"tool": "few_shot_counting",
|
604
613
|
}
|
605
614
|
resp_data = _send_inference_request(data, "tools")
|
@@ -878,7 +887,7 @@ class SegIoU(Tool):
|
|
878
887
|
],
|
879
888
|
"examples": [
|
880
889
|
{
|
881
|
-
"scenario": "
|
890
|
+
"scenario": "Calculate the intersection over union of the segmentation masks for mask_file1.jpg and mask_file2.jpg",
|
882
891
|
"parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
|
883
892
|
}
|
884
893
|
],
|
@@ -947,6 +956,46 @@ class BboxContains(Tool):
|
|
947
956
|
}
|
948
957
|
|
949
958
|
|
959
|
+
class ObjectDistance(Tool):
|
960
|
+
name = "object_distance_"
|
961
|
+
description = "'object_distance_' calculates the distance between two objects in an image. It returns the minimum distance between the two objects."
|
962
|
+
usage = {
|
963
|
+
"required_parameters": [
|
964
|
+
{"name": "object1", "type": "Dict[str, Any]"},
|
965
|
+
{"name": "object2", "type": "Dict[str, Any]"},
|
966
|
+
],
|
967
|
+
"examples": [
|
968
|
+
{
|
969
|
+
"scenario": "Calculate the distance between these two objects {bboxes: [0.2, 0.21, 0.34, 0.42], masks: 'mask_file1.png'}, {bboxes: [0.3, 0.31, 0.44, 0.52], masks: 'mask_file2.png'}",
|
970
|
+
"parameters": {
|
971
|
+
"object1": {
|
972
|
+
"bboxes": [0.2, 0.21, 0.34, 0.42],
|
973
|
+
"scores": 0.54,
|
974
|
+
"masks": "mask_file1.png",
|
975
|
+
},
|
976
|
+
"object2": {
|
977
|
+
"bboxes": [0.3, 0.31, 0.44, 0.52],
|
978
|
+
"scores": 0.66,
|
979
|
+
"masks": "mask_file2.png",
|
980
|
+
},
|
981
|
+
},
|
982
|
+
}
|
983
|
+
],
|
984
|
+
}
|
985
|
+
|
986
|
+
def __call__(self, object1: Dict[str, Any], object2: Dict[str, Any]) -> float:
|
987
|
+
if "masks" in object1 and "masks" in object2:
|
988
|
+
mask1 = object1["masks"]
|
989
|
+
mask2 = object2["masks"]
|
990
|
+
return MaskDistance()(mask1, mask2)
|
991
|
+
elif "bboxes" in object1 and "bboxes" in object2:
|
992
|
+
bbox1 = object1["bboxes"]
|
993
|
+
bbox2 = object2["bboxes"]
|
994
|
+
return BoxDistance()(bbox1, bbox2)
|
995
|
+
else:
|
996
|
+
raise ValueError("Either of the objects should have masks or bboxes")
|
997
|
+
|
998
|
+
|
950
999
|
class BoxDistance(Tool):
|
951
1000
|
name = "box_distance_"
|
952
1001
|
description = "'box_distance_' calculates distance between two bounding boxes. It returns the minumum distance between the given bounding boxes"
|
@@ -957,7 +1006,7 @@ class BoxDistance(Tool):
|
|
957
1006
|
],
|
958
1007
|
"examples": [
|
959
1008
|
{
|
960
|
-
"scenario": "Calculate the distance between
|
1009
|
+
"scenario": "Calculate the distance between these two bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]",
|
961
1010
|
"parameters": {
|
962
1011
|
"bbox1": [0.2, 0.21, 0.34, 0.42],
|
963
1012
|
"bbox2": [0.3, 0.31, 0.44, 0.52],
|
@@ -976,6 +1025,34 @@ class BoxDistance(Tool):
|
|
976
1025
|
return cast(float, round(np.sqrt(horizontal_dist**2 + vertical_dist**2), 2))
|
977
1026
|
|
978
1027
|
|
1028
|
+
class MaskDistance(Tool):
|
1029
|
+
name = "mask_distance_"
|
1030
|
+
description = "'mask_distance_' calculates distance between two masks. It is helpful in checking proximity of two objects. It returns the minumum distance between the given masks"
|
1031
|
+
usage = {
|
1032
|
+
"required_parameters": [
|
1033
|
+
{"name": "mask1", "type": "str"},
|
1034
|
+
{"name": "mask2", "type": "str"},
|
1035
|
+
],
|
1036
|
+
"examples": [
|
1037
|
+
{
|
1038
|
+
"scenario": "Calculate the distance between the segmentation masks for mask_file1.jpg and mask_file2.jpg",
|
1039
|
+
"parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
|
1040
|
+
}
|
1041
|
+
],
|
1042
|
+
}
|
1043
|
+
|
1044
|
+
def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float:
|
1045
|
+
pil_mask1 = Image.open(str(mask1))
|
1046
|
+
pil_mask2 = Image.open(str(mask2))
|
1047
|
+
np_mask1 = np.clip(np.array(pil_mask1), 0, 1)
|
1048
|
+
np_mask2 = np.clip(np.array(pil_mask2), 0, 1)
|
1049
|
+
|
1050
|
+
mask1_points = np.transpose(np.nonzero(np_mask1))
|
1051
|
+
mask2_points = np.transpose(np.nonzero(np_mask2))
|
1052
|
+
dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean")
|
1053
|
+
return cast(float, np.round(np.min(dist_matrix), 2))
|
1054
|
+
|
1055
|
+
|
979
1056
|
class ExtractFrames(Tool):
|
980
1057
|
r"""Extract frames from a video."""
|
981
1058
|
|
@@ -1110,10 +1187,9 @@ TOOLS = {
|
|
1110
1187
|
Crop,
|
1111
1188
|
BboxArea,
|
1112
1189
|
SegArea,
|
1113
|
-
|
1114
|
-
SegIoU,
|
1190
|
+
ObjectDistance,
|
1115
1191
|
BboxContains,
|
1116
|
-
|
1192
|
+
SegIoU,
|
1117
1193
|
OCR,
|
1118
1194
|
Calculator,
|
1119
1195
|
]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.12
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -17,6 +17,7 @@ Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
|
17
17
|
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
18
18
|
Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
19
19
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
20
|
+
Requires-Dist: scipy (>=1.13.0,<1.14.0)
|
20
21
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
21
22
|
Requires-Dist: tqdm (>=4.64.0,<5.0.0)
|
22
23
|
Requires-Dist: typing_extensions (>=4.0.0,<5.0.0)
|
@@ -149,7 +150,7 @@ you. For example:
|
|
149
150
|
|
150
151
|
#### Custom Tools
|
151
152
|
You can also add your own custom tools for your vision agent to use:
|
152
|
-
|
153
|
+
|
153
154
|
```python
|
154
155
|
from vision_agent.tools import Tool, register_tool
|
155
156
|
@register_tool
|
@@ -187,6 +188,7 @@ find an example that creates a custom tool for template matching [here](examples
|
|
187
188
|
| BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
|
188
189
|
| SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
|
189
190
|
| BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
|
191
|
+
| MaskDistance | MaskDistance returns the minimum distance between two segmentation masks in pixel units |
|
190
192
|
| BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
|
191
193
|
| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
|
192
194
|
| ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image. |
|
@@ -5,7 +5,7 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
|
|
5
5
|
vision_agent/agent/easytool_prompts.py,sha256=Bikw-PPLkm78dwywTlnv32Y1Tw6JMeC-R7oCnXWLcTk,4656
|
6
6
|
vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
|
7
7
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
8
|
-
vision_agent/agent/vision_agent.py,sha256=
|
8
|
+
vision_agent/agent/vision_agent.py,sha256=5W5Xr_h4yDMsFvIk2JWcfMlYoPYmTv3JZnrDDumuZgM,26842
|
9
9
|
vision_agent/agent/vision_agent_prompts.py,sha256=moihXFhEzFw8xnf2sUSgd_k9eoxQam3T6XUkB0fyp5o,8570
|
10
10
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
@@ -14,12 +14,12 @@ vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,
|
|
14
14
|
vision_agent/llm/llm.py,sha256=1BkrSVBWEClyqLc0Rmyw4heLhi_ZVm6JO7-i1wd1ziw,5383
|
15
15
|
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
16
16
|
vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
|
17
|
-
vision_agent/tools/__init__.py,sha256=
|
17
|
+
vision_agent/tools/__init__.py,sha256=uWySwcIeQMH57PVN6lVIknTx-SFmN_J0mvn_HbGlXcQ,451
|
18
18
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
19
|
-
vision_agent/tools/tools.py,sha256=
|
19
|
+
vision_agent/tools/tools.py,sha256=kqwmKPbuSAGOWjzv2LCjsvUAp2mfRk8X5a1DrP2B4i8,47007
|
20
20
|
vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
|
21
21
|
vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
|
22
|
-
vision_agent-0.2.
|
23
|
-
vision_agent-0.2.
|
24
|
-
vision_agent-0.2.
|
25
|
-
vision_agent-0.2.
|
22
|
+
vision_agent-0.2.12.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
23
|
+
vision_agent-0.2.12.dist-info/METADATA,sha256=IWJjflG4JW4ZuMzyTw1Rq6IHK-YuO_YCfp_nJ-J0LiY,9073
|
24
|
+
vision_agent-0.2.12.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
25
|
+
vision_agent-0.2.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|