vision-agent 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -585,7 +585,7 @@ class VisionAgent(Agent):
585
585
  self.task_model, question, self.tools, reflections
586
586
  )
587
587
 
588
- task_depend = {"Original Quesiton": question}
588
+ task_depend = {"Original Question": question}
589
589
  previous_log = ""
590
590
  answers = []
591
591
  for task in task_list:
@@ -5,7 +5,9 @@ from .tools import ( # Counter,
5
5
  TOOLS,
6
6
  BboxArea,
7
7
  BboxIoU,
8
+ ObjectDistance,
8
9
  BoxDistance,
10
+ MaskDistance,
9
11
  Crop,
10
12
  DINOv,
11
13
  ExtractFrames,
@@ -9,6 +9,7 @@ import numpy as np
9
9
  import requests
10
10
  from PIL import Image
11
11
  from PIL.Image import Image as ImageType
12
+ from scipy.spatial import distance # type: ignore
12
13
 
13
14
  from vision_agent.image_utils import (
14
15
  b64_to_pil,
@@ -544,7 +545,7 @@ class VisualPromptCounting(Tool):
544
545
  -------
545
546
  >>> import vision_agent as va
546
547
  >>> prompt_count = va.tools.VisualPromptCounting()
547
- >>> prompt_count(image="image1.jpg", prompt="0.1, 0.1, 0.4, 0.42")
548
+ >>> prompt_count(image="image1.jpg", prompt={"bbox": [0.1, 0.1, 0.4, 0.42]})
548
549
  {'count': 23}
549
550
  """
550
551
 
@@ -554,52 +555,60 @@ class VisualPromptCounting(Tool):
554
555
  usage = {
555
556
  "required_parameters": [
556
557
  {"name": "image", "type": "str"},
557
- {"name": "prompt", "type": "str"},
558
+ {"name": "prompt", "type": "Dict[str, List[float]"},
558
559
  ],
559
560
  "examples": [
560
561
  {
561
562
  "scenario": "Here is an example of a lid '0.1, 0.1, 0.14, 0.2', Can you count the items in the image ? Image name: lids.jpg",
562
- "parameters": {"image": "lids.jpg", "prompt": "0.1, 0.1, 0.14, 0.2"},
563
+ "parameters": {
564
+ "image": "lids.jpg",
565
+ "prompt": {"bbox": [0.1, 0.1, 0.14, 0.2]},
566
+ },
563
567
  },
564
568
  {
565
- "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg",
566
- "parameters": {"image": "tray.jpg", "prompt": "0.1, 0.1, 0.2, 0.25"},
569
+ "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg, reference_data: {'bbox': [0.1, 0.1, 0.2, 0.25]}",
570
+ "parameters": {
571
+ "image": "tray.jpg",
572
+ "prompt": {"bbox": [0.1, 0.1, 0.2, 0.25]},
573
+ },
567
574
  },
568
575
  {
569
- "scenario": "Can you count this item based on an example, reference_data: '0.1, 0.15, 0.2, 0.2' ? Image name: shirts.jpg",
576
+ "scenario": "Can you count this item based on an example, reference_data: {'bbox': [100, 115, 200, 200]} ? Image name: shirts.jpg",
570
577
  "parameters": {
571
578
  "image": "shirts.jpg",
572
- "prompt": "0.1, 0.15, 0.2, 0.2",
579
+ "prompt": {"bbox": [100, 115, 200, 200]},
573
580
  },
574
581
  },
575
582
  {
576
- "scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg",
583
+ "scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg, reference_data: {'bbox': [0.1, 0.1, 0.6, 0.65]}",
577
584
  "parameters": {
578
585
  "image": "shoes.jpg",
579
- "prompt": "0.1, 0.1, 0.6, 0.65",
586
+ "prompt": {"bbox": [0.1, 0.1, 0.6, 0.65]},
580
587
  },
581
588
  },
582
589
  ],
583
590
  }
584
591
 
585
- # TODO: Add support for input multiple images, which aligns with the output type.
586
- def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
592
+ def __call__(
593
+ self, image: Union[str, ImageType], prompt: Dict[str, List[float]]
594
+ ) -> Dict:
587
595
  """Invoke the few shot counting model.
588
596
 
589
597
  Parameters:
590
598
  image: the input image.
599
+ prompt: the visual prompt which is a bounding box describing the object.
591
600
 
592
601
  Returns:
593
602
  A dictionary containing the key 'count' and the count as value. E.g. {count: 12}
594
603
  """
595
604
  image_size = get_image_size(image)
596
- bbox = [float(x) for x in prompt.split(",")]
597
- prompt = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
605
+ bbox = prompt["bbox"]
606
+ bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
598
607
  image_b64 = convert_to_b64(image)
599
608
 
600
609
  data = {
601
610
  "image": image_b64,
602
- "prompt": prompt,
611
+ "prompt": bbox_str,
603
612
  "tool": "few_shot_counting",
604
613
  }
605
614
  resp_data = _send_inference_request(data, "tools")
@@ -878,7 +887,7 @@ class SegIoU(Tool):
878
887
  ],
879
888
  "examples": [
880
889
  {
881
- "scenario": "If you want to calculate the intersection over union of the segmentation masks for mask_file1.jpg and mask_file2.jpg",
890
+ "scenario": "Calculate the intersection over union of the segmentation masks for mask_file1.jpg and mask_file2.jpg",
882
891
  "parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
883
892
  }
884
893
  ],
@@ -947,6 +956,46 @@ class BboxContains(Tool):
947
956
  }
948
957
 
949
958
 
959
+ class ObjectDistance(Tool):
960
+ name = "object_distance_"
961
+ description = "'object_distance_' calculates the distance between two objects in an image. It returns the minimum distance between the two objects."
962
+ usage = {
963
+ "required_parameters": [
964
+ {"name": "object1", "type": "Dict[str, Any]"},
965
+ {"name": "object2", "type": "Dict[str, Any]"},
966
+ ],
967
+ "examples": [
968
+ {
969
+ "scenario": "Calculate the distance between these two objects {bboxes: [0.2, 0.21, 0.34, 0.42], masks: 'mask_file1.png'}, {bboxes: [0.3, 0.31, 0.44, 0.52], masks: 'mask_file2.png'}",
970
+ "parameters": {
971
+ "object1": {
972
+ "bboxes": [0.2, 0.21, 0.34, 0.42],
973
+ "scores": 0.54,
974
+ "masks": "mask_file1.png",
975
+ },
976
+ "object2": {
977
+ "bboxes": [0.3, 0.31, 0.44, 0.52],
978
+ "scores": 0.66,
979
+ "masks": "mask_file2.png",
980
+ },
981
+ },
982
+ }
983
+ ],
984
+ }
985
+
986
+ def __call__(self, object1: Dict[str, Any], object2: Dict[str, Any]) -> float:
987
+ if "masks" in object1 and "masks" in object2:
988
+ mask1 = object1["masks"]
989
+ mask2 = object2["masks"]
990
+ return MaskDistance()(mask1, mask2)
991
+ elif "bboxes" in object1 and "bboxes" in object2:
992
+ bbox1 = object1["bboxes"]
993
+ bbox2 = object2["bboxes"]
994
+ return BoxDistance()(bbox1, bbox2)
995
+ else:
996
+ raise ValueError("Either of the objects should have masks or bboxes")
997
+
998
+
950
999
  class BoxDistance(Tool):
951
1000
  name = "box_distance_"
952
1001
  description = "'box_distance_' calculates distance between two bounding boxes. It returns the minumum distance between the given bounding boxes"
@@ -957,7 +1006,7 @@ class BoxDistance(Tool):
957
1006
  ],
958
1007
  "examples": [
959
1008
  {
960
- "scenario": "Calculate the distance between the bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]",
1009
+ "scenario": "Calculate the distance between these two bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]",
961
1010
  "parameters": {
962
1011
  "bbox1": [0.2, 0.21, 0.34, 0.42],
963
1012
  "bbox2": [0.3, 0.31, 0.44, 0.52],
@@ -976,6 +1025,34 @@ class BoxDistance(Tool):
976
1025
  return cast(float, round(np.sqrt(horizontal_dist**2 + vertical_dist**2), 2))
977
1026
 
978
1027
 
1028
+ class MaskDistance(Tool):
1029
+ name = "mask_distance_"
1030
+ description = "'mask_distance_' calculates distance between two masks. It is helpful in checking proximity of two objects. It returns the minumum distance between the given masks"
1031
+ usage = {
1032
+ "required_parameters": [
1033
+ {"name": "mask1", "type": "str"},
1034
+ {"name": "mask2", "type": "str"},
1035
+ ],
1036
+ "examples": [
1037
+ {
1038
+ "scenario": "Calculate the distance between the segmentation masks for mask_file1.jpg and mask_file2.jpg",
1039
+ "parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
1040
+ }
1041
+ ],
1042
+ }
1043
+
1044
+ def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float:
1045
+ pil_mask1 = Image.open(str(mask1))
1046
+ pil_mask2 = Image.open(str(mask2))
1047
+ np_mask1 = np.clip(np.array(pil_mask1), 0, 1)
1048
+ np_mask2 = np.clip(np.array(pil_mask2), 0, 1)
1049
+
1050
+ mask1_points = np.transpose(np.nonzero(np_mask1))
1051
+ mask2_points = np.transpose(np.nonzero(np_mask2))
1052
+ dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean")
1053
+ return cast(float, np.round(np.min(dist_matrix), 2))
1054
+
1055
+
979
1056
  class ExtractFrames(Tool):
980
1057
  r"""Extract frames from a video."""
981
1058
 
@@ -1110,10 +1187,9 @@ TOOLS = {
1110
1187
  Crop,
1111
1188
  BboxArea,
1112
1189
  SegArea,
1113
- BboxIoU,
1114
- SegIoU,
1190
+ ObjectDistance,
1115
1191
  BboxContains,
1116
- BoxDistance,
1192
+ SegIoU,
1117
1193
  OCR,
1118
1194
  Calculator,
1119
1195
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.9
3
+ Version: 0.2.11
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -17,6 +17,7 @@ Requires-Dist: pandas (>=2.0.0,<3.0.0)
17
17
  Requires-Dist: pillow (>=10.0.0,<11.0.0)
18
18
  Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
19
19
  Requires-Dist: requests (>=2.0.0,<3.0.0)
20
+ Requires-Dist: scipy (>=1.13.0,<1.14.0)
20
21
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
21
22
  Requires-Dist: tqdm (>=4.64.0,<5.0.0)
22
23
  Requires-Dist: typing_extensions (>=4.0.0,<5.0.0)
@@ -105,6 +106,30 @@ the individual steps and tools to get the answer:
105
106
  {"visualize_output": "final_output.png"}]
106
107
  ```
107
108
 
109
+ You can also provide reference data for the model to utilize. For example, if you want
110
+ to utilize VisualPromptCounting:
111
+
112
+ ```python
113
+ agent(
114
+ "How many apples are in this image?",
115
+ image="apples.jpg",
116
+ reference_data={"bbox": [0.1, 0.11, 0.24, 0.25]},
117
+ )
118
+ ```
119
+ Where `[0.1, 0.11, 0.24, 0.25]` is the normalized bounding box coordinates of an apple.
120
+ Similarly for DINOv you can provide a reference image and mask:
121
+
122
+ ```python
123
+ agent(
124
+ "Can you detect all of the objects similar to the mask I've provided?",
125
+ image="image.jpg",
126
+ reference_data={"mask": "reference_mask.png", "image": "reference_image.png"},
127
+ )
128
+ ```
129
+ Here, `reference_mask.png` and `reference_image.png` in `reference_data` could be any
130
+ image with it's corresponding mask that is the object you want to detect in `image.jpg`.
131
+ You can find a demo app to generate masks for DINOv [here](examples/mask_app/).
132
+
108
133
  ### Tools
109
134
  There are a variety of tools for the model or the user to use. Some are executed locally
110
135
  while others are hosted for you. You can also ask an LLM directly to build a tool for
@@ -127,25 +152,26 @@ you. For example:
127
152
  You can also add your own custom tools for your vision agent to use:
128
153
 
129
154
  ```python
130
- >>> from vision_agent.tools import Tool, register_tool
131
- >>> @register_tool
132
- >>> class NumItems(Tool):
133
- >>> name = "num_items_"
134
- >>> description = "Returns the number of items in a list."
135
- >>> usage = {
136
- >>> "required_parameters": [{"name": "prompt", "type": "list"}],
137
- >>> "examples": [
138
- >>> {
139
- >>> "scenario": "How many items are in this list? ['a', 'b', 'c']",
140
- >>> "parameters": {"prompt": "['a', 'b', 'c']"},
141
- >>> }
142
- >>> ],
143
- >>> }
144
- >>> def __call__(self, prompt: list[str]) -> int:
145
- >>> return len(prompt)
155
+ from vision_agent.tools import Tool, register_tool
156
+ @register_tool
157
+ class NumItems(Tool):
158
+ name = "num_items_"
159
+ description = "Returns the number of items in a list."
160
+ usage = {
161
+ "required_parameters": [{"name": "prompt", "type": "list"}],
162
+ "examples": [
163
+ {
164
+ "scenario": "How many items are in this list? ['a', 'b', 'c']",
165
+ "parameters": {"prompt": "['a', 'b', 'c']"},
166
+ }
167
+ ],
168
+ }
169
+ def __call__(self, prompt: list[str]) -> int:
170
+ return len(prompt)
146
171
  ```
147
172
  This will register it with the list of tools Vision Agent has access to. It will be able
148
- to pick it based on the tool description and use it based on the usage provided.
173
+ to pick it based on the tool description and use it based on the usage provided. You can
174
+ find an example that creates a custom tool for template matching [here](examples/custom_tools/).
149
175
 
150
176
  #### Tool List
151
177
  | Tool | Description |
@@ -164,8 +190,10 @@ to pick it based on the tool description and use it based on the usage provided.
164
190
  | BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
165
191
  | BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
166
192
  | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
167
- | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image |
168
- | VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt |
193
+ | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image. |
194
+ | VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt. |
195
+ | VisualQuestionAnswering | VisualQuestionAnswering is a tool that can explain the contents of an image and answer questions about the image. |
196
+ | ImageQuestionAnswering | ImageQuestionAnswering is similar to VisualQuestionAnswering but does not rely on OpenAI and instead uses a dedicated model for the task. |
169
197
  | OCR | OCR returns the text detected in an image along with the location. |
170
198
 
171
199
 
@@ -5,7 +5,7 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
5
5
  vision_agent/agent/easytool_prompts.py,sha256=Bikw-PPLkm78dwywTlnv32Y1Tw6JMeC-R7oCnXWLcTk,4656
6
6
  vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
7
7
  vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
8
- vision_agent/agent/vision_agent.py,sha256=PyAtzDl5h1Uasd-Fjzdl-NK9gdZ2ARxoF9y3tvap7PU,26243
8
+ vision_agent/agent/vision_agent.py,sha256=DVcvT02GjY85mCjhHgJGrhI_dpUvjZhoYzYik9bkHQA,26243
9
9
  vision_agent/agent/vision_agent_prompts.py,sha256=moihXFhEzFw8xnf2sUSgd_k9eoxQam3T6XUkB0fyp5o,8570
10
10
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
@@ -14,12 +14,12 @@ vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,
14
14
  vision_agent/llm/llm.py,sha256=1BkrSVBWEClyqLc0Rmyw4heLhi_ZVm6JO7-i1wd1ziw,5383
15
15
  vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
16
16
  vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
17
- vision_agent/tools/__init__.py,sha256=HfUr0JQUwk0Kyieen93df9lMbbdpVf9Q6CcVFmKv_q4,413
17
+ vision_agent/tools/__init__.py,sha256=uWySwcIeQMH57PVN6lVIknTx-SFmN_J0mvn_HbGlXcQ,451
18
18
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
19
- vision_agent/tools/tools.py,sha256=EvNDLUxe-Ed8-meHInTIiX3aySLUXFBsAWwL0Is5S1o,43823
19
+ vision_agent/tools/tools.py,sha256=kqwmKPbuSAGOWjzv2LCjsvUAp2mfRk8X5a1DrP2B4i8,47007
20
20
  vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
21
21
  vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
22
- vision_agent-0.2.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
- vision_agent-0.2.9.dist-info/METADATA,sha256=jyfAwSfDnObeILoLyfB8ijuLLpZUWd-Fvg-xncEMCYc,7697
24
- vision_agent-0.2.9.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
25
- vision_agent-0.2.9.dist-info/RECORD,,
22
+ vision_agent-0.2.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
+ vision_agent-0.2.11.dist-info/METADATA,sha256=kg0CzT1ncFoXAg4ayP2ppStbFwHnzKAygH_t6XmKTxQ,8970
24
+ vision_agent-0.2.11.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
25
+ vision_agent-0.2.11.dist-info/RECORD,,