vision-agent 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -428,7 +428,7 @@ class VisionAgent(Agent):
428
428
  ):
429
429
  """VisionAgent constructor.
430
430
 
431
- Parameters
431
+ Parameters:
432
432
  task_model: the model to use for task decomposition.
433
433
  answer_model: the model to use for reasoning and concluding the answer.
434
434
  reflect_model: the model to use for self reflection.
@@ -504,6 +504,21 @@ class VisionAgent(Agent):
504
504
  reference_data: Optional[Dict[str, str]] = None,
505
505
  visualize_output: Optional[bool] = False,
506
506
  ) -> Tuple[str, List[Dict]]:
507
+ """Chat with the vision agent and return the final answer and all tool results.
508
+
509
+ Parameters:
510
+ chat: a conversation in the format of
511
+ [{"role": "user", "content": "describe your task here..."}].
512
+ image: the input image referenced in the chat parameter.
513
+ reference_data: a dictionary containing the reference image and mask. in the
514
+ format of {"image": "image.jpg", "mask": "mask.jpg}
515
+ visualize_output: whether to visualize the output.
516
+
517
+ Returns:
518
+ A tuple where the first item is the final answer and the second item is a
519
+ list of all the tool results. The last item in the tool results also
520
+ contains the visualized output.
521
+ """
507
522
  question = chat[0]["content"]
508
523
  if image:
509
524
  question += f" Image name: {image}"
@@ -108,8 +108,7 @@ class CLIP(Tool):
108
108
 
109
109
 
110
110
  class ImageCaption(Tool):
111
- r"""ImageCaption is a tool that can caption an image based on its contents
112
- or tags.
111
+ r"""ImageCaption is a tool that can caption an image based on its contents or tags.
113
112
 
114
113
  Example
115
114
  -------
@@ -120,26 +119,20 @@ class ImageCaption(Tool):
120
119
  """
121
120
 
122
121
  name = "image_caption_"
123
- description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image"
122
+ description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image."
124
123
  usage = {
125
124
  "required_parameters": [
126
125
  {"name": "image", "type": "str"},
127
126
  ],
128
127
  "examples": [
129
128
  {
130
- "scenario": "Can you describe this image ? Image name: cat.jpg",
129
+ "scenario": "Can you describe this image? Image name: cat.jpg",
131
130
  "parameters": {"image": "cat.jpg"},
132
131
  },
133
132
  {
134
- "scenario": "Can you caption this image with their main contents ? Image name: cat_dog.jpg",
133
+ "scenario": "Can you caption this image with their main contents? Image name: cat_dog.jpg",
135
134
  "parameters": {"image": "cat_dog.jpg"},
136
135
  },
137
- {
138
- "scenario": "Can you build me a image captioning tool ? Image name: shirts.jpg",
139
- "parameters": {
140
- "image": "shirts.jpg",
141
- },
142
- },
143
136
  ],
144
137
  }
145
138
 
@@ -487,15 +480,15 @@ class ZeroShotCounting(Tool):
487
480
  ],
488
481
  "examples": [
489
482
  {
490
- "scenario": "Can you count the lids in the image ? Image name: lids.jpg",
483
+ "scenario": "Can you count the lids in the image? Image name: lids.jpg",
491
484
  "parameters": {"image": "lids.jpg"},
492
485
  },
493
486
  {
494
- "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg",
487
+ "scenario": "Can you count the total number of objects in this image? Image name: tray.jpg",
495
488
  "parameters": {"image": "tray.jpg"},
496
489
  },
497
490
  {
498
- "scenario": "Can you build me an object counting tool ? Image name: shirts.jpg",
491
+ "scenario": "Can you build me an object counting tool? Image name: shirts.jpg",
499
492
  "parameters": {
500
493
  "image": "shirts.jpg",
501
494
  },
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -126,15 +126,18 @@ you. For example:
126
126
  | Tool | Description |
127
127
  | --- | --- |
128
128
  | CLIP | CLIP is a tool that can classify or tag any image given a set of input classes or tags. |
129
+ | ImageCaption| ImageCaption is a tool that can generate a caption for an image. |
129
130
  | GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. |
130
131
  | GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. |
131
- | Counter | Counter detects and counts the number of objects in an image given an input such as a category name or referring expression. |
132
+ | DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. |
133
+ | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
132
134
  | Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
133
135
  | BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
134
136
  | SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |
135
137
  | BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
136
138
  | SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
137
- | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
139
+ | BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
140
+ | BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
138
141
  | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
139
142
  | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image |
140
143
  | VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt |
@@ -5,7 +5,7 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
5
5
  vision_agent/agent/easytool_prompts.py,sha256=zdQQw6WpXOmvwOMtlBlNKY5a3WNlr65dbUvMIGiqdeo,4526
6
6
  vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
7
7
  vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
8
- vision_agent/agent/vision_agent.py,sha256=2VUMRVI6KAnmaUK-34wrgyfSQ2DAUm4g4QQcpqa2zao,24235
8
+ vision_agent/agent/vision_agent.py,sha256=6AtVaEQL0ksg1QkUBn_YhytYjRfH7-M4q7G6pnds9Ds,25002
9
9
  vision_agent/agent/vision_agent_prompts.py,sha256=W3Z72FpUt71UIJSkjAcgtQqxeMqkYuATqHAN5fYY26c,7342
10
10
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
@@ -16,10 +16,10 @@ vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,
16
16
  vision_agent/lmm/lmm.py,sha256=FjxCuIk0KXuWnfY4orVmdyhJW2I4C6i5QNNEXk7gybk,10197
17
17
  vision_agent/tools/__init__.py,sha256=BlfxqbYkB0oODhnSmQg1UyzQm73AvvjCjrIiOWBIYDs,328
18
18
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
19
- vision_agent/tools/tools.py,sha256=gCjHs5vJuGNBFsnJWFT7PX3wTyfHgtrgX1Eq9vqknN0,34979
19
+ vision_agent/tools/tools.py,sha256=Cwh7GNSnCYxyKKgusHlf-Cqd9NBjlbZG7d-GauQJCwI,34751
20
20
  vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
21
21
  vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
22
- vision_agent-0.2.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
- vision_agent-0.2.2.dist-info/METADATA,sha256=dOZ9KWmhuVb5wvschxYBis8x79HwgOD3MmTKqyupggg,6434
24
- vision_agent-0.2.2.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
25
- vision_agent-0.2.2.dist-info/RECORD,,
22
+ vision_agent-0.2.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
+ vision_agent-0.2.3.dist-info/METADATA,sha256=cQnQTRlWBxf0aVwsMoJS4TiiAtN3SbU00nlCrbNNb9w,6748
24
+ vision_agent-0.2.3.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
25
+ vision_agent-0.2.3.dist-info/RECORD,,