vision-agent 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,12 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
8
8
  from PIL import Image
9
9
  from tabulate import tabulate
10
10
 
11
- from vision_agent.image_utils import overlay_bboxes, overlay_masks, overlay_heat_map
11
+ from vision_agent.image_utils import (
12
+ convert_to_b64,
13
+ overlay_bboxes,
14
+ overlay_heat_map,
15
+ overlay_masks,
16
+ )
12
17
  from vision_agent.llm import LLM, OpenAILLM
13
18
  from vision_agent.lmm import LMM, OpenAILMM
14
19
  from vision_agent.tools import TOOLS
@@ -423,7 +428,7 @@ class VisionAgent(Agent):
423
428
  ):
424
429
  """VisionAgent constructor.
425
430
 
426
- Parameters
431
+ Parameters:
427
432
  task_model: the model to use for task decomposition.
428
433
  answer_model: the model to use for reasoning and concluding the answer.
429
434
  reflect_model: the model to use for self reflection.
@@ -481,6 +486,17 @@ class VisionAgent(Agent):
481
486
  if self.report_progress_callback:
482
487
  self.report_progress_callback(description)
483
488
 
489
+ def _report_visualization_via_callback(
490
+ self, images: Sequence[Union[str, Path]]
491
+ ) -> None:
492
+ """This is intended for streaming the visualization images via the callback to the client side."""
493
+ if self.report_progress_callback:
494
+ self.report_progress_callback("<VIZ>")
495
+ if images:
496
+ for img in images:
497
+ self.report_progress_callback(f"<IMG>{convert_to_b64(img)}</IMG>")
498
+ self.report_progress_callback("</VIZ>")
499
+
484
500
  def chat_with_workflow(
485
501
  self,
486
502
  chat: List[Dict[str, str]],
@@ -488,6 +504,21 @@ class VisionAgent(Agent):
488
504
  reference_data: Optional[Dict[str, str]] = None,
489
505
  visualize_output: Optional[bool] = False,
490
506
  ) -> Tuple[str, List[Dict]]:
507
+ """Chat with the vision agent and return the final answer and all tool results.
508
+
509
+ Parameters:
510
+ chat: a conversation in the format of
511
+ [{"role": "user", "content": "describe your task here..."}].
512
+ image: the input image referenced in the chat parameter.
513
+ reference_data: a dictionary containing the reference image and mask. in the
514
+ format of {"image": "image.jpg", "mask": "mask.jpg}
515
+ visualize_output: whether to visualize the output.
516
+
517
+ Returns:
518
+ A tuple where the first item is the final answer and the second item is a
519
+ list of all the tool results. The last item in the tool results also
520
+ contains the visualized output.
521
+ """
491
522
  question = chat[0]["content"]
492
523
  if image:
493
524
  question += f" Image name: {image}"
@@ -577,9 +608,12 @@ class VisionAgent(Agent):
577
608
  )
578
609
 
579
610
  if visualize_output:
580
- visualized_output = all_tool_results[-1]["visualized_output"]
581
- for image in visualized_output:
582
- Image.open(image).show()
611
+ viz_images: Sequence[Union[str, Path]] = all_tool_results[-1][
612
+ "visualized_output"
613
+ ]
614
+ self._report_visualization_via_callback(viz_images)
615
+ for img in viz_images:
616
+ Image.open(img).show()
583
617
 
584
618
  return final_answer, all_tool_results
585
619
 
@@ -4,7 +4,7 @@ import base64
4
4
  from importlib import resources
5
5
  from io import BytesIO
6
6
  from pathlib import Path
7
- from typing import Dict, Tuple, Union, List
7
+ from typing import Dict, List, Tuple, Union
8
8
 
9
9
  import numpy as np
10
10
  from PIL import Image, ImageDraw, ImageFont
@@ -108,7 +108,7 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
108
108
  data = Image.open(data)
109
109
  if isinstance(data, Image.Image):
110
110
  buffer = BytesIO()
111
- data.convert("RGB").save(buffer, format="JPEG")
111
+ data.convert("RGB").save(buffer, format="PNG")
112
112
  return base64.b64encode(buffer.getvalue()).decode("utf-8")
113
113
  else:
114
114
  arr_bytes = data.tobytes()
@@ -108,8 +108,7 @@ class CLIP(Tool):
108
108
 
109
109
 
110
110
  class ImageCaption(Tool):
111
- r"""ImageCaption is a tool that can caption an image based on its contents
112
- or tags.
111
+ r"""ImageCaption is a tool that can caption an image based on its contents or tags.
113
112
 
114
113
  Example
115
114
  -------
@@ -120,26 +119,20 @@ class ImageCaption(Tool):
120
119
  """
121
120
 
122
121
  name = "image_caption_"
123
- description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image"
122
+ description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image."
124
123
  usage = {
125
124
  "required_parameters": [
126
125
  {"name": "image", "type": "str"},
127
126
  ],
128
127
  "examples": [
129
128
  {
130
- "scenario": "Can you describe this image ? Image name: cat.jpg",
129
+ "scenario": "Can you describe this image? Image name: cat.jpg",
131
130
  "parameters": {"image": "cat.jpg"},
132
131
  },
133
132
  {
134
- "scenario": "Can you caption this image with their main contents ? Image name: cat_dog.jpg",
133
+ "scenario": "Can you caption this image with their main contents? Image name: cat_dog.jpg",
135
134
  "parameters": {"image": "cat_dog.jpg"},
136
135
  },
137
- {
138
- "scenario": "Can you build me a image captioning tool ? Image name: shirts.jpg",
139
- "parameters": {
140
- "image": "shirts.jpg",
141
- },
142
- },
143
136
  ],
144
137
  }
145
138
 
@@ -487,15 +480,15 @@ class ZeroShotCounting(Tool):
487
480
  ],
488
481
  "examples": [
489
482
  {
490
- "scenario": "Can you count the lids in the image ? Image name: lids.jpg",
483
+ "scenario": "Can you count the lids in the image? Image name: lids.jpg",
491
484
  "parameters": {"image": "lids.jpg"},
492
485
  },
493
486
  {
494
- "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg",
487
+ "scenario": "Can you count the total number of objects in this image? Image name: tray.jpg",
495
488
  "parameters": {"image": "tray.jpg"},
496
489
  },
497
490
  {
498
- "scenario": "Can you build me an object counting tool ? Image name: shirts.jpg",
491
+ "scenario": "Can you build me an object counting tool? Image name: shirts.jpg",
499
492
  "parameters": {
500
493
  "image": "shirts.jpg",
501
494
  },
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -126,15 +126,18 @@ you. For example:
126
126
  | Tool | Description |
127
127
  | --- | --- |
128
128
  | CLIP | CLIP is a tool that can classify or tag any image given a set of input classes or tags. |
129
+ | ImageCaption| ImageCaption is a tool that can generate a caption for an image. |
129
130
  | GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. |
130
131
  | GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. |
131
- | Counter | Counter detects and counts the number of objects in an image given an input such as a category name or referring expression. |
132
+ | DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. |
133
+ | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
132
134
  | Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
133
135
  | BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
134
136
  | SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |
135
137
  | BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
136
138
  | SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
137
- | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
139
+ | BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
140
+ | BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
138
141
  | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
139
142
  | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image |
140
143
  | VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt |
@@ -5,21 +5,21 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
5
5
  vision_agent/agent/easytool_prompts.py,sha256=zdQQw6WpXOmvwOMtlBlNKY5a3WNlr65dbUvMIGiqdeo,4526
6
6
  vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
7
7
  vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
8
- vision_agent/agent/vision_agent.py,sha256=MTxeV5_Sghqoe2aOW9EbNgiq61sVCcF3ZndJ7BZl6x0,23588
8
+ vision_agent/agent/vision_agent.py,sha256=6AtVaEQL0ksg1QkUBn_YhytYjRfH7-M4q7G6pnds9Ds,25002
9
9
  vision_agent/agent/vision_agent_prompts.py,sha256=W3Z72FpUt71UIJSkjAcgtQqxeMqkYuATqHAN5fYY26c,7342
10
10
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
12
- vision_agent/image_utils.py,sha256=Cg4aKO1tQiETT1gdsZ50XzORBtJnBFfMG2cKJyjaY6Q,7555
12
+ vision_agent/image_utils.py,sha256=YvP5KE9NrWdgJKuHW2NR1glzfObkxtcXBknpmj3Gsbs,7554
13
13
  vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
14
14
  vision_agent/llm/llm.py,sha256=gwDQ9-p9wEn24xi1019e5jzTGQg4xWDSqBCsqIqGcU4,5168
15
15
  vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
16
16
  vision_agent/lmm/lmm.py,sha256=FjxCuIk0KXuWnfY4orVmdyhJW2I4C6i5QNNEXk7gybk,10197
17
17
  vision_agent/tools/__init__.py,sha256=BlfxqbYkB0oODhnSmQg1UyzQm73AvvjCjrIiOWBIYDs,328
18
18
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
19
- vision_agent/tools/tools.py,sha256=gCjHs5vJuGNBFsnJWFT7PX3wTyfHgtrgX1Eq9vqknN0,34979
19
+ vision_agent/tools/tools.py,sha256=Cwh7GNSnCYxyKKgusHlf-Cqd9NBjlbZG7d-GauQJCwI,34751
20
20
  vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
21
21
  vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
22
- vision_agent-0.2.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
- vision_agent-0.2.1.dist-info/METADATA,sha256=RAD8NCAo5N12sccgSC5Q0j4hKwU_rVKg5p_eLE-Njdc,6434
24
- vision_agent-0.2.1.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
25
- vision_agent-0.2.1.dist-info/RECORD,,
22
+ vision_agent-0.2.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
+ vision_agent-0.2.3.dist-info/METADATA,sha256=cQnQTRlWBxf0aVwsMoJS4TiiAtN3SbU00nlCrbNNb9w,6748
24
+ vision_agent-0.2.3.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
25
+ vision_agent-0.2.3.dist-info/RECORD,,