vision-agent 0.0.52__tar.gz → 0.0.53__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {vision_agent-0.0.52 → vision_agent-0.0.53}/PKG-INFO +1 -1
  2. {vision_agent-0.0.52 → vision_agent-0.0.53}/pyproject.toml +1 -1
  3. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/agent/vision_agent.py +1 -1
  4. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/tools/__init__.py +1 -0
  5. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/tools/tools.py +69 -0
  6. {vision_agent-0.0.52 → vision_agent-0.0.53}/LICENSE +0 -0
  7. {vision_agent-0.0.52 → vision_agent-0.0.53}/README.md +0 -0
  8. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/__init__.py +0 -0
  9. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/agent/__init__.py +0 -0
  10. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/agent/agent.py +0 -0
  11. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/agent/easytool.py +0 -0
  12. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/agent/easytool_prompts.py +0 -0
  13. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/agent/reflexion.py +0 -0
  14. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/agent/reflexion_prompts.py +0 -0
  15. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/agent/vision_agent_prompts.py +0 -0
  16. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/data/__init__.py +0 -0
  17. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/data/data.py +0 -0
  18. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/emb/__init__.py +0 -0
  19. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/emb/emb.py +0 -0
  20. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/fonts/__init__.py +0 -0
  21. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  22. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/image_utils.py +0 -0
  23. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/llm/__init__.py +0 -0
  24. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/llm/llm.py +0 -0
  25. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/lmm/__init__.py +0 -0
  26. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/lmm/lmm.py +0 -0
  27. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/tools/prompts.py +0 -0
  28. {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/tools/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.0.52
3
+ Version: 0.0.53
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.0.52"
7
+ version = "0.0.53"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -351,7 +351,7 @@ class VisionAgent(Agent):
351
351
  task_model: Optional[Union[LLM, LMM]] = None,
352
352
  answer_model: Optional[Union[LLM, LMM]] = None,
353
353
  reflect_model: Optional[Union[LLM, LMM]] = None,
354
- max_retries: int = 3,
354
+ max_retries: int = 2,
355
355
  verbose: bool = False,
356
356
  report_progress_callback: Optional[Callable[[str], None]] = None,
357
357
  ):
@@ -9,6 +9,7 @@ from .tools import (
9
9
  ExtractFrames,
10
10
  GroundingDINO,
11
11
  GroundingSAM,
12
+ ImageCaption,
12
13
  SegArea,
13
14
  SegIoU,
14
15
  Tool,
@@ -144,6 +144,74 @@ class CLIP(Tool):
144
144
  return resp_json["data"] # type: ignore
145
145
 
146
146
 
147
+ class ImageCaption(Tool):
148
+ r"""ImageCaption is a tool that can caption an image based on its contents
149
+ or tags.
150
+
151
+ Example
152
+ -------
153
+ >>> import vision_agent as va
154
+ >>> caption = va.tools.ImageCaption()
155
+ >>> caption("image1.jpg")
156
+ {'text': ['a box of orange and white socks']}
157
+ """
158
+
159
+ _ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
160
+
161
+ name = "image_caption_"
162
+ description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image"
163
+ usage = {
164
+ "required_parameters": [
165
+ {"name": "image", "type": "str"},
166
+ ],
167
+ "examples": [
168
+ {
169
+ "scenario": "Can you describe this image ? Image name: cat.jpg",
170
+ "parameters": {"image": "cat.jpg"},
171
+ },
172
+ {
173
+ "scenario": "Can you caption this image with their main contents ? Image name: cat_dog.jpg",
174
+ "parameters": {"image": "cat_dog.jpg"},
175
+ },
176
+ {
177
+ "scenario": "Can you build me a image captioning tool ? Image name: shirts.jpg",
178
+ "parameters": {
179
+ "image": "shirts.jpg",
180
+ },
181
+ },
182
+ ],
183
+ }
184
+
185
+ # TODO: Add support for input multiple images, which aligns with the output type.
186
+ def __call__(self, image: Union[str, ImageType]) -> Dict:
187
+ """Invoke the Image captioning model.
188
+
189
+ Parameters:
190
+ image: the input image to caption.
191
+
192
+ Returns:
193
+ A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
194
+ """
195
+ image_b64 = convert_to_b64(image)
196
+ data = {
197
+ "image": image_b64,
198
+ "tool": "image_captioning",
199
+ }
200
+ res = requests.post(
201
+ self._ENDPOINT,
202
+ headers={"Content-Type": "application/json"},
203
+ json=data,
204
+ )
205
+ resp_json: Dict[str, Any] = res.json()
206
+ if (
207
+ "statusCode" in resp_json and resp_json["statusCode"] != 200
208
+ ) or "statusCode" not in resp_json:
209
+ _LOGGER.error(f"Request failed: {resp_json}")
210
+ raise ValueError(f"Request failed: {resp_json}")
211
+
212
+ return resp_json["data"] # type: ignore
213
+
214
+
147
215
  class GroundingDINO(Tool):
148
216
  r"""Grounding DINO is a tool that can detect arbitrary objects with inputs such as
149
217
  category names or referring expressions.
@@ -631,6 +699,7 @@ TOOLS = {
631
699
  [
632
700
  NoOp,
633
701
  CLIP,
702
+ ImageCaption,
634
703
  GroundingDINO,
635
704
  AgentGroundingSAM,
636
705
  ExtractFrames,
File without changes
File without changes