vision-agent 0.0.52__tar.gz → 0.0.53__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-0.0.52 → vision_agent-0.0.53}/PKG-INFO +1 -1
- {vision_agent-0.0.52 → vision_agent-0.0.53}/pyproject.toml +1 -1
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/agent/vision_agent.py +1 -1
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/tools/__init__.py +1 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/tools/tools.py +69 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/LICENSE +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/README.md +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/__init__.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/agent/easytool.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/agent/easytool_prompts.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/agent/reflexion.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/agent/reflexion_prompts.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/data/__init__.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/data/data.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/emb/__init__.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/emb/emb.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/image_utils.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/llm/__init__.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/llm/llm.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.0.52 → vision_agent-0.0.53}/vision_agent/tools/video.py +0 -0
@@ -351,7 +351,7 @@ class VisionAgent(Agent):
|
|
351
351
|
task_model: Optional[Union[LLM, LMM]] = None,
|
352
352
|
answer_model: Optional[Union[LLM, LMM]] = None,
|
353
353
|
reflect_model: Optional[Union[LLM, LMM]] = None,
|
354
|
-
max_retries: int =
|
354
|
+
max_retries: int = 2,
|
355
355
|
verbose: bool = False,
|
356
356
|
report_progress_callback: Optional[Callable[[str], None]] = None,
|
357
357
|
):
|
@@ -144,6 +144,74 @@ class CLIP(Tool):
|
|
144
144
|
return resp_json["data"] # type: ignore
|
145
145
|
|
146
146
|
|
147
|
+
class ImageCaption(Tool):
|
148
|
+
r"""ImageCaption is a tool that can caption an image based on its contents
|
149
|
+
or tags.
|
150
|
+
|
151
|
+
Example
|
152
|
+
-------
|
153
|
+
>>> import vision_agent as va
|
154
|
+
>>> caption = va.tools.ImageCaption()
|
155
|
+
>>> caption("image1.jpg")
|
156
|
+
{'text': ['a box of orange and white socks']}
|
157
|
+
"""
|
158
|
+
|
159
|
+
_ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
|
160
|
+
|
161
|
+
name = "image_caption_"
|
162
|
+
description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image"
|
163
|
+
usage = {
|
164
|
+
"required_parameters": [
|
165
|
+
{"name": "image", "type": "str"},
|
166
|
+
],
|
167
|
+
"examples": [
|
168
|
+
{
|
169
|
+
"scenario": "Can you describe this image ? Image name: cat.jpg",
|
170
|
+
"parameters": {"image": "cat.jpg"},
|
171
|
+
},
|
172
|
+
{
|
173
|
+
"scenario": "Can you caption this image with their main contents ? Image name: cat_dog.jpg",
|
174
|
+
"parameters": {"image": "cat_dog.jpg"},
|
175
|
+
},
|
176
|
+
{
|
177
|
+
"scenario": "Can you build me a image captioning tool ? Image name: shirts.jpg",
|
178
|
+
"parameters": {
|
179
|
+
"image": "shirts.jpg",
|
180
|
+
},
|
181
|
+
},
|
182
|
+
],
|
183
|
+
}
|
184
|
+
|
185
|
+
# TODO: Add support for input multiple images, which aligns with the output type.
|
186
|
+
def __call__(self, image: Union[str, ImageType]) -> Dict:
|
187
|
+
"""Invoke the Image captioning model.
|
188
|
+
|
189
|
+
Parameters:
|
190
|
+
image: the input image to caption.
|
191
|
+
|
192
|
+
Returns:
|
193
|
+
A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
|
194
|
+
"""
|
195
|
+
image_b64 = convert_to_b64(image)
|
196
|
+
data = {
|
197
|
+
"image": image_b64,
|
198
|
+
"tool": "image_captioning",
|
199
|
+
}
|
200
|
+
res = requests.post(
|
201
|
+
self._ENDPOINT,
|
202
|
+
headers={"Content-Type": "application/json"},
|
203
|
+
json=data,
|
204
|
+
)
|
205
|
+
resp_json: Dict[str, Any] = res.json()
|
206
|
+
if (
|
207
|
+
"statusCode" in resp_json and resp_json["statusCode"] != 200
|
208
|
+
) or "statusCode" not in resp_json:
|
209
|
+
_LOGGER.error(f"Request failed: {resp_json}")
|
210
|
+
raise ValueError(f"Request failed: {resp_json}")
|
211
|
+
|
212
|
+
return resp_json["data"] # type: ignore
|
213
|
+
|
214
|
+
|
147
215
|
class GroundingDINO(Tool):
|
148
216
|
r"""Grounding DINO is a tool that can detect arbitrary objects with inputs such as
|
149
217
|
category names or referring expressions.
|
@@ -631,6 +699,7 @@ TOOLS = {
|
|
631
699
|
[
|
632
700
|
NoOp,
|
633
701
|
CLIP,
|
702
|
+
ImageCaption,
|
634
703
|
GroundingDINO,
|
635
704
|
AgentGroundingSAM,
|
636
705
|
ExtractFrames,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|