vision-agent 0.2.5__tar.gz → 0.2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-0.2.5 → vision_agent-0.2.7}/PKG-INFO +1 -1
- {vision_agent-0.2.5 → vision_agent-0.2.7}/pyproject.toml +1 -1
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/vision_agent.py +26 -10
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/image_utils.py +5 -7
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/tools/tools.py +7 -2
- {vision_agent-0.2.5 → vision_agent-0.2.7}/LICENSE +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/README.md +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/easytool.py +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/easytool_prompts.py +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/reflexion.py +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/reflexion_prompts.py +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/llm/__init__.py +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/llm/llm.py +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/tools/__init__.py +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/tools/video.py +0 -0
- {vision_agent-0.2.5 → vision_agent-0.2.7}/vision_agent/type_defs.py +0 -0
@@ -314,6 +314,7 @@ def _handle_extract_frames(
|
|
314
314
|
image_to_data[image] = {
|
315
315
|
"bboxes": [],
|
316
316
|
"masks": [],
|
317
|
+
"heat_map": [],
|
317
318
|
"labels": [],
|
318
319
|
"scores": [],
|
319
320
|
}
|
@@ -340,9 +341,12 @@ def _handle_viz_tools(
|
|
340
341
|
return image_to_data
|
341
342
|
|
342
343
|
for param, call_result in zip(parameters, tool_result["call_results"]):
|
343
|
-
#
|
344
|
+
# Calls can fail, so we need to check if the call was successful. It can either:
|
345
|
+
# 1. return a str or some error that's not a dictionary
|
346
|
+
# 2. return a dictionary but not have the necessary keys
|
347
|
+
|
344
348
|
if not isinstance(call_result, dict) or (
|
345
|
-
"bboxes" not in call_result and "
|
349
|
+
"bboxes" not in call_result and "heat_map" not in call_result
|
346
350
|
):
|
347
351
|
return image_to_data
|
348
352
|
|
@@ -352,6 +356,7 @@ def _handle_viz_tools(
|
|
352
356
|
image_to_data[image] = {
|
353
357
|
"bboxes": [],
|
354
358
|
"masks": [],
|
359
|
+
"heat_map": [],
|
355
360
|
"labels": [],
|
356
361
|
"scores": [],
|
357
362
|
}
|
@@ -360,6 +365,8 @@ def _handle_viz_tools(
|
|
360
365
|
image_to_data[image]["labels"].extend(call_result.get("labels", []))
|
361
366
|
image_to_data[image]["scores"].extend(call_result.get("scores", []))
|
362
367
|
image_to_data[image]["masks"].extend(call_result.get("masks", []))
|
368
|
+
# only single heatmap is returned
|
369
|
+
image_to_data[image]["heat_map"].append(call_result.get("heat_map", []))
|
363
370
|
if "mask_shape" in call_result:
|
364
371
|
image_to_data[image]["mask_shape"] = call_result["mask_shape"]
|
365
372
|
|
@@ -480,9 +487,14 @@ class VisionAgent(Agent):
|
|
480
487
|
"""Invoke the vision agent.
|
481
488
|
|
482
489
|
Parameters:
|
483
|
-
|
490
|
+
chat: A conversation in the format of
|
484
491
|
[{"role": "user", "content": "describe your task here..."}].
|
485
|
-
image:
|
492
|
+
image: The input image referenced in the chat parameter.
|
493
|
+
reference_data: A dictionary containing the reference image, mask or bounding
|
494
|
+
box in the format of:
|
495
|
+
{"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
|
496
|
+
where the bounding box coordinates are normalized.
|
497
|
+
visualize_output: Whether to visualize the output.
|
486
498
|
|
487
499
|
Returns:
|
488
500
|
The result of the vision agent in text.
|
@@ -509,7 +521,9 @@ class VisionAgent(Agent):
|
|
509
521
|
self.report_progress_callback("<VIZ>")
|
510
522
|
if images:
|
511
523
|
for img in images:
|
512
|
-
self.report_progress_callback(
|
524
|
+
self.report_progress_callback(
|
525
|
+
f"<IMG>base:64{convert_to_b64(img)}</IMG>"
|
526
|
+
)
|
513
527
|
self.report_progress_callback("</VIZ>")
|
514
528
|
|
515
529
|
def chat_with_workflow(
|
@@ -522,12 +536,14 @@ class VisionAgent(Agent):
|
|
522
536
|
"""Chat with the vision agent and return the final answer and all tool results.
|
523
537
|
|
524
538
|
Parameters:
|
525
|
-
chat:
|
539
|
+
chat: A conversation in the format of
|
526
540
|
[{"role": "user", "content": "describe your task here..."}].
|
527
|
-
image:
|
528
|
-
reference_data:
|
529
|
-
|
530
|
-
|
541
|
+
image: The input image referenced in the chat parameter.
|
542
|
+
reference_data: A dictionary containing the reference image, mask or bounding
|
543
|
+
box in the format of:
|
544
|
+
{"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
|
545
|
+
where the bounding box coordinates are normalized.
|
546
|
+
visualize_output: Whether to visualize the output.
|
531
547
|
|
532
548
|
Returns:
|
533
549
|
A tuple where the first item is the final answer and the second item is a
|
@@ -211,7 +211,7 @@ def overlay_masks(
|
|
211
211
|
}
|
212
212
|
|
213
213
|
for label, mask in zip(masks["labels"], masks["masks"]):
|
214
|
-
if isinstance(mask, str):
|
214
|
+
if isinstance(mask, str) or isinstance(mask, Path):
|
215
215
|
mask = np.array(Image.open(mask))
|
216
216
|
np_mask = np.zeros((image.size[1], image.size[0], 4))
|
217
217
|
np_mask[mask > 0, :] = color[label] + (255 * alpha,)
|
@@ -221,7 +221,7 @@ def overlay_masks(
|
|
221
221
|
|
222
222
|
|
223
223
|
def overlay_heat_map(
|
224
|
-
image: Union[str, Path, np.ndarray, ImageType],
|
224
|
+
image: Union[str, Path, np.ndarray, ImageType], heat_map: Dict, alpha: float = 0.8
|
225
225
|
) -> ImageType:
|
226
226
|
r"""Plots heat map on to an image.
|
227
227
|
|
@@ -238,14 +238,12 @@ def overlay_heat_map(
|
|
238
238
|
elif isinstance(image, np.ndarray):
|
239
239
|
image = Image.fromarray(image)
|
240
240
|
|
241
|
-
if "
|
241
|
+
if "heat_map" not in heat_map:
|
242
242
|
return image.convert("RGB")
|
243
243
|
|
244
|
-
# Only one heat map per image, so no need to loop through masks
|
245
244
|
image = image.convert("L")
|
246
|
-
|
247
|
-
|
248
|
-
mask = b64_to_pil(masks["masks"][0])
|
245
|
+
# Only one heat map per image, so no need to loop through masks
|
246
|
+
mask = Image.fromarray(heat_map["heat_map"][0])
|
249
247
|
|
250
248
|
overlay = Image.new("RGBA", mask.size)
|
251
249
|
odraw = ImageDraw.Draw(overlay)
|
@@ -11,6 +11,7 @@ from PIL import Image
|
|
11
11
|
from PIL.Image import Image as ImageType
|
12
12
|
|
13
13
|
from vision_agent.image_utils import (
|
14
|
+
b64_to_pil,
|
14
15
|
convert_to_b64,
|
15
16
|
denormalize_bbox,
|
16
17
|
get_image_size,
|
@@ -516,7 +517,9 @@ class ZeroShotCounting(Tool):
|
|
516
517
|
"image": image_b64,
|
517
518
|
"tool": "zero_shot_counting",
|
518
519
|
}
|
519
|
-
|
520
|
+
resp_data = _send_inference_request(data, "tools")
|
521
|
+
resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
|
522
|
+
return resp_data
|
520
523
|
|
521
524
|
|
522
525
|
class VisualPromptCounting(Tool):
|
@@ -585,7 +588,9 @@ class VisualPromptCounting(Tool):
|
|
585
588
|
"prompt": prompt,
|
586
589
|
"tool": "few_shot_counting",
|
587
590
|
}
|
588
|
-
|
591
|
+
resp_data = _send_inference_request(data, "tools")
|
592
|
+
resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
|
593
|
+
return resp_data
|
589
594
|
|
590
595
|
|
591
596
|
class VisualQuestionAnswering(Tool):
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|