vision-agent 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +20 -2
- vision_agent/image_utils.py +3 -1
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +99 -0
- {vision_agent-0.1.3.dist-info → vision_agent-0.1.4.dist-info}/METADATA +1 -1
- {vision_agent-0.1.3.dist-info → vision_agent-0.1.4.dist-info}/RECORD +8 -8
- {vision_agent-0.1.3.dist-info → vision_agent-0.1.4.dist-info}/LICENSE +0 -0
- {vision_agent-0.1.3.dist-info → vision_agent-0.1.4.dist-info}/WHEEL +0 -0
@@ -365,6 +365,7 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
|
|
365
365
|
"grounding_sam_",
|
366
366
|
"grounding_dino_",
|
367
367
|
"extract_frames_",
|
368
|
+
"dinov_",
|
368
369
|
]:
|
369
370
|
continue
|
370
371
|
|
@@ -444,6 +445,7 @@ class VisionAgent(Agent):
|
|
444
445
|
self,
|
445
446
|
input: Union[List[Dict[str, str]], str],
|
446
447
|
image: Optional[Union[str, Path]] = None,
|
448
|
+
reference_data: Optional[Dict[str, str]] = None,
|
447
449
|
visualize_output: Optional[bool] = False,
|
448
450
|
) -> str:
|
449
451
|
"""Invoke the vision agent.
|
@@ -458,7 +460,12 @@ class VisionAgent(Agent):
|
|
458
460
|
"""
|
459
461
|
if isinstance(input, str):
|
460
462
|
input = [{"role": "user", "content": input}]
|
461
|
-
return self.chat(
|
463
|
+
return self.chat(
|
464
|
+
input,
|
465
|
+
image=image,
|
466
|
+
visualize_output=visualize_output,
|
467
|
+
reference_data=reference_data,
|
468
|
+
)
|
462
469
|
|
463
470
|
def log_progress(self, description: str) -> None:
|
464
471
|
_LOGGER.info(description)
|
@@ -469,11 +476,18 @@ class VisionAgent(Agent):
|
|
469
476
|
self,
|
470
477
|
chat: List[Dict[str, str]],
|
471
478
|
image: Optional[Union[str, Path]] = None,
|
479
|
+
reference_data: Optional[Dict[str, str]] = None,
|
472
480
|
visualize_output: Optional[bool] = False,
|
473
481
|
) -> Tuple[str, List[Dict]]:
|
474
482
|
question = chat[0]["content"]
|
475
483
|
if image:
|
476
484
|
question += f" Image name: {image}"
|
485
|
+
if reference_data:
|
486
|
+
if not ("image" in reference_data and "mask" in reference_data):
|
487
|
+
raise ValueError(
|
488
|
+
f"Reference data must contain 'image' and 'mask'. but got {reference_data}"
|
489
|
+
)
|
490
|
+
question += f" Reference image: {reference_data['image']}, Reference mask: {reference_data['mask']}"
|
477
491
|
|
478
492
|
reflections = ""
|
479
493
|
final_answer = ""
|
@@ -555,10 +569,14 @@ class VisionAgent(Agent):
|
|
555
569
|
self,
|
556
570
|
chat: List[Dict[str, str]],
|
557
571
|
image: Optional[Union[str, Path]] = None,
|
572
|
+
reference_data: Optional[Dict[str, str]] = None,
|
558
573
|
visualize_output: Optional[bool] = False,
|
559
574
|
) -> str:
|
560
575
|
answer, _ = self.chat_with_workflow(
|
561
|
-
chat,
|
576
|
+
chat,
|
577
|
+
image=image,
|
578
|
+
visualize_output=visualize_output,
|
579
|
+
reference_data=reference_data,
|
562
580
|
)
|
563
581
|
return answer
|
564
582
|
|
vision_agent/image_utils.py
CHANGED
@@ -103,7 +103,9 @@ def overlay_bboxes(
|
|
103
103
|
elif isinstance(image, np.ndarray):
|
104
104
|
image = Image.fromarray(image)
|
105
105
|
|
106
|
-
color = {
|
106
|
+
color = {
|
107
|
+
label: COLORS[i % len(COLORS)] for i, label in enumerate(set(bboxes["labels"]))
|
108
|
+
}
|
107
109
|
|
108
110
|
width, height = image.size
|
109
111
|
fontsize = max(12, int(min(width, height) / 40))
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -372,6 +372,104 @@ class GroundingSAM(Tool):
|
|
372
372
|
return ret_pred
|
373
373
|
|
374
374
|
|
375
|
+
class DINOv(Tool):
|
376
|
+
r"""DINOv is a tool that can detect and segment similar objects with the given input masks.
|
377
|
+
|
378
|
+
Example
|
379
|
+
-------
|
380
|
+
>>> import vision_agent as va
|
381
|
+
>>> t = va.tools.DINOv()
|
382
|
+
>>> t(prompt=[{"mask":"balloon_mask.jpg", "image": "balloon.jpg"}], image="balloon.jpg"])
|
383
|
+
[{'scores': [0.512, 0.212],
|
384
|
+
'masks': [array([[0, 0, 0, ..., 0, 0, 0],
|
385
|
+
...,
|
386
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8)},
|
387
|
+
array([[0, 0, 0, ..., 0, 0, 0],
|
388
|
+
...,
|
389
|
+
[1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
|
390
|
+
"""
|
391
|
+
|
392
|
+
name = "dinov_"
|
393
|
+
description = "'dinov_' is a tool that can detect and segment similar objects given a reference segmentation mask."
|
394
|
+
usage = {
|
395
|
+
"required_parameters": [
|
396
|
+
{"name": "prompt", "type": "List[Dict[str, str]]"},
|
397
|
+
{"name": "image", "type": "str"},
|
398
|
+
],
|
399
|
+
"examples": [
|
400
|
+
{
|
401
|
+
"scenario": "Can you find all the balloons in this image that is similar to the provided masked area? Image name: input.jpg Reference image: balloon.jpg Reference mask: balloon_mask.jpg",
|
402
|
+
"parameters": {
|
403
|
+
"prompt": [
|
404
|
+
{"mask": "balloon_mask.jpg", "image": "balloon.jpg"},
|
405
|
+
],
|
406
|
+
"image": "input.jpg",
|
407
|
+
},
|
408
|
+
},
|
409
|
+
{
|
410
|
+
"scenario": "Detect all the objects in this image that are similar to the provided mask. Image name: original.jpg Reference image: mask.png Reference mask: background.png",
|
411
|
+
"parameters": {
|
412
|
+
"prompt": [
|
413
|
+
{"mask": "mask.png", "image": "background.png"},
|
414
|
+
],
|
415
|
+
"image": "original.jpg",
|
416
|
+
},
|
417
|
+
},
|
418
|
+
],
|
419
|
+
}
|
420
|
+
|
421
|
+
def __call__(
|
422
|
+
self, prompt: List[Dict[str, str]], image: Union[str, ImageType]
|
423
|
+
) -> Dict:
|
424
|
+
"""Invoke the DINOv model.
|
425
|
+
|
426
|
+
Parameters:
|
427
|
+
prompt: a list of visual prompts in the form of {'mask': 'MASK_FILE_PATH', 'image': 'IMAGE_FILE_PATH'}.
|
428
|
+
image: the input image to segment.
|
429
|
+
|
430
|
+
Returns:
|
431
|
+
A dictionary of the below keys: 'scores', 'masks' and 'mask_shape', which stores a list of detected segmentation masks and its scores.
|
432
|
+
"""
|
433
|
+
image_b64 = convert_to_b64(image)
|
434
|
+
for p in prompt:
|
435
|
+
p["mask"] = convert_to_b64(p["mask"])
|
436
|
+
p["image"] = convert_to_b64(p["image"])
|
437
|
+
request_data = {
|
438
|
+
"prompt": prompt,
|
439
|
+
"image": image_b64,
|
440
|
+
"tool": "dinov",
|
441
|
+
}
|
442
|
+
data: Dict[str, Any] = _send_inference_request(request_data, "dinov")
|
443
|
+
if "bboxes" in data:
|
444
|
+
data["bboxes"] = [
|
445
|
+
normalize_bbox(box, data["mask_shape"]) for box in data["bboxes"]
|
446
|
+
]
|
447
|
+
if "masks" in data:
|
448
|
+
data["masks"] = [
|
449
|
+
rle_decode(mask_rle=mask, shape=data["mask_shape"])
|
450
|
+
for mask in data["masks"]
|
451
|
+
]
|
452
|
+
data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))]
|
453
|
+
return data
|
454
|
+
|
455
|
+
|
456
|
+
class AgentDINOv(DINOv):
|
457
|
+
def __call__(
|
458
|
+
self,
|
459
|
+
prompt: List[Dict[str, str]],
|
460
|
+
image: Union[str, ImageType],
|
461
|
+
) -> Dict:
|
462
|
+
rets = super().__call__(prompt, image)
|
463
|
+
mask_files = []
|
464
|
+
for mask in rets["masks"]:
|
465
|
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
466
|
+
file_name = Path(tmp.name).with_suffix(".mask.png")
|
467
|
+
Image.fromarray(mask * 255).save(file_name)
|
468
|
+
mask_files.append(str(file_name))
|
469
|
+
rets["masks"] = mask_files
|
470
|
+
return rets
|
471
|
+
|
472
|
+
|
375
473
|
class AgentGroundingSAM(GroundingSAM):
|
376
474
|
r"""AgentGroundingSAM is the same as GroundingSAM but it saves the masks as files
|
377
475
|
returns the file name. This makes it easier for agents to use.
|
@@ -652,6 +750,7 @@ TOOLS = {
|
|
652
750
|
ImageCaption,
|
653
751
|
GroundingDINO,
|
654
752
|
AgentGroundingSAM,
|
753
|
+
AgentDINOv,
|
655
754
|
ExtractFrames,
|
656
755
|
Crop,
|
657
756
|
BboxArea,
|
@@ -5,7 +5,7 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
|
|
5
5
|
vision_agent/agent/easytool_prompts.py,sha256=zdQQw6WpXOmvwOMtlBlNKY5a3WNlr65dbUvMIGiqdeo,4526
|
6
6
|
vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
|
7
7
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
8
|
-
vision_agent/agent/vision_agent.py,sha256=
|
8
|
+
vision_agent/agent/vision_agent.py,sha256=QWIirRBB3ZPg3figWcf8-g9ltFydM1BDn75LbXWbep0,22735
|
9
9
|
vision_agent/agent/vision_agent_prompts.py,sha256=W3Z72FpUt71UIJSkjAcgtQqxeMqkYuATqHAN5fYY26c,7342
|
10
10
|
vision_agent/data/__init__.py,sha256=YU-5g3LbEQ6a4drz0RLGTagXMVU2Z4Xr3RlfWE-R0jU,46
|
11
11
|
vision_agent/data/data.py,sha256=Z2l76OrT0GgyuN52OeJqDitUcP0q1rhfdXd1of3GsVo,5128
|
@@ -13,17 +13,17 @@ vision_agent/emb/__init__.py,sha256=YmCkGrJBtXb6X6Z3lnKiFoQYKXMgHMJp8JJyMLVvqcI,
|
|
13
13
|
vision_agent/emb/emb.py,sha256=la9lhEzk7jqUCjYYQ5oRgVNSnC9_EJBJIpE_B9c6PJo,1375
|
14
14
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
16
|
-
vision_agent/image_utils.py,sha256=
|
16
|
+
vision_agent/image_utils.py,sha256=qRN_Y1XXBm9EL6V53OZUq21h0spIa1J6X9YDbe6B87o,4805
|
17
17
|
vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
|
18
18
|
vision_agent/llm/llm.py,sha256=Jty_RHdqVmIM0Mm31JNk50c882Tx7hHtkmh0WyXeJd8,5016
|
19
19
|
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
20
20
|
vision_agent/lmm/lmm.py,sha256=1E7e_S_0fOKnf6mSsEdkXvsIjGmhBGl5XW4By2jvhbY,10045
|
21
|
-
vision_agent/tools/__init__.py,sha256=
|
21
|
+
vision_agent/tools/__init__.py,sha256=dkzk9amNzTEKULMB1xRJspqEGpzNPGuccWeXrv1xI0U,280
|
22
22
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
23
|
-
vision_agent/tools/tools.py,sha256=
|
23
|
+
vision_agent/tools/tools.py,sha256=ybhCyutEGzHPKuR0Cu--Nb--KubjYvyzLEzVQYzIMTw,29148
|
24
24
|
vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
|
25
25
|
vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
|
26
|
-
vision_agent-0.1.
|
27
|
-
vision_agent-0.1.
|
28
|
-
vision_agent-0.1.
|
29
|
-
vision_agent-0.1.
|
26
|
+
vision_agent-0.1.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
27
|
+
vision_agent-0.1.4.dist-info/METADATA,sha256=FyBYGPHgC0uV7uy7wph8yvdQpEWSACnGR96y6Jt-E6A,6233
|
28
|
+
vision_agent-0.1.4.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
29
|
+
vision_agent-0.1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|