vision-agent 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -365,6 +365,7 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
365
365
  "grounding_sam_",
366
366
  "grounding_dino_",
367
367
  "extract_frames_",
368
+ "dinov_",
368
369
  ]:
369
370
  continue
370
371
 
@@ -444,6 +445,7 @@ class VisionAgent(Agent):
444
445
  self,
445
446
  input: Union[List[Dict[str, str]], str],
446
447
  image: Optional[Union[str, Path]] = None,
448
+ reference_data: Optional[Dict[str, str]] = None,
447
449
  visualize_output: Optional[bool] = False,
448
450
  ) -> str:
449
451
  """Invoke the vision agent.
@@ -458,7 +460,12 @@ class VisionAgent(Agent):
458
460
  """
459
461
  if isinstance(input, str):
460
462
  input = [{"role": "user", "content": input}]
461
- return self.chat(input, image=image, visualize_output=visualize_output)
463
+ return self.chat(
464
+ input,
465
+ image=image,
466
+ visualize_output=visualize_output,
467
+ reference_data=reference_data,
468
+ )
462
469
 
463
470
  def log_progress(self, description: str) -> None:
464
471
  _LOGGER.info(description)
@@ -469,11 +476,18 @@ class VisionAgent(Agent):
469
476
  self,
470
477
  chat: List[Dict[str, str]],
471
478
  image: Optional[Union[str, Path]] = None,
479
+ reference_data: Optional[Dict[str, str]] = None,
472
480
  visualize_output: Optional[bool] = False,
473
481
  ) -> Tuple[str, List[Dict]]:
474
482
  question = chat[0]["content"]
475
483
  if image:
476
484
  question += f" Image name: {image}"
485
+ if reference_data:
486
+ if not ("image" in reference_data and "mask" in reference_data):
487
+ raise ValueError(
488
+ f"Reference data must contain 'image' and 'mask'. but got {reference_data}"
489
+ )
490
+ question += f" Reference image: {reference_data['image']}, Reference mask: {reference_data['mask']}"
477
491
 
478
492
  reflections = ""
479
493
  final_answer = ""
@@ -555,10 +569,14 @@ class VisionAgent(Agent):
555
569
  self,
556
570
  chat: List[Dict[str, str]],
557
571
  image: Optional[Union[str, Path]] = None,
572
+ reference_data: Optional[Dict[str, str]] = None,
558
573
  visualize_output: Optional[bool] = False,
559
574
  ) -> str:
560
575
  answer, _ = self.chat_with_workflow(
561
- chat, image=image, visualize_output=visualize_output
576
+ chat,
577
+ image=image,
578
+ visualize_output=visualize_output,
579
+ reference_data=reference_data,
562
580
  )
563
581
  return answer
564
582
 
@@ -103,7 +103,9 @@ def overlay_bboxes(
103
103
  elif isinstance(image, np.ndarray):
104
104
  image = Image.fromarray(image)
105
105
 
106
- color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(bboxes["labels"])}
106
+ color = {
107
+ label: COLORS[i % len(COLORS)] for i, label in enumerate(set(bboxes["labels"]))
108
+ }
107
109
 
108
110
  width, height = image.size
109
111
  fontsize = max(12, int(min(width, height) / 40))
@@ -6,6 +6,7 @@ from .tools import ( # Counter,
6
6
  BboxIoU,
7
7
  BoxDistance,
8
8
  Crop,
9
+ DINOv,
9
10
  ExtractFrames,
10
11
  GroundingDINO,
11
12
  GroundingSAM,
@@ -372,6 +372,104 @@ class GroundingSAM(Tool):
372
372
  return ret_pred
373
373
 
374
374
 
375
+ class DINOv(Tool):
376
+ r"""DINOv is a tool that can detect and segment similar objects with the given input masks.
377
+
378
+ Example
379
+ -------
380
+ >>> import vision_agent as va
381
+ >>> t = va.tools.DINOv()
382
+ >>> t(prompt=[{"mask":"balloon_mask.jpg", "image": "balloon.jpg"}], image="balloon.jpg"])
383
+ [{'scores': [0.512, 0.212],
384
+ 'masks': [array([[0, 0, 0, ..., 0, 0, 0],
385
+ ...,
386
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)},
387
+ array([[0, 0, 0, ..., 0, 0, 0],
388
+ ...,
389
+ [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
390
+ """
391
+
392
+ name = "dinov_"
393
+ description = "'dinov_' is a tool that can detect and segment similar objects given a reference segmentation mask."
394
+ usage = {
395
+ "required_parameters": [
396
+ {"name": "prompt", "type": "List[Dict[str, str]]"},
397
+ {"name": "image", "type": "str"},
398
+ ],
399
+ "examples": [
400
+ {
401
+ "scenario": "Can you find all the balloons in this image that is similar to the provided masked area? Image name: input.jpg Reference image: balloon.jpg Reference mask: balloon_mask.jpg",
402
+ "parameters": {
403
+ "prompt": [
404
+ {"mask": "balloon_mask.jpg", "image": "balloon.jpg"},
405
+ ],
406
+ "image": "input.jpg",
407
+ },
408
+ },
409
+ {
410
+ "scenario": "Detect all the objects in this image that are similar to the provided mask. Image name: original.jpg Reference image: mask.png Reference mask: background.png",
411
+ "parameters": {
412
+ "prompt": [
413
+ {"mask": "mask.png", "image": "background.png"},
414
+ ],
415
+ "image": "original.jpg",
416
+ },
417
+ },
418
+ ],
419
+ }
420
+
421
+ def __call__(
422
+ self, prompt: List[Dict[str, str]], image: Union[str, ImageType]
423
+ ) -> Dict:
424
+ """Invoke the DINOv model.
425
+
426
+ Parameters:
427
+ prompt: a list of visual prompts in the form of {'mask': 'MASK_FILE_PATH', 'image': 'IMAGE_FILE_PATH'}.
428
+ image: the input image to segment.
429
+
430
+ Returns:
431
+ A dictionary of the below keys: 'scores', 'masks' and 'mask_shape', which stores a list of detected segmentation masks and its scores.
432
+ """
433
+ image_b64 = convert_to_b64(image)
434
+ for p in prompt:
435
+ p["mask"] = convert_to_b64(p["mask"])
436
+ p["image"] = convert_to_b64(p["image"])
437
+ request_data = {
438
+ "prompt": prompt,
439
+ "image": image_b64,
440
+ "tool": "dinov",
441
+ }
442
+ data: Dict[str, Any] = _send_inference_request(request_data, "dinov")
443
+ if "bboxes" in data:
444
+ data["bboxes"] = [
445
+ normalize_bbox(box, data["mask_shape"]) for box in data["bboxes"]
446
+ ]
447
+ if "masks" in data:
448
+ data["masks"] = [
449
+ rle_decode(mask_rle=mask, shape=data["mask_shape"])
450
+ for mask in data["masks"]
451
+ ]
452
+ data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))]
453
+ return data
454
+
455
+
456
+ class AgentDINOv(DINOv):
457
+ def __call__(
458
+ self,
459
+ prompt: List[Dict[str, str]],
460
+ image: Union[str, ImageType],
461
+ ) -> Dict:
462
+ rets = super().__call__(prompt, image)
463
+ mask_files = []
464
+ for mask in rets["masks"]:
465
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
466
+ file_name = Path(tmp.name).with_suffix(".mask.png")
467
+ Image.fromarray(mask * 255).save(file_name)
468
+ mask_files.append(str(file_name))
469
+ rets["masks"] = mask_files
470
+ return rets
471
+
472
+
375
473
  class AgentGroundingSAM(GroundingSAM):
376
474
  r"""AgentGroundingSAM is the same as GroundingSAM but it saves the masks as files
377
475
  returns the file name. This makes it easier for agents to use.
@@ -652,6 +750,7 @@ TOOLS = {
652
750
  ImageCaption,
653
751
  GroundingDINO,
654
752
  AgentGroundingSAM,
753
+ AgentDINOv,
655
754
  ExtractFrames,
656
755
  Crop,
657
756
  BboxArea,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -5,7 +5,7 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
5
5
  vision_agent/agent/easytool_prompts.py,sha256=zdQQw6WpXOmvwOMtlBlNKY5a3WNlr65dbUvMIGiqdeo,4526
6
6
  vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
7
7
  vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
8
- vision_agent/agent/vision_agent.py,sha256=4-milD0iSY_vKdpAIctba04Ak_In5tMBE8gATdaGIr0,22019
8
+ vision_agent/agent/vision_agent.py,sha256=QWIirRBB3ZPg3figWcf8-g9ltFydM1BDn75LbXWbep0,22735
9
9
  vision_agent/agent/vision_agent_prompts.py,sha256=W3Z72FpUt71UIJSkjAcgtQqxeMqkYuATqHAN5fYY26c,7342
10
10
  vision_agent/data/__init__.py,sha256=YU-5g3LbEQ6a4drz0RLGTagXMVU2Z4Xr3RlfWE-R0jU,46
11
11
  vision_agent/data/data.py,sha256=Z2l76OrT0GgyuN52OeJqDitUcP0q1rhfdXd1of3GsVo,5128
@@ -13,17 +13,17 @@ vision_agent/emb/__init__.py,sha256=YmCkGrJBtXb6X6Z3lnKiFoQYKXMgHMJp8JJyMLVvqcI,
13
13
  vision_agent/emb/emb.py,sha256=la9lhEzk7jqUCjYYQ5oRgVNSnC9_EJBJIpE_B9c6PJo,1375
14
14
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
16
- vision_agent/image_utils.py,sha256=hFdPoRmeVU5jErFr5xaagMQ6Wy7Xbw8H8HXuLGdJIAM,4786
16
+ vision_agent/image_utils.py,sha256=qRN_Y1XXBm9EL6V53OZUq21h0spIa1J6X9YDbe6B87o,4805
17
17
  vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
18
18
  vision_agent/llm/llm.py,sha256=Jty_RHdqVmIM0Mm31JNk50c882Tx7hHtkmh0WyXeJd8,5016
19
19
  vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
20
20
  vision_agent/lmm/lmm.py,sha256=1E7e_S_0fOKnf6mSsEdkXvsIjGmhBGl5XW4By2jvhbY,10045
21
- vision_agent/tools/__init__.py,sha256=lKv90gLu-mNp4uyGtJ8AUG-73xKwFEugZpe0atpsscA,269
21
+ vision_agent/tools/__init__.py,sha256=dkzk9amNzTEKULMB1xRJspqEGpzNPGuccWeXrv1xI0U,280
22
22
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
23
- vision_agent/tools/tools.py,sha256=EK9HauKZ1gq795wBZNER6-8PiDTNZwJ1sXYhDeplDZ0,25410
23
+ vision_agent/tools/tools.py,sha256=ybhCyutEGzHPKuR0Cu--Nb--KubjYvyzLEzVQYzIMTw,29148
24
24
  vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
25
25
  vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
26
- vision_agent-0.1.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
27
- vision_agent-0.1.3.dist-info/METADATA,sha256=iBoN2GBvALl6XxhxRo4o9WaqLgI-UAobSymuZ1RHd9o,6233
28
- vision_agent-0.1.3.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
29
- vision_agent-0.1.3.dist-info/RECORD,,
26
+ vision_agent-0.1.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
27
+ vision_agent-0.1.4.dist-info/METADATA,sha256=FyBYGPHgC0uV7uy7wph8yvdQpEWSACnGR96y6Jt-E6A,6233
28
+ vision_agent-0.1.4.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
29
+ vision_agent-0.1.4.dist-info/RECORD,,