vision-agent 0.2.63__py3-none-any.whl → 0.2.65__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,8 +7,8 @@ import tempfile
7
7
  from pathlib import Path
8
8
  from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
9
9
 
10
- from PIL import Image
11
10
  from langsmith import traceable
11
+ from PIL import Image
12
12
  from rich.console import Console
13
13
  from rich.style import Style
14
14
  from rich.syntax import Syntax
@@ -43,6 +43,8 @@ class DefaultImports:
43
43
 
44
44
  common_imports = [
45
45
  "from typing import *",
46
+ "from pillow_heif import register_heif_opener",
47
+ "register_heif_opener()",
46
48
  ]
47
49
 
48
50
  @staticmethod
vision_agent/lmm/lmm.py CHANGED
@@ -224,10 +224,10 @@ class OpenAILMM(LMM):
224
224
  return lambda x: T.grounding_sam(params["prompt"], x)
225
225
 
226
226
  def generate_zero_shot_counter(self, question: str) -> Callable:
227
- return T.zero_shot_counting
227
+ return T.loca_zero_shot_counting
228
228
 
229
229
  def generate_image_qa_tool(self, question: str) -> Callable:
230
- return lambda x: T.image_question_answering(question, x)
230
+ return lambda x: T.git_vqa_v2(question, x)
231
231
 
232
232
 
233
233
  class AzureOpenAILMM(OpenAILMM):
@@ -7,25 +7,28 @@ from .tools import (
7
7
  TOOLS,
8
8
  TOOLS_DF,
9
9
  UTILITIES_DOCSTRING,
10
+ blip_image_caption,
10
11
  clip,
11
12
  closest_box_distance,
12
13
  closest_mask_distance,
13
14
  extract_frames,
14
15
  get_tool_documentation,
16
+ git_vqa_v2,
15
17
  grounding_dino,
16
18
  grounding_sam,
17
- image_caption,
18
- image_question_answering,
19
19
  load_image,
20
+ loca_visual_prompt_counting,
21
+ loca_zero_shot_counting,
20
22
  ocr,
21
23
  overlay_bounding_boxes,
22
24
  overlay_heat_map,
23
25
  overlay_segmentation_masks,
26
+ owl_v2,
24
27
  save_image,
25
28
  save_json,
26
29
  save_video,
27
- visual_prompt_counting,
28
- zero_shot_counting,
30
+ vit_image_classification,
31
+ vit_nsfw_classification,
29
32
  )
30
33
 
31
34
  __new_tools__ = [
@@ -13,6 +13,7 @@ import pandas as pd
13
13
  import requests
14
14
  from moviepy.editor import ImageSequenceClip
15
15
  from PIL import Image, ImageDraw, ImageFont
16
+ from pillow_heif import register_heif_opener # type: ignore
16
17
 
17
18
  from vision_agent.tools.tool_utils import _send_inference_request
18
19
  from vision_agent.utils import extract_frames_from_video
@@ -26,6 +27,8 @@ from vision_agent.utils.image_utils import (
26
27
  rle_decode,
27
28
  )
28
29
 
30
+ register_heif_opener()
31
+
29
32
  COLORS = [
30
33
  (158, 218, 229),
31
34
  (219, 219, 141),
@@ -59,6 +62,7 @@ def grounding_dino(
59
62
  image: np.ndarray,
60
63
  box_threshold: float = 0.20,
61
64
  iou_threshold: float = 0.20,
65
+ model_size: str = "large",
62
66
  ) -> List[Dict[str, Any]]:
63
67
  """'grounding_dino' is a tool that can detect and count multiple objects given a text
64
68
  prompt such as category names or referring expressions. The categories in text prompt
@@ -72,6 +76,7 @@ def grounding_dino(
72
76
  to 0.20.
73
77
  iou_threshold (float, optional): The threshold for the Intersection over Union
74
78
  (IoU). Defaults to 0.20.
79
+ model_size (str, optional): The size of the model to use.
75
80
 
76
81
  Returns:
77
82
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -90,10 +95,14 @@ def grounding_dino(
90
95
  """
91
96
  image_size = image.shape[:2]
92
97
  image_b64 = convert_to_b64(image)
98
+ if model_size not in ["large", "tiny"]:
99
+ raise ValueError("model_size must be either 'large' or 'tiny'")
93
100
  request_data = {
94
101
  "prompt": prompt,
95
102
  "image": image_b64,
96
- "tool": "visual_grounding",
103
+ "tool": (
104
+ "visual_grounding" if model_size == "large" else "visual_grounding_tiny"
105
+ ),
97
106
  "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
98
107
  }
99
108
  data: Dict[str, Any] = _send_inference_request(request_data, "tools")
@@ -109,6 +118,62 @@ def grounding_dino(
109
118
  return return_data
110
119
 
111
120
 
121
+ def owl_v2(
122
+ prompt: str,
123
+ image: np.ndarray,
124
+ box_threshold: float = 0.10,
125
+ iou_threshold: float = 0.10,
126
+ ) -> List[Dict[str, Any]]:
127
+ """'owl_v2' is a tool that can detect and count multiple objects given a text
128
+ prompt such as category names or referring expressions. The categories in text prompt
129
+ are separated by commas or periods. It returns a list of bounding boxes with
130
+ normalized coordinates, label names and associated probability scores.
131
+
132
+ Parameters:
133
+ prompt (str): The prompt to ground to the image.
134
+ image (np.ndarray): The image to ground the prompt to.
135
+ box_threshold (float, optional): The threshold for the box detection. Defaults
136
+ to 0.10.
137
+ iou_threshold (float, optional): The threshold for the Intersection over Union
138
+ (IoU). Defaults to 0.10.
139
+ model_size (str, optional): The size of the model to use.
140
+
141
+ Returns:
142
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
143
+ bounding box of the detected objects with normalized coordinates between 0
144
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
145
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
146
+ bounding box.
147
+
148
+ Example
149
+ -------
150
+ >>> owl_v2("car. dinosaur", image)
151
+ [
152
+ {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
153
+ {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
154
+ ]
155
+ """
156
+ image_size = image.shape[:2]
157
+ image_b64 = convert_to_b64(image)
158
+ request_data = {
159
+ "prompt": prompt,
160
+ "image": image_b64,
161
+ "tool": "open_vocab_detection",
162
+ "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
163
+ }
164
+ data: Dict[str, Any] = _send_inference_request(request_data, "tools")
165
+ return_data = []
166
+ for i in range(len(data["bboxes"])):
167
+ return_data.append(
168
+ {
169
+ "score": round(data["scores"][i], 2),
170
+ "label": data["labels"][i].strip(),
171
+ "bbox": normalize_bbox(data["bboxes"][i], image_size),
172
+ }
173
+ )
174
+ return return_data
175
+
176
+
112
177
  def grounding_sam(
113
178
  prompt: str,
114
179
  image: np.ndarray,
@@ -253,8 +318,8 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
253
318
  return ocr_results
254
319
 
255
320
 
256
- def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
257
- """'zero_shot_counting' is a tool that counts the dominant foreground object given
321
+ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
322
+ """'loca_zero_shot_counting' is a tool that counts the dominant foreground object given
258
323
  an image and no other information about the content. It returns only the count of
259
324
  the objects in the image.
260
325
 
@@ -267,7 +332,7 @@ def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
267
332
 
268
333
  Example
269
334
  -------
270
- >>> zero_shot_counting(image)
335
+ >>> loca_zero_shot_counting(image)
271
336
  {'count': 45},
272
337
  """
273
338
 
@@ -281,10 +346,10 @@ def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
281
346
  return resp_data
282
347
 
283
348
 
284
- def visual_prompt_counting(
349
+ def loca_visual_prompt_counting(
285
350
  image: np.ndarray, visual_prompt: Dict[str, List[float]]
286
351
  ) -> Dict[str, Any]:
287
- """'visual_prompt_counting' is a tool that counts the dominant foreground object
352
+ """'loca_visual_prompt_counting' is a tool that counts the dominant foreground object
288
353
  given an image and a visual prompt which is a bounding box describing the object.
289
354
  It returns only the count of the objects in the image.
290
355
 
@@ -297,7 +362,7 @@ def visual_prompt_counting(
297
362
 
298
363
  Example
299
364
  -------
300
- >>> visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
365
+ >>> loca_visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
301
366
  {'count': 45},
302
367
  """
303
368
 
@@ -316,8 +381,8 @@ def visual_prompt_counting(
316
381
  return resp_data
317
382
 
318
383
 
319
- def image_question_answering(prompt: str, image: np.ndarray) -> str:
320
- """'image_question_answering_' is a tool that can answer questions about the visual
384
+ def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
385
+ """'git_vqa_v2' is a tool that can answer questions about the visual
321
386
  contents of an image given a question and an image. It returns an answer to the
322
387
  question
323
388
 
@@ -331,7 +396,7 @@ def image_question_answering(prompt: str, image: np.ndarray) -> str:
331
396
 
332
397
  Example
333
398
  -------
334
- >>> image_question_answering('What is the cat doing ?', image)
399
+ >>> git_vqa_v2('What is the cat doing ?', image)
335
400
  'drinking milk'
336
401
  """
337
402
 
@@ -376,8 +441,62 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
376
441
  return resp_data
377
442
 
378
443
 
379
- def image_caption(image: np.ndarray) -> str:
380
- """'image_caption' is a tool that can caption an image based on its contents. It
444
+ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
445
+ """'vit_image_classification' is a tool that can classify an image. It returns a
446
+ list of classes and their probability scores based on image content.
447
+
448
+ Parameters:
449
+ image (np.ndarray): The image to classify or tag
450
+
451
+ Returns:
452
+ Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
453
+ contains a list of labels and other a list of scores.
454
+
455
+ Example
456
+ -------
457
+ >>> vit_image_classification(image)
458
+ {"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
459
+ """
460
+
461
+ image_b64 = convert_to_b64(image)
462
+ data = {
463
+ "image": image_b64,
464
+ "tool": "image_classification",
465
+ }
466
+ resp_data = _send_inference_request(data, "tools")
467
+ resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
468
+ return resp_data
469
+
470
+
471
+ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
472
+ """'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'.
473
+ It returns the predicted label and their probability scores based on image content.
474
+
475
+ Parameters:
476
+ image (np.ndarray): The image to classify or tag
477
+
478
+ Returns:
479
+ Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
480
+ contains a list of labels and other a list of scores.
481
+
482
+ Example
483
+ -------
484
+ >>> vit_nsfw_classification(image)
485
+ {"labels": "normal", "scores": 0.68},
486
+ """
487
+
488
+ image_b64 = convert_to_b64(image)
489
+ data = {
490
+ "image": image_b64,
491
+ "tool": "nsfw_image_classification",
492
+ }
493
+ resp_data = _send_inference_request(data, "tools")
494
+ resp_data["scores"] = round(resp_data["scores"], 4)
495
+ return resp_data
496
+
497
+
498
+ def blip_image_caption(image: np.ndarray) -> str:
499
+ """'blip_image_caption' is a tool that can caption an image based on its contents. It
381
500
  returns a text describing the image.
382
501
 
383
502
  Parameters:
@@ -388,7 +507,7 @@ def image_caption(image: np.ndarray) -> str:
388
507
 
389
508
  Example
390
509
  -------
391
- >>> image_caption(image)
510
+ >>> blip_image_caption(image)
392
511
  'This image contains a cat sitting on a table with a bowl of milk.'
393
512
  """
394
513
 
@@ -543,7 +662,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
543
662
  """
544
663
  from IPython.display import display
545
664
 
546
- pil_image = Image.fromarray(image.astype(np.uint8))
665
+ pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
547
666
  display(pil_image)
548
667
  pil_image.save(file_path)
549
668
 
@@ -792,15 +911,17 @@ def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
792
911
 
793
912
 
794
913
  TOOLS = [
795
- grounding_dino,
914
+ owl_v2,
796
915
  grounding_sam,
797
916
  extract_frames,
798
917
  ocr,
799
918
  clip,
800
- zero_shot_counting,
801
- visual_prompt_counting,
802
- image_question_answering,
803
- image_caption,
919
+ vit_image_classification,
920
+ vit_nsfw_classification,
921
+ loca_zero_shot_counting,
922
+ loca_visual_prompt_counting,
923
+ git_vqa_v2,
924
+ blip_image_caption,
804
925
  closest_mask_distance,
805
926
  closest_box_distance,
806
927
  save_json,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.63
3
+ Version: 0.2.65
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -21,6 +21,7 @@ Requires-Dist: openai (>=1.0.0,<2.0.0)
21
21
  Requires-Dist: opencv-python (>=4.0.0,<5.0.0)
22
22
  Requires-Dist: pandas (>=2.0.0,<3.0.0)
23
23
  Requires-Dist: pillow (>=10.0.0,<11.0.0)
24
+ Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
24
25
  Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
25
26
  Requires-Dist: requests (>=2.0.0,<3.0.0)
26
27
  Requires-Dist: rich (>=13.7.1,<14.0.0)
@@ -1,23 +1,23 @@
1
1
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
2
  vision_agent/agent/__init__.py,sha256=IUwfbPMcT8X_rnXMLmI8gJ4ltsHy_XSs9eLiKURJxeY,81
3
3
  vision_agent/agent/agent.py,sha256=ZK-5lOtd9-eD9aWcXssJpnOyvZuO7_5hAmnb-6sWVe8,569
4
- vision_agent/agent/vision_agent.py,sha256=TVODnpLVlAtqnvSMUQ0wC5YyDxt2U9KRK5V13dxhUA4,25194
4
+ vision_agent/agent/vision_agent.py,sha256=HC63BP4jPiR4lJLEkKQ-zMV5C5JwjnuZvc7hVjjS2uk,25284
5
5
  vision_agent/agent/vision_agent_prompts.py,sha256=bMXdZYf6kbikHn__tCGrYE1QvXC88EmpMpM_97V6szA,8472
6
6
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
8
8
  vision_agent/lmm/__init__.py,sha256=3ro5lCIoS3DgEghOy0SPFrEhYvFnWZpVC5S5kSnIx6A,57
9
- vision_agent/lmm/lmm.py,sha256=UDyGjMRG_CHhcyTnsmvowRE38zHJATy5cbg1UIbdIjs,8954
10
- vision_agent/tools/__init__.py,sha256=inKVLRUATQA9oi83l0NluC8Gm-LJU2-AjA6rL1j12Q8,1532
9
+ vision_agent/lmm/lmm.py,sha256=ihmLYL_291HnELyMtfFKTCnPWnmuoEH2DDFmc4ynMG8,8945
10
+ vision_agent/tools/__init__.py,sha256=aE1O8cMeLDPO50Sc-CuAQ_Akh0viz7vBxDcVeZNqsA0,1604
11
11
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
12
12
  vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
13
- vision_agent/tools/tools.py,sha256=o9ojTfhu8KCSXfW4UPUNOhmki6A-l3jtVi0rPEnELjc,26944
13
+ vision_agent/tools/tools.py,sha256=Qzwm_wu6KJh-3DSoNmZ4Lv8jCCNJMwKIPBFxxN6FmDo,31397
14
14
  vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
15
15
  vision_agent/utils/execute.py,sha256=GqoAodxtwTPBr1nujPTsWiZO2rBGvWVXTe8lgxY4d_g,20603
16
16
  vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
17
17
  vision_agent/utils/sim.py,sha256=ci6Eta73dDgLP1Ajtknbgmf1g8aAvBHqlVQvBuLMKXQ,4427
18
18
  vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
19
19
  vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
20
- vision_agent-0.2.63.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
21
- vision_agent-0.2.63.dist-info/METADATA,sha256=clb-wEt_PcXS2I27fGICOau8hbsrkQLuhDVD0pnH1QQ,8317
22
- vision_agent-0.2.63.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
23
- vision_agent-0.2.63.dist-info/RECORD,,
20
+ vision_agent-0.2.65.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
21
+ vision_agent-0.2.65.dist-info/METADATA,sha256=MnlqbmIs4PRO4Y1qaR2abmD0RueZnIYUEnGGcuJ1wHA,8363
22
+ vision_agent-0.2.65.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
23
+ vision_agent-0.2.65.dist-info/RECORD,,