vision-agent 0.2.24__py3-none-any.whl → 0.2.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ from vision_agent.utils.type_defs import LandingaiAPIKey
8
8
 
9
9
  _LOGGER = logging.getLogger(__name__)
10
10
  _LND_API_KEY = LandingaiAPIKey().api_key
11
- _LND_API_URL = "https://api.dev.landing.ai/v1/agent"
11
+ _LND_API_URL = "https://api.staging.landing.ai/v1/agent"
12
12
 
13
13
 
14
14
  def _send_inference_request(
@@ -53,7 +53,7 @@ class NoOp(Tool):
53
53
 
54
54
 
55
55
  class CLIP(Tool):
56
- r"""CLIP is a tool that can classify or tag any image given a set if input classes
56
+ r"""CLIP is a tool that can classify or tag any image given a set of input classes
57
57
  or tags.
58
58
 
59
59
  Example
@@ -15,7 +15,14 @@ from scipy.spatial import distance # type: ignore
15
15
 
16
16
  from vision_agent.tools.tool_utils import _send_inference_request
17
17
  from vision_agent.utils import extract_frames_from_video
18
- from vision_agent.utils.image_utils import convert_to_b64, normalize_bbox, rle_decode
18
+ from vision_agent.utils.image_utils import (
19
+ convert_to_b64,
20
+ normalize_bbox,
21
+ rle_decode,
22
+ b64_to_pil,
23
+ get_image_size,
24
+ denormalize_bbox,
25
+ )
19
26
 
20
27
  COLORS = [
21
28
  (158, 218, 229),
@@ -49,7 +56,7 @@ def grounding_dino(
49
56
  prompt: str,
50
57
  image: np.ndarray,
51
58
  box_threshold: float = 0.20,
52
- iou_threshold: float = 0.75,
59
+ iou_threshold: float = 0.20,
53
60
  ) -> List[Dict[str, Any]]:
54
61
  """'grounding_dino' is a tool that can detect and count objects given a text prompt
55
62
  such as category names or referring expressions. It returns a list and count of
@@ -61,12 +68,13 @@ def grounding_dino(
61
68
  box_threshold (float, optional): The threshold for the box detection. Defaults
62
69
  to 0.20.
63
70
  iou_threshold (float, optional): The threshold for the Intersection over Union
64
- (IoU). Defaults to 0.75.
71
+ (IoU). Defaults to 0.20.
65
72
 
66
73
  Returns:
67
74
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
68
75
  bounding box of the detected objects with normalized coordinates
69
- (x1, y1, x2, y2).
76
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and
77
+ xmax and ymax are the coordinates of the bottom-right of the bounding box.
70
78
 
71
79
  Example
72
80
  -------
@@ -77,7 +85,7 @@ def grounding_dino(
77
85
  ]
78
86
  """
79
87
  image_size = image.shape[:2]
80
- image_b64 = convert_to_b64(Image.fromarray(image))
88
+ image_b64 = convert_to_b64(image)
81
89
  request_data = {
82
90
  "prompt": prompt,
83
91
  "image": image_b64,
@@ -101,7 +109,7 @@ def grounding_sam(
101
109
  prompt: str,
102
110
  image: np.ndarray,
103
111
  box_threshold: float = 0.20,
104
- iou_threshold: float = 0.75,
112
+ iou_threshold: float = 0.20,
105
113
  ) -> List[Dict[str, Any]]:
106
114
  """'grounding_sam' is a tool that can detect and segment objects given a text
107
115
  prompt such as category names or referring expressions. It returns a list of
@@ -113,12 +121,15 @@ def grounding_sam(
113
121
  box_threshold (float, optional): The threshold for the box detection. Defaults
114
122
  to 0.20.
115
123
  iou_threshold (float, optional): The threshold for the Intersection over Union
116
- (IoU). Defaults to 0.75.
124
+ (IoU). Defaults to 0.20.
117
125
 
118
126
  Returns:
119
127
  List[Dict[str, Any]]: A list of dictionaries containing the score, label,
120
128
  bounding box, and mask of the detected objects with normalized coordinates
121
- (x1, y1, x2, y2).
129
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and
130
+ xmax and ymax are the coordinates of the bottom-right of the bounding box.
131
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
132
+ the background.
122
133
 
123
134
  Example
124
135
  -------
@@ -137,7 +148,7 @@ def grounding_sam(
137
148
  ]
138
149
  """
139
150
  image_size = image.shape[:2]
140
- image_b64 = convert_to_b64(Image.fromarray(image))
151
+ image_b64 = convert_to_b64(image)
141
152
  request_data = {
142
153
  "prompt": prompt,
143
154
  "image": image_b64,
@@ -235,6 +246,152 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
235
246
  return output
236
247
 
237
248
 
249
+ def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
250
+ """'zero_shot_counting' is a tool that counts the dominant foreground object given an image and no other information about the content.
251
+ It returns only the count of the objects in the image.
252
+
253
+ Parameters:
254
+ image (np.ndarray): The image that contains lot of instances of a single object
255
+
256
+ Returns:
257
+ Dict[str, Any]: A dictionary containing the key 'count' and the count as a value. E.g. {count: 12}.
258
+
259
+ Example
260
+ -------
261
+ >>> zero_shot_counting(image)
262
+ {'count': 45},
263
+
264
+ """
265
+
266
+ image_b64 = convert_to_b64(image)
267
+ data = {
268
+ "image": image_b64,
269
+ "tool": "zero_shot_counting",
270
+ }
271
+ resp_data = _send_inference_request(data, "tools")
272
+ resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
273
+ return resp_data
274
+
275
+
276
+ def visual_prompt_counting(
277
+ image: np.ndarray, visual_prompt: Dict[str, List[float]]
278
+ ) -> Dict[str, Any]:
279
+ """'visual_prompt_counting' is a tool that counts the dominant foreground object given an image and a visual prompt which is a bounding box describing the object.
280
+ It returns only the count of the objects in the image.
281
+
282
+ Parameters:
283
+ image (np.ndarray): The image that contains lot of instances of a single object
284
+
285
+ Returns:
286
+ Dict[str, Any]: A dictionary containing the key 'count' and the count as a value. E.g. {count: 12}.
287
+
288
+ Example
289
+ -------
290
+ >>> visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
291
+ {'count': 45},
292
+
293
+ """
294
+
295
+ image_size = get_image_size(image)
296
+ bbox = visual_prompt["bbox"]
297
+ bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
298
+ image_b64 = convert_to_b64(image)
299
+
300
+ data = {
301
+ "image": image_b64,
302
+ "prompt": bbox_str,
303
+ "tool": "few_shot_counting",
304
+ }
305
+ resp_data = _send_inference_request(data, "tools")
306
+ resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
307
+ return resp_data
308
+
309
+
310
+ def image_question_answering(image: np.ndarray, prompt: str) -> str:
311
+ """'image_question_answering_' is a tool that can answer questions about the visual contents of an image given a question and an image.
312
+ It returns an answer to the question
313
+
314
+ Parameters:
315
+ image (np.ndarray): The reference image used for the question
316
+ prompt (str): The question about the image
317
+
318
+ Returns:
319
+ str: A string which is the answer to the given prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}.
320
+
321
+ Example
322
+ -------
323
+ >>> image_question_answering(image, 'What is the cat doing ?')
324
+ 'drinking milk'
325
+
326
+ """
327
+
328
+ image_b64 = convert_to_b64(image)
329
+ data = {
330
+ "image": image_b64,
331
+ "prompt": prompt,
332
+ "tool": "image_question_answering",
333
+ }
334
+
335
+ answer = _send_inference_request(data, "tools")
336
+ return answer["text"][0] # type: ignore
337
+
338
+
339
+ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
340
+ """'clip' is a tool that can classify an image given a list of input classes or tags.
341
+ It returns the same list of the input classes along with their probability scores based on image content.
342
+
343
+ Parameters:
344
+ image (np.ndarray): The image to classify or tag
345
+ classes (List[str]): The list of classes or tags that is associated with the image
346
+
347
+ Returns:
348
+ Dict[str, Any]: A dictionary containing the labels and scores. One dictionary contains a list of given labels and other a list of scores.
349
+
350
+ Example
351
+ -------
352
+ >>> clip(image, ['dog', 'cat', 'bird'])
353
+ {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
354
+
355
+ """
356
+
357
+ image_b64 = convert_to_b64(image)
358
+ data = {
359
+ "prompt": ",".join(classes),
360
+ "image": image_b64,
361
+ "tool": "closed_set_image_classification",
362
+ }
363
+ resp_data = _send_inference_request(data, "tools")
364
+ resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
365
+ return resp_data
366
+
367
+
368
+ def image_caption(image: np.ndarray) -> str:
369
+ """'image_caption' is a tool that can caption an image based on its contents.
370
+ It returns a text describing the image.
371
+
372
+ Parameters:
373
+ image (np.ndarray): The image to caption
374
+
375
+ Returns:
376
+ str: A string which is the caption for the given image.
377
+
378
+ Example
379
+ -------
380
+ >>> image_caption(image)
381
+ 'This image contains a cat sitting on a table with a bowl of milk.'
382
+
383
+ """
384
+
385
+ image_b64 = convert_to_b64(image)
386
+ data = {
387
+ "image": image_b64,
388
+ "tool": "image_captioning",
389
+ }
390
+
391
+ answer = _send_inference_request(data, "tools")
392
+ return answer["text"][0] # type: ignore
393
+
394
+
238
395
  def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
239
396
  """'closest_mask_distance' calculates the closest distance between two masks.
240
397
 
@@ -504,6 +661,11 @@ TOOLS = [
504
661
  grounding_sam,
505
662
  extract_frames,
506
663
  ocr,
664
+ clip,
665
+ zero_shot_counting,
666
+ visual_prompt_counting,
667
+ image_question_answering,
668
+ image_caption,
507
669
  closest_mask_distance,
508
670
  closest_box_distance,
509
671
  save_json,
@@ -104,15 +104,20 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
104
104
  """
105
105
  if data is None:
106
106
  raise ValueError(f"Invalid input image: {data}. Input image can't be None.")
107
+
107
108
  if isinstance(data, (str, Path)):
108
109
  data = Image.open(data)
110
+ elif isinstance(data, np.ndarray):
111
+ data = Image.fromarray(data)
112
+
109
113
  if isinstance(data, Image.Image):
110
114
  buffer = BytesIO()
111
115
  data.convert("RGB").save(buffer, format="PNG")
112
116
  return base64.b64encode(buffer.getvalue()).decode("utf-8")
113
117
  else:
114
- arr_bytes = data.tobytes()
115
- return base64.b64encode(arr_bytes).decode("utf-8")
118
+ raise ValueError(
119
+ f"Invalid input image: {data}. Input image must be a PIL Image or a numpy array."
120
+ )
116
121
 
117
122
 
118
123
  def denormalize_bbox(
@@ -12,7 +12,7 @@ class LandingaiAPIKey(BaseSettings):
12
12
  """
13
13
 
14
14
  api_key: str = Field(
15
- default="land_sk_PCRPYKqB3cq0JWGY83hjEk33SWSDOwdNoyUjTgCDMZO4NxeCXW",
15
+ default="land_sk_IJrojHarPXRjqDj1Fng76mX7yCbzVm1s5rZYxaNXu5v0cNLn0w",
16
16
  alias="LANDINGAI_API_KEY",
17
17
  description="The API key of LandingAI.",
18
18
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.24
3
+ Version: 0.2.25
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -19,16 +19,16 @@ vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,
19
19
  vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
20
20
  vision_agent/tools/__init__.py,sha256=dRHXGpjhItXZRQs0r_l3Z3bQIreaZaYP0CJrl8mOJxM,452
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
- vision_agent/tools/tool_utils.py,sha256=mK6QfbYr6oo9ci979-_6R1DrxU2i8HGhwosADyvciI0,865
23
- vision_agent/tools/tools.py,sha256=sVxN7SpDkz_XTc_SKwkoRF4EwaMTuHvTsCHwtR942Fc,47373
24
- vision_agent/tools/tools_v2.py,sha256=iO-ochdLq73xdCRUY1MKixHyVAk6UIUrY648MtjlHno,16201
22
+ vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
23
+ vision_agent/tools/tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
24
+ vision_agent/tools/tools_v2.py,sha256=Tdam-cWBI4ipXWwGyxim-SK07zP97_hcdUtYd1a4CnI,21404
25
25
  vision_agent/utils/__init__.py,sha256=xsHFyJSDbLdonB9Dh74cwZnVTiT__2OQF3Brd3Nmglc,116
26
26
  vision_agent/utils/execute.py,sha256=RC_jKrm2kOWwzNe9xKuA2xJcbsNcD0Hb95_o3_Le0_E,3820
27
- vision_agent/utils/image_utils.py,sha256=1dggPBhW8_hUXDItCRLa23h-hdBwS50cjL4v1hsoUbg,7586
27
+ vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
28
28
  vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
29
- vision_agent/utils/type_defs.py,sha256=ijFAd7D0y8JOg0Ib063rqsDcrFtZfQbdqpaRPTmp2hY,1792
29
+ vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
30
30
  vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
31
- vision_agent-0.2.24.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
32
- vision_agent-0.2.24.dist-info/METADATA,sha256=G4bq69V2-eRKNSWwx0skCfU60iiCUQf5l37B9O49Bkk,9212
33
- vision_agent-0.2.24.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
34
- vision_agent-0.2.24.dist-info/RECORD,,
31
+ vision_agent-0.2.25.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
32
+ vision_agent-0.2.25.dist-info/METADATA,sha256=5bycdwOp0pnRpUBQo_JM1c1Abq2fmWJcVYE_7YgtoUY,9212
33
+ vision_agent-0.2.25.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
34
+ vision_agent-0.2.25.dist-info/RECORD,,