vision-agent 0.0.52__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -351,7 +351,7 @@ class VisionAgent(Agent):
351
351
  task_model: Optional[Union[LLM, LMM]] = None,
352
352
  answer_model: Optional[Union[LLM, LMM]] = None,
353
353
  reflect_model: Optional[Union[LLM, LMM]] = None,
354
- max_retries: int = 3,
354
+ max_retries: int = 2,
355
355
  verbose: bool = False,
356
356
  report_progress_callback: Optional[Callable[[str], None]] = None,
357
357
  ):
@@ -476,7 +476,7 @@ class VisionAgent(Agent):
476
476
  reflections += "\n" + reflection
477
477
  # '<END>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
478
478
  self.log_progress(
479
- f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}</<ANSWER>"
479
+ f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}</ANSWER>"
480
480
  )
481
481
 
482
482
  if visualize_output:
@@ -9,6 +9,7 @@ from .tools import (
9
9
  ExtractFrames,
10
10
  GroundingDINO,
11
11
  GroundingSAM,
12
+ ImageCaption,
12
13
  SegArea,
13
14
  SegIoU,
14
15
  Tool,
@@ -12,8 +12,11 @@ from PIL.Image import Image as ImageType
12
12
 
13
13
  from vision_agent.image_utils import convert_to_b64, get_image_size
14
14
  from vision_agent.tools.video import extract_frames_from_video
15
+ from vision_agent.type_defs import LandingaiAPIKey
15
16
 
16
17
  _LOGGER = logging.getLogger(__name__)
18
+ _LND_API_KEY = LandingaiAPIKey().api_key
19
+ _LND_API_URL = "https://api.dev.landing.ai/v1/agent"
17
20
 
18
21
 
19
22
  def normalize_bbox(
@@ -80,8 +83,6 @@ class CLIP(Tool):
80
83
  [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
81
84
  """
82
85
 
83
- _ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
84
-
85
86
  name = "clip_"
86
87
  description = "'clip_' is a tool that can classify any image given a set of input names or tags. It returns a list of the input names along with their probability scores."
87
88
  usage = {
@@ -125,23 +126,63 @@ class CLIP(Tool):
125
126
  "image": image_b64,
126
127
  "tool": "closed_set_image_classification",
127
128
  }
128
- res = requests.post(
129
- self._ENDPOINT,
130
- headers={"Content-Type": "application/json"},
131
- json=data,
132
- )
133
- resp_json: Dict[str, Any] = res.json()
134
- if (
135
- "statusCode" in resp_json and resp_json["statusCode"] != 200
136
- ) or "statusCode" not in resp_json:
137
- _LOGGER.error(f"Request failed: {resp_json}")
138
- raise ValueError(f"Request failed: {resp_json}")
139
-
140
- resp_json["data"]["scores"] = [
141
- round(prob, 4) for prob in resp_json["data"]["scores"]
142
- ]
129
+ resp_data = _send_inference_request(data, "tools")
130
+ resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
131
+ return resp_data
132
+
133
+
134
+ class ImageCaption(Tool):
135
+ r"""ImageCaption is a tool that can caption an image based on its contents
136
+ or tags.
137
+
138
+ Example
139
+ -------
140
+ >>> import vision_agent as va
141
+ >>> caption = va.tools.ImageCaption()
142
+ >>> caption("image1.jpg")
143
+ {'text': ['a box of orange and white socks']}
144
+ """
145
+
146
+ name = "image_caption_"
147
+ description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image"
148
+ usage = {
149
+ "required_parameters": [
150
+ {"name": "image", "type": "str"},
151
+ ],
152
+ "examples": [
153
+ {
154
+ "scenario": "Can you describe this image ? Image name: cat.jpg",
155
+ "parameters": {"image": "cat.jpg"},
156
+ },
157
+ {
158
+ "scenario": "Can you caption this image with their main contents ? Image name: cat_dog.jpg",
159
+ "parameters": {"image": "cat_dog.jpg"},
160
+ },
161
+ {
162
+ "scenario": "Can you build me a image captioning tool ? Image name: shirts.jpg",
163
+ "parameters": {
164
+ "image": "shirts.jpg",
165
+ },
166
+ },
167
+ ],
168
+ }
143
169
 
144
- return resp_json["data"] # type: ignore
170
+ # TODO: Add support for input multiple images, which aligns with the output type.
171
+ def __call__(self, image: Union[str, ImageType]) -> Dict:
172
+ """Invoke the Image captioning model.
173
+
174
+ Parameters:
175
+ image: the input image to caption.
176
+
177
+ Returns:
178
+ A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
179
+ """
180
+ image_b64 = convert_to_b64(image)
181
+ data = {
182
+ "image": image_b64,
183
+ "tool": "image_captioning",
184
+ }
185
+ return _send_inference_request(data, "tools")
145
186
 
146
187
 
147
188
  class GroundingDINO(Tool):
@@ -158,8 +199,6 @@ class GroundingDINO(Tool):
158
199
  'scores': [0.98, 0.02]}]
159
200
  """
160
201
 
161
- _ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
162
-
163
202
  name = "grounding_dino_"
164
203
  description = "'grounding_dino_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. It returns a list of bounding boxes, label names and associated probability scores."
165
204
  usage = {
@@ -222,24 +261,13 @@ class GroundingDINO(Tool):
222
261
  "tool": "visual_grounding",
223
262
  "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
224
263
  }
225
- res = requests.post(
226
- self._ENDPOINT,
227
- headers={"Content-Type": "application/json"},
228
- json=request_data,
229
- )
230
- resp_json: Dict[str, Any] = res.json()
231
- if (
232
- "statusCode" in resp_json and resp_json["statusCode"] != 200
233
- ) or "statusCode" not in resp_json:
234
- _LOGGER.error(f"Request failed: {resp_json}")
235
- raise ValueError(f"Request failed: {resp_json}")
236
- data: Dict[str, Any] = resp_json["data"]
264
+ data: Dict[str, Any] = _send_inference_request(request_data, "tools")
237
265
  if "bboxes" in data:
238
266
  data["bboxes"] = [normalize_bbox(box, image_size) for box in data["bboxes"]]
239
267
  if "scores" in data:
240
268
  data["scores"] = [round(score, 2) for score in data["scores"]]
241
269
  if "labels" in data:
242
- data["labels"] = [label for label in data["labels"]]
270
+ data["labels"] = list(data["labels"])
243
271
  data["size"] = (image_size[1], image_size[0])
244
272
  return data
245
273
 
@@ -267,8 +295,6 @@ class GroundingSAM(Tool):
267
295
  [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
268
296
  """
269
297
 
270
- _ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
271
-
272
298
  name = "grounding_sam_"
273
299
  description = "'grounding_sam_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
274
300
  usage = {
@@ -331,18 +357,7 @@ class GroundingSAM(Tool):
331
357
  "tool": "visual_grounding_segment",
332
358
  "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
333
359
  }
334
- res = requests.post(
335
- self._ENDPOINT,
336
- headers={"Content-Type": "application/json"},
337
- json=request_data,
338
- )
339
- resp_json: Dict[str, Any] = res.json()
340
- if (
341
- "statusCode" in resp_json and resp_json["statusCode"] != 200
342
- ) or "statusCode" not in resp_json:
343
- _LOGGER.error(f"Request failed: {resp_json}")
344
- raise ValueError(f"Request failed: {resp_json}")
345
- data: Dict[str, Any] = resp_json["data"]
360
+ data: Dict[str, Any] = _send_inference_request(request_data, "tools")
346
361
  ret_pred: Dict[str, List] = {"labels": [], "bboxes": [], "masks": []}
347
362
  if "bboxes" in data:
348
363
  ret_pred["bboxes"] = [
@@ -631,6 +646,7 @@ TOOLS = {
631
646
  [
632
647
  NoOp,
633
648
  CLIP,
649
+ ImageCaption,
634
650
  GroundingDINO,
635
651
  AgentGroundingSAM,
636
652
  ExtractFrames,
@@ -645,3 +661,20 @@ TOOLS = {
645
661
  )
646
662
  if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage"))
647
663
  }
664
+
665
+
666
+ def _send_inference_request(
667
+ payload: Dict[str, Any], endpoint_name: str
668
+ ) -> Dict[str, Any]:
669
+ res = requests.post(
670
+ f"{_LND_API_URL}/model/{endpoint_name}",
671
+ headers={
672
+ "Content-Type": "application/json",
673
+ "apikey": _LND_API_KEY,
674
+ },
675
+ json=payload,
676
+ )
677
+ if res.status_code != 200:
678
+ _LOGGER.error(f"Request failed: {res.text}")
679
+ raise ValueError(f"Request failed: {res.text}")
680
+ return res.json()["data"] # type: ignore
@@ -0,0 +1,48 @@
1
+ from pydantic import Field, field_validator
2
+ from pydantic_settings import BaseSettings
3
+
4
+
5
+ class LandingaiAPIKey(BaseSettings):
6
+ """The API key of a user in a particular organization in LandingLens.
7
+ It supports loading from environment variables or .env files.
8
+ The supported name of the environment variables are (case-insensitive):
9
+ - LANDINGAI_API_KEY
10
+
11
+ Environment variables will always take priority over values loaded from a dotenv file.
12
+ """
13
+
14
+ api_key: str = Field(
15
+ default="land_sk_hw34v3tyEc35OAhP8F7hnGnrDv2C8hD2ycMyq0aMkVS1H40D22",
16
+ alias="LANDINGAI_API_KEY",
17
+ description="The API key of LandingAI.",
18
+ )
19
+
20
+ @field_validator("api_key")
21
+ @classmethod
22
+ def is_api_key_valid(cls, key: str) -> str:
23
+ """Check if the API key is a v2 key."""
24
+ if not key:
25
+ raise InvalidApiKeyError(f"LandingAI API key is required, but it's {key}")
26
+ if not key.startswith("land_sk_"):
27
+ raise InvalidApiKeyError(
28
+ f"LandingAI API key (v2) must start with 'land_sk_' prefix, but it's {key}. See https://support.landing.ai/docs/api-key for more information."
29
+ )
30
+ return key
31
+
32
+ class Config:
33
+ env_file = ".env"
34
+ env_prefix = "landingai_"
35
+ case_sensitive = False
36
+ extra = "ignore"
37
+
38
+
39
+ class InvalidApiKeyError(Exception):
40
+ """Exception raised when the an invalid API key is provided. This error could be raised from any SDK code, not limited to a HTTP client."""
41
+
42
+ def __init__(self, message: str):
43
+ self.message = f"""{message}
44
+ For more information, see https://landing-ai.github.io/landingai-python/landingai.html#manage-api-credentials"""
45
+ super().__init__(self.message)
46
+
47
+ def __str__(self) -> str:
48
+ return self.message
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.0.52
3
+ Version: 0.1.1
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -16,6 +16,7 @@ Requires-Dist: openai (>=1.0.0,<2.0.0)
16
16
  Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
17
17
  Requires-Dist: pandas (>=2.0.0,<3.0.0)
18
18
  Requires-Dist: pillow (>=10.0.0,<11.0.0)
19
+ Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
19
20
  Requires-Dist: requests (>=2.0.0,<3.0.0)
20
21
  Requires-Dist: sentence-transformers (>=2.0.0,<3.0.0)
21
22
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
@@ -5,7 +5,7 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
5
5
  vision_agent/agent/easytool_prompts.py,sha256=dYzWa_RaiaFSQ-CowoQOcFmjZtBTTljRyA809bLgrvU,4519
6
6
  vision_agent/agent/reflexion.py,sha256=wzpptfALNZIh9Q5jgkK3imGL5LWjTW_n_Ypsvxdh07Q,10101
7
7
  vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
8
- vision_agent/agent/vision_agent.py,sha256=TKseWK3C7kr9GmjQmYgNSBZJHPqd7wTP6BSkwYqJkdY,19765
8
+ vision_agent/agent/vision_agent.py,sha256=nHmfr-OuMfdH0N8gECXLzTAgRmTx9cYe5_pnQj-HnBE,19764
9
9
  vision_agent/agent/vision_agent_prompts.py,sha256=dPg0mLVK_fGJpYK2xXGhm-zuXX1KVZW_zFXyYsspUz8,6567
10
10
  vision_agent/data/__init__.py,sha256=YU-5g3LbEQ6a4drz0RLGTagXMVU2Z4Xr3RlfWE-R0jU,46
11
11
  vision_agent/data/data.py,sha256=pgtSGZdAnbQ8oGsuapLtFTMPajnCGDGekEXTnFuBwsY,5122
@@ -18,11 +18,12 @@ vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,
18
18
  vision_agent/llm/llm.py,sha256=tgL6ZtuwZKuxSNiCxJCuP2ETjNMrosdgxXkZJb0_00E,5024
19
19
  vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
20
20
  vision_agent/lmm/lmm.py,sha256=LxwxCArp7DfnPbjf_Gl55xBxPwo2Qx8eDp1gCnGYSO0,9535
21
- vision_agent/tools/__init__.py,sha256=AKN-T659HpwVearRnkCd6wWNoJ6K5kW9gAZwb8IQSLE,235
21
+ vision_agent/tools/__init__.py,sha256=OEqEysxm5wnnOD73NKNCUggALB72GEmVg9FNsEkSBtA,253
22
22
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
23
- vision_agent/tools/tools.py,sha256=h11niI1JiOCOaOFkdHee_AnXegaIK5Al8QMoFdZaJuo,24583
23
+ vision_agent/tools/tools.py,sha256=Qsqe8X6VjB0EMWhyKJ5EMPyLIc_d5Vtlw4ugV2FB_Ks,25589
24
24
  vision_agent/tools/video.py,sha256=40rscP8YvKN3lhZ4PDcOK4XbdFX2duCRpHY_krmBYKU,7476
25
- vision_agent-0.0.52.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
26
- vision_agent-0.0.52.dist-info/METADATA,sha256=5OBmHCpSDZbvGb_pNU_cOKWI9AdUOhEufDHigk_cm3c,6184
27
- vision_agent-0.0.52.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
28
- vision_agent-0.0.52.dist-info/RECORD,,
25
+ vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
26
+ vision_agent-0.1.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
27
+ vision_agent-0.1.1.dist-info/METADATA,sha256=rWMocnnZwuRhd3xIGyQUzDbsndVASBSu2jvAqt-3Odc,6233
28
+ vision_agent-0.1.1.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
29
+ vision_agent-0.1.1.dist-info/RECORD,,