vision-agent 0.0.53__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +110 -51
- vision_agent/agent/vision_agent_prompts.py +14 -1
- vision_agent/llm/llm.py +1 -1
- vision_agent/lmm/lmm.py +8 -4
- vision_agent/tools/tools.py +27 -63
- vision_agent/type_defs.py +48 -0
- {vision_agent-0.0.53.dist-info → vision_agent-0.1.2.dist-info}/METADATA +2 -1
- {vision_agent-0.0.53.dist-info → vision_agent-0.1.2.dist-info}/RECORD +10 -9
- {vision_agent-0.0.53.dist-info → vision_agent-0.1.2.dist-info}/LICENSE +0 -0
- {vision_agent-0.0.53.dist-info → vision_agent-0.1.2.dist-info}/WHEEL +0 -0
@@ -37,10 +37,10 @@ _LOGGER = logging.getLogger(__name__)
|
|
37
37
|
|
38
38
|
def parse_json(s: str) -> Any:
|
39
39
|
s = (
|
40
|
-
s.replace(":
|
41
|
-
.replace(":
|
42
|
-
.replace(":
|
43
|
-
.replace(":
|
40
|
+
s.replace(": True", ": true")
|
41
|
+
.replace(": False", ": false")
|
42
|
+
.replace(":True", ": true")
|
43
|
+
.replace(":False", ": false")
|
44
44
|
.replace("```", "")
|
45
45
|
.strip()
|
46
46
|
)
|
@@ -62,6 +62,19 @@ def format_tools(tools: Dict[int, Any]) -> str:
|
|
62
62
|
return tool_str
|
63
63
|
|
64
64
|
|
65
|
+
def format_tool_usage(tools: Dict[int, Any], tool_result: List[Dict]) -> str:
|
66
|
+
usage = []
|
67
|
+
name_to_usage = {v["name"]: v["usage"] for v in tools.values()}
|
68
|
+
for tool_res in tool_result:
|
69
|
+
if "tool_name" in tool_res:
|
70
|
+
usage.append((tool_res["tool_name"], name_to_usage[tool_res["tool_name"]]))
|
71
|
+
|
72
|
+
usage_str = ""
|
73
|
+
for tool_name, tool_usage in usage:
|
74
|
+
usage_str += f"{tool_name} - {tool_usage}\n"
|
75
|
+
return usage_str
|
76
|
+
|
77
|
+
|
65
78
|
def topological_sort(tasks: List[Dict]) -> List[Dict]:
|
66
79
|
in_degree = {task["id"]: 0 for task in tasks}
|
67
80
|
for task in tasks:
|
@@ -255,7 +268,8 @@ def self_reflect(
|
|
255
268
|
) -> str:
|
256
269
|
prompt = VISION_AGENT_REFLECTION.format(
|
257
270
|
question=question,
|
258
|
-
tools=format_tools(tools),
|
271
|
+
tools=format_tools({k: v["description"] for k, v in tools.items()}),
|
272
|
+
tool_usage=format_tool_usage(tools, tool_result),
|
259
273
|
tool_results=str(tool_result),
|
260
274
|
final_answer=final_answer,
|
261
275
|
)
|
@@ -268,41 +282,28 @@ def self_reflect(
|
|
268
282
|
return reflect_model(prompt)
|
269
283
|
|
270
284
|
|
271
|
-
def parse_reflect(reflect: str) ->
|
272
|
-
|
273
|
-
|
285
|
+
def parse_reflect(reflect: str) -> Any:
|
286
|
+
reflect = reflect.strip()
|
287
|
+
try:
|
288
|
+
return parse_json(reflect)
|
289
|
+
except Exception:
|
290
|
+
_LOGGER.error(f"Failed parse json reflection: {reflect}")
|
291
|
+
# LMMs have a hard time following directions, so make the criteria less strict
|
292
|
+
finish = (
|
274
293
|
"finish" in reflect.lower() and len(reflect) < 100
|
275
294
|
) or "finish" in reflect.lower()[-10:]
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
if "image" not in parameters:
|
289
|
-
continue
|
290
|
-
parameters = [parameters]
|
291
|
-
elif isinstance(tool_result["parameters"], list):
|
292
|
-
if len(tool_result["parameters"]) < 1 or (
|
293
|
-
"image" not in tool_result["parameters"][0]
|
294
|
-
):
|
295
|
-
continue
|
296
|
-
|
297
|
-
for param, call_result in zip(parameters, tool_result["call_results"]):
|
298
|
-
# calls can fail, so we need to check if the call was successful
|
299
|
-
if not isinstance(call_result, dict):
|
300
|
-
continue
|
301
|
-
if "bboxes" not in call_result:
|
302
|
-
continue
|
303
|
-
|
304
|
-
# if the call was successful, then we can add the image data
|
305
|
-
image = param["image"]
|
295
|
+
return {"Finish": finish, "Reflection": reflect}
|
296
|
+
|
297
|
+
|
298
|
+
def _handle_extract_frames(
|
299
|
+
image_to_data: Dict[str, Dict], tool_result: Dict
|
300
|
+
) -> Dict[str, Dict]:
|
301
|
+
image_to_data = image_to_data.copy()
|
302
|
+
# handle extract_frames_ case, useful if it extracts frames but doesn't do
|
303
|
+
# any following processing
|
304
|
+
for video_file_output in tool_result["call_results"]:
|
305
|
+
for frame, _ in video_file_output:
|
306
|
+
image = frame
|
306
307
|
if image not in image_to_data:
|
307
308
|
image_to_data[image] = {
|
308
309
|
"bboxes": [],
|
@@ -310,17 +311,72 @@ def visualize_result(all_tool_results: List[Dict]) -> List[str]:
|
|
310
311
|
"labels": [],
|
311
312
|
"scores": [],
|
312
313
|
}
|
314
|
+
return image_to_data
|
315
|
+
|
316
|
+
|
317
|
+
def _handle_viz_tools(
|
318
|
+
image_to_data: Dict[str, Dict], tool_result: Dict
|
319
|
+
) -> Dict[str, Dict]:
|
320
|
+
image_to_data = image_to_data.copy()
|
321
|
+
|
322
|
+
# handle grounding_sam_ and grounding_dino_
|
323
|
+
parameters = tool_result["parameters"]
|
324
|
+
# parameters can either be a dictionary or list, parameters can also be malformed
|
325
|
+
# becaus the LLM builds them
|
326
|
+
if isinstance(parameters, dict):
|
327
|
+
if "image" not in parameters:
|
328
|
+
return image_to_data
|
329
|
+
parameters = [parameters]
|
330
|
+
elif isinstance(tool_result["parameters"], list):
|
331
|
+
if len(tool_result["parameters"]) < 1 or (
|
332
|
+
"image" not in tool_result["parameters"][0]
|
333
|
+
):
|
334
|
+
return image_to_data
|
335
|
+
|
336
|
+
for param, call_result in zip(parameters, tool_result["call_results"]):
|
337
|
+
# calls can fail, so we need to check if the call was successful
|
338
|
+
if not isinstance(call_result, dict) or "bboxes" not in call_result:
|
339
|
+
return image_to_data
|
340
|
+
|
341
|
+
# if the call was successful, then we can add the image data
|
342
|
+
image = param["image"]
|
343
|
+
if image not in image_to_data:
|
344
|
+
image_to_data[image] = {
|
345
|
+
"bboxes": [],
|
346
|
+
"masks": [],
|
347
|
+
"labels": [],
|
348
|
+
"scores": [],
|
349
|
+
}
|
350
|
+
|
351
|
+
image_to_data[image]["bboxes"].extend(call_result["bboxes"])
|
352
|
+
image_to_data[image]["labels"].extend(call_result["labels"])
|
353
|
+
image_to_data[image]["scores"].extend(call_result["scores"])
|
354
|
+
if "masks" in call_result:
|
355
|
+
image_to_data[image]["masks"].extend(call_result["masks"])
|
356
|
+
|
357
|
+
return image_to_data
|
358
|
+
|
313
359
|
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
360
|
+
def visualize_result(all_tool_results: List[Dict]) -> List[str]:
|
361
|
+
image_to_data: Dict[str, Dict] = {}
|
362
|
+
for tool_result in all_tool_results:
|
363
|
+
# only handle bbox/mask tools or frame extraction
|
364
|
+
if tool_result["tool_name"] not in [
|
365
|
+
"grounding_sam_",
|
366
|
+
"grounding_dino_",
|
367
|
+
"extract_frames_",
|
368
|
+
]:
|
369
|
+
continue
|
370
|
+
|
371
|
+
if tool_result["tool_name"] == "extract_frames_":
|
372
|
+
image_to_data = _handle_extract_frames(image_to_data, tool_result)
|
373
|
+
else:
|
374
|
+
image_to_data = _handle_viz_tools(image_to_data, tool_result)
|
319
375
|
|
320
376
|
visualized_images = []
|
321
|
-
for
|
322
|
-
image_path = Path(
|
323
|
-
image_data = image_to_data[
|
377
|
+
for image_str in image_to_data:
|
378
|
+
image_path = Path(image_str)
|
379
|
+
image_data = image_to_data[image_str]
|
324
380
|
image = overlay_masks(image_path, image_data)
|
325
381
|
image = overlay_bboxes(image, image_data)
|
326
382
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
|
@@ -374,7 +430,9 @@ class VisionAgent(Agent):
|
|
374
430
|
OpenAILLM(temperature=0.1) if answer_model is None else answer_model
|
375
431
|
)
|
376
432
|
self.reflect_model = (
|
377
|
-
OpenAILMM(temperature=0.1)
|
433
|
+
OpenAILMM(json_mode=True, temperature=0.1)
|
434
|
+
if reflect_model is None
|
435
|
+
else reflect_model
|
378
436
|
)
|
379
437
|
self.max_retries = max_retries
|
380
438
|
self.tools = TOOLS
|
@@ -470,13 +528,14 @@ class VisionAgent(Agent):
|
|
470
528
|
visualized_output[0] if len(visualized_output) > 0 else image,
|
471
529
|
)
|
472
530
|
self.log_progress(f"Reflection: {reflection}")
|
473
|
-
|
531
|
+
parsed_reflection = parse_reflect(reflection)
|
532
|
+
if parsed_reflection["Finish"]:
|
474
533
|
break
|
475
534
|
else:
|
476
|
-
reflections += "\n" +
|
477
|
-
# '<
|
535
|
+
reflections += "\n" + parsed_reflection["Reflection"]
|
536
|
+
# '<ANSWER>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
|
478
537
|
self.log_progress(
|
479
|
-
f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}
|
538
|
+
f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}</ANSWER>"
|
480
539
|
)
|
481
540
|
|
482
541
|
if visualize_output:
|
@@ -1,4 +1,14 @@
|
|
1
|
-
VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question and the final answer the agent provided. You may also receive an image with the visualized bounding boxes or masks with their associated labels and scores from the tools used.
|
1
|
+
VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question and the final answer the agent provided. You may also receive an image with the visualized bounding boxes or masks with their associated labels and scores from the tools used.
|
2
|
+
|
3
|
+
Please note that:
|
4
|
+
1. You must ONLY output parsible JSON format. If the agents output was correct set "Finish" to true, else set "Finish" to false. An example output looks like:
|
5
|
+
{{"Finish": true, "Reflection": "The agent's answer was correct."}}
|
6
|
+
2. You must utilize the image with the visualized bounding boxes or masks and determine if the tools were used correctly or, using your own judgement, utilized incorrectly.
|
7
|
+
3. If the agent's answer was incorrect, you must diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, concrete plan that aims to mitigate the same failure with the tools available. An example output looks like:
|
8
|
+
{{"Finish": false, "Reflection": "I can see from teh visualized bounding boxes that the agent's answer was incorrect because the grounding_dino_ tool produced false positive predictions. The agent should use the following tools with the following parameters:
|
9
|
+
Step 1: Use 'grounding_dino_' with a 'prompt' of 'baby. bed' and a 'box_threshold' of 0.7 to reduce the false positives.
|
10
|
+
Step 2: Use 'box_iou_' with the baby bounding box and the bed bounding box to determine if the baby is on the bed or not."}}
|
11
|
+
4. If the task cannot be completed with the existing tools or by adjusting the parameters, set "Finish" to true.
|
2
12
|
|
3
13
|
User's question: {question}
|
4
14
|
|
@@ -8,6 +18,9 @@ Tools available:
|
|
8
18
|
Tasks and tools used:
|
9
19
|
{tool_results}
|
10
20
|
|
21
|
+
Tool's used API documentation:
|
22
|
+
{tool_usage}
|
23
|
+
|
11
24
|
Final answer:
|
12
25
|
{final_answer}
|
13
26
|
|
vision_agent/llm/llm.py
CHANGED
vision_agent/lmm/lmm.py
CHANGED
@@ -99,9 +99,10 @@ class OpenAILMM(LMM):
|
|
99
99
|
|
100
100
|
def __init__(
|
101
101
|
self,
|
102
|
-
model_name: str = "gpt-4-
|
102
|
+
model_name: str = "gpt-4-turbo",
|
103
103
|
api_key: Optional[str] = None,
|
104
104
|
max_tokens: int = 1024,
|
105
|
+
json_mode: bool = False,
|
105
106
|
**kwargs: Any,
|
106
107
|
):
|
107
108
|
if not api_key:
|
@@ -111,7 +112,10 @@ class OpenAILMM(LMM):
|
|
111
112
|
|
112
113
|
self.client = OpenAI(api_key=api_key)
|
113
114
|
self.model_name = model_name
|
114
|
-
|
115
|
+
if "max_tokens" not in kwargs:
|
116
|
+
kwargs["max_tokens"] = max_tokens
|
117
|
+
if json_mode:
|
118
|
+
kwargs["response_format"] = {"type": "json_object"}
|
115
119
|
self.kwargs = kwargs
|
116
120
|
|
117
121
|
def __call__(
|
@@ -153,7 +157,7 @@ class OpenAILMM(LMM):
|
|
153
157
|
)
|
154
158
|
|
155
159
|
response = self.client.chat.completions.create(
|
156
|
-
model=self.model_name, messages=fixed_chat,
|
160
|
+
model=self.model_name, messages=fixed_chat, **self.kwargs # type: ignore
|
157
161
|
)
|
158
162
|
|
159
163
|
return cast(str, response.choices[0].message.content)
|
@@ -181,7 +185,7 @@ class OpenAILMM(LMM):
|
|
181
185
|
)
|
182
186
|
|
183
187
|
response = self.client.chat.completions.create(
|
184
|
-
model=self.model_name, messages=message,
|
188
|
+
model=self.model_name, messages=message, **self.kwargs # type: ignore
|
185
189
|
)
|
186
190
|
return cast(str, response.choices[0].message.content)
|
187
191
|
|
vision_agent/tools/tools.py
CHANGED
@@ -12,8 +12,11 @@ from PIL.Image import Image as ImageType
|
|
12
12
|
|
13
13
|
from vision_agent.image_utils import convert_to_b64, get_image_size
|
14
14
|
from vision_agent.tools.video import extract_frames_from_video
|
15
|
+
from vision_agent.type_defs import LandingaiAPIKey
|
15
16
|
|
16
17
|
_LOGGER = logging.getLogger(__name__)
|
18
|
+
_LND_API_KEY = LandingaiAPIKey().api_key
|
19
|
+
_LND_API_URL = "https://api.dev.landing.ai/v1/agent"
|
17
20
|
|
18
21
|
|
19
22
|
def normalize_bbox(
|
@@ -80,8 +83,6 @@ class CLIP(Tool):
|
|
80
83
|
[{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
|
81
84
|
"""
|
82
85
|
|
83
|
-
_ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
|
84
|
-
|
85
86
|
name = "clip_"
|
86
87
|
description = "'clip_' is a tool that can classify any image given a set of input names or tags. It returns a list of the input names along with their probability scores."
|
87
88
|
usage = {
|
@@ -125,23 +126,9 @@ class CLIP(Tool):
|
|
125
126
|
"image": image_b64,
|
126
127
|
"tool": "closed_set_image_classification",
|
127
128
|
}
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
json=data,
|
132
|
-
)
|
133
|
-
resp_json: Dict[str, Any] = res.json()
|
134
|
-
if (
|
135
|
-
"statusCode" in resp_json and resp_json["statusCode"] != 200
|
136
|
-
) or "statusCode" not in resp_json:
|
137
|
-
_LOGGER.error(f"Request failed: {resp_json}")
|
138
|
-
raise ValueError(f"Request failed: {resp_json}")
|
139
|
-
|
140
|
-
resp_json["data"]["scores"] = [
|
141
|
-
round(prob, 4) for prob in resp_json["data"]["scores"]
|
142
|
-
]
|
143
|
-
|
144
|
-
return resp_json["data"] # type: ignore
|
129
|
+
resp_data = _send_inference_request(data, "tools")
|
130
|
+
resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
|
131
|
+
return resp_data
|
145
132
|
|
146
133
|
|
147
134
|
class ImageCaption(Tool):
|
@@ -156,8 +143,6 @@ class ImageCaption(Tool):
|
|
156
143
|
{'text': ['a box of orange and white socks']}
|
157
144
|
"""
|
158
145
|
|
159
|
-
_ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
|
160
|
-
|
161
146
|
name = "image_caption_"
|
162
147
|
description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image"
|
163
148
|
usage = {
|
@@ -197,19 +182,7 @@ class ImageCaption(Tool):
|
|
197
182
|
"image": image_b64,
|
198
183
|
"tool": "image_captioning",
|
199
184
|
}
|
200
|
-
|
201
|
-
self._ENDPOINT,
|
202
|
-
headers={"Content-Type": "application/json"},
|
203
|
-
json=data,
|
204
|
-
)
|
205
|
-
resp_json: Dict[str, Any] = res.json()
|
206
|
-
if (
|
207
|
-
"statusCode" in resp_json and resp_json["statusCode"] != 200
|
208
|
-
) or "statusCode" not in resp_json:
|
209
|
-
_LOGGER.error(f"Request failed: {resp_json}")
|
210
|
-
raise ValueError(f"Request failed: {resp_json}")
|
211
|
-
|
212
|
-
return resp_json["data"] # type: ignore
|
185
|
+
return _send_inference_request(data, "tools")
|
213
186
|
|
214
187
|
|
215
188
|
class GroundingDINO(Tool):
|
@@ -226,8 +199,6 @@ class GroundingDINO(Tool):
|
|
226
199
|
'scores': [0.98, 0.02]}]
|
227
200
|
"""
|
228
201
|
|
229
|
-
_ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
|
230
|
-
|
231
202
|
name = "grounding_dino_"
|
232
203
|
description = "'grounding_dino_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. It returns a list of bounding boxes, label names and associated probability scores."
|
233
204
|
usage = {
|
@@ -290,24 +261,13 @@ class GroundingDINO(Tool):
|
|
290
261
|
"tool": "visual_grounding",
|
291
262
|
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
292
263
|
}
|
293
|
-
|
294
|
-
self._ENDPOINT,
|
295
|
-
headers={"Content-Type": "application/json"},
|
296
|
-
json=request_data,
|
297
|
-
)
|
298
|
-
resp_json: Dict[str, Any] = res.json()
|
299
|
-
if (
|
300
|
-
"statusCode" in resp_json and resp_json["statusCode"] != 200
|
301
|
-
) or "statusCode" not in resp_json:
|
302
|
-
_LOGGER.error(f"Request failed: {resp_json}")
|
303
|
-
raise ValueError(f"Request failed: {resp_json}")
|
304
|
-
data: Dict[str, Any] = resp_json["data"]
|
264
|
+
data: Dict[str, Any] = _send_inference_request(request_data, "tools")
|
305
265
|
if "bboxes" in data:
|
306
266
|
data["bboxes"] = [normalize_bbox(box, image_size) for box in data["bboxes"]]
|
307
267
|
if "scores" in data:
|
308
268
|
data["scores"] = [round(score, 2) for score in data["scores"]]
|
309
269
|
if "labels" in data:
|
310
|
-
data["labels"] =
|
270
|
+
data["labels"] = list(data["labels"])
|
311
271
|
data["size"] = (image_size[1], image_size[0])
|
312
272
|
return data
|
313
273
|
|
@@ -335,8 +295,6 @@ class GroundingSAM(Tool):
|
|
335
295
|
[1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
|
336
296
|
"""
|
337
297
|
|
338
|
-
_ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
|
339
|
-
|
340
298
|
name = "grounding_sam_"
|
341
299
|
description = "'grounding_sam_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
|
342
300
|
usage = {
|
@@ -399,18 +357,7 @@ class GroundingSAM(Tool):
|
|
399
357
|
"tool": "visual_grounding_segment",
|
400
358
|
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
401
359
|
}
|
402
|
-
|
403
|
-
self._ENDPOINT,
|
404
|
-
headers={"Content-Type": "application/json"},
|
405
|
-
json=request_data,
|
406
|
-
)
|
407
|
-
resp_json: Dict[str, Any] = res.json()
|
408
|
-
if (
|
409
|
-
"statusCode" in resp_json and resp_json["statusCode"] != 200
|
410
|
-
) or "statusCode" not in resp_json:
|
411
|
-
_LOGGER.error(f"Request failed: {resp_json}")
|
412
|
-
raise ValueError(f"Request failed: {resp_json}")
|
413
|
-
data: Dict[str, Any] = resp_json["data"]
|
360
|
+
data: Dict[str, Any] = _send_inference_request(request_data, "tools")
|
414
361
|
ret_pred: Dict[str, List] = {"labels": [], "bboxes": [], "masks": []}
|
415
362
|
if "bboxes" in data:
|
416
363
|
ret_pred["bboxes"] = [
|
@@ -714,3 +661,20 @@ TOOLS = {
|
|
714
661
|
)
|
715
662
|
if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage"))
|
716
663
|
}
|
664
|
+
|
665
|
+
|
666
|
+
def _send_inference_request(
|
667
|
+
payload: Dict[str, Any], endpoint_name: str
|
668
|
+
) -> Dict[str, Any]:
|
669
|
+
res = requests.post(
|
670
|
+
f"{_LND_API_URL}/model/{endpoint_name}",
|
671
|
+
headers={
|
672
|
+
"Content-Type": "application/json",
|
673
|
+
"apikey": _LND_API_KEY,
|
674
|
+
},
|
675
|
+
json=payload,
|
676
|
+
)
|
677
|
+
if res.status_code != 200:
|
678
|
+
_LOGGER.error(f"Request failed: {res.text}")
|
679
|
+
raise ValueError(f"Request failed: {res.text}")
|
680
|
+
return res.json()["data"] # type: ignore
|
@@ -0,0 +1,48 @@
|
|
1
|
+
from pydantic import Field, field_validator
|
2
|
+
from pydantic_settings import BaseSettings
|
3
|
+
|
4
|
+
|
5
|
+
class LandingaiAPIKey(BaseSettings):
|
6
|
+
"""The API key of a user in a particular organization in LandingLens.
|
7
|
+
It supports loading from environment variables or .env files.
|
8
|
+
The supported name of the environment variables are (case-insensitive):
|
9
|
+
- LANDINGAI_API_KEY
|
10
|
+
|
11
|
+
Environment variables will always take priority over values loaded from a dotenv file.
|
12
|
+
"""
|
13
|
+
|
14
|
+
api_key: str = Field(
|
15
|
+
default="land_sk_hw34v3tyEc35OAhP8F7hnGnrDv2C8hD2ycMyq0aMkVS1H40D22",
|
16
|
+
alias="LANDINGAI_API_KEY",
|
17
|
+
description="The API key of LandingAI.",
|
18
|
+
)
|
19
|
+
|
20
|
+
@field_validator("api_key")
|
21
|
+
@classmethod
|
22
|
+
def is_api_key_valid(cls, key: str) -> str:
|
23
|
+
"""Check if the API key is a v2 key."""
|
24
|
+
if not key:
|
25
|
+
raise InvalidApiKeyError(f"LandingAI API key is required, but it's {key}")
|
26
|
+
if not key.startswith("land_sk_"):
|
27
|
+
raise InvalidApiKeyError(
|
28
|
+
f"LandingAI API key (v2) must start with 'land_sk_' prefix, but it's {key}. See https://support.landing.ai/docs/api-key for more information."
|
29
|
+
)
|
30
|
+
return key
|
31
|
+
|
32
|
+
class Config:
|
33
|
+
env_file = ".env"
|
34
|
+
env_prefix = "landingai_"
|
35
|
+
case_sensitive = False
|
36
|
+
extra = "ignore"
|
37
|
+
|
38
|
+
|
39
|
+
class InvalidApiKeyError(Exception):
|
40
|
+
"""Exception raised when the an invalid API key is provided. This error could be raised from any SDK code, not limited to a HTTP client."""
|
41
|
+
|
42
|
+
def __init__(self, message: str):
|
43
|
+
self.message = f"""{message}
|
44
|
+
For more information, see https://landing-ai.github.io/landingai-python/landingai.html#manage-api-credentials"""
|
45
|
+
super().__init__(self.message)
|
46
|
+
|
47
|
+
def __str__(self) -> str:
|
48
|
+
return self.message
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -16,6 +16,7 @@ Requires-Dist: openai (>=1.0.0,<2.0.0)
|
|
16
16
|
Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
|
17
17
|
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
18
18
|
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
19
|
+
Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
19
20
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
20
21
|
Requires-Dist: sentence-transformers (>=2.0.0,<3.0.0)
|
21
22
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
@@ -5,8 +5,8 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
|
|
5
5
|
vision_agent/agent/easytool_prompts.py,sha256=dYzWa_RaiaFSQ-CowoQOcFmjZtBTTljRyA809bLgrvU,4519
|
6
6
|
vision_agent/agent/reflexion.py,sha256=wzpptfALNZIh9Q5jgkK3imGL5LWjTW_n_Ypsvxdh07Q,10101
|
7
7
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
8
|
-
vision_agent/agent/vision_agent.py,sha256=
|
9
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
8
|
+
vision_agent/agent/vision_agent.py,sha256=_xh3v7DaeH3r5JLeXtCvDbQgogGRvpmqH3dAW7ChA1E,21759
|
9
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=JC43AB0ZnL8jQW9LYe-5mTeEJmH0w-SuH9YmGQxf1eM,7311
|
10
10
|
vision_agent/data/__init__.py,sha256=YU-5g3LbEQ6a4drz0RLGTagXMVU2Z4Xr3RlfWE-R0jU,46
|
11
11
|
vision_agent/data/data.py,sha256=pgtSGZdAnbQ8oGsuapLtFTMPajnCGDGekEXTnFuBwsY,5122
|
12
12
|
vision_agent/emb/__init__.py,sha256=YmCkGrJBtXb6X6Z3lnKiFoQYKXMgHMJp8JJyMLVvqcI,75
|
@@ -15,14 +15,15 @@ vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
|
|
15
15
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
16
16
|
vision_agent/image_utils.py,sha256=hFdPoRmeVU5jErFr5xaagMQ6Wy7Xbw8H8HXuLGdJIAM,4786
|
17
17
|
vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
|
18
|
-
vision_agent/llm/llm.py,sha256=
|
18
|
+
vision_agent/llm/llm.py,sha256=Jty_RHdqVmIM0Mm31JNk50c882Tx7hHtkmh0WyXeJd8,5016
|
19
19
|
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
20
|
-
vision_agent/lmm/lmm.py,sha256=
|
20
|
+
vision_agent/lmm/lmm.py,sha256=qDdy_9Q9wRjJ9ZUfqB8zfjhVIgITgjF7K4hYaTAoPCI,9637
|
21
21
|
vision_agent/tools/__init__.py,sha256=OEqEysxm5wnnOD73NKNCUggALB72GEmVg9FNsEkSBtA,253
|
22
22
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
23
|
-
vision_agent/tools/tools.py,sha256=
|
23
|
+
vision_agent/tools/tools.py,sha256=Qsqe8X6VjB0EMWhyKJ5EMPyLIc_d5Vtlw4ugV2FB_Ks,25589
|
24
24
|
vision_agent/tools/video.py,sha256=40rscP8YvKN3lhZ4PDcOK4XbdFX2duCRpHY_krmBYKU,7476
|
25
|
-
vision_agent
|
26
|
-
vision_agent-0.
|
27
|
-
vision_agent-0.
|
28
|
-
vision_agent-0.
|
25
|
+
vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
|
26
|
+
vision_agent-0.1.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
27
|
+
vision_agent-0.1.2.dist-info/METADATA,sha256=6AP0Z9G4l15uCcfBGhUfHV1AnP4lwXQuey7uH-QuvlU,6233
|
28
|
+
vision_agent-0.1.2.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
29
|
+
vision_agent-0.1.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|