vision-agent 0.2.23__py3-none-any.whl → 0.2.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/agent_coder.py +19 -5
- vision_agent/agent/vision_agent_v2.py +9 -2
- vision_agent/agent/vision_agent_v2_prompt.py +4 -3
- vision_agent/llm/llm.py +7 -4
- vision_agent/tools/tool_utils.py +1 -1
- vision_agent/tools/tools.py +1 -1
- vision_agent/tools/tools_v2.py +199 -10
- vision_agent/utils/image_utils.py +7 -2
- vision_agent/utils/sim.py +6 -1
- vision_agent/utils/type_defs.py +1 -1
- {vision_agent-0.2.23.dist-info → vision_agent-0.2.25.dist-info}/METADATA +4 -2
- {vision_agent-0.2.23.dist-info → vision_agent-0.2.25.dist-info}/RECORD +14 -14
- {vision_agent-0.2.23.dist-info → vision_agent-0.2.25.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.23.dist-info → vision_agent-0.2.25.dist-info}/WHEEL +0 -0
@@ -5,6 +5,9 @@ import sys
|
|
5
5
|
from pathlib import Path
|
6
6
|
from typing import Dict, List, Optional, Union
|
7
7
|
|
8
|
+
from rich.console import Console
|
9
|
+
from rich.syntax import Syntax
|
10
|
+
|
8
11
|
from vision_agent.agent import Agent
|
9
12
|
from vision_agent.agent.agent_coder_prompts import (
|
10
13
|
DEBUG,
|
@@ -40,6 +43,7 @@ from vision_agent.tools.tools_v2 import *
|
|
40
43
|
logging.basicConfig(stream=sys.stdout)
|
41
44
|
_LOGGER = logging.getLogger(__name__)
|
42
45
|
_EXECUTE = Execute()
|
46
|
+
_CONSOLE = Console()
|
43
47
|
|
44
48
|
|
45
49
|
def write_tests(question: str, code: str, model: LLM) -> str:
|
@@ -103,7 +107,7 @@ def run_visual_tests(
|
|
103
107
|
|
104
108
|
|
105
109
|
def fix_bugs(code: str, tests: str, result: str, feedback: str, model: LLM) -> str:
|
106
|
-
prompt = FIX_BUG.format(
|
110
|
+
prompt = FIX_BUG.format(code=code, tests=tests, result=result, feedback=feedback)
|
107
111
|
completion = model(prompt)
|
108
112
|
return preprocess_data(completion)
|
109
113
|
|
@@ -139,7 +143,8 @@ class AgentCoder(Agent):
|
|
139
143
|
else visual_tester_agent
|
140
144
|
)
|
141
145
|
self.max_turns = 3
|
142
|
-
|
146
|
+
self.verbose = verbose
|
147
|
+
if self.verbose:
|
143
148
|
_LOGGER.setLevel(logging.INFO)
|
144
149
|
|
145
150
|
def __call__(
|
@@ -164,9 +169,15 @@ class AgentCoder(Agent):
|
|
164
169
|
feedback = ""
|
165
170
|
for _ in range(self.max_turns):
|
166
171
|
code = write_program(question, feedback, self.coder_agent)
|
167
|
-
|
172
|
+
if self.verbose:
|
173
|
+
_CONSOLE.print(
|
174
|
+
Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
|
175
|
+
)
|
168
176
|
debug = write_debug(question, code, feedback, self.tester_agent)
|
169
|
-
|
177
|
+
if self.verbose:
|
178
|
+
_CONSOLE.print(
|
179
|
+
Syntax(debug, "python", theme="gruvbox-dark", line_numbers=True)
|
180
|
+
)
|
170
181
|
results = execute_tests(code, debug)
|
171
182
|
_LOGGER.info(
|
172
183
|
f"execution results: passed: {results['passed']}\n{results['result']}"
|
@@ -176,7 +187,10 @@ class AgentCoder(Agent):
|
|
176
187
|
code = fix_bugs(
|
177
188
|
code, debug, results["result"].strip(), feedback, self.coder_agent # type: ignore
|
178
189
|
)
|
179
|
-
|
190
|
+
if self.verbose:
|
191
|
+
_CONSOLE.print(
|
192
|
+
Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
|
193
|
+
)
|
180
194
|
else:
|
181
195
|
# TODO: Sometimes it prints nothing, so we need to handle that case
|
182
196
|
# TODO: The visual agent reflection does not work very well, needs more testing
|
@@ -4,6 +4,7 @@ from pathlib import Path
|
|
4
4
|
from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
|
5
5
|
|
6
6
|
import pandas as pd
|
7
|
+
from langsmith import traceable
|
7
8
|
from rich.console import Console
|
8
9
|
from rich.syntax import Syntax
|
9
10
|
from tabulate import tabulate
|
@@ -66,6 +67,7 @@ def extract_json(json_str: str) -> Dict[str, Any]:
|
|
66
67
|
return json_dict # type: ignore
|
67
68
|
|
68
69
|
|
70
|
+
@traceable(name="planning")
|
69
71
|
def write_plan(
|
70
72
|
chat: List[Dict[str, str]],
|
71
73
|
plan: Optional[List[Dict[str, Any]]],
|
@@ -214,6 +216,7 @@ def write_and_exec_code(
|
|
214
216
|
return success, code, result, working_memory
|
215
217
|
|
216
218
|
|
219
|
+
@traceable(name="plan execution")
|
217
220
|
def run_plan(
|
218
221
|
user_req: str,
|
219
222
|
plan: List[Dict[str, Any]],
|
@@ -235,7 +238,7 @@ def run_plan(
|
|
235
238
|
f"""
|
236
239
|
{tabulate(tabular_data=[task], headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
|
237
240
|
)
|
238
|
-
tools = tool_recommender.top_k(task["instruction"])
|
241
|
+
tools = tool_recommender.top_k(task["instruction"], thresh=0.3)
|
239
242
|
tool_info = "\n".join([e["doc"] for e in tools])
|
240
243
|
|
241
244
|
if verbosity == 2:
|
@@ -285,6 +288,7 @@ class VisionAgentV2(Agent):
|
|
285
288
|
solve vision tasks. It is inspired by MetaGPT's Data Interpreter
|
286
289
|
https://arxiv.org/abs/2402.18679. Vision Agent has several key features to help it
|
287
290
|
generate code:
|
291
|
+
|
288
292
|
- A planner to generate a plan of tasks to solve a user requirement. The planner
|
289
293
|
can output code tasks or test tasks, where test tasks are used to verify the code.
|
290
294
|
- Automatic debugging, if a task fails, the agent will attempt to debug the code
|
@@ -333,6 +337,7 @@ class VisionAgentV2(Agent):
|
|
333
337
|
results = self.chat_with_workflow(input, image, plan)
|
334
338
|
return results["code"] # type: ignore
|
335
339
|
|
340
|
+
@traceable
|
336
341
|
def chat_with_workflow(
|
337
342
|
self,
|
338
343
|
chat: List[Dict[str, str]],
|
@@ -377,7 +382,9 @@ class VisionAgentV2(Agent):
|
|
377
382
|
self.long_term_memory,
|
378
383
|
self.verbosity,
|
379
384
|
)
|
380
|
-
success = all(
|
385
|
+
success = all(
|
386
|
+
task["success"] if "success" in task else False for task in plan
|
387
|
+
)
|
381
388
|
working_memory.update(working_memory_i)
|
382
389
|
|
383
390
|
if not success:
|
@@ -34,7 +34,7 @@ PLAN = """
|
|
34
34
|
|
35
35
|
# Task:
|
36
36
|
Based on the context and the tools you have available, write a plan of subtasks to achieve the user request that adhere to the following requirements:
|
37
|
-
- For each subtask, you should provide
|
37
|
+
- For each subtask, you should provide instructions on what to do. Write detailed subtasks, ensure they are large enough to be meaningful, encompassing multiple lines of code.
|
38
38
|
- You do not need to have the agent rewrite any tool functionality you already have, you should instead instruct it to utilize one or more of those tools in each subtask.
|
39
39
|
- You can have agents either write coding tasks, to code some functionality or testing tasks to test previous functionality.
|
40
40
|
- If a current plan exists, examine each item in the plan to determine if it was successful. If there was an item that failed, i.e. 'success': False, then you should rewrite that item and all subsequent items to ensure that the rewritten plan is successful.
|
@@ -73,9 +73,10 @@ CODE = """
|
|
73
73
|
{code}
|
74
74
|
|
75
75
|
# Constraints
|
76
|
-
- Write a function that accomplishes the '
|
77
|
-
- Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info
|
76
|
+
- Write a function that accomplishes the 'Current Subtask'. You are supplied code from a previous task under 'Previous Code', do not delete or change previous code unless it contains a bug or it is necessary to complete the 'Current Subtask'.
|
77
|
+
- Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info' when working on 'Current Subtask'. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
|
78
78
|
- You may recieve previous trials and errors under 'Previous Task', this is code, output and reflections from previous tasks. You can use these to avoid running in to the same issues when writing your code.
|
79
|
+
- Use the `save_json` function from `vision_agent.tools.tools_v2` to save your output as a json file.
|
79
80
|
- Write clean, readable, and well-documented code.
|
80
81
|
|
81
82
|
# Output
|
vision_agent/llm/llm.py
CHANGED
@@ -3,6 +3,7 @@ import os
|
|
3
3
|
from abc import ABC, abstractmethod
|
4
4
|
from typing import Any, Callable, Dict, List, Mapping, Optional, Union, cast
|
5
5
|
|
6
|
+
from langsmith.wrappers import wrap_openai
|
6
7
|
from openai import AzureOpenAI, OpenAI
|
7
8
|
|
8
9
|
from vision_agent.tools import (
|
@@ -41,9 +42,9 @@ class OpenAILLM(LLM):
|
|
41
42
|
**kwargs: Any
|
42
43
|
):
|
43
44
|
if not api_key:
|
44
|
-
self.client = OpenAI()
|
45
|
+
self.client = wrap_openai(OpenAI())
|
45
46
|
else:
|
46
|
-
self.client = OpenAI(api_key=api_key)
|
47
|
+
self.client = wrap_openai(OpenAI(api_key=api_key))
|
47
48
|
|
48
49
|
self.model_name = model_name
|
49
50
|
self.system_prompt = system_prompt
|
@@ -165,8 +166,10 @@ class AzureOpenAILLM(OpenAILLM):
|
|
165
166
|
if not azure_endpoint:
|
166
167
|
raise ValueError("Azure OpenAI endpoint is required.")
|
167
168
|
|
168
|
-
self.client =
|
169
|
-
|
169
|
+
self.client = wrap_openai(
|
170
|
+
AzureOpenAI(
|
171
|
+
api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint
|
172
|
+
)
|
170
173
|
)
|
171
174
|
self.model_name = model_name
|
172
175
|
self.kwargs = kwargs
|
vision_agent/tools/tool_utils.py
CHANGED
@@ -8,7 +8,7 @@ from vision_agent.utils.type_defs import LandingaiAPIKey
|
|
8
8
|
|
9
9
|
_LOGGER = logging.getLogger(__name__)
|
10
10
|
_LND_API_KEY = LandingaiAPIKey().api_key
|
11
|
-
_LND_API_URL = "https://api.
|
11
|
+
_LND_API_URL = "https://api.staging.landing.ai/v1/agent"
|
12
12
|
|
13
13
|
|
14
14
|
def _send_inference_request(
|
vision_agent/tools/tools.py
CHANGED
vision_agent/tools/tools_v2.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import inspect
|
2
2
|
import io
|
3
|
+
import json
|
3
4
|
import logging
|
4
5
|
import tempfile
|
5
6
|
from importlib import resources
|
@@ -14,7 +15,14 @@ from scipy.spatial import distance # type: ignore
|
|
14
15
|
|
15
16
|
from vision_agent.tools.tool_utils import _send_inference_request
|
16
17
|
from vision_agent.utils import extract_frames_from_video
|
17
|
-
from vision_agent.utils.image_utils import
|
18
|
+
from vision_agent.utils.image_utils import (
|
19
|
+
convert_to_b64,
|
20
|
+
normalize_bbox,
|
21
|
+
rle_decode,
|
22
|
+
b64_to_pil,
|
23
|
+
get_image_size,
|
24
|
+
denormalize_bbox,
|
25
|
+
)
|
18
26
|
|
19
27
|
COLORS = [
|
20
28
|
(158, 218, 229),
|
@@ -48,7 +56,7 @@ def grounding_dino(
|
|
48
56
|
prompt: str,
|
49
57
|
image: np.ndarray,
|
50
58
|
box_threshold: float = 0.20,
|
51
|
-
iou_threshold: float = 0.
|
59
|
+
iou_threshold: float = 0.20,
|
52
60
|
) -> List[Dict[str, Any]]:
|
53
61
|
"""'grounding_dino' is a tool that can detect and count objects given a text prompt
|
54
62
|
such as category names or referring expressions. It returns a list and count of
|
@@ -60,12 +68,13 @@ def grounding_dino(
|
|
60
68
|
box_threshold (float, optional): The threshold for the box detection. Defaults
|
61
69
|
to 0.20.
|
62
70
|
iou_threshold (float, optional): The threshold for the Intersection over Union
|
63
|
-
(IoU). Defaults to 0.
|
71
|
+
(IoU). Defaults to 0.20.
|
64
72
|
|
65
73
|
Returns:
|
66
74
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
67
75
|
bounding box of the detected objects with normalized coordinates
|
68
|
-
(
|
76
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and
|
77
|
+
xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
69
78
|
|
70
79
|
Example
|
71
80
|
-------
|
@@ -76,7 +85,7 @@ def grounding_dino(
|
|
76
85
|
]
|
77
86
|
"""
|
78
87
|
image_size = image.shape[:2]
|
79
|
-
image_b64 = convert_to_b64(
|
88
|
+
image_b64 = convert_to_b64(image)
|
80
89
|
request_data = {
|
81
90
|
"prompt": prompt,
|
82
91
|
"image": image_b64,
|
@@ -100,7 +109,7 @@ def grounding_sam(
|
|
100
109
|
prompt: str,
|
101
110
|
image: np.ndarray,
|
102
111
|
box_threshold: float = 0.20,
|
103
|
-
iou_threshold: float = 0.
|
112
|
+
iou_threshold: float = 0.20,
|
104
113
|
) -> List[Dict[str, Any]]:
|
105
114
|
"""'grounding_sam' is a tool that can detect and segment objects given a text
|
106
115
|
prompt such as category names or referring expressions. It returns a list of
|
@@ -112,12 +121,15 @@ def grounding_sam(
|
|
112
121
|
box_threshold (float, optional): The threshold for the box detection. Defaults
|
113
122
|
to 0.20.
|
114
123
|
iou_threshold (float, optional): The threshold for the Intersection over Union
|
115
|
-
(IoU). Defaults to 0.
|
124
|
+
(IoU). Defaults to 0.20.
|
116
125
|
|
117
126
|
Returns:
|
118
127
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
119
128
|
bounding box, and mask of the detected objects with normalized coordinates
|
120
|
-
(
|
129
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and
|
130
|
+
xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
131
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
132
|
+
the background.
|
121
133
|
|
122
134
|
Example
|
123
135
|
-------
|
@@ -136,7 +148,7 @@ def grounding_sam(
|
|
136
148
|
]
|
137
149
|
"""
|
138
150
|
image_size = image.shape[:2]
|
139
|
-
image_b64 = convert_to_b64(
|
151
|
+
image_b64 = convert_to_b64(image)
|
140
152
|
request_data = {
|
141
153
|
"prompt": prompt,
|
142
154
|
"image": image_b64,
|
@@ -234,6 +246,152 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
234
246
|
return output
|
235
247
|
|
236
248
|
|
249
|
+
def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
250
|
+
"""'zero_shot_counting' is a tool that counts the dominant foreground object given an image and no other information about the content.
|
251
|
+
It returns only the count of the objects in the image.
|
252
|
+
|
253
|
+
Parameters:
|
254
|
+
image (np.ndarray): The image that contains lot of instances of a single object
|
255
|
+
|
256
|
+
Returns:
|
257
|
+
Dict[str, Any]: A dictionary containing the key 'count' and the count as a value. E.g. {count: 12}.
|
258
|
+
|
259
|
+
Example
|
260
|
+
-------
|
261
|
+
>>> zero_shot_counting(image)
|
262
|
+
{'count': 45},
|
263
|
+
|
264
|
+
"""
|
265
|
+
|
266
|
+
image_b64 = convert_to_b64(image)
|
267
|
+
data = {
|
268
|
+
"image": image_b64,
|
269
|
+
"tool": "zero_shot_counting",
|
270
|
+
}
|
271
|
+
resp_data = _send_inference_request(data, "tools")
|
272
|
+
resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
|
273
|
+
return resp_data
|
274
|
+
|
275
|
+
|
276
|
+
def visual_prompt_counting(
|
277
|
+
image: np.ndarray, visual_prompt: Dict[str, List[float]]
|
278
|
+
) -> Dict[str, Any]:
|
279
|
+
"""'visual_prompt_counting' is a tool that counts the dominant foreground object given an image and a visual prompt which is a bounding box describing the object.
|
280
|
+
It returns only the count of the objects in the image.
|
281
|
+
|
282
|
+
Parameters:
|
283
|
+
image (np.ndarray): The image that contains lot of instances of a single object
|
284
|
+
|
285
|
+
Returns:
|
286
|
+
Dict[str, Any]: A dictionary containing the key 'count' and the count as a value. E.g. {count: 12}.
|
287
|
+
|
288
|
+
Example
|
289
|
+
-------
|
290
|
+
>>> visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
|
291
|
+
{'count': 45},
|
292
|
+
|
293
|
+
"""
|
294
|
+
|
295
|
+
image_size = get_image_size(image)
|
296
|
+
bbox = visual_prompt["bbox"]
|
297
|
+
bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
|
298
|
+
image_b64 = convert_to_b64(image)
|
299
|
+
|
300
|
+
data = {
|
301
|
+
"image": image_b64,
|
302
|
+
"prompt": bbox_str,
|
303
|
+
"tool": "few_shot_counting",
|
304
|
+
}
|
305
|
+
resp_data = _send_inference_request(data, "tools")
|
306
|
+
resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
|
307
|
+
return resp_data
|
308
|
+
|
309
|
+
|
310
|
+
def image_question_answering(image: np.ndarray, prompt: str) -> str:
|
311
|
+
"""'image_question_answering_' is a tool that can answer questions about the visual contents of an image given a question and an image.
|
312
|
+
It returns an answer to the question
|
313
|
+
|
314
|
+
Parameters:
|
315
|
+
image (np.ndarray): The reference image used for the question
|
316
|
+
prompt (str): The question about the image
|
317
|
+
|
318
|
+
Returns:
|
319
|
+
str: A string which is the answer to the given prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}.
|
320
|
+
|
321
|
+
Example
|
322
|
+
-------
|
323
|
+
>>> image_question_answering(image, 'What is the cat doing ?')
|
324
|
+
'drinking milk'
|
325
|
+
|
326
|
+
"""
|
327
|
+
|
328
|
+
image_b64 = convert_to_b64(image)
|
329
|
+
data = {
|
330
|
+
"image": image_b64,
|
331
|
+
"prompt": prompt,
|
332
|
+
"tool": "image_question_answering",
|
333
|
+
}
|
334
|
+
|
335
|
+
answer = _send_inference_request(data, "tools")
|
336
|
+
return answer["text"][0] # type: ignore
|
337
|
+
|
338
|
+
|
339
|
+
def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
|
340
|
+
"""'clip' is a tool that can classify an image given a list of input classes or tags.
|
341
|
+
It returns the same list of the input classes along with their probability scores based on image content.
|
342
|
+
|
343
|
+
Parameters:
|
344
|
+
image (np.ndarray): The image to classify or tag
|
345
|
+
classes (List[str]): The list of classes or tags that is associated with the image
|
346
|
+
|
347
|
+
Returns:
|
348
|
+
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary contains a list of given labels and other a list of scores.
|
349
|
+
|
350
|
+
Example
|
351
|
+
-------
|
352
|
+
>>> clip(image, ['dog', 'cat', 'bird'])
|
353
|
+
{"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
|
354
|
+
|
355
|
+
"""
|
356
|
+
|
357
|
+
image_b64 = convert_to_b64(image)
|
358
|
+
data = {
|
359
|
+
"prompt": ",".join(classes),
|
360
|
+
"image": image_b64,
|
361
|
+
"tool": "closed_set_image_classification",
|
362
|
+
}
|
363
|
+
resp_data = _send_inference_request(data, "tools")
|
364
|
+
resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
|
365
|
+
return resp_data
|
366
|
+
|
367
|
+
|
368
|
+
def image_caption(image: np.ndarray) -> str:
|
369
|
+
"""'image_caption' is a tool that can caption an image based on its contents.
|
370
|
+
It returns a text describing the image.
|
371
|
+
|
372
|
+
Parameters:
|
373
|
+
image (np.ndarray): The image to caption
|
374
|
+
|
375
|
+
Returns:
|
376
|
+
str: A string which is the caption for the given image.
|
377
|
+
|
378
|
+
Example
|
379
|
+
-------
|
380
|
+
>>> image_caption(image)
|
381
|
+
'This image contains a cat sitting on a table with a bowl of milk.'
|
382
|
+
|
383
|
+
"""
|
384
|
+
|
385
|
+
image_b64 = convert_to_b64(image)
|
386
|
+
data = {
|
387
|
+
"image": image_b64,
|
388
|
+
"tool": "image_captioning",
|
389
|
+
}
|
390
|
+
|
391
|
+
answer = _send_inference_request(data, "tools")
|
392
|
+
return answer["text"][0] # type: ignore
|
393
|
+
|
394
|
+
|
237
395
|
def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
|
238
396
|
"""'closest_mask_distance' calculates the closest distance between two masks.
|
239
397
|
|
@@ -285,6 +443,31 @@ def closest_box_distance(box1: List[float], box2: List[float]) -> float:
|
|
285
443
|
# Utility and visualization functions
|
286
444
|
|
287
445
|
|
446
|
+
def save_json(data: Any, file_path: str) -> None:
|
447
|
+
"""'save_json' is a utility function that saves data as a JSON file. It is helpful
|
448
|
+
for saving data that contains NumPy arrays which are not JSON serializable.
|
449
|
+
|
450
|
+
Parameters:
|
451
|
+
data (Any): The data to save.
|
452
|
+
file_path (str): The path to save the JSON file.
|
453
|
+
|
454
|
+
Example
|
455
|
+
-------
|
456
|
+
>>> save_json(data, "path/to/file.json")
|
457
|
+
"""
|
458
|
+
|
459
|
+
class NumpyEncoder(json.JSONEncoder):
|
460
|
+
def default(self, obj: Any): # type: ignore
|
461
|
+
if isinstance(obj, np.ndarray):
|
462
|
+
return obj.tolist()
|
463
|
+
elif isinstance(obj, np.bool_):
|
464
|
+
return bool(obj)
|
465
|
+
return json.JSONEncoder.default(self, obj)
|
466
|
+
|
467
|
+
with open(file_path, "w") as f:
|
468
|
+
json.dump(data, f, cls=NumpyEncoder)
|
469
|
+
|
470
|
+
|
288
471
|
def load_image(image_path: str) -> np.ndarray:
|
289
472
|
"""'load_image' is a utility function that loads an image from the given path.
|
290
473
|
|
@@ -478,8 +661,14 @@ TOOLS = [
|
|
478
661
|
grounding_sam,
|
479
662
|
extract_frames,
|
480
663
|
ocr,
|
664
|
+
clip,
|
665
|
+
zero_shot_counting,
|
666
|
+
visual_prompt_counting,
|
667
|
+
image_question_answering,
|
668
|
+
image_caption,
|
481
669
|
closest_mask_distance,
|
482
670
|
closest_box_distance,
|
671
|
+
save_json,
|
483
672
|
load_image,
|
484
673
|
save_image,
|
485
674
|
overlay_bounding_boxes,
|
@@ -489,5 +678,5 @@ TOOLS_DF = get_tools_df(TOOLS) # type: ignore
|
|
489
678
|
TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
|
490
679
|
TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
|
491
680
|
UTILITIES_DOCSTRING = get_tool_documentation(
|
492
|
-
[load_image, save_image, overlay_bounding_boxes]
|
681
|
+
[save_json, load_image, save_image, overlay_bounding_boxes]
|
493
682
|
)
|
@@ -104,15 +104,20 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
|
|
104
104
|
"""
|
105
105
|
if data is None:
|
106
106
|
raise ValueError(f"Invalid input image: {data}. Input image can't be None.")
|
107
|
+
|
107
108
|
if isinstance(data, (str, Path)):
|
108
109
|
data = Image.open(data)
|
110
|
+
elif isinstance(data, np.ndarray):
|
111
|
+
data = Image.fromarray(data)
|
112
|
+
|
109
113
|
if isinstance(data, Image.Image):
|
110
114
|
buffer = BytesIO()
|
111
115
|
data.convert("RGB").save(buffer, format="PNG")
|
112
116
|
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
113
117
|
else:
|
114
|
-
|
115
|
-
|
118
|
+
raise ValueError(
|
119
|
+
f"Invalid input image: {data}. Input image must be a PIL Image or a numpy array."
|
120
|
+
)
|
116
121
|
|
117
122
|
|
118
123
|
def denormalize_bbox(
|
vision_agent/utils/sim.py
CHANGED
@@ -56,12 +56,15 @@ class Sim:
|
|
56
56
|
df = df.drop("embs", axis=1)
|
57
57
|
df.to_csv(sim_file / "df.csv", index=False)
|
58
58
|
|
59
|
-
def top_k(
|
59
|
+
def top_k(
|
60
|
+
self, query: str, k: int = 5, thresh: Optional[float] = None
|
61
|
+
) -> Sequence[Dict]:
|
60
62
|
"""Returns the top k most similar items to the query.
|
61
63
|
|
62
64
|
Parameters:
|
63
65
|
query: str: The query to compare to.
|
64
66
|
k: int: The number of items to return.
|
67
|
+
thresh: Optional[float]: The minimum similarity threshold.
|
65
68
|
|
66
69
|
Returns:
|
67
70
|
Sequence[Dict]: The top k most similar items.
|
@@ -70,6 +73,8 @@ class Sim:
|
|
70
73
|
embedding = get_embedding(self.client, query, model=self.model)
|
71
74
|
self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
|
72
75
|
res = self.df.sort_values("sim", ascending=False).head(k)
|
76
|
+
if thresh is not None:
|
77
|
+
res = res[res.sim > thresh]
|
73
78
|
return res[[c for c in res.columns if c != "embs"]].to_dict(orient="records")
|
74
79
|
|
75
80
|
|
vision_agent/utils/type_defs.py
CHANGED
@@ -12,7 +12,7 @@ class LandingaiAPIKey(BaseSettings):
|
|
12
12
|
"""
|
13
13
|
|
14
14
|
api_key: str = Field(
|
15
|
-
default="
|
15
|
+
default="land_sk_IJrojHarPXRjqDj1Fng76mX7yCbzVm1s5rZYxaNXu5v0cNLn0w",
|
16
16
|
alias="LANDINGAI_API_KEY",
|
17
17
|
description="The API key of LandingAI.",
|
18
18
|
)
|
@@ -1,14 +1,16 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.25
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
7
|
-
Requires-Python: >=3.9
|
7
|
+
Requires-Python: >=3.9,<4.0
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
9
9
|
Classifier: Programming Language :: Python :: 3.9
|
10
10
|
Classifier: Programming Language :: Python :: 3.10
|
11
11
|
Classifier: Programming Language :: Python :: 3.11
|
12
|
+
Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
|
13
|
+
Requires-Dist: langsmith (>=0.1.58,<0.2.0)
|
12
14
|
Requires-Dist: moviepy (>=1.0.0,<2.0.0)
|
13
15
|
Requires-Dist: nbclient (>=0.10.0,<0.11.0)
|
14
16
|
Requires-Dist: nbformat (>=5.10.4,<6.0.0)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
vision_agent/__init__.py,sha256=GVLHCeK_R-zgldpbcPmOzJat-BkadvkuRCMxDvTIcXs,108
|
2
2
|
vision_agent/agent/__init__.py,sha256=Zv8lc91mPy0iDySId38_vc4mo56JQ9mCMvUWdAKQjh0,206
|
3
3
|
vision_agent/agent/agent.py,sha256=X7kON-g9ePUKumCDaYfQNBX_MEFE-ax5PnRp7-Cc5Wo,529
|
4
|
-
vision_agent/agent/agent_coder.py,sha256=
|
4
|
+
vision_agent/agent/agent_coder.py,sha256=4iB732bX4wDnPAuyYBk6HWlf4aFq2l9EcL695qfDIXw,7004
|
5
5
|
vision_agent/agent/agent_coder_prompts.py,sha256=CJe3v7xvHQ32u3RQAXQga_Tk_4UgU64RBAMHZ3S70KY,5538
|
6
6
|
vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMVg,11511
|
7
7
|
vision_agent/agent/easytool_prompts.py,sha256=Bikw-PPLkm78dwywTlnv32Y1Tw6JMeC-R7oCnXWLcTk,4656
|
@@ -9,26 +9,26 @@ vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6w
|
|
9
9
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
10
10
|
vision_agent/agent/vision_agent.py,sha256=pnx7gtTPazR7Dck5_kfZC3S3QWKu4e28YVigzOicOX0,27130
|
11
11
|
vision_agent/agent/vision_agent_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
|
12
|
-
vision_agent/agent/vision_agent_v2.py,sha256=
|
13
|
-
vision_agent/agent/vision_agent_v2_prompt.py,sha256=
|
12
|
+
vision_agent/agent/vision_agent_v2.py,sha256=3qjvaj-yyrXmoY_cecUsiuY4Rn6MmJanFZeoXFJRK2c,13229
|
13
|
+
vision_agent/agent/vision_agent_v2_prompt.py,sha256=b_0BMq6GrbGfl09MHrv4mj-mqyE1FxMl3Xq44qD4S1E,6161
|
14
14
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
16
16
|
vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
|
17
|
-
vision_agent/llm/llm.py,sha256=
|
17
|
+
vision_agent/llm/llm.py,sha256=_Klwngc35JdRuzezWe1P5BMBRkfRQSGJqNOtS44rM9s,5891
|
18
18
|
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
19
19
|
vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
|
20
20
|
vision_agent/tools/__init__.py,sha256=dRHXGpjhItXZRQs0r_l3Z3bQIreaZaYP0CJrl8mOJxM,452
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
|
-
vision_agent/tools/tool_utils.py,sha256=
|
23
|
-
vision_agent/tools/tools.py,sha256=
|
24
|
-
vision_agent/tools/tools_v2.py,sha256=
|
22
|
+
vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
|
23
|
+
vision_agent/tools/tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
|
24
|
+
vision_agent/tools/tools_v2.py,sha256=Tdam-cWBI4ipXWwGyxim-SK07zP97_hcdUtYd1a4CnI,21404
|
25
25
|
vision_agent/utils/__init__.py,sha256=xsHFyJSDbLdonB9Dh74cwZnVTiT__2OQF3Brd3Nmglc,116
|
26
26
|
vision_agent/utils/execute.py,sha256=RC_jKrm2kOWwzNe9xKuA2xJcbsNcD0Hb95_o3_Le0_E,3820
|
27
|
-
vision_agent/utils/image_utils.py,sha256=
|
28
|
-
vision_agent/utils/sim.py,sha256=
|
29
|
-
vision_agent/utils/type_defs.py,sha256=
|
27
|
+
vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
|
28
|
+
vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
|
29
|
+
vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
|
30
30
|
vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
31
|
+
vision_agent-0.2.25.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
32
|
+
vision_agent-0.2.25.dist-info/METADATA,sha256=5bycdwOp0pnRpUBQo_JM1c1Abq2fmWJcVYE_7YgtoUY,9212
|
33
|
+
vision_agent-0.2.25.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
34
|
+
vision_agent-0.2.25.dist-info/RECORD,,
|
File without changes
|
File without changes
|