vision-agent 0.2.24__py3-none-any.whl → 0.2.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,3 +4,4 @@ from .easytool import EasyTool
4
4
  from .reflexion import Reflexion
5
5
  from .vision_agent import VisionAgent
6
6
  from .vision_agent_v2 import VisionAgentV2
7
+ from .vision_agent_v3 import VisionAgentV3
@@ -10,7 +10,7 @@ from rich.syntax import Syntax
10
10
  from tabulate import tabulate
11
11
 
12
12
  from vision_agent.agent import Agent
13
- from vision_agent.agent.vision_agent_v2_prompt import (
13
+ from vision_agent.agent.vision_agent_v2_prompts import (
14
14
  CODE,
15
15
  CODE_SYS_MSG,
16
16
  DEBUG,
@@ -0,0 +1,305 @@
1
+ import copy
2
+ import json
3
+ import logging
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional, Union, cast
7
+
8
+ from rich.console import Console
9
+ from rich.syntax import Syntax
10
+ from tabulate import tabulate
11
+
12
+ from vision_agent.agent import Agent
13
+ from vision_agent.agent.vision_agent_v3_prompts import (
14
+ CODE,
15
+ FEEDBACK,
16
+ FIX_BUG,
17
+ PLAN,
18
+ REFLECT,
19
+ SIMPLE_TEST,
20
+ USER_REQ,
21
+ )
22
+ from vision_agent.llm import LLM, OpenAILLM
23
+ from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF, UTILITIES_DOCSTRING
24
+ from vision_agent.utils import Execute
25
+ from vision_agent.utils.sim import Sim
26
+
27
+ logging.basicConfig(stream=sys.stdout)
28
+ _LOGGER = logging.getLogger(__name__)
29
+ _MAX_TABULATE_COL_WIDTH = 80
30
+ _EXECUTE = Execute(600)
31
+ _CONSOLE = Console()
32
+
33
+
34
+ def format_memory(memory: List[Dict[str, str]]) -> str:
35
+ return FEEDBACK.format(
36
+ feedback="\n".join(
37
+ [
38
+ f"### Feedback {i}:\nCode: ```python\n{m['code']}\n```\nFeedback: {m['feedback']}\n"
39
+ for i, m in enumerate(memory)
40
+ ]
41
+ )
42
+ )
43
+
44
+
45
+ def extract_code(code: str) -> str:
46
+ if "\n```python" in code:
47
+ start = "\n```python"
48
+ elif "```python" in code:
49
+ start = "```python"
50
+ else:
51
+ return code
52
+
53
+ code = code[code.find(start) + len(start) :]
54
+ code = code[: code.find("```")]
55
+ if code.startswith("python\n"):
56
+ code = code[len("python\n") :]
57
+ return code
58
+
59
+
60
+ def extract_json(json_str: str) -> Dict[str, Any]:
61
+ try:
62
+ json_dict = json.loads(json_str)
63
+ except json.JSONDecodeError:
64
+ if "```json" in json_str:
65
+ json_str = json_str[json_str.find("```json") + len("```json") :]
66
+ json_str = json_str[: json_str.find("```")]
67
+ elif "```" in json_str:
68
+ json_str = json_str[json_str.find("```") + len("```") :]
69
+ # get the last ``` not one from an intermediate string
70
+ json_str = json_str[: json_str.find("}```")]
71
+ json_dict = json.loads(json_str)
72
+ return json_dict # type: ignore
73
+
74
+
75
+ def write_plan(
76
+ chat: List[Dict[str, str]],
77
+ tool_desc: str,
78
+ working_memory: str,
79
+ model: LLM,
80
+ ) -> List[Dict[str, str]]:
81
+ chat = copy.deepcopy(chat)
82
+ if chat[-1]["role"] != "user":
83
+ raise ValueError("Last chat message must be from the user.")
84
+
85
+ user_request = chat[-1]["content"]
86
+ context = USER_REQ.format(user_request=user_request)
87
+ prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
88
+ chat[-1]["content"] = prompt
89
+ return extract_json(model.chat(chat))["plan"] # type: ignore
90
+
91
+
92
+ def reflect(
93
+ chat: List[Dict[str, str]],
94
+ plan: str,
95
+ code: str,
96
+ model: LLM,
97
+ ) -> Dict[str, Union[str, bool]]:
98
+ chat = copy.deepcopy(chat)
99
+ if chat[-1]["role"] != "user":
100
+ raise ValueError("Last chat message must be from the user.")
101
+
102
+ user_request = chat[-1]["content"]
103
+ context = USER_REQ.format(user_request=user_request)
104
+ prompt = REFLECT.format(context=context, plan=plan, code=code)
105
+ chat[-1]["content"] = prompt
106
+ return extract_json(model.chat(chat))
107
+
108
+
109
+ def write_and_test_code(
110
+ task: str,
111
+ tool_info: str,
112
+ tool_utils: str,
113
+ working_memory: str,
114
+ coder: LLM,
115
+ tester: LLM,
116
+ debugger: LLM,
117
+ verbosity: int = 0,
118
+ max_retries: int = 3,
119
+ ) -> Dict[str, Any]:
120
+ code = extract_code(
121
+ coder(CODE.format(docstring=tool_info, question=task, feedback=working_memory))
122
+ )
123
+ test = extract_code(
124
+ tester(
125
+ SIMPLE_TEST.format(
126
+ docstring=tool_utils, question=task, code=code, feedback=working_memory
127
+ )
128
+ )
129
+ )
130
+
131
+ success, result = _EXECUTE.run_isolation(f"{code}\n{test}")
132
+ if verbosity == 2:
133
+ _LOGGER.info("First code and tests:")
134
+ _CONSOLE.print(
135
+ Syntax(f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True)
136
+ )
137
+ _LOGGER.info(f"First result: {result}")
138
+
139
+ count = 0
140
+ new_working_memory = []
141
+ while not success and count < max_retries:
142
+ fixed_code_and_test = extract_json(
143
+ debugger(
144
+ FIX_BUG.format(
145
+ code=code, tests=test, result=result, feedback=working_memory
146
+ )
147
+ )
148
+ )
149
+ if fixed_code_and_test["code"].strip() != "":
150
+ code = extract_code(fixed_code_and_test["code"])
151
+ if fixed_code_and_test["test"].strip() != "":
152
+ test = extract_code(fixed_code_and_test["test"])
153
+ new_working_memory.append(
154
+ {"code": f"{code}\n{test}", "feedback": fixed_code_and_test["reflections"]}
155
+ )
156
+
157
+ success, result = _EXECUTE.run_isolation(f"{code}\n{test}")
158
+ if verbosity == 2:
159
+ _LOGGER.info(
160
+ f"Debug attempt {count + 1}, reflection: {fixed_code_and_test['reflections']}"
161
+ )
162
+ _CONSOLE.print(
163
+ Syntax(
164
+ f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True
165
+ )
166
+ )
167
+ _LOGGER.info(f"Debug result: {result}")
168
+ count += 1
169
+
170
+ if verbosity == 1:
171
+ _CONSOLE.print(
172
+ Syntax(f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True)
173
+ )
174
+ _LOGGER.info(f"Result: {result}")
175
+
176
+ return {
177
+ "code": code,
178
+ "test": test,
179
+ "success": success,
180
+ "working_memory": new_working_memory,
181
+ }
182
+
183
+
184
+ def retrieve_tools(
185
+ plan: List[Dict[str, str]], tool_recommender: Sim, verbosity: int = 0
186
+ ) -> str:
187
+ tool_info = []
188
+ tool_desc = []
189
+ for task in plan:
190
+ tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3)
191
+ tool_info.extend([e["doc"] for e in tools])
192
+ tool_desc.extend([e["desc"] for e in tools])
193
+ if verbosity == 2:
194
+ _LOGGER.info(f"Tools: {tool_desc}")
195
+ tool_info_set = set(tool_info)
196
+ return "\n\n".join(tool_info_set)
197
+
198
+
199
+ class VisionAgentV3(Agent):
200
+ def __init__(
201
+ self,
202
+ timeout: int = 600,
203
+ planner: Optional[LLM] = None,
204
+ coder: Optional[LLM] = None,
205
+ tester: Optional[LLM] = None,
206
+ debugger: Optional[LLM] = None,
207
+ tool_recommender: Optional[Sim] = None,
208
+ verbosity: int = 0,
209
+ ) -> None:
210
+ self.planner = (
211
+ OpenAILLM(temperature=0.0, json_mode=True) if planner is None else planner
212
+ )
213
+ self.coder = OpenAILLM(temperature=0.0) if coder is None else coder
214
+ self.tester = OpenAILLM(temperature=0.0) if tester is None else tester
215
+ self.debugger = (
216
+ OpenAILLM(temperature=0.0, json_mode=True) if debugger is None else debugger
217
+ )
218
+
219
+ self.tool_recommender = (
220
+ Sim(TOOLS_DF, sim_key="desc")
221
+ if tool_recommender is None
222
+ else tool_recommender
223
+ )
224
+ self.verbosity = verbosity
225
+ self.max_retries = 3
226
+
227
+ def __call__(
228
+ self,
229
+ input: Union[List[Dict[str, str]], str],
230
+ image: Optional[Union[str, Path]] = None,
231
+ ) -> str:
232
+ if isinstance(input, str):
233
+ input = [{"role": "user", "content": input}]
234
+ results = self.chat_with_workflow(input, image)
235
+ return results["code"] # type: ignore
236
+
237
+ def chat_with_workflow(
238
+ self,
239
+ chat: List[Dict[str, str]],
240
+ image: Optional[Union[str, Path]] = None,
241
+ ) -> Dict[str, Any]:
242
+ if len(chat) == 0:
243
+ raise ValueError("Chat cannot be empty.")
244
+
245
+ if image is not None:
246
+ for chat_i in chat:
247
+ if chat_i["role"] == "user":
248
+ chat_i["content"] += f" Image name {image}"
249
+
250
+ code = ""
251
+ test = ""
252
+ working_memory: List[Dict[str, str]] = []
253
+ results = {"code": "", "test": "", "plan": []}
254
+ plan = []
255
+ success = False
256
+ retries = 0
257
+
258
+ while not success and retries < self.max_retries:
259
+ plan_i = write_plan(
260
+ chat, TOOL_DESCRIPTIONS, format_memory(working_memory), self.planner
261
+ )
262
+ plan_i_str = "\n-".join([e["instructions"] for e in plan_i])
263
+ if self.verbosity == 1 or self.verbosity == 2:
264
+ _LOGGER.info(
265
+ f"""
266
+ {tabulate(tabular_data=plan_i, headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
267
+ )
268
+
269
+ tool_info = retrieve_tools(
270
+ plan_i,
271
+ self.tool_recommender,
272
+ self.verbosity,
273
+ )
274
+ results = write_and_test_code(
275
+ plan_i_str,
276
+ tool_info,
277
+ UTILITIES_DOCSTRING,
278
+ format_memory(working_memory),
279
+ self.coder,
280
+ self.tester,
281
+ self.debugger,
282
+ verbosity=self.verbosity,
283
+ )
284
+ success = cast(bool, results["success"])
285
+ code = cast(str, results["code"])
286
+ test = cast(str, results["test"])
287
+ working_memory.extend(results["working_memory"]) # type: ignore
288
+ plan.append({"code": code, "test": test, "plan": plan_i})
289
+
290
+ reflection = reflect(chat, plan_i_str, code, self.planner)
291
+ if self.verbosity > 0:
292
+ _LOGGER.info(f"Reflection: {reflection}")
293
+ feedback = cast(str, reflection["feedback"])
294
+ success = cast(bool, reflection["success"])
295
+ working_memory.append({"code": f"{code}\n{test}", "feedback": feedback})
296
+
297
+ return {
298
+ "code": code,
299
+ "test": test,
300
+ "plan": plan,
301
+ "working_memory": working_memory,
302
+ }
303
+
304
+ def log_progress(self, description: str) -> None:
305
+ pass
@@ -0,0 +1,221 @@
1
+ USER_REQ = """
2
+ ## User Request
3
+ {user_request}
4
+ """
5
+
6
+ FEEDBACK = """
7
+ ## This contains code and feedback from previous runs and is used for providing context so you do not make the same mistake again.
8
+
9
+ {feedback}
10
+ """
11
+
12
+
13
+ PLAN = """
14
+ **Context**
15
+ {context}
16
+
17
+ **Tools Available**:
18
+ {tool_desc}
19
+
20
+ **Previous Feedback**:
21
+ {feedback}
22
+
23
+ **Instructions**:
24
+ Based on the context and tools you have available, write a plan of subtasks to achieve the user request utilizing given tools when necessary. Output a list of jsons in the following format:
25
+
26
+ ```json
27
+ {{
28
+ "plan":
29
+ [
30
+ {{
31
+ "instructions": str # what you should do in this task, one short phrase or sentence
32
+ }}
33
+ ]
34
+ }}
35
+ ```
36
+ """
37
+
38
+ CODE = """
39
+ **Role**: You are a software programmer.
40
+
41
+ **Task**: As a programmer, you are required to complete the function. Use a Chain-of-Thought approach to break down the problem, create pseudocode, and then write the code in Python language. Ensure that your code is efficient, readable, and well-commented. Return the requested information from the function you create. Do not call your code, a test will be run after the code is submitted.
42
+
43
+ **Documentation**:
44
+ This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools.tools_v2 import *`.
45
+
46
+ {docstring}
47
+
48
+ **Input Code Snippet**:
49
+ ```python
50
+ # Your code here
51
+ ```
52
+
53
+ **User Instructions**:
54
+ {question}
55
+
56
+ **Previous Feedback**:
57
+ {feedback}
58
+
59
+ **Instructions**:
60
+ 1. **Understand and Clarify**: Make sure you understand the task.
61
+ 2. **Algorithm/Method Selection**: Decide on the most efficient way.
62
+ 3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
63
+ 4. **Code Generation**: Translate your pseudocode into executable Python code.
64
+ """
65
+
66
+ TEST = """
67
+ **Role**: As a tester, your task is to create comprehensive test cases for the provided code. These test cases should encompass Basic and Edge case scenarios to ensure the code's robustness and reliability if possible.
68
+
69
+ **Documentation**:
70
+ This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools.tools_v2 import *`. You do not need to test these functions. Test only the code provided by the user.
71
+
72
+ {docstring}
73
+
74
+ **User Instructions**:
75
+ {question}
76
+
77
+ **Input Code Snippet**:
78
+ ```python
79
+ ### Please decided how would you want to generate test cases. Based on incomplete code or completed version.
80
+ {code}
81
+ ```
82
+
83
+ **Instructions**:
84
+ 1. Verify the fundamental functionality under normal conditions.
85
+ 2. Ensure each test case is well-documented with comments explaining the scenario it covers.
86
+ 3. DO NOT use any files that are not provided by the user's instructions, your test must be run and will crash if it tries to load a non-existent file.
87
+ 4. DO NOT mock any functions, you must test their functionality as is.
88
+
89
+ You should format your test cases at the end of your response wrapped in ```python ``` tags like in the following example:
90
+ ```python
91
+ # You can run assertions to ensure the function is working as expected
92
+ assert function(input) == expected_output, "Test case description"
93
+
94
+ # You can simply call the function to ensure it runs
95
+ function(input)
96
+
97
+ # Or you can visualize the output
98
+ output = function(input)
99
+ visualize(output)
100
+ ```
101
+
102
+ **Examples**:
103
+ ## Prompt 1:
104
+ ```python
105
+ def detect_cats_and_dogs(image_path: str) -> Dict[str, List[List[float]]]:
106
+ \""" Detects cats and dogs in an image. Returns a dictionary with
107
+ {{
108
+ "cats": [[x1, y1, x2, y2], ...], "dogs": [[x1, y1, x2, y2], ...]
109
+ }}
110
+ \"""
111
+ ```
112
+
113
+ ## Completion 1:
114
+ ```python
115
+ # We can test to ensure the output has the correct structure but we cannot test the
116
+ # content of the output without knowing the image. We can test on "image.jpg" because
117
+ # it is provided by the user so we know it exists.
118
+ output = detect_cats_and_dogs("image.jpg")
119
+ assert "cats" in output, "The output should contain 'cats'
120
+ assert "dogs" in output, "The output should contain 'dogs'
121
+ ```
122
+
123
+ ## Prompt 2:
124
+ ```python
125
+ def find_text(image_path: str, text: str) -> str:
126
+ \""" Finds the text in the image and returns the text. \"""
127
+
128
+ ## Completion 2:
129
+ ```python
130
+ # Because we do not know ahead of time what text is in the image, we can only run the
131
+ # code and print the results. We can test on "image.jpg" because it is provided by the
132
+ # user so we know it exists.
133
+ found_text = find_text("image.jpg", "Hello World")
134
+ print(found_text)
135
+ ```
136
+ """
137
+
138
+
139
+ SIMPLE_TEST = """
140
+ **Role**: As a tester, your task is to create a simple test case for the provided code. This test case should verify the fundamental functionality under normal conditions.
141
+
142
+ **Documentation**:
143
+ This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools.tools_v2 import *`. You do not need to test these functions, only the code provided by the user.
144
+
145
+ {docstring}
146
+
147
+ **User Instructions**:
148
+ {question}
149
+
150
+ **Input Code Snippet**:
151
+ ```python
152
+ ### Please decided how would you want to generate test cases. Based on incomplete code or completed version.
153
+ {code}
154
+ ```
155
+
156
+ **Previous Feedback**:
157
+ {feedback}
158
+
159
+ **Instructions**:
160
+ 1. Verify the fundamental functionality under normal conditions.
161
+ 2. Ensure each test case is well-documented with comments explaining the scenario it covers.
162
+ 3. DO NOT use any files that are not provided by the user's instructions, your test must be run and will crash if it tries to load a non-existent file.
163
+ 4. DO NOT mock any functions, you must test their functionality as is.
164
+ """
165
+
166
+
167
+ FIX_BUG = """
168
+ **Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting so feel free to run !pip install to install missing packages.
169
+
170
+ **Instructions**:
171
+ Please re-complete the code to fix the error message. Here is the previous version:
172
+ ```python
173
+ {code}
174
+ ```
175
+
176
+ When we run this test code:
177
+ ```python
178
+ {tests}
179
+ ```
180
+
181
+ It raises this error:
182
+ ```python
183
+ {result}
184
+ ```
185
+
186
+ This is previous feedback provided on the code:
187
+ {feedback}
188
+
189
+ Please fix the bug by follow the error information and return a JSON object with the following format:
190
+ {{
191
+ "reflections": str # any thoughts you have about the bug and how you fixed it
192
+ "code": str # the fixed code if any, else an empty string
193
+ "test": str # the fixed test code if any, else an empty string
194
+ }}
195
+ """
196
+
197
+
198
+ REFLECT = """
199
+ **Role**: You are a reflection agent. Your job is to look at the original user request and the code produced and determine if the code satisfies the user's request. If it does not, you must provide feedback on how to improve the code. You are concerned only if the code meets the user request, not if the code is good or bad.
200
+
201
+ **Context**:
202
+ {context}
203
+
204
+ **Plan**:
205
+ {plan}
206
+
207
+ **Code**:
208
+ {code}
209
+
210
+ **Instructions**:
211
+ 1. **Understand the User Request**: Read the user request and understand what the user is asking for.
212
+ 2. **Review the Plan**: Check the plan to see if it is a viable approach to solving the user request.
213
+ 3. **Review the Code**: Check the code to see if it solves the user request.
214
+ 4. DO NOT add any reflections for test cases, these are taken care of.
215
+
216
+ Respond in JSON format with the following structure:
217
+ {{
218
+ "feedback": str # the feedback you would give to the coder and tester
219
+ "success": bool # whether the code and tests meet the user request
220
+ }}
221
+ """
@@ -8,7 +8,7 @@ from vision_agent.utils.type_defs import LandingaiAPIKey
8
8
 
9
9
  _LOGGER = logging.getLogger(__name__)
10
10
  _LND_API_KEY = LandingaiAPIKey().api_key
11
- _LND_API_URL = "https://api.dev.landing.ai/v1/agent"
11
+ _LND_API_URL = "https://api.staging.landing.ai/v1/agent"
12
12
 
13
13
 
14
14
  def _send_inference_request(
@@ -53,7 +53,7 @@ class NoOp(Tool):
53
53
 
54
54
 
55
55
  class CLIP(Tool):
56
- r"""CLIP is a tool that can classify or tag any image given a set if input classes
56
+ r"""CLIP is a tool that can classify or tag any image given a set of input classes
57
57
  or tags.
58
58
 
59
59
  Example
@@ -15,7 +15,14 @@ from scipy.spatial import distance # type: ignore
15
15
 
16
16
  from vision_agent.tools.tool_utils import _send_inference_request
17
17
  from vision_agent.utils import extract_frames_from_video
18
- from vision_agent.utils.image_utils import convert_to_b64, normalize_bbox, rle_decode
18
+ from vision_agent.utils.image_utils import (
19
+ b64_to_pil,
20
+ convert_to_b64,
21
+ denormalize_bbox,
22
+ get_image_size,
23
+ normalize_bbox,
24
+ rle_decode,
25
+ )
19
26
 
20
27
  COLORS = [
21
28
  (158, 218, 229),
@@ -49,7 +56,7 @@ def grounding_dino(
49
56
  prompt: str,
50
57
  image: np.ndarray,
51
58
  box_threshold: float = 0.20,
52
- iou_threshold: float = 0.75,
59
+ iou_threshold: float = 0.20,
53
60
  ) -> List[Dict[str, Any]]:
54
61
  """'grounding_dino' is a tool that can detect and count objects given a text prompt
55
62
  such as category names or referring expressions. It returns a list and count of
@@ -61,12 +68,13 @@ def grounding_dino(
61
68
  box_threshold (float, optional): The threshold for the box detection. Defaults
62
69
  to 0.20.
63
70
  iou_threshold (float, optional): The threshold for the Intersection over Union
64
- (IoU). Defaults to 0.75.
71
+ (IoU). Defaults to 0.20.
65
72
 
66
73
  Returns:
67
74
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
68
75
  bounding box of the detected objects with normalized coordinates
69
- (x1, y1, x2, y2).
76
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and
77
+ xmax and ymax are the coordinates of the bottom-right of the bounding box.
70
78
 
71
79
  Example
72
80
  -------
@@ -77,7 +85,7 @@ def grounding_dino(
77
85
  ]
78
86
  """
79
87
  image_size = image.shape[:2]
80
- image_b64 = convert_to_b64(Image.fromarray(image))
88
+ image_b64 = convert_to_b64(image)
81
89
  request_data = {
82
90
  "prompt": prompt,
83
91
  "image": image_b64,
@@ -101,7 +109,7 @@ def grounding_sam(
101
109
  prompt: str,
102
110
  image: np.ndarray,
103
111
  box_threshold: float = 0.20,
104
- iou_threshold: float = 0.75,
112
+ iou_threshold: float = 0.20,
105
113
  ) -> List[Dict[str, Any]]:
106
114
  """'grounding_sam' is a tool that can detect and segment objects given a text
107
115
  prompt such as category names or referring expressions. It returns a list of
@@ -113,12 +121,15 @@ def grounding_sam(
113
121
  box_threshold (float, optional): The threshold for the box detection. Defaults
114
122
  to 0.20.
115
123
  iou_threshold (float, optional): The threshold for the Intersection over Union
116
- (IoU). Defaults to 0.75.
124
+ (IoU). Defaults to 0.20.
117
125
 
118
126
  Returns:
119
127
  List[Dict[str, Any]]: A list of dictionaries containing the score, label,
120
128
  bounding box, and mask of the detected objects with normalized coordinates
121
- (x1, y1, x2, y2).
129
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and
130
+ xmax and ymax are the coordinates of the bottom-right of the bounding box.
131
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
132
+ the background.
122
133
 
123
134
  Example
124
135
  -------
@@ -137,7 +148,7 @@ def grounding_sam(
137
148
  ]
138
149
  """
139
150
  image_size = image.shape[:2]
140
- image_b64 = convert_to_b64(Image.fromarray(image))
151
+ image_b64 = convert_to_b64(image)
141
152
  request_data = {
142
153
  "prompt": prompt,
143
154
  "image": image_b64,
@@ -235,6 +246,152 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
235
246
  return output
236
247
 
237
248
 
249
+ def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
250
+ """'zero_shot_counting' is a tool that counts the dominant foreground object given an image and no other information about the content.
251
+ It returns only the count of the objects in the image.
252
+
253
+ Parameters:
254
+ image (np.ndarray): The image that contains lot of instances of a single object
255
+
256
+ Returns:
257
+ Dict[str, Any]: A dictionary containing the key 'count' and the count as a value. E.g. {count: 12}.
258
+
259
+ Example
260
+ -------
261
+ >>> zero_shot_counting(image)
262
+ {'count': 45},
263
+
264
+ """
265
+
266
+ image_b64 = convert_to_b64(image)
267
+ data = {
268
+ "image": image_b64,
269
+ "tool": "zero_shot_counting",
270
+ }
271
+ resp_data = _send_inference_request(data, "tools")
272
+ resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
273
+ return resp_data
274
+
275
+
276
+ def visual_prompt_counting(
277
+ image: np.ndarray, visual_prompt: Dict[str, List[float]]
278
+ ) -> Dict[str, Any]:
279
+ """'visual_prompt_counting' is a tool that counts the dominant foreground object given an image and a visual prompt which is a bounding box describing the object.
280
+ It returns only the count of the objects in the image.
281
+
282
+ Parameters:
283
+ image (np.ndarray): The image that contains lot of instances of a single object
284
+
285
+ Returns:
286
+ Dict[str, Any]: A dictionary containing the key 'count' and the count as a value. E.g. {count: 12}.
287
+
288
+ Example
289
+ -------
290
+ >>> visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
291
+ {'count': 45},
292
+
293
+ """
294
+
295
+ image_size = get_image_size(image)
296
+ bbox = visual_prompt["bbox"]
297
+ bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
298
+ image_b64 = convert_to_b64(image)
299
+
300
+ data = {
301
+ "image": image_b64,
302
+ "prompt": bbox_str,
303
+ "tool": "few_shot_counting",
304
+ }
305
+ resp_data = _send_inference_request(data, "tools")
306
+ resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
307
+ return resp_data
308
+
309
+
310
+ def image_question_answering(image: np.ndarray, prompt: str) -> str:
311
+ """'image_question_answering_' is a tool that can answer questions about the visual contents of an image given a question and an image.
312
+ It returns an answer to the question
313
+
314
+ Parameters:
315
+ image (np.ndarray): The reference image used for the question
316
+ prompt (str): The question about the image
317
+
318
+ Returns:
319
+ str: A string which is the answer to the given prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}.
320
+
321
+ Example
322
+ -------
323
+ >>> image_question_answering(image, 'What is the cat doing ?')
324
+ 'drinking milk'
325
+
326
+ """
327
+
328
+ image_b64 = convert_to_b64(image)
329
+ data = {
330
+ "image": image_b64,
331
+ "prompt": prompt,
332
+ "tool": "image_question_answering",
333
+ }
334
+
335
+ answer = _send_inference_request(data, "tools")
336
+ return answer["text"][0] # type: ignore
337
+
338
+
339
+ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
340
+ """'clip' is a tool that can classify an image given a list of input classes or tags.
341
+ It returns the same list of the input classes along with their probability scores based on image content.
342
+
343
+ Parameters:
344
+ image (np.ndarray): The image to classify or tag
345
+ classes (List[str]): The list of classes or tags that is associated with the image
346
+
347
+ Returns:
348
+ Dict[str, Any]: A dictionary containing the labels and scores. One dictionary contains a list of given labels and other a list of scores.
349
+
350
+ Example
351
+ -------
352
+ >>> clip(image, ['dog', 'cat', 'bird'])
353
+ {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
354
+
355
+ """
356
+
357
+ image_b64 = convert_to_b64(image)
358
+ data = {
359
+ "prompt": ",".join(classes),
360
+ "image": image_b64,
361
+ "tool": "closed_set_image_classification",
362
+ }
363
+ resp_data = _send_inference_request(data, "tools")
364
+ resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
365
+ return resp_data
366
+
367
+
368
+ def image_caption(image: np.ndarray) -> str:
369
+ """'image_caption' is a tool that can caption an image based on its contents.
370
+ It returns a text describing the image.
371
+
372
+ Parameters:
373
+ image (np.ndarray): The image to caption
374
+
375
+ Returns:
376
+ str: A string which is the caption for the given image.
377
+
378
+ Example
379
+ -------
380
+ >>> image_caption(image)
381
+ 'This image contains a cat sitting on a table with a bowl of milk.'
382
+
383
+ """
384
+
385
+ image_b64 = convert_to_b64(image)
386
+ data = {
387
+ "image": image_b64,
388
+ "tool": "image_captioning",
389
+ }
390
+
391
+ answer = _send_inference_request(data, "tools")
392
+ return answer["text"][0] # type: ignore
393
+
394
+
238
395
  def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
239
396
  """'closest_mask_distance' calculates the closest distance between two masks.
240
397
 
@@ -504,6 +661,11 @@ TOOLS = [
504
661
  grounding_sam,
505
662
  extract_frames,
506
663
  ocr,
664
+ clip,
665
+ zero_shot_counting,
666
+ visual_prompt_counting,
667
+ image_question_answering,
668
+ image_caption,
507
669
  closest_mask_distance,
508
670
  closest_box_distance,
509
671
  save_json,
@@ -4,6 +4,7 @@
4
4
  import base64 as b64
5
5
  import io
6
6
  import re
7
+ from time import sleep
7
8
  from typing import Dict, List, Tuple
8
9
 
9
10
  import nbformat
@@ -75,6 +76,7 @@ class Execute:
75
76
  self.terminate()
76
77
  self.nb = nbformat.v4.new_notebook()
77
78
  self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
79
+ sleep(1)
78
80
  self.build()
79
81
 
80
82
  def run_cell(self, cell: NotebookNode, cell_index: int) -> Tuple[bool, str]:
@@ -83,6 +85,7 @@ class Execute:
83
85
  return parse_outputs(self.nb.cells[-1].outputs)
84
86
  except CellTimeoutError:
85
87
  run_sync(self.nb_client.km.interrupt_kernel)() # type: ignore
88
+ sleep(1)
86
89
  return False, "Cell execution timed out."
87
90
  except DeadKernelError:
88
91
  self.reset()
@@ -104,15 +104,20 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
104
104
  """
105
105
  if data is None:
106
106
  raise ValueError(f"Invalid input image: {data}. Input image can't be None.")
107
+
107
108
  if isinstance(data, (str, Path)):
108
109
  data = Image.open(data)
110
+ elif isinstance(data, np.ndarray):
111
+ data = Image.fromarray(data)
112
+
109
113
  if isinstance(data, Image.Image):
110
114
  buffer = BytesIO()
111
115
  data.convert("RGB").save(buffer, format="PNG")
112
116
  return base64.b64encode(buffer.getvalue()).decode("utf-8")
113
117
  else:
114
- arr_bytes = data.tobytes()
115
- return base64.b64encode(arr_bytes).decode("utf-8")
118
+ raise ValueError(
119
+ f"Invalid input image: {data}. Input image must be a PIL Image or a numpy array."
120
+ )
116
121
 
117
122
 
118
123
  def denormalize_bbox(
@@ -12,7 +12,7 @@ class LandingaiAPIKey(BaseSettings):
12
12
  """
13
13
 
14
14
  api_key: str = Field(
15
- default="land_sk_PCRPYKqB3cq0JWGY83hjEk33SWSDOwdNoyUjTgCDMZO4NxeCXW",
15
+ default="land_sk_IJrojHarPXRjqDj1Fng76mX7yCbzVm1s5rZYxaNXu5v0cNLn0w",
16
16
  alias="LANDINGAI_API_KEY",
17
17
  description="The API key of LandingAI.",
18
18
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.24
3
+ Version: 0.2.26
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -1,5 +1,5 @@
1
1
  vision_agent/__init__.py,sha256=GVLHCeK_R-zgldpbcPmOzJat-BkadvkuRCMxDvTIcXs,108
2
- vision_agent/agent/__init__.py,sha256=Zv8lc91mPy0iDySId38_vc4mo56JQ9mCMvUWdAKQjh0,206
2
+ vision_agent/agent/__init__.py,sha256=jpmL6z5e4PFfQM21JbSsRwcERRXn58XFmURAMwWeoRM,249
3
3
  vision_agent/agent/agent.py,sha256=X7kON-g9ePUKumCDaYfQNBX_MEFE-ax5PnRp7-Cc5Wo,529
4
4
  vision_agent/agent/agent_coder.py,sha256=4iB732bX4wDnPAuyYBk6HWlf4aFq2l9EcL695qfDIXw,7004
5
5
  vision_agent/agent/agent_coder_prompts.py,sha256=CJe3v7xvHQ32u3RQAXQga_Tk_4UgU64RBAMHZ3S70KY,5538
@@ -9,8 +9,10 @@ vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6w
9
9
  vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
10
10
  vision_agent/agent/vision_agent.py,sha256=pnx7gtTPazR7Dck5_kfZC3S3QWKu4e28YVigzOicOX0,27130
11
11
  vision_agent/agent/vision_agent_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
12
- vision_agent/agent/vision_agent_v2.py,sha256=3qjvaj-yyrXmoY_cecUsiuY4Rn6MmJanFZeoXFJRK2c,13229
13
- vision_agent/agent/vision_agent_v2_prompt.py,sha256=b_0BMq6GrbGfl09MHrv4mj-mqyE1FxMl3Xq44qD4S1E,6161
12
+ vision_agent/agent/vision_agent_v2.py,sha256=XQy9Bh-nMiBOmld4ufz-SGcGQ0ab9S060mYITsrnQEg,13230
13
+ vision_agent/agent/vision_agent_v2_prompts.py,sha256=b_0BMq6GrbGfl09MHrv4mj-mqyE1FxMl3Xq44qD4S1E,6161
14
+ vision_agent/agent/vision_agent_v3.py,sha256=EGA3zQKVIVdDlZOWwZNgueMnlqKqNwGvSc9v_XM-b34,9696
15
+ vision_agent/agent/vision_agent_v3_prompts.py,sha256=LRZBKObeb0Bs48vo7vtB2M8loPO1lQzruH-3IiMS5ts,7484
14
16
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
17
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
16
18
  vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
@@ -19,16 +21,16 @@ vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,
19
21
  vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
20
22
  vision_agent/tools/__init__.py,sha256=dRHXGpjhItXZRQs0r_l3Z3bQIreaZaYP0CJrl8mOJxM,452
21
23
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
- vision_agent/tools/tool_utils.py,sha256=mK6QfbYr6oo9ci979-_6R1DrxU2i8HGhwosADyvciI0,865
23
- vision_agent/tools/tools.py,sha256=sVxN7SpDkz_XTc_SKwkoRF4EwaMTuHvTsCHwtR942Fc,47373
24
- vision_agent/tools/tools_v2.py,sha256=iO-ochdLq73xdCRUY1MKixHyVAk6UIUrY648MtjlHno,16201
24
+ vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
25
+ vision_agent/tools/tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
26
+ vision_agent/tools/tools_v2.py,sha256=3Bv1xuZFoPjaCb-VixF5Vl3uoyac03571FXUzBI8FBQ,21404
25
27
  vision_agent/utils/__init__.py,sha256=xsHFyJSDbLdonB9Dh74cwZnVTiT__2OQF3Brd3Nmglc,116
26
- vision_agent/utils/execute.py,sha256=RC_jKrm2kOWwzNe9xKuA2xJcbsNcD0Hb95_o3_Le0_E,3820
27
- vision_agent/utils/image_utils.py,sha256=1dggPBhW8_hUXDItCRLa23h-hdBwS50cjL4v1hsoUbg,7586
28
+ vision_agent/utils/execute.py,sha256=8_SfK-IkHH4lXF0JVyV7sDFszZn9HKsh1bFITKGCJ1g,3881
29
+ vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
28
30
  vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
29
- vision_agent/utils/type_defs.py,sha256=ijFAd7D0y8JOg0Ib063rqsDcrFtZfQbdqpaRPTmp2hY,1792
31
+ vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
30
32
  vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
31
- vision_agent-0.2.24.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
32
- vision_agent-0.2.24.dist-info/METADATA,sha256=G4bq69V2-eRKNSWwx0skCfU60iiCUQf5l37B9O49Bkk,9212
33
- vision_agent-0.2.24.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
34
- vision_agent-0.2.24.dist-info/RECORD,,
33
+ vision_agent-0.2.26.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
34
+ vision_agent-0.2.26.dist-info/METADATA,sha256=4iVEn5ndUUHrEduVwmUy8IJ09YKp6k9BRSrCdpSaUtA,9212
35
+ vision_agent-0.2.26.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
36
+ vision_agent-0.2.26.dist-info/RECORD,,