vision-agent 0.2.15__py3-none-any.whl → 0.2.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -308,7 +308,7 @@ def _handle_extract_frames(
308
308
  # any following processing
309
309
  for video_file_output in tool_result["call_results"]:
310
310
  # When the video tool is run with wrong parameters, exit the loop
311
- if len(video_file_output) < 2:
311
+ if not isinstance(video_file_output, tuple) or len(video_file_output) < 2:
312
312
  break
313
313
  for frame, _ in video_file_output:
314
314
  image = frame
@@ -464,15 +464,17 @@ class VisionAgent(Agent):
464
464
  report_progress_callback: a callback to report the progress of the agent. This is useful for streaming logs in a web application where multiple VisionAgent instances are running in parallel. This callback ensures that the progress are not mixed up.
465
465
  """
466
466
  self.task_model = (
467
- OpenAILLM(json_mode=True, temperature=0.1)
467
+ OpenAILLM(model_name="gpt-4-turbo", json_mode=True, temperature=0.0)
468
468
  if task_model is None
469
469
  else task_model
470
470
  )
471
471
  self.answer_model = (
472
- OpenAILLM(temperature=0.1) if answer_model is None else answer_model
472
+ OpenAILLM(model_name="gpt-4-turbo", temperature=0.0)
473
+ if answer_model is None
474
+ else answer_model
473
475
  )
474
476
  self.reflect_model = (
475
- OpenAILMM(json_mode=True, temperature=0.1)
477
+ OpenAILMM(model_name="gpt-4-turbo", json_mode=True, temperature=0.0)
476
478
  if reflect_model is None
477
479
  else reflect_model
478
480
  )
@@ -1,8 +1,9 @@
1
1
  import json
2
2
  import logging
3
3
  from pathlib import Path
4
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
4
+ from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
5
5
 
6
+ import pandas as pd
6
7
  from rich.console import Console
7
8
  from rich.syntax import Syntax
8
9
  from tabulate import tabulate
@@ -20,6 +21,7 @@ from vision_agent.agent.vision_agent_v2_prompt import (
20
21
  TEST,
21
22
  USER_REQ_CONTEXT,
22
23
  USER_REQ_SUBTASK_CONTEXT,
24
+ USER_REQ_SUBTASK_WM_CONTEXT,
23
25
  )
24
26
  from vision_agent.llm import LLM, OpenAILLM
25
27
  from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF
@@ -31,28 +33,68 @@ _MAX_TABULATE_COL_WIDTH = 80
31
33
  _CONSOLE = Console()
32
34
 
33
35
 
36
+ def build_working_memory(working_memory: Mapping[str, List[str]]) -> Sim:
37
+ data: Mapping[str, List[str]] = {"desc": [], "doc": []}
38
+ for key, value in working_memory.items():
39
+ data["desc"].append(key)
40
+ data["doc"].append("\n".join(value))
41
+ df = pd.DataFrame(data) # type: ignore
42
+ return Sim(df, sim_key="desc")
43
+
44
+
34
45
  def extract_code(code: str) -> str:
35
46
  if "```python" in code:
36
47
  code = code[code.find("```python") + len("```python") :]
37
48
  code = code[: code.find("```")]
49
+ if code.startswith("python\n"):
50
+ code = code[len("python\n") :]
38
51
  return code
39
52
 
40
53
 
54
+ def extract_json(json_str: str) -> Dict[str, Any]:
55
+ try:
56
+ json_dict = json.loads(json_str)
57
+ except json.JSONDecodeError:
58
+ if "```json" in json_str:
59
+ json_str = json_str[json_str.find("```json") + len("```json") :]
60
+ json_str = json_str[: json_str.find("```")]
61
+ elif "```" in json_str:
62
+ json_str = json_str[json_str.find("```") + len("```") :]
63
+ # get the last ``` not one from an intermediate string
64
+ json_str = json_str[: json_str.find("}```")]
65
+ json_dict = json.loads(json_str)
66
+ return json_dict # type: ignore
67
+
68
+
41
69
  def write_plan(
42
- user_requirements: str, tool_desc: str, model: LLM
43
- ) -> List[Dict[str, Any]]:
70
+ chat: List[Dict[str, str]],
71
+ plan: Optional[List[Dict[str, Any]]],
72
+ tool_desc: str,
73
+ model: LLM,
74
+ ) -> Tuple[str, List[Dict[str, Any]]]:
75
+ # Get last user request
76
+ if chat[-1]["role"] != "user":
77
+ raise ValueError("Last chat message must be from the user.")
78
+ user_requirements = chat[-1]["content"]
79
+
44
80
  context = USER_REQ_CONTEXT.format(user_requirement=user_requirements)
45
- prompt = PLAN.format(context=context, plan="", tool_desc=tool_desc)
46
- plan = json.loads(model(prompt).replace("```", "").strip())
47
- return plan["plan"] # type: ignore
81
+ prompt = PLAN.format(context=context, plan=str(plan), tool_desc=tool_desc)
82
+ chat[-1]["content"] = prompt
83
+ new_plan = extract_json(model.chat(chat))
84
+ return new_plan["user_req"], new_plan["plan"]
48
85
 
49
86
 
50
87
  def write_code(
51
- user_req: str, subtask: str, tool_info: str, code: str, model: LLM
88
+ user_req: str,
89
+ subtask: str,
90
+ working_memory: str,
91
+ tool_info: str,
92
+ code: str,
93
+ model: LLM,
52
94
  ) -> str:
53
95
  prompt = CODE.format(
54
- context=USER_REQ_SUBTASK_CONTEXT.format(
55
- user_requirement=user_req, subtask=subtask
96
+ context=USER_REQ_SUBTASK_WM_CONTEXT.format(
97
+ user_requirement=user_req, working_memory=working_memory, subtask=subtask
56
98
  ),
57
99
  tool_info=tool_info,
58
100
  code=code,
@@ -66,7 +108,7 @@ def write_code(
66
108
 
67
109
 
68
110
  def write_test(
69
- user_req: str, subtask: str, tool_info: str, code: str, model: LLM
111
+ user_req: str, subtask: str, tool_info: str, _: str, code: str, model: LLM
70
112
  ) -> str:
71
113
  prompt = TEST.format(
72
114
  context=USER_REQ_SUBTASK_CONTEXT.format(
@@ -83,20 +125,30 @@ def write_test(
83
125
  return extract_code(code)
84
126
 
85
127
 
86
- def debug_code(sub_task: str, working_memory: List[str], model: LLM) -> Tuple[str, str]:
128
+ def debug_code(
129
+ user_req: str,
130
+ subtask: str,
131
+ retrieved_ltm: str,
132
+ working_memory: str,
133
+ model: LLM,
134
+ ) -> Tuple[str, str]:
87
135
  # Make debug model output JSON
88
136
  if hasattr(model, "kwargs"):
89
137
  model.kwargs["response_format"] = {"type": "json_object"}
90
138
  prompt = DEBUG.format(
91
139
  debug_example=DEBUG_EXAMPLE,
92
- context=USER_REQ_CONTEXT.format(user_requirement=sub_task),
93
- previous_impl="\n".join(working_memory),
140
+ context=USER_REQ_SUBTASK_WM_CONTEXT.format(
141
+ user_requirement=user_req,
142
+ subtask=subtask,
143
+ working_memory=retrieved_ltm,
144
+ ),
145
+ previous_impl=working_memory,
94
146
  )
95
147
  messages = [
96
148
  {"role": "system", "content": DEBUG_SYS_MSG},
97
149
  {"role": "user", "content": prompt},
98
150
  ]
99
- code_and_ref = json.loads(model.chat(messages).replace("```", "").strip())
151
+ code_and_ref = extract_json(model.chat(messages))
100
152
  if hasattr(model, "kwargs"):
101
153
  del model.kwargs["response_format"]
102
154
  return extract_code(code_and_ref["improved_impl"]), code_and_ref["reflection"]
@@ -106,20 +158,25 @@ def write_and_exec_code(
106
158
  user_req: str,
107
159
  subtask: str,
108
160
  orig_code: str,
109
- code_writer_call: Callable,
161
+ code_writer_call: Callable[..., str],
110
162
  model: LLM,
111
163
  tool_info: str,
112
164
  exec: Execute,
165
+ retrieved_ltm: str,
113
166
  max_retry: int = 3,
114
- verbose: bool = False,
167
+ verbosity: int = 0,
115
168
  ) -> Tuple[bool, str, str, Dict[str, List[str]]]:
116
169
  success = False
117
170
  counter = 0
118
171
  reflection = ""
119
172
 
120
- # TODO: add working memory to code_writer_call and debug_code
121
- code = code_writer_call(user_req, subtask, tool_info, orig_code, model)
173
+ code = code_writer_call(
174
+ user_req, subtask, retrieved_ltm, tool_info, orig_code, model
175
+ )
122
176
  success, result = exec.run_isolation(code)
177
+ if verbosity == 2:
178
+ _CONSOLE.print(Syntax(code, "python", theme="gruvbox-dark", line_numbers=True))
179
+ _LOGGER.info(f"\tCode success: {success}, result: {str(result)}")
123
180
  working_memory: Dict[str, List[str]] = {}
124
181
  while not success and counter < max_retry:
125
182
  if subtask not in working_memory:
@@ -136,19 +193,21 @@ def write_and_exec_code(
136
193
  PREV_CODE_CONTEXT.format(code=code, result=result)
137
194
  )
138
195
 
139
- code, reflection = debug_code(subtask, working_memory[subtask], model)
196
+ code, reflection = debug_code(
197
+ user_req, subtask, retrieved_ltm, "\n".join(working_memory[subtask]), model
198
+ )
140
199
  success, result = exec.run_isolation(code)
141
200
  counter += 1
142
- if verbose:
201
+ if verbosity == 2:
143
202
  _CONSOLE.print(
144
203
  Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
145
204
  )
146
- _LOGGER.info(f"\tDebugging reflection, result: {reflection}, {result}")
205
+ _LOGGER.info(f"\tDebugging reflection: {reflection}, result: {result}")
147
206
 
148
207
  if success:
149
208
  working_memory[subtask].append(
150
209
  PREV_CODE_CONTEXT_WITH_REFLECTION.format(
151
- code=code, result=result, reflection=reflection
210
+ reflection=reflection, code=code, result=result
152
211
  )
153
212
  )
154
213
 
@@ -162,12 +221,15 @@ def run_plan(
162
221
  exec: Execute,
163
222
  code: str,
164
223
  tool_recommender: Sim,
165
- verbose: bool = False,
224
+ long_term_memory: Optional[Sim] = None,
225
+ verbosity: int = 0,
166
226
  ) -> Tuple[str, str, List[Dict[str, Any]], Dict[str, List[str]]]:
167
227
  active_plan = [e for e in plan if "success" not in e or not e["success"]]
168
- working_memory: Dict[str, List[str]] = {}
169
228
  current_code = code
170
229
  current_test = ""
230
+ retrieved_ltm = ""
231
+ working_memory: Dict[str, List[str]] = {}
232
+
171
233
  for task in active_plan:
172
234
  _LOGGER.info(
173
235
  f"""
@@ -176,7 +238,13 @@ def run_plan(
176
238
  tool_info = "\n".join(
177
239
  [e["doc"] for e in tool_recommender.top_k(task["instruction"])]
178
240
  )
179
- success, code, result, task_memory = write_and_exec_code(
241
+
242
+ if long_term_memory is not None:
243
+ retrieved_ltm = "\n".join(
244
+ [e["doc"] for e in long_term_memory.top_k(task["instruction"], 1)]
245
+ )
246
+
247
+ success, code, result, working_memory_i = write_and_exec_code(
180
248
  user_req,
181
249
  task["instruction"],
182
250
  current_code,
@@ -184,20 +252,21 @@ def run_plan(
184
252
  coder,
185
253
  tool_info,
186
254
  exec,
187
- verbose,
255
+ retrieved_ltm,
256
+ verbosity=verbosity,
188
257
  )
189
258
  if task["type"] == "code":
190
259
  current_code = code
191
260
  else:
192
261
  current_test = code
193
262
 
194
- working_memory.update(task_memory)
263
+ working_memory.update(working_memory_i)
195
264
 
196
- if verbose:
265
+ if verbosity == 1:
197
266
  _CONSOLE.print(
198
267
  Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
199
268
  )
200
- _LOGGER.info(f"\tCode success, result: {success}, {str(result)}")
269
+ _LOGGER.info(f"\tCode success: {success} result: {str(result)}")
201
270
 
202
271
  task["success"] = success
203
272
  task["result"] = result
@@ -231,53 +300,71 @@ class VisionAgentV2(Agent):
231
300
  self,
232
301
  timeout: int = 600,
233
302
  tool_recommender: Optional[Sim] = None,
234
- verbose: bool = False,
303
+ long_term_memory: Optional[Sim] = None,
304
+ verbosity: int = 0,
235
305
  ) -> None:
236
- self.planner = OpenAILLM(temperature=0.1, json_mode=True)
237
- self.coder = OpenAILLM(temperature=0.1)
306
+ self.planner = OpenAILLM(temperature=0.0, json_mode=True)
307
+ self.coder = OpenAILLM(temperature=0.0)
238
308
  self.exec = Execute(timeout=timeout)
239
309
  if tool_recommender is None:
240
310
  self.tool_recommender = Sim(TOOLS_DF, sim_key="desc")
241
311
  else:
242
312
  self.tool_recommender = tool_recommender
243
- self.verbose = verbose
244
- if self.verbose:
313
+ self.verbosity = verbosity
314
+ self._working_memory: Dict[str, List[str]] = {}
315
+ if long_term_memory is not None:
316
+ if "doc" not in long_term_memory.df.columns:
317
+ raise ValueError("Long term memory must have a 'doc' column.")
318
+ self.long_term_memory = long_term_memory
319
+ self.max_retries = 3
320
+ if self.verbosity:
245
321
  _LOGGER.setLevel(logging.INFO)
246
322
 
247
323
  def __call__(
248
324
  self,
249
325
  input: Union[List[Dict[str, str]], str],
250
326
  image: Optional[Union[str, Path]] = None,
327
+ plan: Optional[List[Dict[str, Any]]] = None,
251
328
  ) -> str:
252
329
  if isinstance(input, str):
253
330
  input = [{"role": "user", "content": input}]
254
- code, _ = self.chat_with_tests(input, image)
255
- return code
331
+ results = self.chat_with_workflow(input, image, plan)
332
+ return results["code"] # type: ignore
256
333
 
257
- def chat_with_tests(
334
+ def chat_with_workflow(
258
335
  self,
259
336
  chat: List[Dict[str, str]],
260
337
  image: Optional[Union[str, Path]] = None,
261
- ) -> Tuple[str, str]:
338
+ plan: Optional[List[Dict[str, Any]]] = None,
339
+ ) -> Dict[str, Any]:
262
340
  if len(chat) == 0:
263
341
  raise ValueError("Input cannot be empty.")
264
342
 
265
- user_req = chat[0]["content"]
266
343
  if image is not None:
267
- user_req += f" Image name {image}"
344
+ # append file names to all user messages
345
+ for chat_i in chat:
346
+ if chat_i["role"] == "user":
347
+ chat_i["content"] += f" Image name {image}"
348
+
349
+ working_code = ""
350
+ if plan is not None:
351
+ # grab the latest working code from a previous plan
352
+ for task in plan:
353
+ if "success" in task and "code" in task and task["success"]:
354
+ working_code = task["code"]
268
355
 
269
- plan = write_plan(user_req, TOOL_DESCRIPTIONS, self.planner)
356
+ user_req, plan = write_plan(chat, plan, TOOL_DESCRIPTIONS, self.planner)
270
357
  _LOGGER.info(
271
358
  f"""Plan:
272
359
  {tabulate(tabular_data=plan, headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
273
360
  )
274
- working_memory: Dict[str, List[str]] = {}
275
361
 
276
- working_code = ""
277
362
  working_test = ""
363
+ working_memory: Dict[str, List[str]] = {}
278
364
  success = False
365
+ retries = 0
279
366
 
280
- while not success:
367
+ while not success and retries < self.max_retries:
281
368
  working_code, working_test, plan, working_memory_i = run_plan(
282
369
  user_req,
283
370
  plan,
@@ -285,16 +372,25 @@ class VisionAgentV2(Agent):
285
372
  self.exec,
286
373
  working_code,
287
374
  self.tool_recommender,
288
- self.verbose,
375
+ self.long_term_memory,
376
+ self.verbosity,
289
377
  )
290
378
  success = all(task["success"] for task in plan)
291
379
  working_memory.update(working_memory_i)
292
380
 
293
381
  if not success:
294
- # TODO: ask for feedback and replan
382
+ # return to user and request feedback
295
383
  break
296
384
 
297
- return working_code, working_test
385
+ retries += 1
386
+
387
+ return {
388
+ "code": working_code,
389
+ "test": working_test,
390
+ "success": success,
391
+ "working_memory": build_working_memory(working_memory),
392
+ "plan": plan,
393
+ }
298
394
 
299
395
  def log_progress(self, description: str) -> None:
300
396
  pass
@@ -1,3 +1,8 @@
1
+ USER_REQ_CONTEXT = """
2
+ ## User Requirement
3
+ {user_requirement}
4
+ """
5
+
1
6
  USER_REQ_SUBTASK_CONTEXT = """
2
7
  ## User Requirement
3
8
  {user_requirement}
@@ -6,11 +11,16 @@ USER_REQ_SUBTASK_CONTEXT = """
6
11
  {subtask}
7
12
  """
8
13
 
9
- USER_REQ_CONTEXT = """
14
+ USER_REQ_SUBTASK_WM_CONTEXT = """
10
15
  ## User Requirement
11
16
  {user_requirement}
12
- """
13
17
 
18
+ ## Current Subtask
19
+ {subtask}
20
+
21
+ ## Previous Task
22
+ {working_memory}
23
+ """
14
24
 
15
25
  PLAN = """
16
26
  # Context
@@ -27,11 +37,13 @@ Based on the context and the tools you have available, write a plan of subtasks
27
37
  - For each subtask, you should provide a short instruction on what to do. Ensure the subtasks are large enough to be meaningful, encompassing multiple lines of code.
28
38
  - You do not need to have the agent rewrite any tool functionality you already have, you should instead instruct it to utilize one or more of those tools in each subtask.
29
39
  - You can have agents either write coding tasks, to code some functionality or testing tasks to test previous functionality.
40
+ - If a current plan exists, examine each item in the plan to determine if it was successful. If there was an item that failed, i.e. 'success': False, then you should rewrite that item and all subsequent items to ensure that the rewritten plan is successful.
30
41
 
31
42
  Output a list of jsons in the following format:
32
43
 
33
44
  ```json
34
45
  {{
46
+ "user_req": str, # "a summarized version of the user requirement"
35
47
  "plan":
36
48
  [
37
49
  {{
@@ -61,8 +73,9 @@ CODE = """
61
73
  {code}
62
74
 
63
75
  # Constraints
64
- - Write a function that accomplishes the User Requirement. You are supplied code from a previous task, feel free to copy over that code into your own implementation if you need it.
65
- - Always prioritize using pre-defined tools or code for the same functionality. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
76
+ - Write a function that accomplishes the 'User Requirement'. You are supplied code from a previous task under 'Previous Code', feel free to copy over that code into your own implementation if you need it.
77
+ - Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info for Current Subtask'. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
78
+ - You may recieve previous trials and errors under 'Previous Task', this is code, output and reflections from previous tasks. You can use these to avoid running in to the same issues when writing your code.
66
79
  - Write clean, readable, and well-documented code.
67
80
 
68
81
  # Output
@@ -102,6 +115,7 @@ def add(a: int, b: int) -> int:
102
115
 
103
116
 
104
117
  PREV_CODE_CONTEXT = """
118
+ [previous impl]
105
119
  ```python
106
120
  {code}
107
121
  ```
@@ -112,18 +126,20 @@ PREV_CODE_CONTEXT = """
112
126
 
113
127
 
114
128
  PREV_CODE_CONTEXT_WITH_REFLECTION = """
129
+ [reflection on previous impl]
130
+ {reflection}
131
+
132
+ [new impl]
115
133
  ```python
116
134
  {code}
117
135
  ```
118
136
 
119
- [previous output]
137
+ [new output]
120
138
  {result}
121
139
 
122
- [reflection on previous impl]
123
- {reflection}
124
140
  """
125
141
 
126
-
142
+ # don't need [previous impl] because it will come from PREV_CODE_CONTEXT or PREV_CODE_CONTEXT_WITH_REFLECTION
127
143
  DEBUG = """
128
144
  [example]
129
145
  Here is an example of debugging with reflection.
@@ -133,7 +149,6 @@ Here is an example of debugging with reflection.
133
149
  [context]
134
150
  {context}
135
151
 
136
- [previous impl]
137
152
  {previous_impl}
138
153
 
139
154
  [instruction]
@@ -158,7 +173,7 @@ TEST = """
158
173
  {code}
159
174
 
160
175
  # Constraints
161
- - Write code to test the functionality of the provided code according to the Current Subtask. If you cannot test the code, then write code to visualize the result by calling the code.
176
+ - Write code to test the functionality of the provided code according to the 'Current Subtask'. If you cannot test the code, then write code to visualize the result by calling the code.
162
177
  - Always prioritize using pre-defined tools for the same functionality.
163
178
  - Write clean, readable, and well-documented code.
164
179
 
vision_agent/llm/llm.py CHANGED
@@ -34,7 +34,7 @@ class OpenAILLM(LLM):
34
34
 
35
35
  def __init__(
36
36
  self,
37
- model_name: str = "gpt-4-turbo",
37
+ model_name: str = "gpt-4o",
38
38
  api_key: Optional[str] = None,
39
39
  json_mode: bool = False,
40
40
  system_prompt: Optional[str] = None,
@@ -3,7 +3,7 @@ from .tools import ( # Counter,
3
3
  CLIP,
4
4
  OCR,
5
5
  TOOLS,
6
- BboxArea,
6
+ BboxStats,
7
7
  BboxIoU,
8
8
  BoxDistance,
9
9
  Crop,
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import os
2
3
  from typing import Any, Dict
3
4
 
4
5
  import requests
@@ -13,6 +14,8 @@ _LND_API_URL = "https://api.dev.landing.ai/v1/agent"
13
14
  def _send_inference_request(
14
15
  payload: Dict[str, Any], endpoint_name: str
15
16
  ) -> Dict[str, Any]:
17
+ if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
18
+ payload["runtime_tag"] = runtime_tag
16
19
  res = requests.post(
17
20
  f"{_LND_API_URL}/model/{endpoint_name}",
18
21
  headers={
@@ -174,15 +174,15 @@ class GroundingDINO(Tool):
174
174
  """
175
175
 
176
176
  name = "grounding_dino_"
177
- description = "'grounding_dino_' is a tool that can detect and count objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores."
177
+ description = "'grounding_dino_' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores."
178
178
  usage = {
179
179
  "required_parameters": [
180
180
  {"name": "prompt", "type": "str"},
181
181
  {"name": "image", "type": "str"},
182
182
  ],
183
183
  "optional_parameters": [
184
- {"name": "box_threshold", "type": "float"},
185
- {"name": "iou_threshold", "type": "float"},
184
+ {"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
185
+ {"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
186
186
  ],
187
187
  "examples": [
188
188
  {
@@ -209,7 +209,7 @@ class GroundingDINO(Tool):
209
209
  "prompt": "red shirt. green shirt",
210
210
  "image": "shirts.jpg",
211
211
  "box_threshold": 0.20,
212
- "iou_threshold": 0.75,
212
+ "iou_threshold": 0.20,
213
213
  },
214
214
  },
215
215
  ],
@@ -221,7 +221,7 @@ class GroundingDINO(Tool):
221
221
  prompt: str,
222
222
  image: Union[str, Path, ImageType],
223
223
  box_threshold: float = 0.20,
224
- iou_threshold: float = 0.75,
224
+ iou_threshold: float = 0.20,
225
225
  ) -> Dict:
226
226
  """Invoke the Grounding DINO model.
227
227
 
@@ -249,7 +249,7 @@ class GroundingDINO(Tool):
249
249
  data["scores"] = [round(score, 2) for score in data["scores"]]
250
250
  if "labels" in data:
251
251
  data["labels"] = list(data["labels"])
252
- data["size"] = (image_size[1], image_size[0])
252
+ data["image_size"] = image_size
253
253
  return data
254
254
 
255
255
 
@@ -277,15 +277,15 @@ class GroundingSAM(Tool):
277
277
  """
278
278
 
279
279
  name = "grounding_sam_"
280
- description = "'grounding_sam_' is a tool that can detect and segment objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
280
+ description = "'grounding_sam_' is a tool that can detect and segment multiple objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
281
281
  usage = {
282
282
  "required_parameters": [
283
283
  {"name": "prompt", "type": "str"},
284
284
  {"name": "image", "type": "str"},
285
285
  ],
286
286
  "optional_parameters": [
287
- {"name": "box_threshold", "type": "float"},
288
- {"name": "iou_threshold", "type": "float"},
287
+ {"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
288
+ {"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
289
289
  ],
290
290
  "examples": [
291
291
  {
@@ -312,7 +312,7 @@ class GroundingSAM(Tool):
312
312
  "prompt": "red shirt, green shirt",
313
313
  "image": "shirts.jpg",
314
314
  "box_threshold": 0.20,
315
- "iou_threshold": 0.75,
315
+ "iou_threshold": 0.20,
316
316
  },
317
317
  },
318
318
  ],
@@ -324,7 +324,7 @@ class GroundingSAM(Tool):
324
324
  prompt: str,
325
325
  image: Union[str, ImageType],
326
326
  box_threshold: float = 0.2,
327
- iou_threshold: float = 0.75,
327
+ iou_threshold: float = 0.2,
328
328
  ) -> Dict:
329
329
  """Invoke the Grounding SAM model.
330
330
 
@@ -353,6 +353,7 @@ class GroundingSAM(Tool):
353
353
  rle_decode(mask_rle=mask, shape=data["mask_shape"])
354
354
  for mask in data["masks"]
355
355
  ]
356
+ data["image_size"] = image_size
356
357
  data.pop("mask_shape", None)
357
358
  return data
358
359
 
@@ -434,6 +435,8 @@ class DINOv(Tool):
434
435
  for mask in data["masks"]
435
436
  ]
436
437
  data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))]
438
+ mask_shape = data.pop("mask_shape", None)
439
+ data["image_size"] = (mask_shape[0], mask_shape[1]) if mask_shape else None
437
440
  return data
438
441
 
439
442
 
@@ -789,33 +792,49 @@ class Crop(Tool):
789
792
  return {"image": tmp.name}
790
793
 
791
794
 
792
- class BboxArea(Tool):
793
- r"""BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places."""
795
+ class BboxStats(Tool):
796
+ r"""BboxStats returns the height, width and area of the bounding box in pixels to 2 decimal places."""
794
797
 
795
- name = "bbox_area_"
796
- description = "'bbox_area_' returns the area of the given bounding box in pixels normalized to 2 decimal places."
798
+ name = "bbox_stats_"
799
+ description = "'bbox_stats_' returns the height, width and area of the given bounding box in pixels to 2 decimal places."
797
800
  usage = {
798
- "required_parameters": [{"name": "bboxes", "type": "List[int]"}],
801
+ "required_parameters": [
802
+ {"name": "bboxes", "type": "List[int]"},
803
+ {"name": "image_size", "type": "Tuple[int]"},
804
+ ],
799
805
  "examples": [
800
806
  {
801
- "scenario": "If you want to calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
802
- "parameters": {"bboxes": [0.2, 0.21, 0.34, 0.42]},
803
- }
807
+ "scenario": "Calculate the width and height of the bounding box [0.2, 0.21, 0.34, 0.42]",
808
+ "parameters": {
809
+ "bboxes": [[0.2, 0.21, 0.34, 0.42]],
810
+ "image_size": (500, 1200),
811
+ },
812
+ },
813
+ {
814
+ "scenario": "Calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
815
+ "parameters": {
816
+ "bboxes": [[0.2, 0.21, 0.34, 0.42]],
817
+ "image_size": (640, 480),
818
+ },
819
+ },
804
820
  ],
805
821
  }
806
822
 
807
- def __call__(self, bboxes: List[Dict]) -> List[Dict]:
823
+ def __call__(
824
+ self, bboxes: List[List[int]], image_size: Tuple[int, int]
825
+ ) -> List[Dict]:
808
826
  areas = []
809
- for elt in bboxes:
810
- height, width = elt["size"]
811
- for label, bbox in zip(elt["labels"], elt["bboxes"]):
812
- x1, y1, x2, y2 = bbox
813
- areas.append(
814
- {
815
- "area": round((x2 - x1) * (y2 - y1) * width * height, 2),
816
- "label": label,
817
- }
818
- )
827
+ height, width = image_size
828
+ for bbox in bboxes:
829
+ x1, y1, x2, y2 = bbox
830
+ areas.append(
831
+ {
832
+ "width": round((x2 - x1) * width, 2),
833
+ "height": round((y2 - y1) * height, 2),
834
+ "area": round((x2 - x1) * (y2 - y1) * width * height, 2),
835
+ }
836
+ )
837
+
819
838
  return areas
820
839
 
821
840
 
@@ -1054,22 +1073,25 @@ class ExtractFrames(Tool):
1054
1073
  r"""Extract frames from a video."""
1055
1074
 
1056
1075
  name = "extract_frames_"
1057
- description = "'extract_frames_' extracts frames from a video, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
1076
+ description = "'extract_frames_' extracts frames from a video every 2 seconds, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
1058
1077
  usage = {
1059
1078
  "required_parameters": [{"name": "video_uri", "type": "str"}],
1079
+ "optional_parameters": [{"name": "frames_every", "type": "float"}],
1060
1080
  "examples": [
1061
1081
  {
1062
1082
  "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
1063
1083
  "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
1064
1084
  },
1065
1085
  {
1066
- "scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
1067
- "parameters": {"video_uri": "tests/data/test.mp4"},
1086
+ "scenario": "Can you extract the images from this video file at every 2 seconds ? Video path: tests/data/test.mp4",
1087
+ "parameters": {"video_uri": "tests/data/test.mp4", "frames_every": 2},
1068
1088
  },
1069
1089
  ],
1070
1090
  }
1071
1091
 
1072
- def __call__(self, video_uri: str) -> List[Tuple[str, float]]:
1092
+ def __call__(
1093
+ self, video_uri: str, frames_every: float = 2
1094
+ ) -> List[Tuple[str, float]]:
1073
1095
  """Extract frames from a video.
1074
1096
 
1075
1097
 
@@ -1079,7 +1101,7 @@ class ExtractFrames(Tool):
1079
1101
  Returns:
1080
1102
  a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
1081
1103
  """
1082
- frames = extract_frames_from_video(video_uri)
1104
+ frames = extract_frames_from_video(video_uri, fps=round(1 / frames_every, 2))
1083
1105
  result = []
1084
1106
  _LOGGER.info(
1085
1107
  f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
@@ -1182,7 +1204,7 @@ TOOLS = {
1182
1204
  AgentDINOv,
1183
1205
  ExtractFrames,
1184
1206
  Crop,
1185
- BboxArea,
1207
+ BboxStats,
1186
1208
  SegArea,
1187
1209
  ObjectDistance,
1188
1210
  BboxContains,
@@ -1,3 +1,3 @@
1
1
  from .execute import Execute
2
- from .sim import Sim
2
+ from .sim import Sim, load_sim, merge_sim
3
3
  from .video import extract_frames_from_video
vision_agent/utils/sim.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from pathlib import Path
2
2
  from typing import Dict, List, Optional, Sequence, Union
3
3
 
4
+ import numpy as np
4
5
  import pandas as pd
5
6
  from openai import Client
6
7
  from scipy.spatial.distance import cosine # type: ignore
@@ -46,7 +47,14 @@ class Sim:
46
47
  )
47
48
 
48
49
  def save(self, sim_file: Union[str, Path]) -> None:
49
- self.df.to_csv(sim_file, index=False)
50
+ sim_file = Path(sim_file)
51
+ sim_file.mkdir(parents=True, exist_ok=True)
52
+
53
+ df = self.df.copy()
54
+ embs = np.array(df.embs.tolist())
55
+ np.save(sim_file / "embs.npy", embs)
56
+ df = df.drop("embs", axis=1)
57
+ df.to_csv(sim_file / "df.csv", index=False)
50
58
 
51
59
  def top_k(self, query: str, k: int = 5) -> Sequence[Dict]:
52
60
  """Returns the top k most similar items to the query.
@@ -65,6 +73,13 @@ class Sim:
65
73
  return res[[c for c in res.columns if c != "embs"]].to_dict(orient="records")
66
74
 
67
75
 
76
+ def merge_sim(sim1: Sim, sim2: Sim) -> Sim:
77
+ return Sim(pd.concat([sim1.df, sim2.df], ignore_index=True))
78
+
79
+
68
80
  def load_sim(sim_file: Union[str, Path]) -> Sim:
69
- df = pd.read_csv(sim_file)
81
+ sim_file = Path(sim_file)
82
+ df = pd.read_csv(sim_file / "df.csv")
83
+ embs = np.load(sim_file / "embs.npy")
84
+ df["embs"] = list(embs)
70
85
  return Sim(df)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.15
3
+ Version: 0.2.22
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -7,28 +7,28 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
7
7
  vision_agent/agent/easytool_prompts.py,sha256=Bikw-PPLkm78dwywTlnv32Y1Tw6JMeC-R7oCnXWLcTk,4656
8
8
  vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
9
9
  vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
10
- vision_agent/agent/vision_agent.py,sha256=4-GjEX8ZmLhvLebqNRRTSSu1kSaFYVR_wFsrjXgKdYI,26984
10
+ vision_agent/agent/vision_agent.py,sha256=pnx7gtTPazR7Dck5_kfZC3S3QWKu4e28YVigzOicOX0,27130
11
11
  vision_agent/agent/vision_agent_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
12
- vision_agent/agent/vision_agent_v2.py,sha256=CDgGBSoa2LoMS0b4JhyDkoS3PJJNmCCPfxIGUc4RfQg,9658
13
- vision_agent/agent/vision_agent_v2_prompt.py,sha256=-90Hlbtqb5Fp7OVjGabpTdgr-yCr8AYKIfiMRfoL4SY,5141
12
+ vision_agent/agent/vision_agent_v2.py,sha256=pAOYfNxBVZwnNxyYfv_Bk5dklFr4ougA52ib4q8O4Uo,12942
13
+ vision_agent/agent/vision_agent_v2_prompt.py,sha256=dd9m9Vqp91r4dpsKMDwXr54jG_GTBdJNDzpgR115S8Q,5997
14
14
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
16
16
  vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
17
- vision_agent/llm/llm.py,sha256=qWDBpJolGLWNwDjpEXu1NrjlJbo7Fj9efJYkSfVn6oE,5784
17
+ vision_agent/llm/llm.py,sha256=A-gN0vMb79fSxhSK1qBs6PTu1fba9Gvy6pitOyjW2gM,5779
18
18
  vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
19
19
  vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
20
- vision_agent/tools/__init__.py,sha256=WiEjXzXyaBq7IQMKOMbFAK3FKvLNfzZ3dd7CPN-d7B8,451
20
+ vision_agent/tools/__init__.py,sha256=p5SM0YhThSVO_jRF9O-OjH2fYDPv-iMjexDX9xPPb7M,452
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
- vision_agent/tools/tool_utils.py,sha256=moR7X4hkLKQzC56axdojo_OcIuVOv45bKcHPUVZrPvk,753
23
- vision_agent/tools/tools.py,sha256=WrNu_L5n2cEpe7e1oy8S1o3dy4JJ4AUxTHcjAdX64_g,46398
22
+ vision_agent/tools/tool_utils.py,sha256=mK6QfbYr6oo9ci979-_6R1DrxU2i8HGhwosADyvciI0,865
23
+ vision_agent/tools/tools.py,sha256=sVxN7SpDkz_XTc_SKwkoRF4EwaMTuHvTsCHwtR942Fc,47373
24
24
  vision_agent/tools/tools_v2.py,sha256=1Y_ZbYJyuo2eZZkq7jY3YfuKWC82C-GFCZMLYH-I5ew,13800
25
- vision_agent/utils/__init__.py,sha256=AKXf1QVOpO6MnqU8RSaFLQ_4us4DcKf8ibgEbhuHjvI,95
25
+ vision_agent/utils/__init__.py,sha256=xsHFyJSDbLdonB9Dh74cwZnVTiT__2OQF3Brd3Nmglc,116
26
26
  vision_agent/utils/execute.py,sha256=RC_jKrm2kOWwzNe9xKuA2xJcbsNcD0Hb95_o3_Le0_E,3820
27
27
  vision_agent/utils/image_utils.py,sha256=1dggPBhW8_hUXDItCRLa23h-hdBwS50cjL4v1hsoUbg,7586
28
- vision_agent/utils/sim.py,sha256=FaD16kKL1-JR2aSCmznF9KkJux9u3_Nr9tF4smBeoK0,2327
28
+ vision_agent/utils/sim.py,sha256=SO4-pj2Fjs3yr-KT8S0nuUd66lf7m7XvMAp7_ecvKuQ,2813
29
29
  vision_agent/utils/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
30
30
  vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
31
- vision_agent-0.2.15.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
32
- vision_agent-0.2.15.dist-info/METADATA,sha256=qK9rIVOI_IL0dcUcIqtgoRCxuk5GZuQ5HHSrdsuVLKs,9121
33
- vision_agent-0.2.15.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
34
- vision_agent-0.2.15.dist-info/RECORD,,
31
+ vision_agent-0.2.22.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
32
+ vision_agent-0.2.22.dist-info/METADATA,sha256=hOVbcYSPue2CEdagEkIiX3dGtjip9p1GgKilmPYj-gU,9121
33
+ vision_agent-0.2.22.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
34
+ vision_agent-0.2.22.dist-info/RECORD,,