vision-agent 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,3 +3,4 @@ from .agent_coder import AgentCoder
3
3
  from .easytool import EasyTool
4
4
  from .reflexion import Reflexion
5
5
  from .vision_agent import VisionAgent
6
+ from .vision_agent_v2 import VisionAgentV2
@@ -6,15 +6,40 @@ from pathlib import Path
6
6
  from typing import Dict, List, Optional, Union
7
7
 
8
8
  from vision_agent.agent import Agent
9
+ from vision_agent.agent.agent_coder_prompts import (
10
+ DEBUG,
11
+ FIX_BUG,
12
+ PROGRAM,
13
+ TEST,
14
+ VISUAL_TEST,
15
+ )
9
16
  from vision_agent.llm import LLM, OpenAILLM
10
17
  from vision_agent.lmm import LMM, OpenAILMM
11
- from vision_agent.tools.tools_v2 import TOOLS_DOCSTRING, UTILITIES_DOCSTRING
12
-
13
- from .agent_coder_prompts import DEBUG, FIX_BUG, PROGRAM, TEST, VISUAL_TEST
14
- from .execution import IMPORT_HELPER, check_correctness
18
+ from vision_agent.tools.tools_v2 import TOOL_DOCSTRING, UTILITIES_DOCSTRING
19
+ from vision_agent.utils import Execute
15
20
 
21
+ IMPORT_HELPER = """
22
+ import math
23
+ import re
24
+ import sys
25
+ import copy
26
+ import datetime
27
+ import itertools
28
+ import collections
29
+ import heapq
30
+ import statistics
31
+ import functools
32
+ import hashlib
33
+ import numpy
34
+ import numpy as np
35
+ import string
36
+ from typing import *
37
+ from collections import *
38
+ from vision_agent.tools.tools_v2 import *
39
+ """
16
40
  logging.basicConfig(stream=sys.stdout)
17
41
  _LOGGER = logging.getLogger(__name__)
42
+ _EXECUTE = Execute()
18
43
 
19
44
 
20
45
  def write_tests(question: str, code: str, model: LLM) -> str:
@@ -40,7 +65,7 @@ def parse_file_name(s: str) -> str:
40
65
 
41
66
  def write_program(question: str, feedback: str, model: LLM) -> str:
42
67
  prompt = PROGRAM.format(
43
- docstring=TOOLS_DOCSTRING, question=question, feedback=feedback
68
+ docstring=TOOL_DOCSTRING, question=question, feedback=feedback
44
69
  )
45
70
  completion = model(prompt)
46
71
  return preprocess_data(completion)
@@ -59,14 +84,15 @@ def write_debug(question: str, code: str, feedback: str, model: LLM) -> str:
59
84
 
60
85
  def execute_tests(code: str, tests: str) -> Dict[str, Union[str, bool]]:
61
86
  full_code = f"{IMPORT_HELPER}\n{code}\n{tests}"
62
- return check_correctness(full_code, 20.0)
87
+ success, result = _EXECUTE.run_isolation(full_code)
88
+ return {"code": code, "result": result, "passed": success}
63
89
 
64
90
 
65
91
  def run_visual_tests(
66
92
  question: str, code: str, viz_file: str, feedback: str, model: LMM
67
93
  ) -> Dict[str, Union[str, bool]]:
68
94
  prompt = VISUAL_TEST.format(
69
- docstring=TOOLS_DOCSTRING,
95
+ docstring=TOOL_DOCSTRING,
70
96
  code=code,
71
97
  question=question,
72
98
  feedback=feedback,
@@ -8,18 +8,8 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
8
8
  from PIL import Image
9
9
  from tabulate import tabulate
10
10
 
11
- from vision_agent.image_utils import (
12
- convert_to_b64,
13
- overlay_bboxes,
14
- overlay_heat_map,
15
- overlay_masks,
16
- )
17
- from vision_agent.llm import LLM, OpenAILLM
18
- from vision_agent.lmm import LMM, OpenAILMM
19
- from vision_agent.tools import TOOLS
20
-
21
- from .agent import Agent
22
- from .easytool_prompts import (
11
+ from vision_agent.agent.agent import Agent
12
+ from vision_agent.agent.easytool_prompts import (
23
13
  ANSWER_GENERATE,
24
14
  ANSWER_SUMMARIZE,
25
15
  CHOOSE_PARAMETER,
@@ -27,7 +17,7 @@ from .easytool_prompts import (
27
17
  TASK_DECOMPOSE,
28
18
  TASK_TOPOLOGY,
29
19
  )
30
- from .vision_agent_prompts import (
20
+ from vision_agent.agent.vision_agent_prompts import (
31
21
  ANSWER_GENERATE_DEPENDS,
32
22
  ANSWER_SUMMARIZE_DEPENDS,
33
23
  CHOOSE_PARAMETER_DEPENDS,
@@ -35,6 +25,15 @@ from .vision_agent_prompts import (
35
25
  TASK_DECOMPOSE_DEPENDS,
36
26
  VISION_AGENT_REFLECTION,
37
27
  )
28
+ from vision_agent.llm import LLM, OpenAILLM
29
+ from vision_agent.lmm import LMM, OpenAILMM
30
+ from vision_agent.tools import TOOLS
31
+ from vision_agent.utils.image_utils import (
32
+ convert_to_b64,
33
+ overlay_bboxes,
34
+ overlay_heat_map,
35
+ overlay_masks,
36
+ )
38
37
 
39
38
  logging.basicConfig(stream=sys.stdout)
40
39
  _LOGGER = logging.getLogger(__name__)
@@ -309,7 +308,7 @@ def _handle_extract_frames(
309
308
  # any following processing
310
309
  for video_file_output in tool_result["call_results"]:
311
310
  # When the video tool is run with wrong parameters, exit the loop
312
- if len(video_file_output) < 2:
311
+ if not isinstance(video_file_output, tuple) or len(video_file_output) < 2:
313
312
  break
314
313
  for frame, _ in video_file_output:
315
314
  image = frame
@@ -561,6 +560,9 @@ class VisionAgent(Agent):
561
560
  list of all the tool results. The last item in the tool results also
562
561
  contains the visualized output.
563
562
  """
563
+ if len(chat) == 0:
564
+ raise ValueError("Input cannot be empty.")
565
+
564
566
  question = chat[0]["content"]
565
567
  if image:
566
568
  question += f" Image name: {image}"
@@ -0,0 +1,300 @@
1
+ import json
2
+ import logging
3
+ from pathlib import Path
4
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
5
+
6
+ from rich.console import Console
7
+ from rich.syntax import Syntax
8
+ from tabulate import tabulate
9
+
10
+ from vision_agent.agent import Agent
11
+ from vision_agent.agent.vision_agent_v2_prompt import (
12
+ CODE,
13
+ CODE_SYS_MSG,
14
+ DEBUG,
15
+ DEBUG_EXAMPLE,
16
+ DEBUG_SYS_MSG,
17
+ PLAN,
18
+ PREV_CODE_CONTEXT,
19
+ PREV_CODE_CONTEXT_WITH_REFLECTION,
20
+ TEST,
21
+ USER_REQ_CONTEXT,
22
+ USER_REQ_SUBTASK_CONTEXT,
23
+ )
24
+ from vision_agent.llm import LLM, OpenAILLM
25
+ from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF
26
+ from vision_agent.utils import Execute, Sim
27
+
28
+ logging.basicConfig(level=logging.INFO)
29
+ _LOGGER = logging.getLogger(__name__)
30
+ _MAX_TABULATE_COL_WIDTH = 80
31
+ _CONSOLE = Console()
32
+
33
+
34
+ def extract_code(code: str) -> str:
35
+ if "```python" in code:
36
+ code = code[code.find("```python") + len("```python") :]
37
+ code = code[: code.find("```")]
38
+ return code
39
+
40
+
41
+ def write_plan(
42
+ user_requirements: str, tool_desc: str, model: LLM
43
+ ) -> List[Dict[str, Any]]:
44
+ context = USER_REQ_CONTEXT.format(user_requirement=user_requirements)
45
+ prompt = PLAN.format(context=context, plan="", tool_desc=tool_desc)
46
+ plan = json.loads(model(prompt).replace("```", "").strip())
47
+ return plan["plan"] # type: ignore
48
+
49
+
50
+ def write_code(
51
+ user_req: str, subtask: str, tool_info: str, code: str, model: LLM
52
+ ) -> str:
53
+ prompt = CODE.format(
54
+ context=USER_REQ_SUBTASK_CONTEXT.format(
55
+ user_requirement=user_req, subtask=subtask
56
+ ),
57
+ tool_info=tool_info,
58
+ code=code,
59
+ )
60
+ messages = [
61
+ {"role": "system", "content": CODE_SYS_MSG},
62
+ {"role": "user", "content": prompt},
63
+ ]
64
+ code = model.chat(messages)
65
+ return extract_code(code)
66
+
67
+
68
+ def write_test(
69
+ user_req: str, subtask: str, tool_info: str, code: str, model: LLM
70
+ ) -> str:
71
+ prompt = TEST.format(
72
+ context=USER_REQ_SUBTASK_CONTEXT.format(
73
+ user_requirement=user_req, subtask=subtask
74
+ ),
75
+ tool_info=tool_info,
76
+ code=code,
77
+ )
78
+ messages = [
79
+ {"role": "system", "content": CODE_SYS_MSG},
80
+ {"role": "user", "content": prompt},
81
+ ]
82
+ code = model.chat(messages)
83
+ return extract_code(code)
84
+
85
+
86
+ def debug_code(sub_task: str, working_memory: List[str], model: LLM) -> Tuple[str, str]:
87
+ # Make debug model output JSON
88
+ if hasattr(model, "kwargs"):
89
+ model.kwargs["response_format"] = {"type": "json_object"}
90
+ prompt = DEBUG.format(
91
+ debug_example=DEBUG_EXAMPLE,
92
+ context=USER_REQ_CONTEXT.format(user_requirement=sub_task),
93
+ previous_impl="\n".join(working_memory),
94
+ )
95
+ messages = [
96
+ {"role": "system", "content": DEBUG_SYS_MSG},
97
+ {"role": "user", "content": prompt},
98
+ ]
99
+ code_and_ref = json.loads(model.chat(messages).replace("```", "").strip())
100
+ if hasattr(model, "kwargs"):
101
+ del model.kwargs["response_format"]
102
+ return extract_code(code_and_ref["improved_impl"]), code_and_ref["reflection"]
103
+
104
+
105
+ def write_and_exec_code(
106
+ user_req: str,
107
+ subtask: str,
108
+ orig_code: str,
109
+ code_writer_call: Callable,
110
+ model: LLM,
111
+ tool_info: str,
112
+ exec: Execute,
113
+ max_retry: int = 3,
114
+ verbose: bool = False,
115
+ ) -> Tuple[bool, str, str, Dict[str, List[str]]]:
116
+ success = False
117
+ counter = 0
118
+ reflection = ""
119
+
120
+ # TODO: add working memory to code_writer_call and debug_code
121
+ code = code_writer_call(user_req, subtask, tool_info, orig_code, model)
122
+ success, result = exec.run_isolation(code)
123
+ working_memory: Dict[str, List[str]] = {}
124
+ while not success and counter < max_retry:
125
+ if subtask not in working_memory:
126
+ working_memory[subtask] = []
127
+
128
+ if reflection:
129
+ working_memory[subtask].append(
130
+ PREV_CODE_CONTEXT_WITH_REFLECTION.format(
131
+ code=code, result=result, reflection=reflection
132
+ )
133
+ )
134
+ else:
135
+ working_memory[subtask].append(
136
+ PREV_CODE_CONTEXT.format(code=code, result=result)
137
+ )
138
+
139
+ code, reflection = debug_code(subtask, working_memory[subtask], model)
140
+ success, result = exec.run_isolation(code)
141
+ counter += 1
142
+ if verbose:
143
+ _CONSOLE.print(
144
+ Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
145
+ )
146
+ _LOGGER.info(f"\tDebugging reflection, result: {reflection}, {result}")
147
+
148
+ if success:
149
+ working_memory[subtask].append(
150
+ PREV_CODE_CONTEXT_WITH_REFLECTION.format(
151
+ code=code, result=result, reflection=reflection
152
+ )
153
+ )
154
+
155
+ return success, code, result, working_memory
156
+
157
+
158
+ def run_plan(
159
+ user_req: str,
160
+ plan: List[Dict[str, Any]],
161
+ coder: LLM,
162
+ exec: Execute,
163
+ code: str,
164
+ tool_recommender: Sim,
165
+ verbose: bool = False,
166
+ ) -> Tuple[str, str, List[Dict[str, Any]], Dict[str, List[str]]]:
167
+ active_plan = [e for e in plan if "success" not in e or not e["success"]]
168
+ working_memory: Dict[str, List[str]] = {}
169
+ current_code = code
170
+ current_test = ""
171
+ for task in active_plan:
172
+ _LOGGER.info(
173
+ f"""
174
+ {tabulate(tabular_data=[task], headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
175
+ )
176
+ tool_info = "\n".join(
177
+ [e["doc"] for e in tool_recommender.top_k(task["instruction"])]
178
+ )
179
+ success, code, result, task_memory = write_and_exec_code(
180
+ user_req,
181
+ task["instruction"],
182
+ current_code,
183
+ write_code if task["type"] == "code" else write_test,
184
+ coder,
185
+ tool_info,
186
+ exec,
187
+ verbose,
188
+ )
189
+ if task["type"] == "code":
190
+ current_code = code
191
+ else:
192
+ current_test = code
193
+
194
+ working_memory.update(task_memory)
195
+
196
+ if verbose:
197
+ _CONSOLE.print(
198
+ Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
199
+ )
200
+ _LOGGER.info(f"\tCode success, result: {success}, {str(result)}")
201
+
202
+ task["success"] = success
203
+ task["result"] = result
204
+ task["code"] = code
205
+
206
+ if not success:
207
+ break
208
+
209
+ return current_code, current_test, plan, working_memory
210
+
211
+
212
+ class VisionAgentV2(Agent):
213
+ """Vision Agent is an AI agentic framework geared towards outputting Python code to
214
+ solve vision tasks. It is inspired by MetaGPT's Data Interpreter
215
+ https://arxiv.org/abs/2402.18679. Vision Agent has several key features to help it
216
+ generate code:
217
+ - A planner to generate a plan of tasks to solve a user requirement. The planner
218
+ can output code tasks or test tasks, where test tasks are used to verify the code.
219
+ - Automatic debugging, if a task fails, the agent will attempt to debug the code
220
+ using the failed output to fix it.
221
+ - A tool recommender to recommend tools to use for a given task. LLM performance
222
+ on tool retrieval starts to decrease as you add more tools, tool retrieval helps
223
+ keep the number of tools to choose from low.
224
+ - Memory retrieval, the agent can remember previous iterations on tasks to help it
225
+ with new tasks.
226
+ - Dynamic replanning, the agent can ask for feedback and replan remaining tasks
227
+ based off of that feedback.
228
+ """
229
+
230
+ def __init__(
231
+ self,
232
+ timeout: int = 600,
233
+ tool_recommender: Optional[Sim] = None,
234
+ verbose: bool = False,
235
+ ) -> None:
236
+ self.planner = OpenAILLM(temperature=0.1, json_mode=True)
237
+ self.coder = OpenAILLM(temperature=0.1)
238
+ self.exec = Execute(timeout=timeout)
239
+ if tool_recommender is None:
240
+ self.tool_recommender = Sim(TOOLS_DF, sim_key="desc")
241
+ else:
242
+ self.tool_recommender = tool_recommender
243
+ self.verbose = verbose
244
+ if self.verbose:
245
+ _LOGGER.setLevel(logging.INFO)
246
+
247
+ def __call__(
248
+ self,
249
+ input: Union[List[Dict[str, str]], str],
250
+ image: Optional[Union[str, Path]] = None,
251
+ ) -> str:
252
+ if isinstance(input, str):
253
+ input = [{"role": "user", "content": input}]
254
+ code, _ = self.chat_with_tests(input, image)
255
+ return code
256
+
257
+ def chat_with_tests(
258
+ self,
259
+ chat: List[Dict[str, str]],
260
+ image: Optional[Union[str, Path]] = None,
261
+ ) -> Tuple[str, str]:
262
+ if len(chat) == 0:
263
+ raise ValueError("Input cannot be empty.")
264
+
265
+ user_req = chat[0]["content"]
266
+ if image is not None:
267
+ user_req += f" Image name {image}"
268
+
269
+ plan = write_plan(user_req, TOOL_DESCRIPTIONS, self.planner)
270
+ _LOGGER.info(
271
+ f"""Plan:
272
+ {tabulate(tabular_data=plan, headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
273
+ )
274
+ working_memory: Dict[str, List[str]] = {}
275
+
276
+ working_code = ""
277
+ working_test = ""
278
+ success = False
279
+
280
+ while not success:
281
+ working_code, working_test, plan, working_memory_i = run_plan(
282
+ user_req,
283
+ plan,
284
+ self.coder,
285
+ self.exec,
286
+ working_code,
287
+ self.tool_recommender,
288
+ self.verbose,
289
+ )
290
+ success = all(task["success"] for task in plan)
291
+ working_memory.update(working_memory_i)
292
+
293
+ if not success:
294
+ # TODO: ask for feedback and replan
295
+ break
296
+
297
+ return working_code, working_test
298
+
299
+ def log_progress(self, description: str) -> None:
300
+ pass
@@ -0,0 +1,170 @@
1
+ USER_REQ_SUBTASK_CONTEXT = """
2
+ ## User Requirement
3
+ {user_requirement}
4
+
5
+ ## Current Subtask
6
+ {subtask}
7
+ """
8
+
9
+ USER_REQ_CONTEXT = """
10
+ ## User Requirement
11
+ {user_requirement}
12
+ """
13
+
14
+
15
+ PLAN = """
16
+ # Context
17
+ {context}
18
+
19
+ # Current Plan
20
+ {plan}
21
+
22
+ # Tools Available
23
+ {tool_desc}
24
+
25
+ # Task:
26
+ Based on the context and the tools you have available, write a plan of subtasks to achieve the user request that adhere to the following requirements:
27
+ - For each subtask, you should provide a short instruction on what to do. Ensure the subtasks are large enough to be meaningful, encompassing multiple lines of code.
28
+ - You do not need to have the agent rewrite any tool functionality you already have, you should instead instruct it to utilize one or more of those tools in each subtask.
29
+ - You can have agents either write coding tasks, to code some functionality or testing tasks to test previous functionality.
30
+
31
+ Output a list of jsons in the following format:
32
+
33
+ ```json
34
+ {{
35
+ "plan":
36
+ [
37
+ {{
38
+ "task_id": int, # "unique identifier for a task in plan, can be an ordinal"
39
+ "dependent_task_ids": list[int], # "ids of tasks prerequisite to this task"
40
+ "instruction": str, # "what you should do in this task, one short phrase or sentence"
41
+ "type": str, # "the type of the task, tasks can either be 'code' for coding tasks or 'test' for testing tasks"
42
+ }},
43
+ ...
44
+ ]
45
+ }}
46
+ ```
47
+ """
48
+
49
+
50
+ CODE_SYS_MSG = """You are an AI Python assistant. You need to help user to achieve their goal by implementing a function. Your code will be run in a jupyter notebook environment so don't use asyncio.run. Instead, use await if you need to call an async function. Do not use 'display' for showing images, instead use matplotlib or PIL."""
51
+
52
+
53
+ CODE = """
54
+ # Context
55
+ {context}
56
+
57
+ # Tool Info for Current Subtask
58
+ {tool_info}
59
+
60
+ # Previous Code
61
+ {code}
62
+
63
+ # Constraints
64
+ - Write a function that accomplishes the User Requirement. You are supplied code from a previous task, feel free to copy over that code into your own implementation if you need it.
65
+ - Always prioritize using pre-defined tools or code for the same functionality. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
66
+ - Write clean, readable, and well-documented code.
67
+
68
+ # Output
69
+ While some concise thoughts are helpful, code is absolutely required. If possible, execute your defined functions in the code output. Output code in the following format:
70
+ ```python
71
+ from vision_agent.tools.tools_v2 imoprt *
72
+
73
+ # your code goes here
74
+ ```
75
+ """
76
+
77
+
78
+ DEBUG_SYS_MSG = """You are an AI Python assistant. You will be given your previous implementation code of a task, runtime error results, and a hint to change the implementation appropriately. Your code will be run in a jupyter notebook environment. Write your full implementation."""
79
+
80
+
81
+ DEBUG_EXAMPLE = '''
82
+ [previous impl]:
83
+ ```python
84
+ def add(a: int, b: int) -> int:
85
+ """Given integers a and b, return the total value of a and b."""
86
+ return a - b
87
+ ```
88
+
89
+ [previous output]
90
+ Tests failed:
91
+ assert add(1, 2) == 3 # output: -1
92
+ assert add(1, 3) == 4 # output: -2
93
+
94
+ [reflection on previous impl]:
95
+ The implementation failed the test cases where the input integers are 1 and 2. The issue arises because the code does not add the two integers together, but instead subtracts the second integer from the first. To fix this issue, we should change the operator from `-` to `+` in the return statement. This will ensure that the function returns the correct output for the given input.
96
+
97
+ [improved impl]:
98
+ def add(a: int, b: int) -> int:
99
+ """Given integers a and b, return the total value of a and b."""
100
+ return a + b
101
+ '''
102
+
103
+
104
+ PREV_CODE_CONTEXT = """
105
+ ```python
106
+ {code}
107
+ ```
108
+
109
+ [previous output]
110
+ {result}
111
+ """
112
+
113
+
114
+ PREV_CODE_CONTEXT_WITH_REFLECTION = """
115
+ ```python
116
+ {code}
117
+ ```
118
+
119
+ [previous output]
120
+ {result}
121
+
122
+ [reflection on previous impl]
123
+ {reflection}
124
+ """
125
+
126
+
127
+ DEBUG = """
128
+ [example]
129
+ Here is an example of debugging with reflection.
130
+ {debug_example}
131
+ [/example]
132
+
133
+ [context]
134
+ {context}
135
+
136
+ [previous impl]
137
+ {previous_impl}
138
+
139
+ [instruction]
140
+ Analyze your previous code and error in [context] step by step, provide me with improved method and code. Remember to follow [context] requirement. Because you are writing code in a jupyter notebook, you can run `!pip install` to install missing packages. Output a json following the format:
141
+ ```json
142
+ {{
143
+ "reflection": str = "Reflection on previous implementation",
144
+ "improved_impl": str = "Refined code after reflection.",
145
+ }}
146
+ ```
147
+ """
148
+
149
+
150
+ TEST = """
151
+ # Context
152
+ {context}
153
+
154
+ # Tool Info for Current Subtask
155
+ {tool_info}
156
+
157
+ # Code to Test
158
+ {code}
159
+
160
+ # Constraints
161
+ - Write code to test the functionality of the provided code according to the Current Subtask. If you cannot test the code, then write code to visualize the result by calling the code.
162
+ - Always prioritize using pre-defined tools for the same functionality.
163
+ - Write clean, readable, and well-documented code.
164
+
165
+ # Output
166
+ While some concise thoughts are helpful, code is absolutely required. Always output one and only one code block in your response. Output code in the following format:
167
+ ```python
168
+ your code
169
+ ```
170
+ """
vision_agent/llm/llm.py CHANGED
@@ -37,6 +37,7 @@ class OpenAILLM(LLM):
37
37
  model_name: str = "gpt-4-turbo",
38
38
  api_key: Optional[str] = None,
39
39
  json_mode: bool = False,
40
+ system_prompt: Optional[str] = None,
40
41
  **kwargs: Any
41
42
  ):
42
43
  if not api_key:
@@ -45,22 +46,29 @@ class OpenAILLM(LLM):
45
46
  self.client = OpenAI(api_key=api_key)
46
47
 
47
48
  self.model_name = model_name
49
+ self.system_prompt = system_prompt
48
50
  self.kwargs = kwargs
49
51
  if json_mode:
50
52
  self.kwargs["response_format"] = {"type": "json_object"}
51
53
 
52
54
  def generate(self, prompt: str) -> str:
55
+ messages = []
56
+ if self.system_prompt:
57
+ messages.append({"role": "system", "content": self.system_prompt})
58
+ messages.append({"role": "user", "content": prompt})
59
+
53
60
  response = self.client.chat.completions.create(
54
61
  model=self.model_name,
55
- messages=[
56
- {"role": "user", "content": prompt},
57
- ],
62
+ messages=messages, # type: ignore
58
63
  **self.kwargs,
59
64
  )
60
65
 
61
66
  return cast(str, response.choices[0].message.content)
62
67
 
63
68
  def chat(self, chat: List[Dict[str, str]]) -> str:
69
+ if self.system_prompt and not any(msg["role"] == "system" for msg in chat):
70
+ chat.insert(0, {"role": "system", "content": self.system_prompt})
71
+
64
72
  response = self.client.chat.completions.create(
65
73
  model=self.model_name,
66
74
  messages=chat, # type: ignore
@@ -3,11 +3,9 @@ from .tools import ( # Counter,
3
3
  CLIP,
4
4
  OCR,
5
5
  TOOLS,
6
- BboxArea,
6
+ BboxStats,
7
7
  BboxIoU,
8
- ObjectDistance,
9
8
  BoxDistance,
10
- MaskDistance,
11
9
  Crop,
12
10
  DINOv,
13
11
  ExtractFrames,
@@ -15,6 +13,8 @@ from .tools import ( # Counter,
15
13
  GroundingSAM,
16
14
  ImageCaption,
17
15
  ImageQuestionAnswering,
16
+ MaskDistance,
17
+ ObjectDistance,
18
18
  SegArea,
19
19
  SegIoU,
20
20
  Tool,
@@ -3,7 +3,7 @@ from typing import Any, Dict
3
3
 
4
4
  import requests
5
5
 
6
- from vision_agent.type_defs import LandingaiAPIKey
6
+ from vision_agent.utils.type_defs import LandingaiAPIKey
7
7
 
8
8
  _LOGGER = logging.getLogger(__name__)
9
9
  _LND_API_KEY = LandingaiAPIKey().api_key