vision-agent 0.2.55__tar.gz → 0.2.57__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {vision_agent-0.2.55 → vision_agent-0.2.57}/PKG-INFO +48 -15
  2. {vision_agent-0.2.55 → vision_agent-0.2.57}/README.md +47 -14
  3. {vision_agent-0.2.55 → vision_agent-0.2.57}/pyproject.toml +1 -1
  4. vision_agent-0.2.57/vision_agent/__init__.py +2 -0
  5. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/agent/agent.py +3 -1
  6. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/agent/vision_agent.py +110 -81
  7. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/agent/vision_agent_prompts.py +1 -1
  8. vision_agent-0.2.57/vision_agent/lmm/__init__.py +1 -0
  9. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/lmm/lmm.py +54 -116
  10. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/tools/__init__.py +2 -1
  11. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/tools/tools.py +3 -3
  12. vision_agent-0.2.55/vision_agent/__init__.py +0 -3
  13. vision_agent-0.2.55/vision_agent/agent/agent_coder.py +0 -216
  14. vision_agent-0.2.55/vision_agent/agent/agent_coder_prompts.py +0 -135
  15. vision_agent-0.2.55/vision_agent/agent/data_interpreter.py +0 -475
  16. vision_agent-0.2.55/vision_agent/agent/data_interpreter_prompts.py +0 -186
  17. vision_agent-0.2.55/vision_agent/agent/easytool.py +0 -346
  18. vision_agent-0.2.55/vision_agent/agent/easytool_prompts.py +0 -89
  19. vision_agent-0.2.55/vision_agent/agent/easytool_v2.py +0 -781
  20. vision_agent-0.2.55/vision_agent/agent/easytool_v2_prompts.py +0 -152
  21. vision_agent-0.2.55/vision_agent/agent/reflexion.py +0 -299
  22. vision_agent-0.2.55/vision_agent/agent/reflexion_prompts.py +0 -100
  23. vision_agent-0.2.55/vision_agent/llm/__init__.py +0 -1
  24. vision_agent-0.2.55/vision_agent/llm/llm.py +0 -176
  25. vision_agent-0.2.55/vision_agent/lmm/__init__.py +0 -1
  26. vision_agent-0.2.55/vision_agent/tools/easytool_tools.py +0 -1242
  27. {vision_agent-0.2.55 → vision_agent-0.2.57}/LICENSE +0 -0
  28. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/agent/__init__.py +0 -0
  29. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/fonts/__init__.py +0 -0
  30. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  31. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/tools/prompts.py +0 -0
  32. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/tools/tool_utils.py +0 -0
  33. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/utils/__init__.py +0 -0
  34. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/utils/execute.py +0 -0
  35. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/utils/image_utils.py +0 -0
  36. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/utils/sim.py +0 -0
  37. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/utils/type_defs.py +0 -0
  38. {vision_agent-0.2.55 → vision_agent-0.2.57}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.55
3
+ Version: 0.2.57
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -38,7 +38,6 @@ Description-Content-Type: text/markdown
38
38
  <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
39
39
 
40
40
  # 🔍🤖 Vision Agent
41
-
42
41
  [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
43
42
  ![ci_status](https://github.com/landing-ai/vision-agent/actions/workflows/ci_cd.yml/badge.svg)
44
43
  [![PyPI version](https://badge.fury.io/py/vision-agent.svg)](https://badge.fury.io/py/vision-agent)
@@ -52,9 +51,14 @@ accomplish the task you want. Vision Agent aims to provide an in-seconds experie
52
51
  allowing users to describe their problem in text and have the agent framework generate
53
52
  code to solve the task for them. Check out our discord for updates and roadmaps!
54
53
 
54
+
55
+ ## Web Application
56
+
57
+ Try Vision Agent live on [va.landing.ai](https://va.landing.ai/)
58
+
55
59
  ## Documentation
56
60
 
57
- - [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
61
+ [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
58
62
 
59
63
 
60
64
  ## Getting Started
@@ -73,6 +77,7 @@ export OPENAI_API_KEY="your-api-key"
73
77
  ```
74
78
 
75
79
  ### Vision Agent
80
+ #### Basic Usage
76
81
  You can interact with the agent as you would with any LLM or LMM model:
77
82
 
78
83
  ```python
@@ -88,28 +93,28 @@ from vision_agent.tools import load_image, grounding_sam
88
93
  def calculate_filled_percentage(image_path: str) -> float:
89
94
  # Step 1: Load the image
90
95
  image = load_image(image_path)
91
-
96
+
92
97
  # Step 2: Segment the jar
93
98
  jar_segments = grounding_sam(prompt="jar", image=image)
94
-
99
+
95
100
  # Step 3: Segment the coffee beans
96
101
  coffee_beans_segments = grounding_sam(prompt="coffee beans", image=image)
97
-
102
+
98
103
  # Step 4: Calculate the area of the segmented jar
99
104
  jar_area = 0
100
105
  for segment in jar_segments:
101
106
  jar_area += segment['mask'].sum()
102
-
107
+
103
108
  # Step 5: Calculate the area of the segmented coffee beans
104
109
  coffee_beans_area = 0
105
110
  for segment in coffee_beans_segments:
106
111
  coffee_beans_area += segment['mask'].sum()
107
-
112
+
108
113
  # Step 6: Compute the percentage of the jar area that is filled with coffee beans
109
114
  if jar_area == 0:
110
115
  return 0.0 # To avoid division by zero
111
116
  filled_percentage = (coffee_beans_area / jar_area) * 100
112
-
117
+
113
118
  # Step 7: Return the computed percentage
114
119
  return filled_percentage
115
120
  ```
@@ -121,10 +126,12 @@ mode by passing in the verbose argument:
121
126
  >>> agent = VisionAgent(verbose=2)
122
127
  ```
123
128
 
124
- You can also have it return more information by calling `chat_with_workflow`:
129
+ #### Detailed Usage
130
+ You can also have it return more information by calling `chat_with_workflow`. The format
131
+ of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
125
132
 
126
133
  ```python
127
- >>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?"}], media="jar.jpg")
134
+ >>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
128
135
  >>> print(results)
129
136
  {
130
137
  "code": "from vision_agent.tools import ..."
@@ -135,19 +142,45 @@ You can also have it return more information by calling `chat_with_workflow`:
135
142
  }
136
143
  ```
137
144
 
138
- With this you can examine more detailed information such as the etesting code, testing
145
+ With this you can examine more detailed information such as the testing code, testing
139
146
  results, plan or working memory it used to complete the task.
140
147
 
148
+ #### Multi-turn conversations
149
+ You can have multi-turn conversations with vision-agent as well, giving it feedback on
150
+ the code and having it update. You just need to add the code as a response from the
151
+ assistant:
152
+
153
+ ```python
154
+ agent = va.agent.VisionAgent(verbosity=2)
155
+ conv = [
156
+ {
157
+ "role": "user",
158
+ "content": "Are these workers wearing safety gear? Output only a True or False value.",
159
+ "media": ["workers.png"],
160
+ }
161
+ ]
162
+ result = agent.chat_with_workflow(conv)
163
+ code = result["code"]
164
+ conv.append({"role": "assistant", "content": code})
165
+ conv.append(
166
+ {
167
+ "role": "user",
168
+ "content": "Can you also return the number of workers wearing safety gear?",
169
+ }
170
+ )
171
+ result = agent.chat_with_workflow(conv)
172
+ ```
173
+
141
174
  ### Tools
142
175
  There are a variety of tools for the model or the user to use. Some are executed locally
143
- while others are hosted for you. You can also ask an LLM directly to build a tool for
176
+ while others are hosted for you. You can also ask an LMM directly to build a tool for
144
177
  you. For example:
145
178
 
146
179
  ```python
147
180
  >>> import vision_agent as va
148
- >>> llm = va.llm.OpenAILLM()
181
+ >>> llm = va.llm.OpenAILMM()
149
182
  >>> detector = llm.generate_detector("Can you build a jar detector for me?")
150
- >>> detector("jar.jpg")
183
+ >>> detector(va.tools.load_image("jar.jpg"))
151
184
  [{"labels": ["jar",],
152
185
  "scores": [0.99],
153
186
  "bboxes": [
@@ -2,7 +2,6 @@
2
2
  <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
3
3
 
4
4
  # 🔍🤖 Vision Agent
5
-
6
5
  [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
7
6
  ![ci_status](https://github.com/landing-ai/vision-agent/actions/workflows/ci_cd.yml/badge.svg)
8
7
  [![PyPI version](https://badge.fury.io/py/vision-agent.svg)](https://badge.fury.io/py/vision-agent)
@@ -16,9 +15,14 @@ accomplish the task you want. Vision Agent aims to provide an in-seconds experie
16
15
  allowing users to describe their problem in text and have the agent framework generate
17
16
  code to solve the task for them. Check out our discord for updates and roadmaps!
18
17
 
18
+
19
+ ## Web Application
20
+
21
+ Try Vision Agent live on [va.landing.ai](https://va.landing.ai/)
22
+
19
23
  ## Documentation
20
24
 
21
- - [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
25
+ [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
22
26
 
23
27
 
24
28
  ## Getting Started
@@ -37,6 +41,7 @@ export OPENAI_API_KEY="your-api-key"
37
41
  ```
38
42
 
39
43
  ### Vision Agent
44
+ #### Basic Usage
40
45
  You can interact with the agent as you would with any LLM or LMM model:
41
46
 
42
47
  ```python
@@ -52,28 +57,28 @@ from vision_agent.tools import load_image, grounding_sam
52
57
  def calculate_filled_percentage(image_path: str) -> float:
53
58
  # Step 1: Load the image
54
59
  image = load_image(image_path)
55
-
60
+
56
61
  # Step 2: Segment the jar
57
62
  jar_segments = grounding_sam(prompt="jar", image=image)
58
-
63
+
59
64
  # Step 3: Segment the coffee beans
60
65
  coffee_beans_segments = grounding_sam(prompt="coffee beans", image=image)
61
-
66
+
62
67
  # Step 4: Calculate the area of the segmented jar
63
68
  jar_area = 0
64
69
  for segment in jar_segments:
65
70
  jar_area += segment['mask'].sum()
66
-
71
+
67
72
  # Step 5: Calculate the area of the segmented coffee beans
68
73
  coffee_beans_area = 0
69
74
  for segment in coffee_beans_segments:
70
75
  coffee_beans_area += segment['mask'].sum()
71
-
76
+
72
77
  # Step 6: Compute the percentage of the jar area that is filled with coffee beans
73
78
  if jar_area == 0:
74
79
  return 0.0 # To avoid division by zero
75
80
  filled_percentage = (coffee_beans_area / jar_area) * 100
76
-
81
+
77
82
  # Step 7: Return the computed percentage
78
83
  return filled_percentage
79
84
  ```
@@ -85,10 +90,12 @@ mode by passing in the verbose argument:
85
90
  >>> agent = VisionAgent(verbose=2)
86
91
  ```
87
92
 
88
- You can also have it return more information by calling `chat_with_workflow`:
93
+ #### Detailed Usage
94
+ You can also have it return more information by calling `chat_with_workflow`. The format
95
+ of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
89
96
 
90
97
  ```python
91
- >>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?"}], media="jar.jpg")
98
+ >>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
92
99
  >>> print(results)
93
100
  {
94
101
  "code": "from vision_agent.tools import ..."
@@ -99,19 +106,45 @@ You can also have it return more information by calling `chat_with_workflow`:
99
106
  }
100
107
  ```
101
108
 
102
- With this you can examine more detailed information such as the etesting code, testing
109
+ With this you can examine more detailed information such as the testing code, testing
103
110
  results, plan or working memory it used to complete the task.
104
111
 
112
+ #### Multi-turn conversations
113
+ You can have multi-turn conversations with vision-agent as well, giving it feedback on
114
+ the code and having it update. You just need to add the code as a response from the
115
+ assistant:
116
+
117
+ ```python
118
+ agent = va.agent.VisionAgent(verbosity=2)
119
+ conv = [
120
+ {
121
+ "role": "user",
122
+ "content": "Are these workers wearing safety gear? Output only a True or False value.",
123
+ "media": ["workers.png"],
124
+ }
125
+ ]
126
+ result = agent.chat_with_workflow(conv)
127
+ code = result["code"]
128
+ conv.append({"role": "assistant", "content": code})
129
+ conv.append(
130
+ {
131
+ "role": "user",
132
+ "content": "Can you also return the number of workers wearing safety gear?",
133
+ }
134
+ )
135
+ result = agent.chat_with_workflow(conv)
136
+ ```
137
+
105
138
  ### Tools
106
139
  There are a variety of tools for the model or the user to use. Some are executed locally
107
- while others are hosted for you. You can also ask an LLM directly to build a tool for
140
+ while others are hosted for you. You can also ask an LMM directly to build a tool for
108
141
  you. For example:
109
142
 
110
143
  ```python
111
144
  >>> import vision_agent as va
112
- >>> llm = va.llm.OpenAILLM()
145
+ >>> llm = va.llm.OpenAILMM()
113
146
  >>> detector = llm.generate_detector("Can you build a jar detector for me?")
114
- >>> detector("jar.jpg")
147
+ >>> detector(va.tools.load_image("jar.jpg"))
115
148
  [{"labels": ["jar",],
116
149
  "scores": [0.99],
117
150
  "bboxes": [
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.55"
7
+ version = "0.2.57"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -0,0 +1,2 @@
1
+ from .agent import Agent
2
+ from .lmm import LMM, OpenAILMM
@@ -2,12 +2,14 @@ from abc import ABC, abstractmethod
2
2
  from pathlib import Path
3
3
  from typing import Any, Dict, List, Optional, Union
4
4
 
5
+ from vision_agent.lmm import Message
6
+
5
7
 
6
8
  class Agent(ABC):
7
9
  @abstractmethod
8
10
  def __call__(
9
11
  self,
10
- input: Union[List[Dict[str, str]], str],
12
+ input: Union[str, List[Message]],
11
13
  media: Optional[Union[str, Path]] = None,
12
14
  ) -> str:
13
15
  pass
@@ -13,7 +13,6 @@ from rich.style import Style
13
13
  from rich.syntax import Syntax
14
14
  from tabulate import tabulate
15
15
 
16
- from vision_agent.llm.llm import AzureOpenAILLM
17
16
  import vision_agent.tools as T
18
17
  from vision_agent.agent import Agent
19
18
  from vision_agent.agent.vision_agent_prompts import (
@@ -25,8 +24,7 @@ from vision_agent.agent.vision_agent_prompts import (
25
24
  SIMPLE_TEST,
26
25
  USER_REQ,
27
26
  )
28
- from vision_agent.llm import LLM, OpenAILLM
29
- from vision_agent.lmm import LMM, OpenAILMM
27
+ from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OpenAILMM
30
28
  from vision_agent.utils import CodeInterpreterFactory, Execution
31
29
  from vision_agent.utils.execute import CodeInterpreter
32
30
  from vision_agent.utils.image_utils import b64_to_pil
@@ -133,11 +131,10 @@ def extract_image(
133
131
 
134
132
 
135
133
  def write_plan(
136
- chat: List[Dict[str, str]],
134
+ chat: List[Message],
137
135
  tool_desc: str,
138
136
  working_memory: str,
139
- model: Union[LLM, LMM],
140
- media: Optional[Sequence[Union[str, Path]]] = None,
137
+ model: LMM,
141
138
  ) -> List[Dict[str, str]]:
142
139
  chat = copy.deepcopy(chat)
143
140
  if chat[-1]["role"] != "user":
@@ -147,18 +144,58 @@ def write_plan(
147
144
  context = USER_REQ.format(user_request=user_request)
148
145
  prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
149
146
  chat[-1]["content"] = prompt
150
- if isinstance(model, OpenAILMM):
151
- media = extract_image(media)
152
- return extract_json(model.chat(chat, images=media))["plan"] # type: ignore
153
- else:
154
- return extract_json(model.chat(chat))["plan"] # type: ignore
147
+ return extract_json(model.chat(chat))["plan"] # type: ignore
148
+
149
+
150
+ def write_code(
151
+ coder: LMM,
152
+ chat: List[Message],
153
+ tool_info: str,
154
+ feedback: str,
155
+ ) -> str:
156
+ chat = copy.deepcopy(chat)
157
+ if chat[-1]["role"] != "user":
158
+ raise ValueError("Last chat message must be from the user.")
159
+
160
+ user_request = chat[-1]["content"]
161
+ prompt = CODE.format(
162
+ docstring=tool_info,
163
+ question=user_request,
164
+ feedback=feedback,
165
+ )
166
+ chat[-1]["content"] = prompt
167
+ return extract_code(coder(chat))
168
+
169
+
170
+ def write_test(
171
+ tester: LMM,
172
+ chat: List[Message],
173
+ tool_utils: str,
174
+ code: str,
175
+ feedback: str,
176
+ media: Optional[Sequence[Union[str, Path]]] = None,
177
+ ) -> str:
178
+ chat = copy.deepcopy(chat)
179
+ if chat[-1]["role"] != "user":
180
+ raise ValueError("Last chat message must be from the user.")
181
+
182
+ user_request = chat[-1]["content"]
183
+ prompt = SIMPLE_TEST.format(
184
+ docstring=tool_utils,
185
+ question=user_request,
186
+ code=code,
187
+ feedback=feedback,
188
+ media=media,
189
+ )
190
+ chat[-1]["content"] = prompt
191
+ return extract_code(tester(chat))
155
192
 
156
193
 
157
194
  def reflect(
158
- chat: List[Dict[str, str]],
195
+ chat: List[Message],
159
196
  plan: str,
160
197
  code: str,
161
- model: Union[LLM, LMM],
198
+ model: LMM,
162
199
  ) -> Dict[str, Union[str, bool]]:
163
200
  chat = copy.deepcopy(chat)
164
201
  if chat[-1]["role"] != "user":
@@ -168,22 +205,22 @@ def reflect(
168
205
  context = USER_REQ.format(user_request=user_request)
169
206
  prompt = REFLECT.format(context=context, plan=plan, code=code)
170
207
  chat[-1]["content"] = prompt
171
- return extract_json(model.chat(chat))
208
+ return extract_json(model(chat))
172
209
 
173
210
 
174
211
  def write_and_test_code(
175
- task: str,
212
+ chat: List[Message],
176
213
  tool_info: str,
177
214
  tool_utils: str,
178
215
  working_memory: List[Dict[str, str]],
179
- coder: LLM,
180
- tester: LLM,
181
- debugger: LLM,
216
+ coder: LMM,
217
+ tester: LMM,
218
+ debugger: LMM,
182
219
  code_interpreter: CodeInterpreter,
183
220
  log_progress: Callable[[Dict[str, Any]], None],
184
221
  verbosity: int = 0,
185
222
  max_retries: int = 3,
186
- input_media: Optional[Union[str, Path]] = None,
223
+ media: Optional[Sequence[Union[str, Path]]] = None,
187
224
  ) -> Dict[str, Any]:
188
225
  log_progress(
189
226
  {
@@ -191,25 +228,9 @@ def write_and_test_code(
191
228
  "status": "started",
192
229
  }
193
230
  )
194
- code = extract_code(
195
- coder(
196
- CODE.format(
197
- docstring=tool_info,
198
- question=task,
199
- feedback=format_memory(working_memory),
200
- )
201
- )
202
- )
203
- test = extract_code(
204
- tester(
205
- SIMPLE_TEST.format(
206
- docstring=tool_utils,
207
- question=task,
208
- code=code,
209
- feedback=working_memory,
210
- media=input_media,
211
- )
212
- )
231
+ code = write_code(coder, chat, tool_info, format_memory(working_memory))
232
+ test = write_test(
233
+ tester, chat, tool_utils, code, format_memory(working_memory), media
213
234
  )
214
235
 
215
236
  log_progress(
@@ -392,10 +413,10 @@ class VisionAgent(Agent):
392
413
 
393
414
  def __init__(
394
415
  self,
395
- planner: Optional[Union[LLM, LMM]] = None,
396
- coder: Optional[LLM] = None,
397
- tester: Optional[LLM] = None,
398
- debugger: Optional[LLM] = None,
416
+ planner: Optional[LMM] = None,
417
+ coder: Optional[LMM] = None,
418
+ tester: Optional[LMM] = None,
419
+ debugger: Optional[LMM] = None,
399
420
  tool_recommender: Optional[Sim] = None,
400
421
  verbosity: int = 0,
401
422
  report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
@@ -403,10 +424,10 @@ class VisionAgent(Agent):
403
424
  """Initialize the Vision Agent.
404
425
 
405
426
  Parameters:
406
- planner (Optional[LLM]): The planner model to use. Defaults to OpenAILLM.
407
- coder (Optional[LLM]): The coder model to use. Defaults to OpenAILLM.
408
- tester (Optional[LLM]): The tester model to use. Defaults to OpenAILLM.
409
- debugger (Optional[LLM]): The debugger model to
427
+ planner (Optional[LMM]): The planner model to use. Defaults to OpenAILMM.
428
+ coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
429
+ tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
430
+ debugger (Optional[LMM]): The debugger model to
410
431
  tool_recommender (Optional[Sim]): The tool recommender model to use.
411
432
  verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
412
433
  highest verbosity level which will output all intermediate debugging
@@ -418,12 +439,12 @@ class VisionAgent(Agent):
418
439
  """
419
440
 
420
441
  self.planner = (
421
- OpenAILLM(temperature=0.0, json_mode=True) if planner is None else planner
442
+ OpenAILMM(temperature=0.0, json_mode=True) if planner is None else planner
422
443
  )
423
- self.coder = OpenAILLM(temperature=0.0) if coder is None else coder
424
- self.tester = OpenAILLM(temperature=0.0) if tester is None else tester
444
+ self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
445
+ self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
425
446
  self.debugger = (
426
- OpenAILLM(temperature=0.0, json_mode=True) if debugger is None else debugger
447
+ OpenAILMM(temperature=0.0, json_mode=True) if debugger is None else debugger
427
448
  )
428
449
 
429
450
  self.tool_recommender = (
@@ -437,7 +458,7 @@ class VisionAgent(Agent):
437
458
 
438
459
  def __call__(
439
460
  self,
440
- input: Union[List[Dict[str, str]], str],
461
+ input: Union[str, List[Message]],
441
462
  media: Optional[Union[str, Path]] = None,
442
463
  ) -> str:
443
464
  """Chat with Vision Agent and return intermediate information regarding the task.
@@ -454,23 +475,26 @@ class VisionAgent(Agent):
454
475
 
455
476
  if isinstance(input, str):
456
477
  input = [{"role": "user", "content": input}]
457
- results = self.chat_with_workflow(input, media)
478
+ if media is not None:
479
+ input[0]["media"] = [media]
480
+ results = self.chat_with_workflow(input)
458
481
  results.pop("working_memory")
459
482
  return results # type: ignore
460
483
 
461
484
  def chat_with_workflow(
462
485
  self,
463
- chat: List[Dict[str, str]],
464
- media: Optional[Union[str, Path]] = None,
486
+ chat: List[Message],
465
487
  self_reflection: bool = False,
466
488
  display_visualization: bool = False,
467
489
  ) -> Dict[str, Any]:
468
490
  """Chat with Vision Agent and return intermediate information regarding the task.
469
491
 
470
492
  Parameters:
471
- chat (List[Dict[str, str]]): A conversation in the format of
472
- [{"role": "user", "content": "describe your task here..."}].
473
- media (Optional[Union[str, Path]]): The media file to be used in the task.
493
+ chat (List[MediaChatItem]): A conversation
494
+ in the format of:
495
+ [{"role": "user", "content": "describe your task here..."}]
496
+ or if it contains media files, it should be in the format of:
497
+ [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
474
498
  self_reflection (bool): Whether to reflect on the task and debug the code.
475
499
  display_visualization (bool): If True, it opens a new window locally to
476
500
  show the image(s) created by visualization code (if there is any).
@@ -485,11 +509,19 @@ class VisionAgent(Agent):
485
509
 
486
510
  # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
487
511
  with CodeInterpreterFactory.new_instance() as code_interpreter:
488
- if media is not None:
489
- media = code_interpreter.upload_file(media)
490
- for chat_i in chat:
491
- if chat_i["role"] == "user":
492
- chat_i["content"] += f" Image name {media}"
512
+ chat = copy.deepcopy(chat)
513
+ media_list = []
514
+ for chat_i in chat:
515
+ if "media" in chat_i:
516
+ for media in chat_i["media"]:
517
+ media = code_interpreter.upload_file(media)
518
+ chat_i["content"] += f" Media name {media}" # type: ignore
519
+ media_list.append(media)
520
+
521
+ int_chat = cast(
522
+ List[Message],
523
+ [{"role": c["role"], "content": c["content"]} for c in chat],
524
+ )
493
525
 
494
526
  code = ""
495
527
  test = ""
@@ -507,11 +539,10 @@ class VisionAgent(Agent):
507
539
  }
508
540
  )
509
541
  plan_i = write_plan(
510
- chat,
542
+ int_chat,
511
543
  T.TOOL_DESCRIPTIONS,
512
544
  format_memory(working_memory),
513
545
  self.planner,
514
- media=[media] if media else None,
515
546
  )
516
547
  plan_i_str = "\n-".join([e["instructions"] for e in plan_i])
517
548
 
@@ -534,9 +565,7 @@ class VisionAgent(Agent):
534
565
  self.verbosity,
535
566
  )
536
567
  results = write_and_test_code(
537
- task=FULL_TASK.format(
538
- user_request=chat[0]["content"], subtasks=plan_i_str
539
- ),
568
+ chat=int_chat,
540
569
  tool_info=tool_info,
541
570
  tool_utils=T.UTILITIES_DOCSTRING,
542
571
  working_memory=working_memory,
@@ -546,7 +575,7 @@ class VisionAgent(Agent):
546
575
  code_interpreter=code_interpreter,
547
576
  log_progress=self.log_progress,
548
577
  verbosity=self.verbosity,
549
- input_media=media,
578
+ media=media_list,
550
579
  )
551
580
  success = cast(bool, results["success"])
552
581
  code = cast(str, results["code"])
@@ -564,7 +593,7 @@ class VisionAgent(Agent):
564
593
  }
565
594
  )
566
595
  reflection = reflect(
567
- chat,
596
+ int_chat,
568
597
  FULL_TASK.format(
569
598
  user_request=chat[0]["content"], subtasks=plan_i_str
570
599
  ),
@@ -634,10 +663,10 @@ class AzureVisionAgent(VisionAgent):
634
663
 
635
664
  def __init__(
636
665
  self,
637
- planner: Optional[Union[LLM, LMM]] = None,
638
- coder: Optional[LLM] = None,
639
- tester: Optional[LLM] = None,
640
- debugger: Optional[LLM] = None,
666
+ planner: Optional[LMM] = None,
667
+ coder: Optional[LMM] = None,
668
+ tester: Optional[LMM] = None,
669
+ debugger: Optional[LMM] = None,
641
670
  tool_recommender: Optional[Sim] = None,
642
671
  verbosity: int = 0,
643
672
  report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
@@ -645,10 +674,10 @@ class AzureVisionAgent(VisionAgent):
645
674
  """Initialize the Vision Agent.
646
675
 
647
676
  Parameters:
648
- planner (Optional[LLM]): The planner model to use. Defaults to OpenAILLM.
649
- coder (Optional[LLM]): The coder model to use. Defaults to OpenAILLM.
650
- tester (Optional[LLM]): The tester model to use. Defaults to OpenAILLM.
651
- debugger (Optional[LLM]): The debugger model to
677
+ planner (Optional[LMM]): The planner model to use. Defaults to OpenAILMM.
678
+ coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
679
+ tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
680
+ debugger (Optional[LMM]): The debugger model to
652
681
  tool_recommender (Optional[Sim]): The tool recommender model to use.
653
682
  verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
654
683
  highest verbosity level which will output all intermediate debugging
@@ -660,14 +689,14 @@ class AzureVisionAgent(VisionAgent):
660
689
  """
661
690
  super().__init__(
662
691
  planner=(
663
- AzureOpenAILLM(temperature=0.0, json_mode=True)
692
+ AzureOpenAILMM(temperature=0.0, json_mode=True)
664
693
  if planner is None
665
694
  else planner
666
695
  ),
667
- coder=AzureOpenAILLM(temperature=0.0) if coder is None else coder,
668
- tester=AzureOpenAILLM(temperature=0.0) if tester is None else tester,
696
+ coder=AzureOpenAILMM(temperature=0.0) if coder is None else coder,
697
+ tester=AzureOpenAILMM(temperature=0.0) if tester is None else tester,
669
698
  debugger=(
670
- AzureOpenAILLM(temperature=0.0, json_mode=True)
699
+ AzureOpenAILMM(temperature=0.0, json_mode=True)
671
700
  if debugger is None
672
701
  else debugger
673
702
  ),
@@ -171,7 +171,7 @@ This is the documentation for the functions you have access to. You may call any
171
171
  **Instructions**:
172
172
  1. Verify the fundamental functionality under normal conditions.
173
173
  2. Ensure each test case is well-documented with comments explaining the scenario it covers.
174
- 3. Your test case MUST run only on the given image which is {media}
174
+ 3. Your test case MUST run only on the given images which are {media}
175
175
  4. Your test case MUST run only with the given values which is available in the question - {question}
176
176
  5. DO NOT use any non-existent or dummy image or video files that are not provided by the user's instructions.
177
177
  6. DO NOT mock any functions, you must test their functionality as is.
@@ -0,0 +1 @@
1
+ from .lmm import LMM, AzureOpenAILMM, Message, OpenAILMM