vision-agent 0.2.58__tar.gz → 0.2.78__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-0.2.58 → vision_agent-0.2.78}/PKG-INFO +38 -7
- {vision_agent-0.2.58 → vision_agent-0.2.78}/README.md +33 -4
- {vision_agent-0.2.58 → vision_agent-0.2.78}/pyproject.toml +5 -3
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/agent/vision_agent.py +118 -66
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/agent/vision_agent_prompts.py +2 -0
- vision_agent-0.2.78/vision_agent/lmm/__init__.py +1 -0
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/lmm/lmm.py +97 -6
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/tools/__init__.py +16 -4
- vision_agent-0.2.78/vision_agent/tools/tool_utils.py +67 -0
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/tools/tools.py +517 -35
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/utils/execute.py +40 -27
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/utils/sim.py +7 -1
- vision_agent-0.2.58/vision_agent/lmm/__init__.py +0 -1
- vision_agent-0.2.58/vision_agent/tools/tool_utils.py +0 -30
- {vision_agent-0.2.58 → vision_agent-0.2.78}/LICENSE +0 -0
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.78
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -9,8 +9,8 @@ Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: Programming Language :: Python :: 3.9
|
10
10
|
Classifier: Programming Language :: Python :: 3.10
|
11
11
|
Classifier: Programming Language :: Python :: 3.11
|
12
|
-
Requires-Dist: e2b (>=0.17.
|
13
|
-
Requires-Dist: e2b-code-interpreter (
|
12
|
+
Requires-Dist: e2b (>=0.17.1,<0.18.0)
|
13
|
+
Requires-Dist: e2b-code-interpreter (==0.0.11a1)
|
14
14
|
Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
|
15
15
|
Requires-Dist: langsmith (>=0.1.58,<0.2.0)
|
16
16
|
Requires-Dist: moviepy (>=1.0.0,<2.0.0)
|
@@ -21,7 +21,9 @@ Requires-Dist: openai (>=1.0.0,<2.0.0)
|
|
21
21
|
Requires-Dist: opencv-python (>=4.0.0,<5.0.0)
|
22
22
|
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
23
23
|
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
24
|
+
Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
|
24
25
|
Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
26
|
+
Requires-Dist: pytube (==15.0.0)
|
25
27
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
26
28
|
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
27
29
|
Requires-Dist: scipy (>=1.13.0,<1.14.0)
|
@@ -76,6 +78,9 @@ using Azure OpenAI please see the Azure setup section):
|
|
76
78
|
export OPENAI_API_KEY="your-api-key"
|
77
79
|
```
|
78
80
|
|
81
|
+
### Important Note on API Usage
|
82
|
+
Please be aware that using the API in this project requires you to have API credits (minimum of five US dollars). This is different from the OpenAI subscription used in this chatbot. If you don't have credit, further information can be found [here](https://github.com/landing-ai/vision-agent?tab=readme-ov-file#how-to-get-started-with-openai-api-credits)
|
83
|
+
|
79
84
|
### Vision Agent
|
80
85
|
#### Basic Usage
|
81
86
|
You can interact with the agent as you would with any LLM or LMM model:
|
@@ -178,8 +183,8 @@ you. For example:
|
|
178
183
|
|
179
184
|
```python
|
180
185
|
>>> import vision_agent as va
|
181
|
-
>>>
|
182
|
-
>>> detector =
|
186
|
+
>>> lmm = va.lmm.OpenAILMM()
|
187
|
+
>>> detector = lmm.generate_detector("Can you build a jar detector for me?")
|
183
188
|
>>> detector(va.tools.load_image("jar.jpg"))
|
184
189
|
[{"labels": ["jar",],
|
185
190
|
"scores": [0.99],
|
@@ -218,18 +223,44 @@ ensure the documentation is in the same format above with description, `Paramete
|
|
218
223
|
`Returns:`, and `Example\n-------`. You can find an example use case [here](examples/custom_tools/).
|
219
224
|
|
220
225
|
### Azure Setup
|
221
|
-
If you want to use Azure OpenAI models, you
|
226
|
+
If you want to use Azure OpenAI models, you need to have two OpenAI model deployments:
|
227
|
+
|
228
|
+
1. OpenAI GPT-4o model
|
229
|
+
2. OpenAI text embedding model
|
230
|
+
|
231
|
+
<img width="1201" alt="Screenshot 2024-06-12 at 5 54 48 PM" src="https://github.com/landing-ai/vision-agent/assets/2736300/da125592-b01d-45bc-bc99-d48c9dcdfa32">
|
232
|
+
|
233
|
+
Then you can set the following environment variables:
|
222
234
|
|
223
235
|
```bash
|
224
236
|
export AZURE_OPENAI_API_KEY="your-api-key"
|
225
237
|
export AZURE_OPENAI_ENDPOINT="your-endpoint"
|
238
|
+
# The deployment name of your Azure OpenAI chat model
|
239
|
+
export AZURE_OPENAI_CHAT_MODEL_DEPLOYMENT_NAME="your_gpt4o_model_deployment_name"
|
240
|
+
# The deployment name of your Azure OpenAI text embedding model
|
241
|
+
export AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME="your_embedding_model_deployment_name"
|
226
242
|
```
|
227
243
|
|
244
|
+
> NOTE: make sure your Azure model deployment have enough quota (token per minute) to support it. The default value 8000TPM is not enough.
|
245
|
+
|
228
246
|
You can then run Vision Agent using the Azure OpenAI models:
|
229
247
|
|
230
248
|
```python
|
231
249
|
import vision_agent as va
|
232
|
-
import vision_agent.tools as T
|
233
250
|
agent = va.agent.AzureVisionAgent()
|
234
251
|
```
|
235
252
|
|
253
|
+
******************************************************************************************************************************
|
254
|
+
|
255
|
+
### Q&A
|
256
|
+
|
257
|
+
#### How to get started with OpenAI API credits
|
258
|
+
|
259
|
+
1. Visit the[OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key.
|
260
|
+
2. Follow the instructions to purchase and manage your API credits.
|
261
|
+
3. Ensure your API key is correctly configured in your project settings.
|
262
|
+
|
263
|
+
Failure to have sufficient API credits may result in limited or no functionality for the features that rely on the OpenAI API.
|
264
|
+
|
265
|
+
For more details on managing your API usage and credits, please refer to the OpenAI API documentation.
|
266
|
+
|
@@ -40,6 +40,9 @@ using Azure OpenAI please see the Azure setup section):
|
|
40
40
|
export OPENAI_API_KEY="your-api-key"
|
41
41
|
```
|
42
42
|
|
43
|
+
### Important Note on API Usage
|
44
|
+
Please be aware that using the API in this project requires you to have API credits (minimum of five US dollars). This is different from the OpenAI subscription used in this chatbot. If you don't have credit, further information can be found [here](https://github.com/landing-ai/vision-agent?tab=readme-ov-file#how-to-get-started-with-openai-api-credits)
|
45
|
+
|
43
46
|
### Vision Agent
|
44
47
|
#### Basic Usage
|
45
48
|
You can interact with the agent as you would with any LLM or LMM model:
|
@@ -142,8 +145,8 @@ you. For example:
|
|
142
145
|
|
143
146
|
```python
|
144
147
|
>>> import vision_agent as va
|
145
|
-
>>>
|
146
|
-
>>> detector =
|
148
|
+
>>> lmm = va.lmm.OpenAILMM()
|
149
|
+
>>> detector = lmm.generate_detector("Can you build a jar detector for me?")
|
147
150
|
>>> detector(va.tools.load_image("jar.jpg"))
|
148
151
|
[{"labels": ["jar",],
|
149
152
|
"scores": [0.99],
|
@@ -182,17 +185,43 @@ ensure the documentation is in the same format above with description, `Paramete
|
|
182
185
|
`Returns:`, and `Example\n-------`. You can find an example use case [here](examples/custom_tools/).
|
183
186
|
|
184
187
|
### Azure Setup
|
185
|
-
If you want to use Azure OpenAI models, you
|
188
|
+
If you want to use Azure OpenAI models, you need to have two OpenAI model deployments:
|
189
|
+
|
190
|
+
1. OpenAI GPT-4o model
|
191
|
+
2. OpenAI text embedding model
|
192
|
+
|
193
|
+
<img width="1201" alt="Screenshot 2024-06-12 at 5 54 48 PM" src="https://github.com/landing-ai/vision-agent/assets/2736300/da125592-b01d-45bc-bc99-d48c9dcdfa32">
|
194
|
+
|
195
|
+
Then you can set the following environment variables:
|
186
196
|
|
187
197
|
```bash
|
188
198
|
export AZURE_OPENAI_API_KEY="your-api-key"
|
189
199
|
export AZURE_OPENAI_ENDPOINT="your-endpoint"
|
200
|
+
# The deployment name of your Azure OpenAI chat model
|
201
|
+
export AZURE_OPENAI_CHAT_MODEL_DEPLOYMENT_NAME="your_gpt4o_model_deployment_name"
|
202
|
+
# The deployment name of your Azure OpenAI text embedding model
|
203
|
+
export AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME="your_embedding_model_deployment_name"
|
190
204
|
```
|
191
205
|
|
206
|
+
> NOTE: make sure your Azure model deployment have enough quota (token per minute) to support it. The default value 8000TPM is not enough.
|
207
|
+
|
192
208
|
You can then run Vision Agent using the Azure OpenAI models:
|
193
209
|
|
194
210
|
```python
|
195
211
|
import vision_agent as va
|
196
|
-
import vision_agent.tools as T
|
197
212
|
agent = va.agent.AzureVisionAgent()
|
198
213
|
```
|
214
|
+
|
215
|
+
******************************************************************************************************************************
|
216
|
+
|
217
|
+
### Q&A
|
218
|
+
|
219
|
+
#### How to get started with OpenAI API credits
|
220
|
+
|
221
|
+
1. Visit the[OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key.
|
222
|
+
2. Follow the instructions to purchase and manage your API credits.
|
223
|
+
3. Ensure your API key is correctly configured in your project settings.
|
224
|
+
|
225
|
+
Failure to have sufficient API credits may result in limited or no functionality for the features that rely on the OpenAI API.
|
226
|
+
|
227
|
+
For more details on managing your API usage and credits, please refer to the OpenAI API documentation.
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
4
4
|
|
5
5
|
[tool.poetry]
|
6
6
|
name = "vision-agent"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.78"
|
8
8
|
description = "Toolset for Vision Agent"
|
9
9
|
authors = ["Landing AI <dev@landing.ai>"]
|
10
10
|
readme = "README.md"
|
@@ -34,9 +34,11 @@ nbformat = "^5.10.4"
|
|
34
34
|
rich = "^13.7.1"
|
35
35
|
langsmith = "^0.1.58"
|
36
36
|
ipykernel = "^6.29.4"
|
37
|
-
e2b = "^0.17.
|
38
|
-
e2b-code-interpreter = "
|
37
|
+
e2b = "^0.17.1"
|
38
|
+
e2b-code-interpreter = "0.0.11a1"
|
39
39
|
tenacity = "^8.3.0"
|
40
|
+
pillow-heif = "^0.16.0"
|
41
|
+
pytube = "15.0.0"
|
40
42
|
|
41
43
|
[tool.poetry.group.dev.dependencies]
|
42
44
|
autoflake = "1.*"
|
@@ -7,6 +7,7 @@ import tempfile
|
|
7
7
|
from pathlib import Path
|
8
8
|
from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
|
9
9
|
|
10
|
+
from langsmith import traceable
|
10
11
|
from PIL import Image
|
11
12
|
from rich.console import Console
|
12
13
|
from rich.style import Style
|
@@ -42,6 +43,8 @@ class DefaultImports:
|
|
42
43
|
|
43
44
|
common_imports = [
|
44
45
|
"from typing import *",
|
46
|
+
"from pillow_heif import register_heif_opener",
|
47
|
+
"register_heif_opener()",
|
45
48
|
]
|
46
49
|
|
47
50
|
@staticmethod
|
@@ -96,6 +99,7 @@ def extract_json(json_str: str) -> Dict[str, Any]:
|
|
96
99
|
try:
|
97
100
|
json_dict = json.loads(json_str)
|
98
101
|
except json.JSONDecodeError:
|
102
|
+
input_json_str = json_str
|
99
103
|
if "```json" in json_str:
|
100
104
|
json_str = json_str[json_str.find("```json") + len("```json") :]
|
101
105
|
json_str = json_str[: json_str.find("```")]
|
@@ -103,7 +107,12 @@ def extract_json(json_str: str) -> Dict[str, Any]:
|
|
103
107
|
json_str = json_str[json_str.find("```") + len("```") :]
|
104
108
|
# get the last ``` not one from an intermediate string
|
105
109
|
json_str = json_str[: json_str.find("}```")]
|
106
|
-
|
110
|
+
try:
|
111
|
+
json_dict = json.loads(json_str)
|
112
|
+
except json.JSONDecodeError as e:
|
113
|
+
error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
|
114
|
+
_LOGGER.exception(error_msg)
|
115
|
+
raise ValueError(error_msg) from e
|
107
116
|
return json_dict # type: ignore
|
108
117
|
|
109
118
|
|
@@ -130,6 +139,7 @@ def extract_image(
|
|
130
139
|
return new_media
|
131
140
|
|
132
141
|
|
142
|
+
@traceable
|
133
143
|
def write_plan(
|
134
144
|
chat: List[Message],
|
135
145
|
tool_desc: str,
|
@@ -147,6 +157,7 @@ def write_plan(
|
|
147
157
|
return extract_json(model.chat(chat))["plan"] # type: ignore
|
148
158
|
|
149
159
|
|
160
|
+
@traceable
|
150
161
|
def write_code(
|
151
162
|
coder: LMM,
|
152
163
|
chat: List[Message],
|
@@ -167,6 +178,7 @@ def write_code(
|
|
167
178
|
return extract_code(coder(chat))
|
168
179
|
|
169
180
|
|
181
|
+
@traceable
|
170
182
|
def write_test(
|
171
183
|
tester: LMM,
|
172
184
|
chat: List[Message],
|
@@ -191,6 +203,7 @@ def write_test(
|
|
191
203
|
return extract_code(tester(chat))
|
192
204
|
|
193
205
|
|
206
|
+
@traceable
|
194
207
|
def reflect(
|
195
208
|
chat: List[Message],
|
196
209
|
plan: str,
|
@@ -266,70 +279,19 @@ def write_and_test_code(
|
|
266
279
|
count = 0
|
267
280
|
new_working_memory: List[Dict[str, str]] = []
|
268
281
|
while not result.success and count < max_retries:
|
269
|
-
log_progress(
|
270
|
-
{
|
271
|
-
"type": "code",
|
272
|
-
"status": "started",
|
273
|
-
}
|
274
|
-
)
|
275
|
-
fixed_code_and_test = extract_json(
|
276
|
-
debugger(
|
277
|
-
FIX_BUG.format(
|
278
|
-
code=code,
|
279
|
-
tests=test,
|
280
|
-
result="\n".join(result.text().splitlines()[-50:]),
|
281
|
-
feedback=format_memory(working_memory + new_working_memory),
|
282
|
-
)
|
283
|
-
)
|
284
|
-
)
|
285
|
-
old_code = code
|
286
|
-
old_test = test
|
287
|
-
|
288
|
-
if fixed_code_and_test["code"].strip() != "":
|
289
|
-
code = extract_code(fixed_code_and_test["code"])
|
290
|
-
if fixed_code_and_test["test"].strip() != "":
|
291
|
-
test = extract_code(fixed_code_and_test["test"])
|
292
|
-
|
293
|
-
new_working_memory.append(
|
294
|
-
{
|
295
|
-
"code": f"{code}\n{test}",
|
296
|
-
"feedback": fixed_code_and_test["reflections"],
|
297
|
-
"edits": get_diff(f"{old_code}\n{old_test}", f"{code}\n{test}"),
|
298
|
-
}
|
299
|
-
)
|
300
|
-
log_progress(
|
301
|
-
{
|
302
|
-
"type": "code",
|
303
|
-
"status": "running",
|
304
|
-
"payload": {
|
305
|
-
"code": DefaultImports.prepend_imports(code),
|
306
|
-
"test": test,
|
307
|
-
},
|
308
|
-
}
|
309
|
-
)
|
310
|
-
|
311
|
-
result = code_interpreter.exec_isolation(
|
312
|
-
f"{DefaultImports.to_code_string()}\n{code}\n{test}"
|
313
|
-
)
|
314
|
-
log_progress(
|
315
|
-
{
|
316
|
-
"type": "code",
|
317
|
-
"status": "completed" if result.success else "failed",
|
318
|
-
"payload": {
|
319
|
-
"code": DefaultImports.prepend_imports(code),
|
320
|
-
"test": test,
|
321
|
-
"result": result.to_json(),
|
322
|
-
},
|
323
|
-
}
|
324
|
-
)
|
325
282
|
if verbosity == 2:
|
326
|
-
_LOGGER.info(
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
283
|
+
_LOGGER.info(f"Start debugging attempt {count + 1}")
|
284
|
+
code, test, result = debug_code(
|
285
|
+
working_memory,
|
286
|
+
debugger,
|
287
|
+
code_interpreter,
|
288
|
+
code,
|
289
|
+
test,
|
290
|
+
result,
|
291
|
+
new_working_memory,
|
292
|
+
log_progress,
|
293
|
+
verbosity,
|
294
|
+
)
|
333
295
|
count += 1
|
334
296
|
|
335
297
|
if verbosity >= 1:
|
@@ -344,6 +306,95 @@ def write_and_test_code(
|
|
344
306
|
}
|
345
307
|
|
346
308
|
|
309
|
+
@traceable
|
310
|
+
def debug_code(
|
311
|
+
working_memory: List[Dict[str, str]],
|
312
|
+
debugger: LMM,
|
313
|
+
code_interpreter: CodeInterpreter,
|
314
|
+
code: str,
|
315
|
+
test: str,
|
316
|
+
result: Execution,
|
317
|
+
new_working_memory: List[Dict[str, str]],
|
318
|
+
log_progress: Callable[[Dict[str, Any]], None],
|
319
|
+
verbosity: int = 0,
|
320
|
+
) -> tuple[str, str, Execution]:
|
321
|
+
log_progress(
|
322
|
+
{
|
323
|
+
"type": "code",
|
324
|
+
"status": "started",
|
325
|
+
}
|
326
|
+
)
|
327
|
+
|
328
|
+
fixed_code_and_test = {"code": "", "test": "", "reflections": ""}
|
329
|
+
success = False
|
330
|
+
count = 0
|
331
|
+
while not success and count < 3:
|
332
|
+
try:
|
333
|
+
fixed_code_and_test = extract_json(
|
334
|
+
debugger(
|
335
|
+
FIX_BUG.format(
|
336
|
+
code=code,
|
337
|
+
tests=test,
|
338
|
+
result="\n".join(result.text().splitlines()[-50:]),
|
339
|
+
feedback=format_memory(working_memory + new_working_memory),
|
340
|
+
)
|
341
|
+
)
|
342
|
+
)
|
343
|
+
success = True
|
344
|
+
except Exception as e:
|
345
|
+
_LOGGER.exception(f"Error while extracting JSON: {e}")
|
346
|
+
|
347
|
+
count += 1
|
348
|
+
|
349
|
+
old_code = code
|
350
|
+
old_test = test
|
351
|
+
|
352
|
+
if fixed_code_and_test["code"].strip() != "":
|
353
|
+
code = extract_code(fixed_code_and_test["code"])
|
354
|
+
if fixed_code_and_test["test"].strip() != "":
|
355
|
+
test = extract_code(fixed_code_and_test["test"])
|
356
|
+
|
357
|
+
new_working_memory.append(
|
358
|
+
{
|
359
|
+
"code": f"{code}\n{test}",
|
360
|
+
"feedback": fixed_code_and_test["reflections"],
|
361
|
+
"edits": get_diff(f"{old_code}\n{old_test}", f"{code}\n{test}"),
|
362
|
+
}
|
363
|
+
)
|
364
|
+
log_progress(
|
365
|
+
{
|
366
|
+
"type": "code",
|
367
|
+
"status": "running",
|
368
|
+
"payload": {
|
369
|
+
"code": DefaultImports.prepend_imports(code),
|
370
|
+
"test": test,
|
371
|
+
},
|
372
|
+
}
|
373
|
+
)
|
374
|
+
|
375
|
+
result = code_interpreter.exec_isolation(
|
376
|
+
f"{DefaultImports.to_code_string()}\n{code}\n{test}"
|
377
|
+
)
|
378
|
+
log_progress(
|
379
|
+
{
|
380
|
+
"type": "code",
|
381
|
+
"status": "completed" if result.success else "failed",
|
382
|
+
"payload": {
|
383
|
+
"code": DefaultImports.prepend_imports(code),
|
384
|
+
"test": test,
|
385
|
+
"result": result.to_json(),
|
386
|
+
},
|
387
|
+
}
|
388
|
+
)
|
389
|
+
if verbosity == 2:
|
390
|
+
_print_code("Code and test after attempted fix:", code, test)
|
391
|
+
_LOGGER.info(
|
392
|
+
f"Reflection: {fixed_code_and_test['reflections']}\nCode execution result after attempted fix: {result.text(include_logs=True)}"
|
393
|
+
)
|
394
|
+
|
395
|
+
return code, test, result
|
396
|
+
|
397
|
+
|
347
398
|
def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
|
348
399
|
_CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
|
349
400
|
_CONSOLE.print("=" * 30 + " Code " + "=" * 30)
|
@@ -386,12 +437,12 @@ def retrieve_tools(
|
|
386
437
|
{
|
387
438
|
"type": "tools",
|
388
439
|
"status": "completed",
|
389
|
-
"payload": tool_list,
|
440
|
+
"payload": list({v["description"]: v for v in tool_list}.values()),
|
390
441
|
}
|
391
442
|
)
|
392
443
|
|
393
444
|
if verbosity == 2:
|
394
|
-
tool_desc_str = "\n".join(tool_desc)
|
445
|
+
tool_desc_str = "\n".join(set(tool_desc))
|
395
446
|
_LOGGER.info(f"Tools Description:\n{tool_desc_str}")
|
396
447
|
tool_info_set = set(tool_info)
|
397
448
|
return "\n\n".join(tool_info_set)
|
@@ -481,6 +532,7 @@ class VisionAgent(Agent):
|
|
481
532
|
results.pop("working_memory")
|
482
533
|
return results # type: ignore
|
483
534
|
|
535
|
+
@traceable
|
484
536
|
def chat_with_workflow(
|
485
537
|
self,
|
486
538
|
chat: List[Message],
|
@@ -179,6 +179,8 @@ This is the documentation for the functions you have access to. You may call any
|
|
179
179
|
8. DO NOT use try except block to handle the error, let the error be raised if the code is incorrect.
|
180
180
|
9. DO NOT import the testing function as it will available in the testing environment.
|
181
181
|
10. Print the output of the function that is being tested.
|
182
|
+
11. Use the output of the function that is being tested as the return value of the testing function.
|
183
|
+
12. Run the testing function in the end and don't assign a variable to its output.
|
182
184
|
"""
|
183
185
|
|
184
186
|
|
@@ -0,0 +1 @@
|
|
1
|
+
from .lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM
|
@@ -6,6 +6,7 @@ from abc import ABC, abstractmethod
|
|
6
6
|
from pathlib import Path
|
7
7
|
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
8
8
|
|
9
|
+
import requests
|
9
10
|
from openai import AzureOpenAI, OpenAI
|
10
11
|
|
11
12
|
import vision_agent.tools as T
|
@@ -163,6 +164,7 @@ class OpenAILMM(LMM):
|
|
163
164
|
{"role": "system", "content": SYSTEM_PROMPT},
|
164
165
|
{"role": "user", "content": prompt},
|
165
166
|
],
|
167
|
+
response_format={"type": "json_object"},
|
166
168
|
)
|
167
169
|
|
168
170
|
try:
|
@@ -178,7 +180,7 @@ class OpenAILMM(LMM):
|
|
178
180
|
return lambda x: T.clip(x, params["prompt"])
|
179
181
|
|
180
182
|
def generate_detector(self, question: str) -> Callable:
|
181
|
-
api_doc = T.get_tool_documentation([T.
|
183
|
+
api_doc = T.get_tool_documentation([T.owl_v2])
|
182
184
|
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
183
185
|
response = self.client.chat.completions.create(
|
184
186
|
model=self.model_name,
|
@@ -186,6 +188,7 @@ class OpenAILMM(LMM):
|
|
186
188
|
{"role": "system", "content": SYSTEM_PROMPT},
|
187
189
|
{"role": "user", "content": prompt},
|
188
190
|
],
|
191
|
+
response_format={"type": "json_object"},
|
189
192
|
)
|
190
193
|
|
191
194
|
try:
|
@@ -198,7 +201,7 @@ class OpenAILMM(LMM):
|
|
198
201
|
)
|
199
202
|
raise ValueError("Failed to decode response")
|
200
203
|
|
201
|
-
return lambda x: T.
|
204
|
+
return lambda x: T.owl_v2(params["prompt"], x)
|
202
205
|
|
203
206
|
def generate_segmentor(self, question: str) -> Callable:
|
204
207
|
api_doc = T.get_tool_documentation([T.grounding_sam])
|
@@ -209,6 +212,7 @@ class OpenAILMM(LMM):
|
|
209
212
|
{"role": "system", "content": SYSTEM_PROMPT},
|
210
213
|
{"role": "user", "content": prompt},
|
211
214
|
],
|
215
|
+
response_format={"type": "json_object"},
|
212
216
|
)
|
213
217
|
|
214
218
|
try:
|
@@ -224,16 +228,16 @@ class OpenAILMM(LMM):
|
|
224
228
|
return lambda x: T.grounding_sam(params["prompt"], x)
|
225
229
|
|
226
230
|
def generate_zero_shot_counter(self, question: str) -> Callable:
|
227
|
-
return T.
|
231
|
+
return T.loca_zero_shot_counting
|
228
232
|
|
229
233
|
def generate_image_qa_tool(self, question: str) -> Callable:
|
230
|
-
return lambda x: T.
|
234
|
+
return lambda x: T.git_vqa_v2(question, x)
|
231
235
|
|
232
236
|
|
233
237
|
class AzureOpenAILMM(OpenAILMM):
|
234
238
|
def __init__(
|
235
239
|
self,
|
236
|
-
model_name: str =
|
240
|
+
model_name: Optional[str] = None,
|
237
241
|
api_key: Optional[str] = None,
|
238
242
|
api_version: str = "2024-02-01",
|
239
243
|
azure_endpoint: Optional[str] = None,
|
@@ -245,14 +249,20 @@ class AzureOpenAILMM(OpenAILMM):
|
|
245
249
|
api_key = os.getenv("AZURE_OPENAI_API_KEY")
|
246
250
|
if not azure_endpoint:
|
247
251
|
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
|
252
|
+
if not model_name:
|
253
|
+
model_name = os.getenv("AZURE_OPENAI_CHAT_MODEL_DEPLOYMENT_NAME")
|
248
254
|
|
249
255
|
if not api_key:
|
250
256
|
raise ValueError("OpenAI API key is required.")
|
251
257
|
if not azure_endpoint:
|
252
258
|
raise ValueError("Azure OpenAI endpoint is required.")
|
259
|
+
if not model_name:
|
260
|
+
raise ValueError("Azure OpenAI chat model deployment name is required.")
|
253
261
|
|
254
262
|
self.client = AzureOpenAI(
|
255
|
-
api_key=api_key,
|
263
|
+
api_key=api_key,
|
264
|
+
api_version=api_version,
|
265
|
+
azure_endpoint=azure_endpoint,
|
256
266
|
)
|
257
267
|
self.model_name = model_name
|
258
268
|
|
@@ -261,3 +271,84 @@ class AzureOpenAILMM(OpenAILMM):
|
|
261
271
|
if json_mode:
|
262
272
|
kwargs["response_format"] = {"type": "json_object"}
|
263
273
|
self.kwargs = kwargs
|
274
|
+
|
275
|
+
|
276
|
+
class OllamaLMM(LMM):
|
277
|
+
r"""An LMM class for the ollama."""
|
278
|
+
|
279
|
+
def __init__(
|
280
|
+
self,
|
281
|
+
model_name: str = "llava",
|
282
|
+
base_url: Optional[str] = "http://localhost:11434/api",
|
283
|
+
json_mode: bool = False,
|
284
|
+
**kwargs: Any,
|
285
|
+
):
|
286
|
+
self.url = base_url
|
287
|
+
self.model_name = model_name
|
288
|
+
self.json_mode = json_mode
|
289
|
+
self.stream = False
|
290
|
+
|
291
|
+
def __call__(
|
292
|
+
self,
|
293
|
+
input: Union[str, List[Message]],
|
294
|
+
) -> str:
|
295
|
+
if isinstance(input, str):
|
296
|
+
return self.generate(input)
|
297
|
+
return self.chat(input)
|
298
|
+
|
299
|
+
def chat(
|
300
|
+
self,
|
301
|
+
chat: List[Message],
|
302
|
+
) -> str:
|
303
|
+
"""Chat with the LMM model.
|
304
|
+
|
305
|
+
Parameters:
|
306
|
+
chat (List[Dict[str, str]]): A list of dictionaries containing the chat
|
307
|
+
messages. The messages can be in the format:
|
308
|
+
[{"role": "user", "content": "Hello!"}, ...]
|
309
|
+
or if it contains media, it should be in the format:
|
310
|
+
[{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
|
311
|
+
"""
|
312
|
+
fixed_chat = []
|
313
|
+
for message in chat:
|
314
|
+
if "media" in message:
|
315
|
+
message["images"] = [encode_image(m) for m in message["media"]]
|
316
|
+
del message["media"]
|
317
|
+
fixed_chat.append(message)
|
318
|
+
url = f"{self.url}/chat"
|
319
|
+
model = self.model_name
|
320
|
+
messages = fixed_chat
|
321
|
+
data = {"model": model, "messages": messages, "stream": self.stream}
|
322
|
+
json_data = json.dumps(data)
|
323
|
+
response = requests.post(url, data=json_data)
|
324
|
+
if response.status_code != 200:
|
325
|
+
raise ValueError(f"Request failed with status code {response.status_code}")
|
326
|
+
response = response.json()
|
327
|
+
return response["message"]["content"] # type: ignore
|
328
|
+
|
329
|
+
def generate(
|
330
|
+
self,
|
331
|
+
prompt: str,
|
332
|
+
media: Optional[List[Union[str, Path]]] = None,
|
333
|
+
) -> str:
|
334
|
+
|
335
|
+
url = f"{self.url}/generate"
|
336
|
+
data = {
|
337
|
+
"model": self.model_name,
|
338
|
+
"prompt": prompt,
|
339
|
+
"images": [],
|
340
|
+
"stream": self.stream,
|
341
|
+
}
|
342
|
+
|
343
|
+
json_data = json.dumps(data)
|
344
|
+
if media and len(media) > 0:
|
345
|
+
for m in media:
|
346
|
+
data["images"].append(encode_image(m)) # type: ignore
|
347
|
+
|
348
|
+
response = requests.post(url, data=json_data)
|
349
|
+
|
350
|
+
if response.status_code != 200:
|
351
|
+
raise ValueError(f"Request failed with status code {response.status_code}")
|
352
|
+
|
353
|
+
response = response.json()
|
354
|
+
return response["response"] # type: ignore
|
@@ -7,25 +7,37 @@ from .tools import (
|
|
7
7
|
TOOLS,
|
8
8
|
TOOLS_DF,
|
9
9
|
UTILITIES_DOCSTRING,
|
10
|
+
blip_image_caption,
|
10
11
|
clip,
|
11
12
|
closest_box_distance,
|
12
13
|
closest_mask_distance,
|
13
14
|
extract_frames,
|
15
|
+
florencev2_image_caption,
|
14
16
|
get_tool_documentation,
|
17
|
+
florencev2_object_detection,
|
18
|
+
detr_segmentation,
|
19
|
+
depth_anything_v2,
|
20
|
+
generate_soft_edge_image,
|
21
|
+
dpt_hybrid_midas,
|
22
|
+
generate_pose_image,
|
23
|
+
git_vqa_v2,
|
15
24
|
grounding_dino,
|
16
25
|
grounding_sam,
|
17
|
-
|
18
|
-
image_question_answering,
|
26
|
+
florencev2_roberta_vqa,
|
19
27
|
load_image,
|
28
|
+
loca_visual_prompt_counting,
|
29
|
+
loca_zero_shot_counting,
|
20
30
|
ocr,
|
21
31
|
overlay_bounding_boxes,
|
22
32
|
overlay_heat_map,
|
23
33
|
overlay_segmentation_masks,
|
34
|
+
owl_v2,
|
24
35
|
save_image,
|
25
36
|
save_json,
|
26
37
|
save_video,
|
27
|
-
|
28
|
-
|
38
|
+
template_match,
|
39
|
+
vit_image_classification,
|
40
|
+
vit_nsfw_classification,
|
29
41
|
)
|
30
42
|
|
31
43
|
__new_tools__ = [
|