vision-agent 0.2.58__tar.gz → 0.2.78__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {vision_agent-0.2.58 → vision_agent-0.2.78}/PKG-INFO +38 -7
  2. {vision_agent-0.2.58 → vision_agent-0.2.78}/README.md +33 -4
  3. {vision_agent-0.2.58 → vision_agent-0.2.78}/pyproject.toml +5 -3
  4. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/agent/vision_agent.py +118 -66
  5. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/agent/vision_agent_prompts.py +2 -0
  6. vision_agent-0.2.78/vision_agent/lmm/__init__.py +1 -0
  7. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/lmm/lmm.py +97 -6
  8. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/tools/__init__.py +16 -4
  9. vision_agent-0.2.78/vision_agent/tools/tool_utils.py +67 -0
  10. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/tools/tools.py +517 -35
  11. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/utils/execute.py +40 -27
  12. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/utils/sim.py +7 -1
  13. vision_agent-0.2.58/vision_agent/lmm/__init__.py +0 -1
  14. vision_agent-0.2.58/vision_agent/tools/tool_utils.py +0 -30
  15. {vision_agent-0.2.58 → vision_agent-0.2.78}/LICENSE +0 -0
  16. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/__init__.py +0 -0
  17. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/agent/__init__.py +0 -0
  18. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/agent/agent.py +0 -0
  19. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/fonts/__init__.py +0 -0
  20. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  21. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/tools/prompts.py +0 -0
  22. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/utils/__init__.py +0 -0
  23. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/utils/image_utils.py +0 -0
  24. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/utils/type_defs.py +0 -0
  25. {vision_agent-0.2.58 → vision_agent-0.2.78}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.58
3
+ Version: 0.2.78
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -9,8 +9,8 @@ Classifier: Programming Language :: Python :: 3
9
9
  Classifier: Programming Language :: Python :: 3.9
10
10
  Classifier: Programming Language :: Python :: 3.10
11
11
  Classifier: Programming Language :: Python :: 3.11
12
- Requires-Dist: e2b (>=0.17.0,<0.18.0)
13
- Requires-Dist: e2b-code-interpreter (>=0.0.7,<0.0.8)
12
+ Requires-Dist: e2b (>=0.17.1,<0.18.0)
13
+ Requires-Dist: e2b-code-interpreter (==0.0.11a1)
14
14
  Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
15
15
  Requires-Dist: langsmith (>=0.1.58,<0.2.0)
16
16
  Requires-Dist: moviepy (>=1.0.0,<2.0.0)
@@ -21,7 +21,9 @@ Requires-Dist: openai (>=1.0.0,<2.0.0)
21
21
  Requires-Dist: opencv-python (>=4.0.0,<5.0.0)
22
22
  Requires-Dist: pandas (>=2.0.0,<3.0.0)
23
23
  Requires-Dist: pillow (>=10.0.0,<11.0.0)
24
+ Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
24
25
  Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
26
+ Requires-Dist: pytube (==15.0.0)
25
27
  Requires-Dist: requests (>=2.0.0,<3.0.0)
26
28
  Requires-Dist: rich (>=13.7.1,<14.0.0)
27
29
  Requires-Dist: scipy (>=1.13.0,<1.14.0)
@@ -76,6 +78,9 @@ using Azure OpenAI please see the Azure setup section):
76
78
  export OPENAI_API_KEY="your-api-key"
77
79
  ```
78
80
 
81
+ ### Important Note on API Usage
82
+ Please be aware that using the API in this project requires you to have API credits (minimum of five US dollars). This is different from the OpenAI subscription used in this chatbot. If you don't have credit, further information can be found [here](https://github.com/landing-ai/vision-agent?tab=readme-ov-file#how-to-get-started-with-openai-api-credits)
83
+
79
84
  ### Vision Agent
80
85
  #### Basic Usage
81
86
  You can interact with the agent as you would with any LLM or LMM model:
@@ -178,8 +183,8 @@ you. For example:
178
183
 
179
184
  ```python
180
185
  >>> import vision_agent as va
181
- >>> llm = va.llm.OpenAILMM()
182
- >>> detector = llm.generate_detector("Can you build a jar detector for me?")
186
+ >>> lmm = va.lmm.OpenAILMM()
187
+ >>> detector = lmm.generate_detector("Can you build a jar detector for me?")
183
188
  >>> detector(va.tools.load_image("jar.jpg"))
184
189
  [{"labels": ["jar",],
185
190
  "scores": [0.99],
@@ -218,18 +223,44 @@ ensure the documentation is in the same format above with description, `Paramete
218
223
  `Returns:`, and `Example\n-------`. You can find an example use case [here](examples/custom_tools/).
219
224
 
220
225
  ### Azure Setup
221
- If you want to use Azure OpenAI models, you can set the environment variable:
226
+ If you want to use Azure OpenAI models, you need to have two OpenAI model deployments:
227
+
228
+ 1. OpenAI GPT-4o model
229
+ 2. OpenAI text embedding model
230
+
231
+ <img width="1201" alt="Screenshot 2024-06-12 at 5 54 48 PM" src="https://github.com/landing-ai/vision-agent/assets/2736300/da125592-b01d-45bc-bc99-d48c9dcdfa32">
232
+
233
+ Then you can set the following environment variables:
222
234
 
223
235
  ```bash
224
236
  export AZURE_OPENAI_API_KEY="your-api-key"
225
237
  export AZURE_OPENAI_ENDPOINT="your-endpoint"
238
+ # The deployment name of your Azure OpenAI chat model
239
+ export AZURE_OPENAI_CHAT_MODEL_DEPLOYMENT_NAME="your_gpt4o_model_deployment_name"
240
+ # The deployment name of your Azure OpenAI text embedding model
241
+ export AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME="your_embedding_model_deployment_name"
226
242
  ```
227
243
 
244
+ > NOTE: make sure your Azure model deployment have enough quota (token per minute) to support it. The default value 8000TPM is not enough.
245
+
228
246
  You can then run Vision Agent using the Azure OpenAI models:
229
247
 
230
248
  ```python
231
249
  import vision_agent as va
232
- import vision_agent.tools as T
233
250
  agent = va.agent.AzureVisionAgent()
234
251
  ```
235
252
 
253
+ ******************************************************************************************************************************
254
+
255
+ ### Q&A
256
+
257
+ #### How to get started with OpenAI API credits
258
+
259
+ 1. Visit the[OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key.
260
+ 2. Follow the instructions to purchase and manage your API credits.
261
+ 3. Ensure your API key is correctly configured in your project settings.
262
+
263
+ Failure to have sufficient API credits may result in limited or no functionality for the features that rely on the OpenAI API.
264
+
265
+ For more details on managing your API usage and credits, please refer to the OpenAI API documentation.
266
+
@@ -40,6 +40,9 @@ using Azure OpenAI please see the Azure setup section):
40
40
  export OPENAI_API_KEY="your-api-key"
41
41
  ```
42
42
 
43
+ ### Important Note on API Usage
44
+ Please be aware that using the API in this project requires you to have API credits (minimum of five US dollars). This is different from the OpenAI subscription used in this chatbot. If you don't have credit, further information can be found [here](https://github.com/landing-ai/vision-agent?tab=readme-ov-file#how-to-get-started-with-openai-api-credits)
45
+
43
46
  ### Vision Agent
44
47
  #### Basic Usage
45
48
  You can interact with the agent as you would with any LLM or LMM model:
@@ -142,8 +145,8 @@ you. For example:
142
145
 
143
146
  ```python
144
147
  >>> import vision_agent as va
145
- >>> llm = va.llm.OpenAILMM()
146
- >>> detector = llm.generate_detector("Can you build a jar detector for me?")
148
+ >>> lmm = va.lmm.OpenAILMM()
149
+ >>> detector = lmm.generate_detector("Can you build a jar detector for me?")
147
150
  >>> detector(va.tools.load_image("jar.jpg"))
148
151
  [{"labels": ["jar",],
149
152
  "scores": [0.99],
@@ -182,17 +185,43 @@ ensure the documentation is in the same format above with description, `Paramete
182
185
  `Returns:`, and `Example\n-------`. You can find an example use case [here](examples/custom_tools/).
183
186
 
184
187
  ### Azure Setup
185
- If you want to use Azure OpenAI models, you can set the environment variable:
188
+ If you want to use Azure OpenAI models, you need to have two OpenAI model deployments:
189
+
190
+ 1. OpenAI GPT-4o model
191
+ 2. OpenAI text embedding model
192
+
193
+ <img width="1201" alt="Screenshot 2024-06-12 at 5 54 48 PM" src="https://github.com/landing-ai/vision-agent/assets/2736300/da125592-b01d-45bc-bc99-d48c9dcdfa32">
194
+
195
+ Then you can set the following environment variables:
186
196
 
187
197
  ```bash
188
198
  export AZURE_OPENAI_API_KEY="your-api-key"
189
199
  export AZURE_OPENAI_ENDPOINT="your-endpoint"
200
+ # The deployment name of your Azure OpenAI chat model
201
+ export AZURE_OPENAI_CHAT_MODEL_DEPLOYMENT_NAME="your_gpt4o_model_deployment_name"
202
+ # The deployment name of your Azure OpenAI text embedding model
203
+ export AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME="your_embedding_model_deployment_name"
190
204
  ```
191
205
 
206
+ > NOTE: make sure your Azure model deployment have enough quota (token per minute) to support it. The default value 8000TPM is not enough.
207
+
192
208
  You can then run Vision Agent using the Azure OpenAI models:
193
209
 
194
210
  ```python
195
211
  import vision_agent as va
196
- import vision_agent.tools as T
197
212
  agent = va.agent.AzureVisionAgent()
198
213
  ```
214
+
215
+ ******************************************************************************************************************************
216
+
217
+ ### Q&A
218
+
219
+ #### How to get started with OpenAI API credits
220
+
221
+ 1. Visit the[OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key.
222
+ 2. Follow the instructions to purchase and manage your API credits.
223
+ 3. Ensure your API key is correctly configured in your project settings.
224
+
225
+ Failure to have sufficient API credits may result in limited or no functionality for the features that rely on the OpenAI API.
226
+
227
+ For more details on managing your API usage and credits, please refer to the OpenAI API documentation.
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.58"
7
+ version = "0.2.78"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -34,9 +34,11 @@ nbformat = "^5.10.4"
34
34
  rich = "^13.7.1"
35
35
  langsmith = "^0.1.58"
36
36
  ipykernel = "^6.29.4"
37
- e2b = "^0.17.0"
38
- e2b-code-interpreter = "^0.0.7"
37
+ e2b = "^0.17.1"
38
+ e2b-code-interpreter = "0.0.11a1"
39
39
  tenacity = "^8.3.0"
40
+ pillow-heif = "^0.16.0"
41
+ pytube = "15.0.0"
40
42
 
41
43
  [tool.poetry.group.dev.dependencies]
42
44
  autoflake = "1.*"
@@ -7,6 +7,7 @@ import tempfile
7
7
  from pathlib import Path
8
8
  from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
9
9
 
10
+ from langsmith import traceable
10
11
  from PIL import Image
11
12
  from rich.console import Console
12
13
  from rich.style import Style
@@ -42,6 +43,8 @@ class DefaultImports:
42
43
 
43
44
  common_imports = [
44
45
  "from typing import *",
46
+ "from pillow_heif import register_heif_opener",
47
+ "register_heif_opener()",
45
48
  ]
46
49
 
47
50
  @staticmethod
@@ -96,6 +99,7 @@ def extract_json(json_str: str) -> Dict[str, Any]:
96
99
  try:
97
100
  json_dict = json.loads(json_str)
98
101
  except json.JSONDecodeError:
102
+ input_json_str = json_str
99
103
  if "```json" in json_str:
100
104
  json_str = json_str[json_str.find("```json") + len("```json") :]
101
105
  json_str = json_str[: json_str.find("```")]
@@ -103,7 +107,12 @@ def extract_json(json_str: str) -> Dict[str, Any]:
103
107
  json_str = json_str[json_str.find("```") + len("```") :]
104
108
  # get the last ``` not one from an intermediate string
105
109
  json_str = json_str[: json_str.find("}```")]
106
- json_dict = json.loads(json_str)
110
+ try:
111
+ json_dict = json.loads(json_str)
112
+ except json.JSONDecodeError as e:
113
+ error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
114
+ _LOGGER.exception(error_msg)
115
+ raise ValueError(error_msg) from e
107
116
  return json_dict # type: ignore
108
117
 
109
118
 
@@ -130,6 +139,7 @@ def extract_image(
130
139
  return new_media
131
140
 
132
141
 
142
+ @traceable
133
143
  def write_plan(
134
144
  chat: List[Message],
135
145
  tool_desc: str,
@@ -147,6 +157,7 @@ def write_plan(
147
157
  return extract_json(model.chat(chat))["plan"] # type: ignore
148
158
 
149
159
 
160
+ @traceable
150
161
  def write_code(
151
162
  coder: LMM,
152
163
  chat: List[Message],
@@ -167,6 +178,7 @@ def write_code(
167
178
  return extract_code(coder(chat))
168
179
 
169
180
 
181
+ @traceable
170
182
  def write_test(
171
183
  tester: LMM,
172
184
  chat: List[Message],
@@ -191,6 +203,7 @@ def write_test(
191
203
  return extract_code(tester(chat))
192
204
 
193
205
 
206
+ @traceable
194
207
  def reflect(
195
208
  chat: List[Message],
196
209
  plan: str,
@@ -266,70 +279,19 @@ def write_and_test_code(
266
279
  count = 0
267
280
  new_working_memory: List[Dict[str, str]] = []
268
281
  while not result.success and count < max_retries:
269
- log_progress(
270
- {
271
- "type": "code",
272
- "status": "started",
273
- }
274
- )
275
- fixed_code_and_test = extract_json(
276
- debugger(
277
- FIX_BUG.format(
278
- code=code,
279
- tests=test,
280
- result="\n".join(result.text().splitlines()[-50:]),
281
- feedback=format_memory(working_memory + new_working_memory),
282
- )
283
- )
284
- )
285
- old_code = code
286
- old_test = test
287
-
288
- if fixed_code_and_test["code"].strip() != "":
289
- code = extract_code(fixed_code_and_test["code"])
290
- if fixed_code_and_test["test"].strip() != "":
291
- test = extract_code(fixed_code_and_test["test"])
292
-
293
- new_working_memory.append(
294
- {
295
- "code": f"{code}\n{test}",
296
- "feedback": fixed_code_and_test["reflections"],
297
- "edits": get_diff(f"{old_code}\n{old_test}", f"{code}\n{test}"),
298
- }
299
- )
300
- log_progress(
301
- {
302
- "type": "code",
303
- "status": "running",
304
- "payload": {
305
- "code": DefaultImports.prepend_imports(code),
306
- "test": test,
307
- },
308
- }
309
- )
310
-
311
- result = code_interpreter.exec_isolation(
312
- f"{DefaultImports.to_code_string()}\n{code}\n{test}"
313
- )
314
- log_progress(
315
- {
316
- "type": "code",
317
- "status": "completed" if result.success else "failed",
318
- "payload": {
319
- "code": DefaultImports.prepend_imports(code),
320
- "test": test,
321
- "result": result.to_json(),
322
- },
323
- }
324
- )
325
282
  if verbosity == 2:
326
- _LOGGER.info(
327
- f"Debug attempt {count + 1}, reflection: {fixed_code_and_test['reflections']}"
328
- )
329
- _print_code("Code and test after attempted fix:", code, test)
330
- _LOGGER.info(
331
- f"Code execution result after attempted fix: {result.text(include_logs=True)}"
332
- )
283
+ _LOGGER.info(f"Start debugging attempt {count + 1}")
284
+ code, test, result = debug_code(
285
+ working_memory,
286
+ debugger,
287
+ code_interpreter,
288
+ code,
289
+ test,
290
+ result,
291
+ new_working_memory,
292
+ log_progress,
293
+ verbosity,
294
+ )
333
295
  count += 1
334
296
 
335
297
  if verbosity >= 1:
@@ -344,6 +306,95 @@ def write_and_test_code(
344
306
  }
345
307
 
346
308
 
309
+ @traceable
310
+ def debug_code(
311
+ working_memory: List[Dict[str, str]],
312
+ debugger: LMM,
313
+ code_interpreter: CodeInterpreter,
314
+ code: str,
315
+ test: str,
316
+ result: Execution,
317
+ new_working_memory: List[Dict[str, str]],
318
+ log_progress: Callable[[Dict[str, Any]], None],
319
+ verbosity: int = 0,
320
+ ) -> tuple[str, str, Execution]:
321
+ log_progress(
322
+ {
323
+ "type": "code",
324
+ "status": "started",
325
+ }
326
+ )
327
+
328
+ fixed_code_and_test = {"code": "", "test": "", "reflections": ""}
329
+ success = False
330
+ count = 0
331
+ while not success and count < 3:
332
+ try:
333
+ fixed_code_and_test = extract_json(
334
+ debugger(
335
+ FIX_BUG.format(
336
+ code=code,
337
+ tests=test,
338
+ result="\n".join(result.text().splitlines()[-50:]),
339
+ feedback=format_memory(working_memory + new_working_memory),
340
+ )
341
+ )
342
+ )
343
+ success = True
344
+ except Exception as e:
345
+ _LOGGER.exception(f"Error while extracting JSON: {e}")
346
+
347
+ count += 1
348
+
349
+ old_code = code
350
+ old_test = test
351
+
352
+ if fixed_code_and_test["code"].strip() != "":
353
+ code = extract_code(fixed_code_and_test["code"])
354
+ if fixed_code_and_test["test"].strip() != "":
355
+ test = extract_code(fixed_code_and_test["test"])
356
+
357
+ new_working_memory.append(
358
+ {
359
+ "code": f"{code}\n{test}",
360
+ "feedback": fixed_code_and_test["reflections"],
361
+ "edits": get_diff(f"{old_code}\n{old_test}", f"{code}\n{test}"),
362
+ }
363
+ )
364
+ log_progress(
365
+ {
366
+ "type": "code",
367
+ "status": "running",
368
+ "payload": {
369
+ "code": DefaultImports.prepend_imports(code),
370
+ "test": test,
371
+ },
372
+ }
373
+ )
374
+
375
+ result = code_interpreter.exec_isolation(
376
+ f"{DefaultImports.to_code_string()}\n{code}\n{test}"
377
+ )
378
+ log_progress(
379
+ {
380
+ "type": "code",
381
+ "status": "completed" if result.success else "failed",
382
+ "payload": {
383
+ "code": DefaultImports.prepend_imports(code),
384
+ "test": test,
385
+ "result": result.to_json(),
386
+ },
387
+ }
388
+ )
389
+ if verbosity == 2:
390
+ _print_code("Code and test after attempted fix:", code, test)
391
+ _LOGGER.info(
392
+ f"Reflection: {fixed_code_and_test['reflections']}\nCode execution result after attempted fix: {result.text(include_logs=True)}"
393
+ )
394
+
395
+ return code, test, result
396
+
397
+
347
398
  def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
348
399
  _CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
349
400
  _CONSOLE.print("=" * 30 + " Code " + "=" * 30)
@@ -386,12 +437,12 @@ def retrieve_tools(
386
437
  {
387
438
  "type": "tools",
388
439
  "status": "completed",
389
- "payload": tool_list,
440
+ "payload": list({v["description"]: v for v in tool_list}.values()),
390
441
  }
391
442
  )
392
443
 
393
444
  if verbosity == 2:
394
- tool_desc_str = "\n".join(tool_desc)
445
+ tool_desc_str = "\n".join(set(tool_desc))
395
446
  _LOGGER.info(f"Tools Description:\n{tool_desc_str}")
396
447
  tool_info_set = set(tool_info)
397
448
  return "\n\n".join(tool_info_set)
@@ -481,6 +532,7 @@ class VisionAgent(Agent):
481
532
  results.pop("working_memory")
482
533
  return results # type: ignore
483
534
 
535
+ @traceable
484
536
  def chat_with_workflow(
485
537
  self,
486
538
  chat: List[Message],
@@ -179,6 +179,8 @@ This is the documentation for the functions you have access to. You may call any
179
179
  8. DO NOT use try except block to handle the error, let the error be raised if the code is incorrect.
180
180
  9. DO NOT import the testing function as it will available in the testing environment.
181
181
  10. Print the output of the function that is being tested.
182
+ 11. Use the output of the function that is being tested as the return value of the testing function.
183
+ 12. Run the testing function in the end and don't assign a variable to its output.
182
184
  """
183
185
 
184
186
 
@@ -0,0 +1 @@
1
+ from .lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM
@@ -6,6 +6,7 @@ from abc import ABC, abstractmethod
6
6
  from pathlib import Path
7
7
  from typing import Any, Callable, Dict, List, Optional, Union, cast
8
8
 
9
+ import requests
9
10
  from openai import AzureOpenAI, OpenAI
10
11
 
11
12
  import vision_agent.tools as T
@@ -163,6 +164,7 @@ class OpenAILMM(LMM):
163
164
  {"role": "system", "content": SYSTEM_PROMPT},
164
165
  {"role": "user", "content": prompt},
165
166
  ],
167
+ response_format={"type": "json_object"},
166
168
  )
167
169
 
168
170
  try:
@@ -178,7 +180,7 @@ class OpenAILMM(LMM):
178
180
  return lambda x: T.clip(x, params["prompt"])
179
181
 
180
182
  def generate_detector(self, question: str) -> Callable:
181
- api_doc = T.get_tool_documentation([T.grounding_dino])
183
+ api_doc = T.get_tool_documentation([T.owl_v2])
182
184
  prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
183
185
  response = self.client.chat.completions.create(
184
186
  model=self.model_name,
@@ -186,6 +188,7 @@ class OpenAILMM(LMM):
186
188
  {"role": "system", "content": SYSTEM_PROMPT},
187
189
  {"role": "user", "content": prompt},
188
190
  ],
191
+ response_format={"type": "json_object"},
189
192
  )
190
193
 
191
194
  try:
@@ -198,7 +201,7 @@ class OpenAILMM(LMM):
198
201
  )
199
202
  raise ValueError("Failed to decode response")
200
203
 
201
- return lambda x: T.grounding_dino(params["prompt"], x)
204
+ return lambda x: T.owl_v2(params["prompt"], x)
202
205
 
203
206
  def generate_segmentor(self, question: str) -> Callable:
204
207
  api_doc = T.get_tool_documentation([T.grounding_sam])
@@ -209,6 +212,7 @@ class OpenAILMM(LMM):
209
212
  {"role": "system", "content": SYSTEM_PROMPT},
210
213
  {"role": "user", "content": prompt},
211
214
  ],
215
+ response_format={"type": "json_object"},
212
216
  )
213
217
 
214
218
  try:
@@ -224,16 +228,16 @@ class OpenAILMM(LMM):
224
228
  return lambda x: T.grounding_sam(params["prompt"], x)
225
229
 
226
230
  def generate_zero_shot_counter(self, question: str) -> Callable:
227
- return T.zero_shot_counting
231
+ return T.loca_zero_shot_counting
228
232
 
229
233
  def generate_image_qa_tool(self, question: str) -> Callable:
230
- return lambda x: T.image_question_answering(question, x)
234
+ return lambda x: T.git_vqa_v2(question, x)
231
235
 
232
236
 
233
237
  class AzureOpenAILMM(OpenAILMM):
234
238
  def __init__(
235
239
  self,
236
- model_name: str = "gpt-4o",
240
+ model_name: Optional[str] = None,
237
241
  api_key: Optional[str] = None,
238
242
  api_version: str = "2024-02-01",
239
243
  azure_endpoint: Optional[str] = None,
@@ -245,14 +249,20 @@ class AzureOpenAILMM(OpenAILMM):
245
249
  api_key = os.getenv("AZURE_OPENAI_API_KEY")
246
250
  if not azure_endpoint:
247
251
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
252
+ if not model_name:
253
+ model_name = os.getenv("AZURE_OPENAI_CHAT_MODEL_DEPLOYMENT_NAME")
248
254
 
249
255
  if not api_key:
250
256
  raise ValueError("OpenAI API key is required.")
251
257
  if not azure_endpoint:
252
258
  raise ValueError("Azure OpenAI endpoint is required.")
259
+ if not model_name:
260
+ raise ValueError("Azure OpenAI chat model deployment name is required.")
253
261
 
254
262
  self.client = AzureOpenAI(
255
- api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint
263
+ api_key=api_key,
264
+ api_version=api_version,
265
+ azure_endpoint=azure_endpoint,
256
266
  )
257
267
  self.model_name = model_name
258
268
 
@@ -261,3 +271,84 @@ class AzureOpenAILMM(OpenAILMM):
261
271
  if json_mode:
262
272
  kwargs["response_format"] = {"type": "json_object"}
263
273
  self.kwargs = kwargs
274
+
275
+
276
+ class OllamaLMM(LMM):
277
+ r"""An LMM class for the ollama."""
278
+
279
+ def __init__(
280
+ self,
281
+ model_name: str = "llava",
282
+ base_url: Optional[str] = "http://localhost:11434/api",
283
+ json_mode: bool = False,
284
+ **kwargs: Any,
285
+ ):
286
+ self.url = base_url
287
+ self.model_name = model_name
288
+ self.json_mode = json_mode
289
+ self.stream = False
290
+
291
+ def __call__(
292
+ self,
293
+ input: Union[str, List[Message]],
294
+ ) -> str:
295
+ if isinstance(input, str):
296
+ return self.generate(input)
297
+ return self.chat(input)
298
+
299
+ def chat(
300
+ self,
301
+ chat: List[Message],
302
+ ) -> str:
303
+ """Chat with the LMM model.
304
+
305
+ Parameters:
306
+ chat (List[Dict[str, str]]): A list of dictionaries containing the chat
307
+ messages. The messages can be in the format:
308
+ [{"role": "user", "content": "Hello!"}, ...]
309
+ or if it contains media, it should be in the format:
310
+ [{"role": "user", "content": "Hello!", "media": ["image1.jpg", ...]}, ...]
311
+ """
312
+ fixed_chat = []
313
+ for message in chat:
314
+ if "media" in message:
315
+ message["images"] = [encode_image(m) for m in message["media"]]
316
+ del message["media"]
317
+ fixed_chat.append(message)
318
+ url = f"{self.url}/chat"
319
+ model = self.model_name
320
+ messages = fixed_chat
321
+ data = {"model": model, "messages": messages, "stream": self.stream}
322
+ json_data = json.dumps(data)
323
+ response = requests.post(url, data=json_data)
324
+ if response.status_code != 200:
325
+ raise ValueError(f"Request failed with status code {response.status_code}")
326
+ response = response.json()
327
+ return response["message"]["content"] # type: ignore
328
+
329
+ def generate(
330
+ self,
331
+ prompt: str,
332
+ media: Optional[List[Union[str, Path]]] = None,
333
+ ) -> str:
334
+
335
+ url = f"{self.url}/generate"
336
+ data = {
337
+ "model": self.model_name,
338
+ "prompt": prompt,
339
+ "images": [],
340
+ "stream": self.stream,
341
+ }
342
+
343
+ json_data = json.dumps(data)
344
+ if media and len(media) > 0:
345
+ for m in media:
346
+ data["images"].append(encode_image(m)) # type: ignore
347
+
348
+ response = requests.post(url, data=json_data)
349
+
350
+ if response.status_code != 200:
351
+ raise ValueError(f"Request failed with status code {response.status_code}")
352
+
353
+ response = response.json()
354
+ return response["response"] # type: ignore
@@ -7,25 +7,37 @@ from .tools import (
7
7
  TOOLS,
8
8
  TOOLS_DF,
9
9
  UTILITIES_DOCSTRING,
10
+ blip_image_caption,
10
11
  clip,
11
12
  closest_box_distance,
12
13
  closest_mask_distance,
13
14
  extract_frames,
15
+ florencev2_image_caption,
14
16
  get_tool_documentation,
17
+ florencev2_object_detection,
18
+ detr_segmentation,
19
+ depth_anything_v2,
20
+ generate_soft_edge_image,
21
+ dpt_hybrid_midas,
22
+ generate_pose_image,
23
+ git_vqa_v2,
15
24
  grounding_dino,
16
25
  grounding_sam,
17
- image_caption,
18
- image_question_answering,
26
+ florencev2_roberta_vqa,
19
27
  load_image,
28
+ loca_visual_prompt_counting,
29
+ loca_zero_shot_counting,
20
30
  ocr,
21
31
  overlay_bounding_boxes,
22
32
  overlay_heat_map,
23
33
  overlay_segmentation_masks,
34
+ owl_v2,
24
35
  save_image,
25
36
  save_json,
26
37
  save_video,
27
- visual_prompt_counting,
28
- zero_shot_counting,
38
+ template_match,
39
+ vit_image_classification,
40
+ vit_nsfw_classification,
29
41
  )
30
42
 
31
43
  __new_tools__ = [