vision-agent 0.2.29__tar.gz → 0.2.31__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. vision_agent-0.2.31/PKG-INFO +175 -0
  2. vision_agent-0.2.31/README.md +141 -0
  3. {vision_agent-0.2.29 → vision_agent-0.2.31}/pyproject.toml +1 -1
  4. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/agent/__init__.py +2 -2
  5. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/agent/agent.py +2 -2
  6. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/agent/agent_coder.py +8 -8
  7. vision_agent-0.2.29/vision_agent/agent/vision_agent_v2.py → vision_agent-0.2.31/vision_agent/agent/data_interpreter.py +12 -12
  8. vision_agent-0.2.29/vision_agent/agent/vision_agent_v2_prompts.py → vision_agent-0.2.31/vision_agent/agent/data_interpreter_prompts.py +3 -3
  9. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/agent/easytool.py +8 -8
  10. vision_agent-0.2.29/vision_agent/agent/vision_agent.py → vision_agent-0.2.31/vision_agent/agent/easytool_v2.py +20 -20
  11. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/agent/reflexion.py +8 -8
  12. vision_agent-0.2.29/vision_agent/agent/vision_agent_v3.py → vision_agent-0.2.31/vision_agent/agent/vision_agent.py +78 -17
  13. vision_agent-0.2.29/vision_agent/agent/vision_agent_v3_prompts.py → vision_agent-0.2.31/vision_agent/agent/vision_agent_prompts.py +13 -5
  14. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/llm/llm.py +3 -4
  15. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/lmm/lmm.py +6 -6
  16. vision_agent-0.2.31/vision_agent/tools/__init__.py +24 -0
  17. vision_agent-0.2.29/PKG-INFO +0 -226
  18. vision_agent-0.2.29/README.md +0 -192
  19. vision_agent-0.2.29/vision_agent/tools/__init__.py +0 -25
  20. {vision_agent-0.2.29 → vision_agent-0.2.31}/LICENSE +0 -0
  21. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/__init__.py +0 -0
  22. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/agent/agent_coder_prompts.py +0 -0
  23. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/agent/easytool_prompts.py +0 -0
  24. /vision_agent-0.2.29/vision_agent/agent/vision_agent_prompts.py → /vision_agent-0.2.31/vision_agent/agent/easytool_v2_prompts.py +0 -0
  25. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/agent/reflexion_prompts.py +0 -0
  26. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/fonts/__init__.py +0 -0
  27. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  28. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/llm/__init__.py +0 -0
  29. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/lmm/__init__.py +0 -0
  30. /vision_agent-0.2.29/vision_agent/tools/tools.py → /vision_agent-0.2.31/vision_agent/tools/easytool_tools.py +0 -0
  31. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/tools/prompts.py +0 -0
  32. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/tools/tool_utils.py +0 -0
  33. /vision_agent-0.2.29/vision_agent/tools/tools_v2.py → /vision_agent-0.2.31/vision_agent/tools/tools.py +0 -0
  34. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/utils/__init__.py +0 -0
  35. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/utils/execute.py +0 -0
  36. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/utils/image_utils.py +0 -0
  37. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/utils/sim.py +0 -0
  38. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/utils/type_defs.py +0 -0
  39. {vision_agent-0.2.29 → vision_agent-0.2.31}/vision_agent/utils/video.py +0 -0
@@ -0,0 +1,175 @@
1
+ Metadata-Version: 2.1
2
+ Name: vision-agent
3
+ Version: 0.2.31
4
+ Summary: Toolset for Vision Agent
5
+ Author: Landing AI
6
+ Author-email: dev@landing.ai
7
+ Requires-Python: >=3.9,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
13
+ Requires-Dist: langsmith (>=0.1.58,<0.2.0)
14
+ Requires-Dist: moviepy (>=1.0.0,<2.0.0)
15
+ Requires-Dist: nbclient (>=0.10.0,<0.11.0)
16
+ Requires-Dist: nbformat (>=5.10.4,<6.0.0)
17
+ Requires-Dist: numpy (>=1.21.0,<2.0.0)
18
+ Requires-Dist: openai (>=1.0.0,<2.0.0)
19
+ Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
20
+ Requires-Dist: pandas (>=2.0.0,<3.0.0)
21
+ Requires-Dist: pillow (>=10.0.0,<11.0.0)
22
+ Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
23
+ Requires-Dist: requests (>=2.0.0,<3.0.0)
24
+ Requires-Dist: rich (>=13.7.1,<14.0.0)
25
+ Requires-Dist: scipy (>=1.13.0,<1.14.0)
26
+ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
27
+ Requires-Dist: tqdm (>=4.64.0,<5.0.0)
28
+ Requires-Dist: typing_extensions (>=4.0.0,<5.0.0)
29
+ Project-URL: Homepage, https://landing.ai
30
+ Project-URL: documentation, https://github.com/landing-ai/vision-agent
31
+ Project-URL: repository, https://github.com/landing-ai/vision-agent
32
+ Description-Content-Type: text/markdown
33
+
34
+ <div align="center">
35
+ <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
36
+
37
+ # 🔍🤖 Vision Agent
38
+
39
+ [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
40
+ ![ci_status](https://github.com/landing-ai/vision-agent/actions/workflows/ci_cd.yml/badge.svg)
41
+ [![PyPI version](https://badge.fury.io/py/vision-agent.svg)](https://badge.fury.io/py/vision-agent)
42
+ ![version](https://img.shields.io/pypi/pyversions/vision-agent)
43
+ </div>
44
+
45
+ Vision Agent is a library that helps you utilize agent frameworks to generate code to
46
+ solve your vision task. Many current vision problems can easily take hours or days to
47
+ solve, you need to find the right model, figure out how to use it and program it to
48
+ accomplish the task you want. Vision Agent aims to provide an in-seconds experience by
49
+ allowing users to describe their problem in text and have the agent framework generate
50
+ code to solve the task for them. Check out our discord for updates and roadmaps!
51
+
52
+ ## Documentation
53
+
54
+ - [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
55
+
56
+
57
+ ## Getting Started
58
+ ### Installation
59
+ To get started, you can install the library using pip:
60
+
61
+ ```bash
62
+ pip install vision-agent
63
+ ```
64
+
65
+ Ensure you have an OpenAI API key and set it as an environment variable (if you are
66
+ using Azure OpenAI please see the Azure setup section):
67
+
68
+ ```bash
69
+ export OPENAI_API_KEY="your-api-key"
70
+ ```
71
+
72
+ ### Vision Agent
73
+ You can interact with the agent as you would with any LLM or LMM model:
74
+
75
+ ```python
76
+ >>> from vision_agent.agent import VisionAgent
77
+ >>> agent = VisionAgent()
78
+ >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
79
+ ```
80
+
81
+ Which produces the following code:
82
+ ```python
83
+ from vision_agent.tools import load_image, grounding_sam
84
+
85
+ def calculate_filled_percentage(image_path: str) -> float:
86
+ # Step 1: Load the image
87
+ image = load_image(image_path)
88
+
89
+ # Step 2: Segment the jar
90
+ jar_segments = grounding_sam(prompt="jar", image=image)
91
+
92
+ # Step 3: Segment the coffee beans
93
+ coffee_beans_segments = grounding_sam(prompt="coffee beans", image=image)
94
+
95
+ # Step 4: Calculate the area of the segmented jar
96
+ jar_area = 0
97
+ for segment in jar_segments:
98
+ jar_area += segment['mask'].sum()
99
+
100
+ # Step 5: Calculate the area of the segmented coffee beans
101
+ coffee_beans_area = 0
102
+ for segment in coffee_beans_segments:
103
+ coffee_beans_area += segment['mask'].sum()
104
+
105
+ # Step 6: Compute the percentage of the jar area that is filled with coffee beans
106
+ if jar_area == 0:
107
+ return 0.0 # To avoid division by zero
108
+ filled_percentage = (coffee_beans_area / jar_area) * 100
109
+
110
+ # Step 7: Return the computed percentage
111
+ return filled_percentage
112
+ ```
113
+
114
+ To better understand how the model came up with it's answer, you can run it in debug
115
+ mode by passing in the verbose argument:
116
+
117
+ ```python
118
+ >>> agent = VisionAgent(verbose=2)
119
+ ```
120
+
121
+ You can also have it return more information by calling `chat_with_workflow`:
122
+
123
+ ```python
124
+ >>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?"}], media="jar.jpg")
125
+ >>> print(results)
126
+ {
127
+ "code": "from vision_agent.tools import ..."
128
+ "test": "calculate_filled_percentage('jar.jpg')",
129
+ "test_result": "...",
130
+ "plan": [{"code": "...", "test": "...", "plan": "..."}, ...],
131
+ "working_memory": ...,
132
+ }
133
+ ```
134
+
135
+ With this you can examine more detailed information such as the etesting code, testing
136
+ results, plan or working memory it used to complete the task.
137
+
138
+ ### Tools
139
+ There are a variety of tools for the model or the user to use. Some are executed locally
140
+ while others are hosted for you. You can also ask an LLM directly to build a tool for
141
+ you. For example:
142
+
143
+ ```python
144
+ >>> import vision_agent as va
145
+ >>> llm = va.llm.OpenAILLM()
146
+ >>> detector = llm.generate_detector("Can you build a jar detector for me?")
147
+ >>> detector("jar.jpg")
148
+ [{"labels": ["jar",],
149
+ "scores": [0.99],
150
+ "bboxes": [
151
+ [0.58, 0.2, 0.72, 0.45],
152
+ ]
153
+ }]
154
+ ```
155
+
156
+ ### Azure Setup
157
+ If you want to use Azure OpenAI models, you can set the environment variable:
158
+
159
+ ```bash
160
+ export AZURE_OPENAI_API_KEY="your-api-key"
161
+ export AZURE_OPENAI_ENDPOINT="your-endpoint"
162
+ ```
163
+
164
+ You can then run Vision Agent using the Azure OpenAI models:
165
+
166
+ ```python
167
+ >>> import vision_agent as va
168
+ >>> agent = va.agent.VisionAgent(
169
+ >>> task_model=va.llm.AzureOpenAILLM(),
170
+ >>> answer_model=va.lmm.AzureOpenAILMM(),
171
+ >>> reflection_model=va.lmm.AzureOpenAILMM(),
172
+ >>> )
173
+ ```
174
+
175
+
@@ -0,0 +1,141 @@
1
+ <div align="center">
2
+ <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
3
+
4
+ # 🔍🤖 Vision Agent
5
+
6
+ [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
7
+ ![ci_status](https://github.com/landing-ai/vision-agent/actions/workflows/ci_cd.yml/badge.svg)
8
+ [![PyPI version](https://badge.fury.io/py/vision-agent.svg)](https://badge.fury.io/py/vision-agent)
9
+ ![version](https://img.shields.io/pypi/pyversions/vision-agent)
10
+ </div>
11
+
12
+ Vision Agent is a library that helps you utilize agent frameworks to generate code to
13
+ solve your vision task. Many current vision problems can easily take hours or days to
14
+ solve, you need to find the right model, figure out how to use it and program it to
15
+ accomplish the task you want. Vision Agent aims to provide an in-seconds experience by
16
+ allowing users to describe their problem in text and have the agent framework generate
17
+ code to solve the task for them. Check out our discord for updates and roadmaps!
18
+
19
+ ## Documentation
20
+
21
+ - [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
22
+
23
+
24
+ ## Getting Started
25
+ ### Installation
26
+ To get started, you can install the library using pip:
27
+
28
+ ```bash
29
+ pip install vision-agent
30
+ ```
31
+
32
+ Ensure you have an OpenAI API key and set it as an environment variable (if you are
33
+ using Azure OpenAI please see the Azure setup section):
34
+
35
+ ```bash
36
+ export OPENAI_API_KEY="your-api-key"
37
+ ```
38
+
39
+ ### Vision Agent
40
+ You can interact with the agent as you would with any LLM or LMM model:
41
+
42
+ ```python
43
+ >>> from vision_agent.agent import VisionAgent
44
+ >>> agent = VisionAgent()
45
+ >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
46
+ ```
47
+
48
+ Which produces the following code:
49
+ ```python
50
+ from vision_agent.tools import load_image, grounding_sam
51
+
52
+ def calculate_filled_percentage(image_path: str) -> float:
53
+ # Step 1: Load the image
54
+ image = load_image(image_path)
55
+
56
+ # Step 2: Segment the jar
57
+ jar_segments = grounding_sam(prompt="jar", image=image)
58
+
59
+ # Step 3: Segment the coffee beans
60
+ coffee_beans_segments = grounding_sam(prompt="coffee beans", image=image)
61
+
62
+ # Step 4: Calculate the area of the segmented jar
63
+ jar_area = 0
64
+ for segment in jar_segments:
65
+ jar_area += segment['mask'].sum()
66
+
67
+ # Step 5: Calculate the area of the segmented coffee beans
68
+ coffee_beans_area = 0
69
+ for segment in coffee_beans_segments:
70
+ coffee_beans_area += segment['mask'].sum()
71
+
72
+ # Step 6: Compute the percentage of the jar area that is filled with coffee beans
73
+ if jar_area == 0:
74
+ return 0.0 # To avoid division by zero
75
+ filled_percentage = (coffee_beans_area / jar_area) * 100
76
+
77
+ # Step 7: Return the computed percentage
78
+ return filled_percentage
79
+ ```
80
+
81
+ To better understand how the model came up with it's answer, you can run it in debug
82
+ mode by passing in the verbose argument:
83
+
84
+ ```python
85
+ >>> agent = VisionAgent(verbose=2)
86
+ ```
87
+
88
+ You can also have it return more information by calling `chat_with_workflow`:
89
+
90
+ ```python
91
+ >>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?"}], media="jar.jpg")
92
+ >>> print(results)
93
+ {
94
+ "code": "from vision_agent.tools import ..."
95
+ "test": "calculate_filled_percentage('jar.jpg')",
96
+ "test_result": "...",
97
+ "plan": [{"code": "...", "test": "...", "plan": "..."}, ...],
98
+ "working_memory": ...,
99
+ }
100
+ ```
101
+
102
+ With this you can examine more detailed information such as the etesting code, testing
103
+ results, plan or working memory it used to complete the task.
104
+
105
+ ### Tools
106
+ There are a variety of tools for the model or the user to use. Some are executed locally
107
+ while others are hosted for you. You can also ask an LLM directly to build a tool for
108
+ you. For example:
109
+
110
+ ```python
111
+ >>> import vision_agent as va
112
+ >>> llm = va.llm.OpenAILLM()
113
+ >>> detector = llm.generate_detector("Can you build a jar detector for me?")
114
+ >>> detector("jar.jpg")
115
+ [{"labels": ["jar",],
116
+ "scores": [0.99],
117
+ "bboxes": [
118
+ [0.58, 0.2, 0.72, 0.45],
119
+ ]
120
+ }]
121
+ ```
122
+
123
+ ### Azure Setup
124
+ If you want to use Azure OpenAI models, you can set the environment variable:
125
+
126
+ ```bash
127
+ export AZURE_OPENAI_API_KEY="your-api-key"
128
+ export AZURE_OPENAI_ENDPOINT="your-endpoint"
129
+ ```
130
+
131
+ You can then run Vision Agent using the Azure OpenAI models:
132
+
133
+ ```python
134
+ >>> import vision_agent as va
135
+ >>> agent = va.agent.VisionAgent(
136
+ >>> task_model=va.llm.AzureOpenAILLM(),
137
+ >>> answer_model=va.lmm.AzureOpenAILMM(),
138
+ >>> reflection_model=va.lmm.AzureOpenAILMM(),
139
+ >>> )
140
+ ```
141
+
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.29"
7
+ version = "0.2.31"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -1,7 +1,7 @@
1
1
  from .agent import Agent
2
2
  from .agent_coder import AgentCoder
3
+ from .data_interpreter import DataInterpreter
3
4
  from .easytool import EasyTool
5
+ from .easytool_v2 import EasyToolV2
4
6
  from .reflexion import Reflexion
5
7
  from .vision_agent import VisionAgent
6
- from .vision_agent_v2 import VisionAgentV2
7
- from .vision_agent_v3 import VisionAgentV3
@@ -1,6 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from pathlib import Path
3
- from typing import Dict, List, Optional, Union, Any
3
+ from typing import Any, Dict, List, Optional, Union
4
4
 
5
5
 
6
6
  class Agent(ABC):
@@ -8,7 +8,7 @@ class Agent(ABC):
8
8
  def __call__(
9
9
  self,
10
10
  input: Union[List[Dict[str, str]], str],
11
- image: Optional[Union[str, Path]] = None,
11
+ media: Optional[Union[str, Path]] = None,
12
12
  ) -> str:
13
13
  pass
14
14
 
@@ -3,7 +3,7 @@ import logging
3
3
  import os
4
4
  import sys
5
5
  from pathlib import Path
6
- from typing import Dict, List, Optional, Union, Any
6
+ from typing import Any, Dict, List, Optional, Union
7
7
 
8
8
  from rich.console import Console
9
9
  from rich.syntax import Syntax
@@ -18,7 +18,7 @@ from vision_agent.agent.agent_coder_prompts import (
18
18
  )
19
19
  from vision_agent.llm import LLM, OpenAILLM
20
20
  from vision_agent.lmm import LMM, OpenAILMM
21
- from vision_agent.tools.tools_v2 import TOOL_DOCSTRING, UTILITIES_DOCSTRING
21
+ from vision_agent.tools import TOOL_DOCSTRING, UTILITIES_DOCSTRING
22
22
  from vision_agent.utils import Execute
23
23
 
24
24
  IMPORT_HELPER = """
@@ -38,7 +38,7 @@ import numpy as np
38
38
  import string
39
39
  from typing import *
40
40
  from collections import *
41
- from vision_agent.tools.tools_v2 import *
41
+ from vision_agent.tools import *
42
42
  """
43
43
  logging.basicConfig(stream=sys.stdout)
44
44
  _LOGGER = logging.getLogger(__name__)
@@ -150,20 +150,20 @@ class AgentCoder(Agent):
150
150
  def __call__(
151
151
  self,
152
152
  input: Union[List[Dict[str, str]], str],
153
- image: Optional[Union[str, Path]] = None,
153
+ media: Optional[Union[str, Path]] = None,
154
154
  ) -> str:
155
155
  if isinstance(input, str):
156
156
  input = [{"role": "user", "content": input}]
157
- return self.chat(input, image)
157
+ return self.chat(input, media)
158
158
 
159
159
  def chat(
160
160
  self,
161
161
  input: List[Dict[str, str]],
162
- image: Optional[Union[str, Path]] = None,
162
+ media: Optional[Union[str, Path]] = None,
163
163
  ) -> str:
164
164
  question = input[0]["content"]
165
- if image:
166
- question += f" Input file path: {os.path.abspath(image)}"
165
+ if media:
166
+ question += f" Input file path: {os.path.abspath(media)}"
167
167
 
168
168
  code = ""
169
169
  feedback = ""
@@ -10,7 +10,7 @@ from rich.syntax import Syntax
10
10
  from tabulate import tabulate
11
11
 
12
12
  from vision_agent.agent import Agent
13
- from vision_agent.agent.vision_agent_v2_prompts import (
13
+ from vision_agent.agent.data_interpreter_prompts import (
14
14
  CODE,
15
15
  CODE_SYS_MSG,
16
16
  DEBUG,
@@ -25,7 +25,7 @@ from vision_agent.agent.vision_agent_v2_prompts import (
25
25
  USER_REQ_SUBTASK_WM_CONTEXT,
26
26
  )
27
27
  from vision_agent.llm import LLM, OpenAILLM
28
- from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF
28
+ from vision_agent.tools import TOOL_DESCRIPTIONS, TOOLS_DF
29
29
  from vision_agent.utils import Execute, Sim
30
30
 
31
31
  logging.basicConfig(level=logging.INFO)
@@ -331,11 +331,11 @@ def run_plan(
331
331
  return current_code, current_test, plan, working_memory
332
332
 
333
333
 
334
- class VisionAgentV2(Agent):
335
- """Vision Agent is an AI agentic framework geared towards outputting Python code to
336
- solve vision tasks. It is inspired by MetaGPT's Data Interpreter
337
- https://arxiv.org/abs/2402.18679. Vision Agent has several key features to help it
338
- generate code:
334
+ class DataInterpreter(Agent):
335
+ """This version of Data Interpreter is an AI agentic framework geared towards
336
+ outputting Python code to solve vision tasks. It is inspired by MetaGPT's Data
337
+ Interpreter https://arxiv.org/abs/2402.18679. This version of Data Interpreter has
338
+ several key features to help it generate code:
339
339
 
340
340
  - A planner to generate a plan of tasks to solve a user requirement. The planner
341
341
  can output code tasks or test tasks, where test tasks are used to verify the code.
@@ -379,29 +379,29 @@ class VisionAgentV2(Agent):
379
379
  def __call__(
380
380
  self,
381
381
  input: Union[List[Dict[str, str]], str],
382
- image: Optional[Union[str, Path]] = None,
382
+ media: Optional[Union[str, Path]] = None,
383
383
  plan: Optional[List[Dict[str, Any]]] = None,
384
384
  ) -> str:
385
385
  if isinstance(input, str):
386
386
  input = [{"role": "user", "content": input}]
387
- results = self.chat_with_workflow(input, image, plan)
387
+ results = self.chat_with_workflow(input, media, plan)
388
388
  return results["code"] # type: ignore
389
389
 
390
390
  @traceable
391
391
  def chat_with_workflow(
392
392
  self,
393
393
  chat: List[Dict[str, str]],
394
- image: Optional[Union[str, Path]] = None,
394
+ media: Optional[Union[str, Path]] = None,
395
395
  plan: Optional[List[Dict[str, Any]]] = None,
396
396
  ) -> Dict[str, Any]:
397
397
  if len(chat) == 0:
398
398
  raise ValueError("Input cannot be empty.")
399
399
 
400
- if image is not None:
400
+ if media is not None:
401
401
  # append file names to all user messages
402
402
  for chat_i in chat:
403
403
  if chat_i["role"] == "user":
404
- chat_i["content"] += f" Image name {image}"
404
+ chat_i["content"] += f" Image name {media}"
405
405
 
406
406
  working_code = ""
407
407
  if plan is not None:
@@ -74,15 +74,15 @@ CODE = """
74
74
 
75
75
  # Constraints
76
76
  - Write a function that accomplishes the 'Current Subtask'. You are supplied code from a previous task under 'Previous Code', do not delete or change previous code unless it contains a bug or it is necessary to complete the 'Current Subtask'.
77
- - Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info' when working on 'Current Subtask'. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
77
+ - Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info' when working on 'Current Subtask'. You have access to all these tools through the `from vision_agent.tools import *` import.
78
78
  - You may recieve previous trials and errors under 'Previous Task', this is code, output and reflections from previous tasks. You can use these to avoid running in to the same issues when writing your code.
79
- - Use the `save_json` function from `vision_agent.tools.tools_v2` to save your output as a json file.
79
+ - Use the `save_json` function from `vision_agent.tools` to save your output as a json file.
80
80
  - Write clean, readable, and well-documented code.
81
81
 
82
82
  # Output
83
83
  While some concise thoughts are helpful, code is absolutely required. If possible, execute your defined functions in the code output. Output code in the following format:
84
84
  ```python
85
- from vision_agent.tools.tools_v2 imoprt *
85
+ from vision_agent.tools imoprt *
86
86
 
87
87
  # your code goes here
88
88
  ```
@@ -6,7 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
6
6
 
7
7
  from vision_agent.llm import LLM, OpenAILLM
8
8
  from vision_agent.lmm import LMM
9
- from vision_agent.tools import TOOLS
9
+ from vision_agent.tools.easytool_tools import TOOLS
10
10
 
11
11
  from .agent import Agent
12
12
  from .easytool_prompts import (
@@ -272,7 +272,7 @@ class EasyTool(Agent):
272
272
  def __call__(
273
273
  self,
274
274
  input: Union[List[Dict[str, str]], str],
275
- image: Optional[Union[str, Path]] = None,
275
+ media: Optional[Union[str, Path]] = None,
276
276
  ) -> str:
277
277
  """Invoke the vision agent.
278
278
 
@@ -285,14 +285,14 @@ class EasyTool(Agent):
285
285
  """
286
286
  if isinstance(input, str):
287
287
  input = [{"role": "user", "content": input}]
288
- return self.chat(input, image=image)
288
+ return self.chat(input, media=media)
289
289
 
290
290
  def chat_with_workflow(
291
- self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None
291
+ self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None
292
292
  ) -> Tuple[str, List[Dict]]:
293
293
  question = chat[0]["content"]
294
- if image:
295
- question += f" Image name: {image}"
294
+ if media:
295
+ question += f" Image name: {media}"
296
296
  tasks = task_decompose(
297
297
  self.task_model,
298
298
  question,
@@ -340,7 +340,7 @@ class EasyTool(Agent):
340
340
  return answer_summarize(self.answer_model, question, answers), all_tool_results
341
341
 
342
342
  def chat(
343
- self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None
343
+ self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None
344
344
  ) -> str:
345
- answer, _ = self.chat_with_workflow(chat, image=image)
345
+ answer, _ = self.chat_with_workflow(chat, media=media)
346
346
  return answer