vision-agent 0.2.229__tar.gz → 0.2.230__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent-0.2.230/PKG-INFO +156 -0
- vision_agent-0.2.230/README.md +110 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/pyproject.toml +1 -1
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/.sim_tools/df.csv +10 -8
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/agent_utils.py +10 -9
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent.py +3 -4
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_coder_prompts.py +6 -6
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_coder_v2.py +41 -26
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_planner_prompts.py +6 -6
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_planner_prompts_v2.py +16 -50
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_planner_v2.py +10 -12
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_prompts.py +11 -11
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_prompts_v2.py +18 -3
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_v2.py +29 -30
- vision_agent-0.2.230/vision_agent/configs/__init__.py +1 -0
- vision_agent-0.2.230/vision_agent/configs/anthropic_config.py +150 -0
- vision_agent-0.2.230/vision_agent/configs/anthropic_openai_config.py +150 -0
- vision_agent-0.2.230/vision_agent/configs/config.py +150 -0
- vision_agent-0.2.230/vision_agent/configs/openai_config.py +160 -0
- vision_agent-0.2.230/vision_agent/lmm/__init__.py +2 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/lmm/lmm.py +63 -9
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/tools/planner_tools.py +60 -40
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/tools/tools.py +10 -8
- vision_agent-0.2.229/PKG-INFO +0 -562
- vision_agent-0.2.229/README.md +0 -516
- vision_agent-0.2.229/vision_agent/lmm/__init__.py +0 -2
- {vision_agent-0.2.229 → vision_agent-0.2.230}/LICENSE +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/.sim_tools/embs.npy +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/README.md +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/types.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_planner.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/tools/__init__.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/utils/video.py +0 -0
- {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/utils/video_tracking.py +0 -0
@@ -0,0 +1,156 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: vision-agent
|
3
|
+
Version: 0.2.230
|
4
|
+
Summary: Toolset for Vision Agent
|
5
|
+
Author: Landing AI
|
6
|
+
Author-email: dev@landing.ai
|
7
|
+
Requires-Python: >=3.9,<4.0
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
12
|
+
Requires-Dist: anthropic (>=0.31.0,<0.32.0)
|
13
|
+
Requires-Dist: av (>=11.0.0,<12.0.0)
|
14
|
+
Requires-Dist: e2b (>=0.17.2a50,<0.18.0)
|
15
|
+
Requires-Dist: e2b-code-interpreter (==0.0.11a37)
|
16
|
+
Requires-Dist: flake8 (>=7.0.0,<8.0.0)
|
17
|
+
Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
|
18
|
+
Requires-Dist: langsmith (>=0.1.58,<0.2.0)
|
19
|
+
Requires-Dist: libcst (>=1.5.0,<2.0.0)
|
20
|
+
Requires-Dist: matplotlib (>=3.9.2,<4.0.0)
|
21
|
+
Requires-Dist: nbclient (>=0.10.0,<0.11.0)
|
22
|
+
Requires-Dist: nbformat (>=5.10.4,<6.0.0)
|
23
|
+
Requires-Dist: numpy (>=1.21.0,<2.0.0)
|
24
|
+
Requires-Dist: openai (>=1.0.0,<2.0.0)
|
25
|
+
Requires-Dist: opencv-python (>=4.0.0,<5.0.0)
|
26
|
+
Requires-Dist: opentelemetry-api (>=1.29.0,<2.0.0)
|
27
|
+
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
28
|
+
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
29
|
+
Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
|
30
|
+
Requires-Dist: pydantic (==2.7.4)
|
31
|
+
Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
32
|
+
Requires-Dist: pytube (==15.0.0)
|
33
|
+
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
34
|
+
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
35
|
+
Requires-Dist: scikit-learn (>=1.5.2,<2.0.0)
|
36
|
+
Requires-Dist: scipy (>=1.13.0,<1.14.0)
|
37
|
+
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
38
|
+
Requires-Dist: tenacity (>=8.3.0,<9.0.0)
|
39
|
+
Requires-Dist: tqdm (>=4.64.0,<5.0.0)
|
40
|
+
Requires-Dist: typing_extensions (>=4.0.0,<5.0.0)
|
41
|
+
Project-URL: Homepage, https://landing.ai
|
42
|
+
Project-URL: documentation, https://github.com/landing-ai/vision-agent
|
43
|
+
Project-URL: repository, https://github.com/landing-ai/vision-agent
|
44
|
+
Description-Content-Type: text/markdown
|
45
|
+
|
46
|
+
<div align="center">
|
47
|
+
<picture>
|
48
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_light.svg?raw=true">
|
49
|
+
<source media="(prefers-color-scheme: light)" srcset="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_dark.svg?raw=true">
|
50
|
+
<img alt="VisionAgent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_light.svg?raw=true">
|
51
|
+
</picture>
|
52
|
+
|
53
|
+
[](https://discord.gg/wPdN8RCYew)
|
54
|
+

|
55
|
+
[](https://badge.fury.io/py/vision-agent)
|
56
|
+

|
57
|
+
</div>
|
58
|
+
|
59
|
+
## VisionAgent
|
60
|
+
VisionAgent is a library that helps you utilize agent frameworks to generate code to
|
61
|
+
solve your vision task. Check out our discord for updates and roadmaps! The fastest
|
62
|
+
way to test out VisionAgent is to use our web application which you can find [here](https://va.landing.ai/).
|
63
|
+
|
64
|
+
## Installation
|
65
|
+
```bash
|
66
|
+
pip install vision-agent
|
67
|
+
```
|
68
|
+
|
69
|
+
```bash
|
70
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
71
|
+
export OPENAI_API_KEY="your-api-key"
|
72
|
+
```
|
73
|
+
|
74
|
+
---
|
75
|
+
**NOTE**
|
76
|
+
We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance
|
77
|
+
for VisionAgent. If you want to use a different LLM provider or only one, see
|
78
|
+
'Using Other LLM Providers' below.
|
79
|
+
---
|
80
|
+
|
81
|
+
## Documentation
|
82
|
+
|
83
|
+
[VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
|
84
|
+
|
85
|
+
## Examples
|
86
|
+
### Counting cans in an image
|
87
|
+
You can run VisionAgent in a local Jupyter Notebook [Counting cans in an image](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks/counting_cans.ipynb)
|
88
|
+
|
89
|
+
### Generating code
|
90
|
+
You can use VisionAgent to generate code to count the number of people in an image:
|
91
|
+
```python
|
92
|
+
from vision_agent.agent import VisionAgentCoderV2
|
93
|
+
from vision_agent.agent.types import AgentMessage
|
94
|
+
|
95
|
+
agent = VisionAgentCoderV2(verbose=True)
|
96
|
+
code_context = agent.generate_code(
|
97
|
+
[
|
98
|
+
AgentMessage(
|
99
|
+
role="user",
|
100
|
+
content="Count the number of people in this image",
|
101
|
+
media=["people.png"]
|
102
|
+
)
|
103
|
+
]
|
104
|
+
)
|
105
|
+
|
106
|
+
with open("generated_code.py", "w") as f:
|
107
|
+
f.write(code_context.code + "\n" + code_context.test)
|
108
|
+
```
|
109
|
+
|
110
|
+
### Using the tools directly
|
111
|
+
VisionAgent produces code that utilizes our tools. You can also use the tools directly.
|
112
|
+
For example if you wanted to detect people in an image and visualize the results:
|
113
|
+
```python
|
114
|
+
import vision_agent.tools as T
|
115
|
+
import matplotlib.pyplot as plt
|
116
|
+
|
117
|
+
image = T.load_image("people.png")
|
118
|
+
dets = T.countgd_object_detection("person", image)
|
119
|
+
# visualize the countgd bounding boxes on the image
|
120
|
+
viz = T.overlay_bounding_boxes(image, dets)
|
121
|
+
|
122
|
+
# save the visualization to a file
|
123
|
+
T.save_image(viz, "people_detected.png")
|
124
|
+
|
125
|
+
# display the visualization
|
126
|
+
plt.imshow(viz)
|
127
|
+
plt.show()
|
128
|
+
```
|
129
|
+
|
130
|
+
You can also use the tools for running on video files:
|
131
|
+
```python
|
132
|
+
import vision_agent.tools as T
|
133
|
+
|
134
|
+
frames_and_ts = T.extract_frames_and_timestamps("people.mp4")
|
135
|
+
# extract the frames from the frames_and_ts list
|
136
|
+
frames = [f["frame"] for f in frames_and_ts]
|
137
|
+
|
138
|
+
# run the countgd tracking on the frames
|
139
|
+
tracks = T.countgd_sam2_video_tracking("person", frames)
|
140
|
+
# visualize the countgd tracking results on the frames and save the video
|
141
|
+
viz = T.overlay_segmentation_masks(frames, tracks)
|
142
|
+
T.save_video(viz, "people_detected.mp4")
|
143
|
+
```
|
144
|
+
|
145
|
+
## Using Other LLM Providers
|
146
|
+
You can use other LLM providers by changing `config.py` in the `vision_agent/configs`
|
147
|
+
directory. For example to change to Anthropic simply just run:
|
148
|
+
```bash
|
149
|
+
cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
|
150
|
+
```
|
151
|
+
|
152
|
+
**NOTE**
|
153
|
+
VisionAgent moves fast and we are constantly updating and changing the library. If you
|
154
|
+
have any questions or need help, please reach out to us on our discord channel.
|
155
|
+
---
|
156
|
+
|
@@ -0,0 +1,110 @@
|
|
1
|
+
<div align="center">
|
2
|
+
<picture>
|
3
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_light.svg?raw=true">
|
4
|
+
<source media="(prefers-color-scheme: light)" srcset="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_dark.svg?raw=true">
|
5
|
+
<img alt="VisionAgent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_light.svg?raw=true">
|
6
|
+
</picture>
|
7
|
+
|
8
|
+
[](https://discord.gg/wPdN8RCYew)
|
9
|
+

|
10
|
+
[](https://badge.fury.io/py/vision-agent)
|
11
|
+

|
12
|
+
</div>
|
13
|
+
|
14
|
+
## VisionAgent
|
15
|
+
VisionAgent is a library that helps you utilize agent frameworks to generate code to
|
16
|
+
solve your vision task. Check out our discord for updates and roadmaps! The fastest
|
17
|
+
way to test out VisionAgent is to use our web application which you can find [here](https://va.landing.ai/).
|
18
|
+
|
19
|
+
## Installation
|
20
|
+
```bash
|
21
|
+
pip install vision-agent
|
22
|
+
```
|
23
|
+
|
24
|
+
```bash
|
25
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
26
|
+
export OPENAI_API_KEY="your-api-key"
|
27
|
+
```
|
28
|
+
|
29
|
+
---
|
30
|
+
**NOTE**
|
31
|
+
We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance
|
32
|
+
for VisionAgent. If you want to use a different LLM provider or only one, see
|
33
|
+
'Using Other LLM Providers' below.
|
34
|
+
---
|
35
|
+
|
36
|
+
## Documentation
|
37
|
+
|
38
|
+
[VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
|
39
|
+
|
40
|
+
## Examples
|
41
|
+
### Counting cans in an image
|
42
|
+
You can run VisionAgent in a local Jupyter Notebook [Counting cans in an image](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks/counting_cans.ipynb)
|
43
|
+
|
44
|
+
### Generating code
|
45
|
+
You can use VisionAgent to generate code to count the number of people in an image:
|
46
|
+
```python
|
47
|
+
from vision_agent.agent import VisionAgentCoderV2
|
48
|
+
from vision_agent.agent.types import AgentMessage
|
49
|
+
|
50
|
+
agent = VisionAgentCoderV2(verbose=True)
|
51
|
+
code_context = agent.generate_code(
|
52
|
+
[
|
53
|
+
AgentMessage(
|
54
|
+
role="user",
|
55
|
+
content="Count the number of people in this image",
|
56
|
+
media=["people.png"]
|
57
|
+
)
|
58
|
+
]
|
59
|
+
)
|
60
|
+
|
61
|
+
with open("generated_code.py", "w") as f:
|
62
|
+
f.write(code_context.code + "\n" + code_context.test)
|
63
|
+
```
|
64
|
+
|
65
|
+
### Using the tools directly
|
66
|
+
VisionAgent produces code that utilizes our tools. You can also use the tools directly.
|
67
|
+
For example if you wanted to detect people in an image and visualize the results:
|
68
|
+
```python
|
69
|
+
import vision_agent.tools as T
|
70
|
+
import matplotlib.pyplot as plt
|
71
|
+
|
72
|
+
image = T.load_image("people.png")
|
73
|
+
dets = T.countgd_object_detection("person", image)
|
74
|
+
# visualize the countgd bounding boxes on the image
|
75
|
+
viz = T.overlay_bounding_boxes(image, dets)
|
76
|
+
|
77
|
+
# save the visualization to a file
|
78
|
+
T.save_image(viz, "people_detected.png")
|
79
|
+
|
80
|
+
# display the visualization
|
81
|
+
plt.imshow(viz)
|
82
|
+
plt.show()
|
83
|
+
```
|
84
|
+
|
85
|
+
You can also use the tools for running on video files:
|
86
|
+
```python
|
87
|
+
import vision_agent.tools as T
|
88
|
+
|
89
|
+
frames_and_ts = T.extract_frames_and_timestamps("people.mp4")
|
90
|
+
# extract the frames from the frames_and_ts list
|
91
|
+
frames = [f["frame"] for f in frames_and_ts]
|
92
|
+
|
93
|
+
# run the countgd tracking on the frames
|
94
|
+
tracks = T.countgd_sam2_video_tracking("person", frames)
|
95
|
+
# visualize the countgd tracking results on the frames and save the video
|
96
|
+
viz = T.overlay_segmentation_masks(frames, tracks)
|
97
|
+
T.save_video(viz, "people_detected.mp4")
|
98
|
+
```
|
99
|
+
|
100
|
+
## Using Other LLM Providers
|
101
|
+
You can use other LLM providers by changing `config.py` in the `vision_agent/configs`
|
102
|
+
directory. For example to change to Anthropic simply just run:
|
103
|
+
```bash
|
104
|
+
cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
|
105
|
+
```
|
106
|
+
|
107
|
+
**NOTE**
|
108
|
+
VisionAgent moves fast and we are constantly updating and changing the library. If you
|
109
|
+
have any questions or need help, please reach out to us on our discord channel.
|
110
|
+
---
|
@@ -244,7 +244,8 @@ desc,doc,name
|
|
244
244
|
1.0.
|
245
245
|
|
246
246
|
Parameters:
|
247
|
-
prompt (str): The prompt to ground to the image.
|
247
|
+
prompt (str): The prompt to ground to the image. Use exclusive categories that
|
248
|
+
do not overlap such as 'person, car' and NOT 'person, athlete'.
|
248
249
|
image (np.ndarray): The image to ground the prompt to.
|
249
250
|
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
250
251
|
fine-tuned model ID here to use it.
|
@@ -281,7 +282,8 @@ desc,doc,name
|
|
281
282
|
is useful for tracking and counting without duplicating counts.
|
282
283
|
|
283
284
|
Parameters:
|
284
|
-
prompt (str): The prompt to ground to the
|
285
|
+
prompt (str): The prompt to ground to the image. Use exclusive categories that
|
286
|
+
do not overlap such as 'person, car' and NOT 'person, athlete'.
|
285
287
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
286
288
|
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
287
289
|
new objects.
|
@@ -317,14 +319,14 @@ desc,doc,name
|
|
317
319
|
]
|
318
320
|
",florence2_sam2_video_tracking
|
319
321
|
"'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
320
|
-
'florence2_object_detection' is a tool that can detect multiple
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
confidence scores of 1.0.
|
322
|
+
'florence2_object_detection' is a tool that can detect multiple objects given a
|
323
|
+
text prompt which can be object names or caption. You can optionally separate the
|
324
|
+
object names in the text with commas. It returns a list of bounding boxes with
|
325
|
+
normalized coordinates, label names and associated confidence scores of 1.0.
|
325
326
|
|
326
327
|
Parameters:
|
327
|
-
prompt (str): The prompt to ground to the image.
|
328
|
+
prompt (str): The prompt to ground to the image. Use exclusive categories that
|
329
|
+
do not overlap such as 'person, car' and NOT 'person, athlete'.
|
328
330
|
image (np.ndarray): The image to used to detect objects
|
329
331
|
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
330
332
|
fine-tuned model ID here to use it.
|
@@ -157,10 +157,11 @@ def format_conversation(chat: List[AgentMessage]) -> str:
|
|
157
157
|
chat = copy.deepcopy(chat)
|
158
158
|
prompt = ""
|
159
159
|
for chat_i in chat:
|
160
|
-
if chat_i.role == "user":
|
161
|
-
|
162
|
-
|
163
|
-
|
160
|
+
if chat_i.role == "user" or chat_i.role == "coder":
|
161
|
+
if "<final_code>" in chat_i.role:
|
162
|
+
prompt += f"OBSERVATION: {chat_i.content}\n\n"
|
163
|
+
elif chat_i.role == "user":
|
164
|
+
prompt += f"USER: {chat_i.content}\n\n"
|
164
165
|
elif chat_i.role == "conversation":
|
165
166
|
prompt += f"AGENT: {chat_i.content}\n\n"
|
166
167
|
return prompt
|
@@ -332,26 +333,26 @@ def strip_function_calls( # noqa: C901
|
|
332
333
|
def __init__(self, exclusions: List[str]):
|
333
334
|
# Store exclusions to skip removing certain function calls
|
334
335
|
self.exclusions = exclusions
|
335
|
-
self.in_function_or_class =
|
336
|
+
self.in_function_or_class: List[bool] = []
|
336
337
|
|
337
338
|
def visit_FunctionDef(self, node: cst.FunctionDef) -> Optional[bool]:
|
338
|
-
self.in_function_or_class
|
339
|
+
self.in_function_or_class.append(True)
|
339
340
|
return True
|
340
341
|
|
341
342
|
def leave_FunctionDef(
|
342
343
|
self, original_node: cst.FunctionDef, updated_node: cst.FunctionDef
|
343
344
|
) -> cst.BaseStatement:
|
344
|
-
self.in_function_or_class
|
345
|
+
self.in_function_or_class.pop()
|
345
346
|
return updated_node
|
346
347
|
|
347
348
|
def visit_ClassDef(self, node: cst.ClassDef) -> Optional[bool]:
|
348
|
-
self.in_function_or_class
|
349
|
+
self.in_function_or_class.append(True)
|
349
350
|
return True
|
350
351
|
|
351
352
|
def leave_ClassDef(
|
352
353
|
self, node: cst.ClassDef, updated_node: cst.ClassDef
|
353
354
|
) -> cst.BaseStatement:
|
354
|
-
self.in_function_or_class
|
355
|
+
self.in_function_or_class.pop()
|
355
356
|
return updated_node
|
356
357
|
|
357
358
|
def leave_Expr(
|
@@ -291,10 +291,9 @@ class VisionAgent(Agent):
|
|
291
291
|
verbosity (int): The verbosity level of the agent.
|
292
292
|
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
293
293
|
function to send intermediate update messages.
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
object is provided it will use that.
|
294
|
+
code_sandbox_runtime (Optional[str]): For string values it can be one of:
|
295
|
+
None, "local" or "e2b". If None, it will read from the environment
|
296
|
+
variable "CODE_SANDBOX_RUNTIME".
|
298
297
|
"""
|
299
298
|
|
300
299
|
self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
{vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
@@ -44,22 +44,22 @@ Can you write a program to check if each person is wearing a helmet? First detec
|
|
44
44
|
|
45
45
|
## Subtasks
|
46
46
|
|
47
|
-
This plan uses the
|
48
|
-
-Use
|
47
|
+
This plan uses the owlv2_object_detection tool to detect both people and helmets in a single pass, which should be efficient and accurate. We can then compare the detections to determine if each person is wearing a helmet.
|
48
|
+
-Use owlv2_object_detection with prompt 'person, helmet' to detect both people and helmets in the image
|
49
49
|
-Process the detections to match helmets with people based on bounding box proximity
|
50
50
|
-Count people with and without helmets based on the matching results
|
51
51
|
-Return a dictionary with the counts
|
52
52
|
|
53
53
|
|
54
54
|
**Tool Tests and Outputs**:
|
55
|
-
After examining the image, I can see 4 workers in total, with 3 wearing yellow safety helmets and 1 not wearing a helmet. Plan 1 using
|
55
|
+
After examining the image, I can see 4 workers in total, with 3 wearing yellow safety helmets and 1 not wearing a helmet. Plan 1 using owlv2_object_detection seems to be the most accurate in detecting both people and helmets. However, it needs some modifications to improve accuracy. We should increase the confidence threshold to 0.15 to filter out the lowest confidence box, and implement logic to associate helmets with people based on their bounding box positions. Plan 2 and Plan 3 seem less reliable given the tool outputs, as they either failed to distinguish between people with and without helmets or misclassified all workers as not wearing helmets.
|
56
56
|
|
57
57
|
**Tool Output Thoughts**:
|
58
58
|
```python
|
59
59
|
...
|
60
60
|
```
|
61
61
|
----- stdout -----
|
62
|
-
Plan 1 -
|
62
|
+
Plan 1 - owlv2_object_detection:
|
63
63
|
|
64
64
|
[{{'label': 'helmet', 'score': 0.15, 'bbox': [0.85, 0.41, 0.87, 0.45]}}, {{'label': 'helmet', 'score': 0.3, 'bbox': [0.8, 0.43, 0.81, 0.46]}}, {{'label': 'helmet', 'score': 0.31, 'bbox': [0.85, 0.45, 0.86, 0.46]}}, {{'label': 'person', 'score': 0.31, 'bbox': [0.84, 0.45, 0.88, 0.58]}}, {{'label': 'person', 'score': 0.31, 'bbox': [0.78, 0.43, 0.82, 0.57]}}, {{'label': 'helmet', 'score': 0.33, 'bbox': [0.3, 0.65, 0.32, 0.67]}}, {{'label': 'person', 'score': 0.29, 'bbox': [0.28, 0.65, 0.36, 0.84]}}, {{'label': 'helmet', 'score': 0.29, 'bbox': [0.13, 0.82, 0.15, 0.85]}}, {{'label': 'person', 'score': 0.3, 'bbox': [0.1, 0.82, 0.24, 1.0]}}]
|
65
65
|
|
@@ -67,12 +67,12 @@ Plan 1 - owl_v2_image:
|
|
67
67
|
|
68
68
|
**Input Code Snippet**:
|
69
69
|
```python
|
70
|
-
from vision_agent.tools import load_image,
|
70
|
+
from vision_agent.tools import load_image, owlv2_object_detection
|
71
71
|
|
72
72
|
def check_helmets(image_path):
|
73
73
|
image = load_image(image_path)
|
74
74
|
# Detect people and helmets, filter out the lowest confidence helmet score of 0.15
|
75
|
-
detections =
|
75
|
+
detections = owlv2_object_detection("person, helmet", image, box_threshold=0.15)
|
76
76
|
height, width = image.shape[:2]
|
77
77
|
|
78
78
|
# Separate people and helmets
|
@@ -26,7 +26,8 @@ from vision_agent.agent.types import (
|
|
26
26
|
)
|
27
27
|
from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
|
28
28
|
from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
|
29
|
-
from vision_agent.
|
29
|
+
from vision_agent.configs import Config
|
30
|
+
from vision_agent.lmm import LMM
|
30
31
|
from vision_agent.lmm.types import Message
|
31
32
|
from vision_agent.tools.meta_tools import get_diff
|
32
33
|
from vision_agent.utils.execute import (
|
@@ -36,6 +37,7 @@ from vision_agent.utils.execute import (
|
|
36
37
|
)
|
37
38
|
from vision_agent.utils.sim import Sim, get_tool_recommender
|
38
39
|
|
40
|
+
CONFIG = Config()
|
39
41
|
_CONSOLE = Console()
|
40
42
|
|
41
43
|
|
@@ -185,23 +187,17 @@ def debug_code(
|
|
185
187
|
return code, test, debug_info
|
186
188
|
|
187
189
|
|
188
|
-
def
|
189
|
-
coder: LMM,
|
190
|
+
def test_code(
|
190
191
|
tester: LMM,
|
191
192
|
debugger: LMM,
|
192
193
|
chat: List[AgentMessage],
|
193
194
|
plan: str,
|
195
|
+
code: str,
|
194
196
|
tool_docs: str,
|
195
197
|
code_interpreter: CodeInterpreter,
|
196
198
|
media_list: List[Union[str, Path]],
|
197
199
|
verbose: bool,
|
198
200
|
) -> CodeContext:
|
199
|
-
code = write_code(
|
200
|
-
coder=coder,
|
201
|
-
chat=chat,
|
202
|
-
tool_docs=tool_docs,
|
203
|
-
plan=plan,
|
204
|
-
)
|
205
201
|
try:
|
206
202
|
code = strip_function_calls(code)
|
207
203
|
except Exception:
|
@@ -257,6 +253,36 @@ def write_and_test_code(
|
|
257
253
|
)
|
258
254
|
|
259
255
|
|
256
|
+
def write_and_test_code(
|
257
|
+
coder: LMM,
|
258
|
+
tester: LMM,
|
259
|
+
debugger: LMM,
|
260
|
+
chat: List[AgentMessage],
|
261
|
+
plan: str,
|
262
|
+
tool_docs: str,
|
263
|
+
code_interpreter: CodeInterpreter,
|
264
|
+
media_list: List[Union[str, Path]],
|
265
|
+
verbose: bool,
|
266
|
+
) -> CodeContext:
|
267
|
+
code = write_code(
|
268
|
+
coder=coder,
|
269
|
+
chat=chat,
|
270
|
+
tool_docs=tool_docs,
|
271
|
+
plan=plan,
|
272
|
+
)
|
273
|
+
return test_code(
|
274
|
+
tester,
|
275
|
+
debugger,
|
276
|
+
chat,
|
277
|
+
plan,
|
278
|
+
code,
|
279
|
+
tool_docs,
|
280
|
+
code_interpreter,
|
281
|
+
media_list,
|
282
|
+
verbose,
|
283
|
+
)
|
284
|
+
|
285
|
+
|
260
286
|
class VisionAgentCoderV2(AgentCoder):
|
261
287
|
"""VisionAgentCoderV2 is an agent that will write vision code for you."""
|
262
288
|
|
@@ -300,21 +326,9 @@ class VisionAgentCoderV2(AgentCoder):
|
|
300
326
|
)
|
301
327
|
)
|
302
328
|
|
303
|
-
self.coder = (
|
304
|
-
|
305
|
-
|
306
|
-
else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
|
307
|
-
)
|
308
|
-
self.tester = (
|
309
|
-
tester
|
310
|
-
if tester is not None
|
311
|
-
else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
|
312
|
-
)
|
313
|
-
self.debugger = (
|
314
|
-
debugger
|
315
|
-
if debugger is not None
|
316
|
-
else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
|
317
|
-
)
|
329
|
+
self.coder = coder if coder is not None else CONFIG.create_coder()
|
330
|
+
self.tester = tester if tester is not None else CONFIG.create_tester()
|
331
|
+
self.debugger = debugger if debugger is not None else CONFIG.create_debugger()
|
318
332
|
if tool_recommender is not None:
|
319
333
|
if isinstance(tool_recommender, str):
|
320
334
|
self.tool_recommender = Sim.load(tool_recommender)
|
@@ -440,12 +454,13 @@ class VisionAgentCoderV2(AgentCoder):
|
|
440
454
|
) as code_interpreter:
|
441
455
|
int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
|
442
456
|
tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
|
443
|
-
|
444
|
-
|
457
|
+
|
458
|
+
code_context = test_code(
|
445
459
|
tester=self.tester,
|
446
460
|
debugger=self.debugger,
|
447
461
|
chat=int_chat,
|
448
462
|
plan=format_plan_v2(plan_context),
|
463
|
+
code=plan_context.code,
|
449
464
|
tool_docs=tool_docs,
|
450
465
|
code_interpreter=code_interpreter,
|
451
466
|
media_list=media_list,
|
{vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_planner_prompts.py
RENAMED
@@ -55,27 +55,27 @@ This is the documentation for the functions you have access to. You may call any
|
|
55
55
|
--- EXAMPLE1 ---
|
56
56
|
plan1:
|
57
57
|
- Load the image from the provided file path 'image.jpg'.
|
58
|
-
- Use the '
|
58
|
+
- Use the 'owlv2_object_detection' tool with the prompt 'person' to detect and count the number of people in the image.
|
59
59
|
plan2:
|
60
60
|
- Load the image from the provided file path 'image.jpg'.
|
61
|
-
- Use the '
|
61
|
+
- Use the 'florence2_sam2_instance_segmentation' tool with the prompt 'person' to detect and count the number of people in the image.
|
62
62
|
- Count the number of detected objects labeled as 'person'.
|
63
63
|
plan3:
|
64
64
|
- Load the image from the provided file path 'image.jpg'.
|
65
65
|
- Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
|
66
66
|
|
67
67
|
```python
|
68
|
-
from vision_agent.tools import load_image,
|
68
|
+
from vision_agent.tools import load_image, owlv2_object_detection, florence2_sam2_instance_segmentation, countgd_object_detection
|
69
69
|
image = load_image("image.jpg")
|
70
|
-
owl_v2_out =
|
70
|
+
owl_v2_out = owlv2_object_detection("person", image)
|
71
71
|
|
72
|
-
f2s2_out =
|
72
|
+
f2s2_out = florence2_sam2_instance_segmentation("person", image)
|
73
73
|
# strip out the masks from the output becuase they don't provide useful information when printed
|
74
74
|
f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
|
75
75
|
|
76
76
|
cgd_out = countgd_object_detection("person", image)
|
77
77
|
|
78
|
-
final_out = {{"
|
78
|
+
final_out = {{"owlv2_object_detection": owl_v2_out, "florence2_sam2_instance_segmentation": f2s2, "countgd_object_detection": cgd_out}}
|
79
79
|
print(final_out)
|
80
80
|
--- END EXAMPLE1 ---
|
81
81
|
|