vision-agent 1.0.4__tar.gz → 1.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-1.0.4 → vision_agent-1.0.7}/PKG-INFO +31 -3
- {vision_agent-1.0.4 → vision_agent-1.0.7}/README.md +30 -2
- {vision_agent-1.0.4 → vision_agent-1.0.7}/pyproject.toml +1 -1
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/.sim_tools/df.csv +46 -47
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent-1.0.7/vision_agent/agent/__init__.py +4 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/vision_agent_planner_prompts_v2.py +57 -58
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/vision_agent_planner_v2.py +3 -2
- vision_agent-1.0.4/vision_agent/configs/anthropic_openai_config.py → vision_agent-1.0.7/vision_agent/configs/anthropic_config.py +12 -13
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/configs/config.py +14 -15
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/configs/openai_config.py +10 -10
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/lmm/lmm.py +2 -2
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/tools/__init__.py +0 -6
- vision_agent-1.0.7/vision_agent/tools/meta_tools.py +200 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/tools/planner_tools.py +13 -14
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/tools/tools.py +16 -27
- vision_agent-1.0.4/vision_agent/agent/__init__.py +0 -20
- vision_agent-1.0.4/vision_agent/agent/vision_agent.py +0 -605
- vision_agent-1.0.4/vision_agent/agent/vision_agent_coder.py +0 -742
- vision_agent-1.0.4/vision_agent/agent/vision_agent_coder_prompts.py +0 -290
- vision_agent-1.0.4/vision_agent/agent/vision_agent_planner.py +0 -564
- vision_agent-1.0.4/vision_agent/agent/vision_agent_planner_prompts.py +0 -199
- vision_agent-1.0.4/vision_agent/agent/vision_agent_prompts.py +0 -312
- vision_agent-1.0.4/vision_agent/configs/anthropic_config.py +0 -150
- vision_agent-1.0.4/vision_agent/tools/meta_tools.py +0 -691
- {vision_agent-1.0.4 → vision_agent-1.0.7}/LICENSE +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/__init__.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/README.md +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/agent.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/vision_agent_v2.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/clients/http.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/configs/__init__.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/models/__init__.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/models/agent_types.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/models/lmm_types.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/models/tools_types.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/sim/__init__.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/sim/sim.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/agent.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/execute.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/tools.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/tools_doc.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/video.py +0 -0
- {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/video_tracking.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.7
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -65,10 +65,10 @@ pip install vision-agent
|
|
65
65
|
|
66
66
|
```bash
|
67
67
|
export ANTHROPIC_API_KEY="your-api-key"
|
68
|
-
export
|
68
|
+
export GEMINI_API_KEY="your-api-key"
|
69
69
|
```
|
70
70
|
|
71
|
-
> **_NOTE:_** We found using both Anthropic Claude-3.
|
71
|
+
> **_NOTE:_** We found using both Anthropic Claude-3.7 and Gemini-2.0-Flash-Exp to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
|
72
72
|
|
73
73
|
You will also need to set your VisionAgent API key to be able to authenticate when using the hosted vision tools that we provide through our APIs. Currently, the APIs are free to use so you will only need to get it from [here](https://va.landing.ai/account/api-key).
|
74
74
|
|
@@ -147,5 +147,33 @@ directory. For example to change to Anthropic simply just run:
|
|
147
147
|
cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
|
148
148
|
```
|
149
149
|
|
150
|
+
You can also modify the existing `config.py` file yourself to use a different LLM
|
151
|
+
provider, for example if you wanted to change the planner from Anthropic inside
|
152
|
+
`config.py` to OpenAI you would replace this code:
|
153
|
+
```python
|
154
|
+
planner: Type[LMM] = Field(default=AnthropicLMM)
|
155
|
+
planner_kwargs: dict = Field(
|
156
|
+
default_factory=lambda: {
|
157
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
158
|
+
"temperature": 0.0,
|
159
|
+
"image_size": 768,
|
160
|
+
}
|
161
|
+
)
|
162
|
+
```
|
163
|
+
|
164
|
+
with this code:
|
165
|
+
|
166
|
+
```python
|
167
|
+
planner: Type[LMM] = Field(default=OpenAILMM)
|
168
|
+
planner_kwargs: dict = Field(
|
169
|
+
default_factory=lambda: {
|
170
|
+
"model_name": "gpt-4o-2024-11-20",
|
171
|
+
"temperature": 0.0,
|
172
|
+
"image_size": 768,
|
173
|
+
"image_detail": "low",
|
174
|
+
}
|
175
|
+
)
|
176
|
+
```
|
177
|
+
|
150
178
|
> **_NOTE:_** VisionAgent moves fast and we are constantly updating and changing the library. If you have any questions or need help, please reach out to us on our discord channel.
|
151
179
|
|
@@ -23,10 +23,10 @@ pip install vision-agent
|
|
23
23
|
|
24
24
|
```bash
|
25
25
|
export ANTHROPIC_API_KEY="your-api-key"
|
26
|
-
export
|
26
|
+
export GEMINI_API_KEY="your-api-key"
|
27
27
|
```
|
28
28
|
|
29
|
-
> **_NOTE:_** We found using both Anthropic Claude-3.
|
29
|
+
> **_NOTE:_** We found using both Anthropic Claude-3.7 and Gemini-2.0-Flash-Exp to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
|
30
30
|
|
31
31
|
You will also need to set your VisionAgent API key to be able to authenticate when using the hosted vision tools that we provide through our APIs. Currently, the APIs are free to use so you will only need to get it from [here](https://va.landing.ai/account/api-key).
|
32
32
|
|
@@ -105,4 +105,32 @@ directory. For example to change to Anthropic simply just run:
|
|
105
105
|
cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
|
106
106
|
```
|
107
107
|
|
108
|
+
You can also modify the existing `config.py` file yourself to use a different LLM
|
109
|
+
provider, for example if you wanted to change the planner from Anthropic inside
|
110
|
+
`config.py` to OpenAI you would replace this code:
|
111
|
+
```python
|
112
|
+
planner: Type[LMM] = Field(default=AnthropicLMM)
|
113
|
+
planner_kwargs: dict = Field(
|
114
|
+
default_factory=lambda: {
|
115
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
116
|
+
"temperature": 0.0,
|
117
|
+
"image_size": 768,
|
118
|
+
}
|
119
|
+
)
|
120
|
+
```
|
121
|
+
|
122
|
+
with this code:
|
123
|
+
|
124
|
+
```python
|
125
|
+
planner: Type[LMM] = Field(default=OpenAILMM)
|
126
|
+
planner_kwargs: dict = Field(
|
127
|
+
default_factory=lambda: {
|
128
|
+
"model_name": "gpt-4o-2024-11-20",
|
129
|
+
"temperature": 0.0,
|
130
|
+
"image_size": 768,
|
131
|
+
"image_detail": "low",
|
132
|
+
}
|
133
|
+
)
|
134
|
+
```
|
135
|
+
|
108
136
|
> **_NOTE:_** VisionAgent moves fast and we are constantly updating and changing the library. If you have any questions or need help, please reach out to us on our discord channel.
|
@@ -1,15 +1,15 @@
|
|
1
1
|
desc,doc,name
|
2
|
-
"'
|
3
|
-
'
|
4
|
-
|
5
|
-
|
6
|
-
boxes with normalized coordinates, label names and
|
2
|
+
"'glee_object_detection' is a tool that can detect multiple objects given a text prompt such as object names or referring expressions on images. It's particularly good at detecting specific objects given detailed descriptive prompts. It returns a list of bounding boxes with normalized coordinates, label names and associated probability scores.","glee_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
|
3
|
+
'glee_object_detection' is a tool that can detect multiple objects given a
|
4
|
+
text prompt such as object names or referring expressions on images. It's
|
5
|
+
particularly good at detecting specific objects given detailed descriptive prompts.
|
6
|
+
It returns a list of bounding boxes with normalized coordinates, label names and
|
7
|
+
associated probability scores.
|
7
8
|
|
8
9
|
Parameters:
|
9
|
-
prompt (str): The prompt to ground to the image
|
10
|
+
prompt (str): The prompt to ground to the image, only supports a single prompt
|
11
|
+
with no commas or periods.
|
10
12
|
image (np.ndarray): The image to ground the prompt to.
|
11
|
-
box_threshold (float, optional): The threshold for the box detection. Defaults
|
12
|
-
to 0.10.
|
13
13
|
|
14
14
|
Returns:
|
15
15
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -20,24 +20,23 @@ desc,doc,name
|
|
20
20
|
|
21
21
|
Example
|
22
22
|
-------
|
23
|
-
>>>
|
23
|
+
>>> glee_object_detection(""person holding a box"", image)
|
24
24
|
[
|
25
|
-
{'score': 0.99, 'label': '
|
26
|
-
{'score': 0.98, 'label': '
|
25
|
+
{'score': 0.99, 'label': 'person holding a box', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
26
|
+
{'score': 0.98, 'label': 'person holding a box', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
27
27
|
]
|
28
|
-
",
|
29
|
-
"'
|
30
|
-
'
|
31
|
-
instances
|
32
|
-
|
33
|
-
returns a list of bounding boxes with normalized
|
34
|
-
and associated probability scores.
|
28
|
+
",glee_object_detection
|
29
|
+
"'glee_sam2_instance_segmentation' is a tool that can detect multiple instances given a text prompt such as object names or referring expressions on images. It's particularly good at detecting specific objects given detailed descriptive prompts. It returns a list of bounding boxes with normalized coordinates, label names, masks and associated probability scores.","glee_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
|
30
|
+
'glee_sam2_instance_segmentation' is a tool that can detect multiple
|
31
|
+
instances given a text prompt such as object names or referring expressions on
|
32
|
+
images. It's particularly good at detecting specific objects given detailed
|
33
|
+
descriptive prompts. It returns a list of bounding boxes with normalized
|
34
|
+
coordinates, label names, masks and associated probability scores.
|
35
35
|
|
36
36
|
Parameters:
|
37
|
-
prompt (str): The object that needs to be counted
|
37
|
+
prompt (str): The object that needs to be counted, only supports a single
|
38
|
+
prompt with no commas or periods.
|
38
39
|
image (np.ndarray): The image that contains multiple instances of the object.
|
39
|
-
box_threshold (float, optional): The threshold for detection. Defaults
|
40
|
-
to 0.10.
|
41
40
|
|
42
41
|
Returns:
|
43
42
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
@@ -49,11 +48,11 @@ desc,doc,name
|
|
49
48
|
|
50
49
|
Example
|
51
50
|
-------
|
52
|
-
>>>
|
51
|
+
>>> glee_sam2_instance_segmentation(""a large blue flower"", image)
|
53
52
|
[
|
54
53
|
{
|
55
54
|
'score': 0.49,
|
56
|
-
'label': 'flower',
|
55
|
+
'label': 'a large blue flower',
|
57
56
|
'bbox': [0.1, 0.11, 0.35, 0.4],
|
58
57
|
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
59
58
|
[0, 0, 0, ..., 0, 0, 0],
|
@@ -62,21 +61,21 @@ desc,doc,name
|
|
62
61
|
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
63
62
|
},
|
64
63
|
]
|
65
|
-
",
|
66
|
-
"'
|
67
|
-
'
|
68
|
-
objects in a video given a text prompt such as
|
69
|
-
expressions.
|
70
|
-
a list of bounding boxes, label names, masks and
|
71
|
-
is useful for tracking and counting without
|
64
|
+
",glee_sam2_instance_segmentation
|
65
|
+
"'glee_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as object names or referring expressions. It's particularly good at detecting specific objects given detailed descriptive prompts and returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","glee_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.23, chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
|
66
|
+
'glee_sam2_video_tracking' is a tool that can track and segment multiple
|
67
|
+
objects in a video given a text prompt such as object names or referring
|
68
|
+
expressions. It's particularly good at detecting specific objects given detailed
|
69
|
+
descriptive prompts and returns a list of bounding boxes, label names, masks and
|
70
|
+
associated probability scores and is useful for tracking and counting without
|
71
|
+
duplicating counts.
|
72
72
|
|
73
73
|
Parameters:
|
74
|
-
prompt (str): The prompt to ground to the image
|
74
|
+
prompt (str): The prompt to ground to the image, only supports a single prompt
|
75
|
+
with no commas or periods.
|
75
76
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
76
|
-
|
77
|
-
to
|
78
|
-
chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
|
79
|
-
new objects.
|
77
|
+
chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
|
78
|
+
to find new objects.
|
80
79
|
|
81
80
|
Returns:
|
82
81
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
@@ -90,11 +89,11 @@ desc,doc,name
|
|
90
89
|
|
91
90
|
Example
|
92
91
|
-------
|
93
|
-
>>>
|
92
|
+
>>> glee_sam2_video_tracking(""a runner with yellow shoes"", frames)
|
94
93
|
[
|
95
94
|
[
|
96
95
|
{
|
97
|
-
'label': '0:
|
96
|
+
'label': '0: a runner with yellow shoes',
|
98
97
|
'bbox': [0.1, 0.11, 0.35, 0.4],
|
99
98
|
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
100
99
|
[0, 0, 0, ..., 0, 0, 0],
|
@@ -105,7 +104,7 @@ desc,doc,name
|
|
105
104
|
],
|
106
105
|
...
|
107
106
|
]
|
108
|
-
",
|
107
|
+
",glee_sam2_video_tracking
|
109
108
|
"'countgd_object_detection' is a tool that can detect multiple instances of an object given a text prompt. It is particularly useful when trying to detect and count a large number of objects. You can optionally separate object names in the prompt with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores.","countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
|
110
109
|
'countgd_object_detection' is a tool that can detect multiple instances of an
|
111
110
|
object given a text prompt. It is particularly useful when trying to detect and
|
@@ -417,8 +416,8 @@ desc,doc,name
|
|
417
416
|
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
418
417
|
]
|
419
418
|
",ocr
|
420
|
-
'
|
421
|
-
'
|
419
|
+
'qwen25_vl_images_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It can be very useful for document QA or OCR text extraction. It returns text as an answer to the question.,"qwen25_vl_images_vqa(prompt: str, images: List[numpy.ndarray]) -> str:
|
420
|
+
'qwen25_vl_images_vqa' is a tool that can answer any questions about arbitrary
|
422
421
|
images including regular images or images of documents or presentations. It can be
|
423
422
|
very useful for document QA or OCR text extraction. It returns text as an answer to
|
424
423
|
the question.
|
@@ -432,11 +431,11 @@ desc,doc,name
|
|
432
431
|
|
433
432
|
Example
|
434
433
|
-------
|
435
|
-
>>>
|
434
|
+
>>> qwen25_vl_images_vqa('Give a summary of the document', images)
|
436
435
|
'The document talks about the history of the United States of America and its...'
|
437
|
-
",
|
438
|
-
'
|
439
|
-
'
|
436
|
+
",qwen25_vl_images_vqa
|
437
|
+
'qwen25_vl_video_vqa' is a tool that can answer any questions about arbitrary videos including regular videos or videos of documents or presentations. It returns text as an answer to the question.,"qwen25_vl_video_vqa(prompt: str, frames: List[numpy.ndarray]) -> str:
|
438
|
+
'qwen25_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
|
440
439
|
including regular videos or videos of documents or presentations. It returns text
|
441
440
|
as an answer to the question.
|
442
441
|
|
@@ -449,10 +448,10 @@ desc,doc,name
|
|
449
448
|
|
450
449
|
Example
|
451
450
|
-------
|
452
|
-
>>>
|
451
|
+
>>> qwen25_vl_video_vqa('Which football player made the goal?', frames)
|
453
452
|
'Lionel Messi'
|
454
|
-
",
|
455
|
-
'activity_recognition' is a tool that can recognize activities in a video given a text prompt. It can be used to identify where specific activities or actions happen in a video and returns a list of 0s and 1s to indicate the activity.,"activity_recognition(prompt: str, frames: List[numpy.ndarray], model: str = '
|
453
|
+
",qwen25_vl_video_vqa
|
454
|
+
'activity_recognition' is a tool that can recognize activities in a video given a text prompt. It can be used to identify where specific activities or actions happen in a video and returns a list of 0s and 1s to indicate the activity.,"activity_recognition(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen25vl', chunk_length_frames: int = 10) -> List[float]:
|
456
455
|
'activity_recognition' is a tool that can recognize activities in a video given a
|
457
456
|
text prompt. It can be used to identify where specific activities or actions
|
458
457
|
happen in a video and returns a list of 0s and 1s to indicate the activity.
|
Binary file
|
{vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/vision_agent_planner_prompts_v2.py
RENAMED
@@ -9,21 +9,21 @@ PLAN = """
|
|
9
9
|
**Example Planning**: Here are some examples of how you can search for a plan, in the examples the user output is denoted by USER, your output is denoted by AGENT and the observations after your code execution are denoted by OBSERVATION:
|
10
10
|
{examples}
|
11
11
|
|
12
|
-
**Current Planning**:
|
12
|
+
**Current Planning**: Below are your previous thoughts, code and observations from the planning process:
|
13
13
|
--- START CURRENT PLANNING ---
|
14
14
|
{planning}
|
15
15
|
--- END CURRENT PLANNING ---
|
16
16
|
|
17
17
|
**Instructions**:
|
18
|
-
1. Read over the user request and context provided and output <thinking> tags to indicate your thought process. You
|
18
|
+
1. Read over the user request and context provided and output <thinking> tags to indicate your thought process. You have <count> number of turns to complete the user's request.
|
19
19
|
2. You can execute python code in the ipython notebook using <execute_python> tags. Only output one <execute_python> tag at a time.
|
20
|
-
3.
|
20
|
+
3. Output <finalize_plan> when you have written your final code, you do not need to wait until all turns are used to do this. DO NOT output <finalize_plan> with <execute_python> tags, only after OBSERVATION.
|
21
21
|
4. Only load/save files from {media_list} unless you specifically saved the file previously.
|
22
|
-
5. Ensure you always call `suggestion` and `
|
22
|
+
5. Ensure you always call `suggestion` and `vqa` initially and `get_tool_for_task` to get the right tool for the subtask.
|
23
23
|
6. Calling `plt.imshow` or `save_image` will display the image to you so you can check your results. If you see an image after <execute_python> it's generated from your code.
|
24
24
|
7. Be sure to print results returned for tools so you can see the output.
|
25
25
|
8. DO NOT hard code the answer into your code, it should be dynamic and work for any similar request.
|
26
|
-
9. DO NOT over index on
|
26
|
+
9. DO NOT over index on the vqa model. If tool output is close to vqa's output do not improve the tool output. If you have >10 detected objects, or the objects are very small, trust the tool output over the vqa model.
|
27
27
|
10. You can only respond in the following format with a single <thinking>, <execute_python> or <finalize_plan> tag:
|
28
28
|
|
29
29
|
<thinking>Your thought process...</thinking>
|
@@ -39,13 +39,13 @@ USER: Count the number of pedestrians in the image.
|
|
39
39
|
AGENT: <thinking>I need to gather more information, I will ask for a description of the image and a solution to the problem to work towards as well as a suggestion on how best to solve it.</thinking>
|
40
40
|
<execute_python>
|
41
41
|
image = load_image('drone.jpg')
|
42
|
-
|
42
|
+
vqa('Can you describe this image? How many pedestrians do you count in the image?', [image])
|
43
43
|
suggestion('How can I count the number of pedestrians in the image?', [image])
|
44
44
|
</execute_python>
|
45
45
|
|
46
|
-
OBSERVATION: [
|
46
|
+
OBSERVATION: [vqa output]
|
47
47
|
From this aerial view of a busy urban street, it's difficult to clearly see or count individual pedestrians. The image shows a bird's eye view of a city intersection with multiple lanes of traffic, parked cars, sidewalks, and some green spaces. While there may be people in cars or on the sidewalks, they are not distinctly visible from this height and perspective. The focus of the image is more on the urban infrastructure, traffic patterns, and overall city layout rather than on individuals.
|
48
|
-
[end of
|
48
|
+
[end of vqa output]
|
49
49
|
|
50
50
|
[suggestions]
|
51
51
|
[suggestion 0]
|
@@ -161,7 +161,7 @@ detections = merge_bounding_box_list(detection_from_subdivided_images)
|
|
161
161
|
[end of suggestion]
|
162
162
|
<count>9</count>
|
163
163
|
|
164
|
-
AGENT: <thinking>Following the suggested plan, I will subdivide the image into smaller parts and find the right tool on the smaller parts. I will also re-check with
|
164
|
+
AGENT: <thinking>Following the suggested plan, I will subdivide the image into smaller parts and find the right tool on the smaller parts. I will also re-check with vqa on the subdivided images.</thinking>
|
165
165
|
<execute_python>
|
166
166
|
def subdivide_image(image):
|
167
167
|
height, width, _ = image.shape
|
@@ -176,14 +176,14 @@ def subdivide_image(image):
|
|
176
176
|
return [top_left, top_right, bottom_left, bottom_right]
|
177
177
|
|
178
178
|
images = subdivide_image(image)
|
179
|
-
|
179
|
+
vqa('How many pedestrians do you see in these images?', images)
|
180
180
|
get_tool_for_task('Count the number of pedestrians in the images', images)
|
181
181
|
</execute_python>
|
182
182
|
|
183
183
|
OBSERVATION:
|
184
|
-
[
|
184
|
+
[vqa_output]
|
185
185
|
In these aerial images, I can see approximately 5-6 pedestrians walking in various locations, including near the retail signage are and along the sidewalks.
|
186
|
-
[end of
|
186
|
+
[end of vqa_output]
|
187
187
|
|
188
188
|
[get_tool_for_task output]
|
189
189
|
After examining the image, I can see it's an aerial view of a busy urban intersection with multiple lanes of traffic. There are numerous cars visible, each likely containing at least one person (the driver). While it's difficult to count exact individuals, I can estimate based on the number of vehicles. The countgd_object_detection tool is the best choice for this task as it can detect and count multiple instances of an object given a text prompt. I will use this tool with the prompt 'pedistrian' to count the number of pedestrians in the image.
|
@@ -269,7 +269,7 @@ print(len(detections))
|
|
269
269
|
OBSERVATION: 24
|
270
270
|
<count>7</count>
|
271
271
|
|
272
|
-
AGENT: <thinking>countgd_object_detection detected 24 pedestrians in the image by subdividing the image into smaller regions, this differs from
|
272
|
+
AGENT: <thinking>countgd_object_detection detected 24 pedestrians in the image by subdividing the image into smaller regions, this differs from vqa model's answer of 5-6 pedestrians. I will use countgd_object_detection's answer since it is specifically designed to count objects.</thinking>
|
273
273
|
<finalize_plan>This plan successfully detected 24 pedestrians in the image by subdividing the image into smaller regions and using the countgd_object_detection tool on each region. The bounding boxes were then merged to count the unique pedestrians.</finalize_plan>
|
274
274
|
--- END EXAMPLE1 ---
|
275
275
|
"""
|
@@ -283,13 +283,13 @@ AGENT: <thinking>I need to gather more information, I will ask for a description
|
|
283
283
|
<execute_python>
|
284
284
|
frames = extract_frames_and_timestamps('video.mp4', 1)
|
285
285
|
frames = [f["frame"] for f in frames]
|
286
|
-
|
286
|
+
vqa('Can you describe this video? How many boxes are there and do they have tape on them?', frames[:5])
|
287
287
|
suggestion('How can I identify boxes with tape on them in the video?', frames[:5])
|
288
288
|
</execute_python>
|
289
289
|
|
290
|
-
OBSERVATION: [
|
290
|
+
OBSERVATION: [vqa output]
|
291
291
|
In this video, there appear to be 2 cardboard boxes moving along a roller conveyor system. The boxes are brown/kraft colored and appear to have tape sealing them. You can see clear packing tape on the top and sides of the boxes.
|
292
|
-
[end of
|
292
|
+
[end of vqa output]
|
293
293
|
|
294
294
|
[suggestions]
|
295
295
|
[suggestion 0]
|
@@ -400,20 +400,18 @@ get_tool_for_task('Identify if there is tape on the boxes', crops[:3])
|
|
400
400
|
|
401
401
|
OBSERVATION:
|
402
402
|
[get_tool_for_task output]
|
403
|
-
|
403
|
+
glee_object_detection performed best with the prompt 'tape', as it specifically detected multiple instances of tape with localized bounding boxes, which matches what's visible in the images.
|
404
404
|
|
405
|
-
'
|
406
|
-
text prompt such as
|
407
|
-
|
408
|
-
normalized coordinates, label names and
|
405
|
+
'glee_object_detection' is a tool that can detect multiple objects given a
|
406
|
+
text prompt such as object names or referring expressions on images. It's
|
407
|
+
particularly good at detecting specific objects given detailed descriptive prompts.
|
408
|
+
It returns a list of bounding boxes with normalized coordinates, label names and
|
409
|
+
associated probability scores.
|
409
410
|
|
410
411
|
Parameters:
|
411
|
-
prompt (str): The prompt to ground to the image
|
412
|
+
prompt (str): The prompt to ground to the image, only supports a single prompt
|
413
|
+
with no commas or periods.
|
412
414
|
image (np.ndarray): The image to ground the prompt to.
|
413
|
-
box_threshold (float, optional): The threshold for the box detection. Defaults
|
414
|
-
to 0.10.
|
415
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
416
|
-
fine-tuned model ID here to use it.
|
417
415
|
|
418
416
|
Returns:
|
419
417
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -424,10 +422,10 @@ Returns:
|
|
424
422
|
|
425
423
|
Example
|
426
424
|
-------
|
427
|
-
>>>
|
425
|
+
>>> glee_object_detection("person holding a box", image)
|
428
426
|
[
|
429
|
-
{'score': 0.99, 'label': '
|
430
|
-
{'score': 0.98, 'label': '
|
427
|
+
{'score': 0.99, 'label': 'person holding a box', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
428
|
+
{'score': 0.98, 'label': 'person holding a box', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
431
429
|
]
|
432
430
|
[end of get_tool_for_task output]
|
433
431
|
<count>7</count>
|
@@ -445,7 +443,7 @@ for frame, frame_predictions in zip(frames, track_predictions):
|
|
445
443
|
int(obj["bbox"][0] * width) : int(obj["bbox"][2] * width),
|
446
444
|
:,
|
447
445
|
]
|
448
|
-
detections =
|
446
|
+
detections = glee_object_detection("tape", crop)
|
449
447
|
obj_to_info[obj["label"]].extend(detections)
|
450
448
|
|
451
449
|
|
@@ -512,7 +510,7 @@ PICK_PLAN = """
|
|
512
510
|
CATEGORIZE_TOOL_REQUEST = """
|
513
511
|
You are given a task: "{task}" from the user. You must extract the type of category this task belongs to, it can be one or more of the following:
|
514
512
|
- "VQA" - answering questions about an image or video, can be used for most tasks, should generally be included.
|
515
|
-
- "object detection
|
513
|
+
- "object detection" - detecting objects or counting objects from a text prompt in an image.
|
516
514
|
- "instance segmentation" - segmenting objects in an image given a text prompt.
|
517
515
|
- "classification" - classifying objects in an image given a text prompt.
|
518
516
|
- "segmentation" - segmenting objects in an image or video given a text prompt.
|
@@ -569,27 +567,27 @@ Count the number of pedestrians across all the images.
|
|
569
567
|
|
570
568
|
<code>
|
571
569
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
572
|
-
from vision_agent.tools import load_image,
|
570
|
+
from vision_agent.tools import load_image, agentic_object_detection, glee_object_detection, countgd_object_detection
|
573
571
|
|
574
572
|
# process functions in a try catch so that if it fails it doesn't cause `as_completed` to hang
|
575
|
-
def
|
573
|
+
def process_agentic(image_paths):
|
576
574
|
try:
|
577
575
|
results = []
|
578
576
|
for image_path in image_paths:
|
579
577
|
image = load_image(image_path)
|
580
|
-
results.extend(
|
578
|
+
results.extend(agentic_object_detection("person", image))
|
581
579
|
except Exception as e:
|
582
|
-
results = f"Encountered error when executing
|
580
|
+
results = f"Encountered error when executing process_agentic: {str(e)}"
|
583
581
|
return results
|
584
582
|
|
585
|
-
def
|
583
|
+
def process_glee_object_detection(image_paths):
|
586
584
|
try:
|
587
585
|
results = []
|
588
586
|
for image_path in image_paths:
|
589
587
|
image = load_image(image_path)
|
590
|
-
results.extend(
|
588
|
+
results.extend(glee_object_detection("person", image))
|
591
589
|
except Exception as e:
|
592
|
-
results = f"Encountered error when executing
|
590
|
+
results = f"Encountered error when executing process_glee: {str(e)}"
|
593
591
|
return results
|
594
592
|
|
595
593
|
def process_countgd(image_paths):
|
@@ -606,8 +604,8 @@ image_paths = ["image1.jpg", "image2.jpg", "image3.jpg", "image4.jpg"]
|
|
606
604
|
|
607
605
|
with ThreadPoolExecutor() as executor:
|
608
606
|
futures = {{
|
609
|
-
executor.submit(
|
610
|
-
executor.submit(
|
607
|
+
executor.submit(process_agentic, image_paths): "agentic_object_detection",
|
608
|
+
executor.submit(process_glee, image_paths): "glee_object_detection",
|
611
609
|
executor.submit(process_countgd, image_paths): "countgd_object_detection",
|
612
610
|
}}
|
613
611
|
|
@@ -632,7 +630,7 @@ Count the number of people in the video.
|
|
632
630
|
<code>
|
633
631
|
import numpy as np
|
634
632
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
635
|
-
from vision_agent.tools import extract_frames_and_timestamps,
|
633
|
+
from vision_agent.tools import extract_frames_and_timestamps, countgd_sam2_video_tracking, glee_sam2_video_tracking
|
636
634
|
|
637
635
|
# sample at 1 FPS and use the first 10 frames to reduce processing time
|
638
636
|
frames = extract_frames_and_timestamps("video.mp4", 1)
|
@@ -649,18 +647,18 @@ def remove_arrays(o):
|
|
649
647
|
else:
|
650
648
|
return o
|
651
649
|
|
652
|
-
def
|
650
|
+
def process_countgd_sam2_video_tracking(frames):
|
653
651
|
try:
|
654
652
|
# run with chunk_length=1 to ensure best results
|
655
|
-
results =
|
653
|
+
results = countgd_sam2_video_tracking("person", frames, chunk_length=1)
|
656
654
|
except Exception as e:
|
657
|
-
results = f"Encountered error when executing
|
655
|
+
results = f"Encountered error when executing process_countgd_sam2_video_tracking: {str(e)}"
|
658
656
|
return results
|
659
657
|
|
660
|
-
def
|
658
|
+
def process_glee_sam2_video_tracking(frames):
|
661
659
|
try:
|
662
660
|
# run with chunk_length=1 to ensure best results
|
663
|
-
results =
|
661
|
+
results = glee_sam2_video_tracking("person", frames, chunk_length=1)
|
664
662
|
except Exception as e:
|
665
663
|
results = f"Encountered error when executing process_florence2_sam2: {str(e)}"
|
666
664
|
return results
|
@@ -668,8 +666,8 @@ def process_florence2_sam2_video_tracking(frames):
|
|
668
666
|
|
669
667
|
with ThreadPoolExecutor() as executor:
|
670
668
|
futures = {{
|
671
|
-
executor.submit(
|
672
|
-
executor.submit(
|
669
|
+
executor.submit(process_countgd_sam2_video_tracking, frames): "countgd_sam2_video_tracking",
|
670
|
+
executor.submit(process_glee_sam2_video_tracking, frames): "glee_sam2_video_tracking",
|
673
671
|
}}
|
674
672
|
final_results = {{}}
|
675
673
|
for future in as_completed(futures):
|
@@ -701,7 +699,7 @@ PICK_TOOL = """
|
|
701
699
|
**Instructions**:
|
702
700
|
1. Re-read the user request, plans, tool outputs and examine the image.
|
703
701
|
2. Given the user request, try to solve the problem yourself.
|
704
|
-
3. Pick which tool output best matches your solution and the user request, DO NOT focus on other factors.
|
702
|
+
3. Pick which tool output best matches your solution and the user request, DO NOT focus on other factors like confidence score.
|
705
703
|
4. DO NOT modify confidence thresholds unless the tool output is completely wrong.
|
706
704
|
5. Remember for videos that in order to count objects a video some sort of tracking is needed, or else you will overcount the objects.
|
707
705
|
7. Return the following JSON format inside <json> tags using the exact tool name for best_tool:
|
@@ -717,7 +715,7 @@ PICK_TOOL = """
|
|
717
715
|
FINALIZE_PLAN = """
|
718
716
|
**Task**: You are given a chain of thoughts, python executions and observations from a planning agent as it tries to construct a plan to solve a user request. Your task is to summarize the plan it found so that another programming agent to write a program to accomplish the user request.
|
719
717
|
|
720
|
-
**Documentation**: You can use these tools to help
|
718
|
+
**Documentation**: You can use these tools to help complete the code (they are imported `from vision_agent.tools import *`):
|
721
719
|
{tool_desc}
|
722
720
|
|
723
721
|
**Planning**: Here is chain of thoughts, executions and observations from the planning agent:
|
@@ -726,13 +724,14 @@ FINALIZE_PLAN = """
|
|
726
724
|
**Instructions**:
|
727
725
|
1. Summarize the plan that the planning agent found.
|
728
726
|
2. Write a single function that solves the problem based on what the planner found and only returns the final solution.
|
729
|
-
3.
|
730
|
-
4.
|
731
|
-
5.
|
732
|
-
6. Ensure
|
733
|
-
7.
|
734
|
-
8.
|
735
|
-
9.
|
727
|
+
3. Ensure the function takes in a single argument for the image or video path, all other arguments must be keyword arguments with default values.
|
728
|
+
4. Only use tools obtained from calling `get_tool_for_task` and the ones provided in the documentation.
|
729
|
+
5. Do not include {excluded_tools} tools in your instructions.
|
730
|
+
6. Ensure the function is well documented and easy to understand, DO NOT escape quotes in the function documentation.
|
731
|
+
7. Ensure you visualize the output with `overlay_bounding_boxes` or `overlay_segmentation_masks`, if bounding boxes or segmentaiton masks are produced, and save it to a file with `save_image` or `save_video`.
|
732
|
+
8. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
|
733
|
+
9. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
|
734
|
+
10. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
|
736
735
|
|
737
736
|
<json>
|
738
737
|
{{
|
@@ -744,7 +743,7 @@ FINALIZE_PLAN = """
|
|
744
743
|
</json>
|
745
744
|
|
746
745
|
<code>
|
747
|
-
#
|
746
|
+
# Your function code here
|
748
747
|
</code>
|
749
748
|
"""
|
750
749
|
|
@@ -73,7 +73,7 @@ class DefaultPlanningImports:
|
|
73
73
|
"import cv2",
|
74
74
|
"from typing import *",
|
75
75
|
"from vision_agent.tools import *",
|
76
|
-
"from vision_agent.tools.planner_tools import
|
76
|
+
"from vision_agent.tools.planner_tools import vqa, suggestion, get_tool_for_task",
|
77
77
|
"from pillow_heif import register_heif_opener",
|
78
78
|
"register_heif_opener()",
|
79
79
|
"import matplotlib.pyplot as plt",
|
@@ -228,6 +228,7 @@ def execute_code_action(
|
|
228
228
|
|
229
229
|
count = 1
|
230
230
|
while not execution.success and count <= 3:
|
231
|
+
start = time.time()
|
231
232
|
prompt = FIX_BUG.format(chat_history=get_planning(chat), code=code, error=obs)
|
232
233
|
response = cast(str, model.chat([{"role": "user", "content": prompt}]))
|
233
234
|
new_code = extract_tag(response, "code", extract_markdown="python")
|
@@ -243,7 +244,7 @@ def execute_code_action(
|
|
243
244
|
if verbose:
|
244
245
|
print_code(f"Fixing Bug Round {count}:", code)
|
245
246
|
_CONSOLE.print(
|
246
|
-
f"[bold cyan]Code Execution Output:[/bold cyan] [yellow]{escape(obs)}[/yellow]"
|
247
|
+
f"[bold cyan]Code Execution Output ({end - start:.2f}s):[/bold cyan] [yellow]{escape(obs)}[/yellow]"
|
247
248
|
)
|
248
249
|
count += 1
|
249
250
|
|