vision-agent 0.2.236__py3-none-any.whl → 0.2.237__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +57 -80
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/agent/agent.py +2 -2
- vision_agent/agent/vision_agent.py +3 -2
- vision_agent/agent/vision_agent_coder.py +13 -19
- vision_agent/agent/vision_agent_coder_v2.py +17 -17
- vision_agent/agent/vision_agent_planner.py +16 -21
- vision_agent/agent/vision_agent_planner_prompts_v2.py +19 -20
- vision_agent/agent/vision_agent_planner_v2.py +29 -15
- vision_agent/agent/vision_agent_v2.py +12 -12
- vision_agent/clients/landing_public_api.py +1 -1
- vision_agent/configs/config.py +17 -3
- vision_agent/lmm/__init__.py +0 -1
- vision_agent/lmm/lmm.py +4 -3
- vision_agent/models/__init__.py +11 -0
- vision_agent/{lmm/types.py → models/lmm_types.py} +4 -1
- vision_agent/sim/__init__.py +8 -0
- vision_agent/{utils → sim}/sim.py +3 -3
- vision_agent/tools/__init__.py +10 -23
- vision_agent/tools/meta_tools.py +4 -5
- vision_agent/tools/planner_tools.py +127 -37
- vision_agent/tools/tools.py +388 -302
- vision_agent/utils/__init__.py +0 -1
- vision_agent/{agent/agent_utils.py → utils/agent.py} +11 -2
- vision_agent/utils/image_utils.py +18 -7
- vision_agent/{tools/tool_utils.py → utils/tools.py} +1 -93
- vision_agent/utils/tools_doc.py +87 -0
- vision_agent/utils/video.py +15 -0
- vision_agent/utils/video_tracking.py +38 -5
- {vision_agent-0.2.236.dist-info → vision_agent-0.2.237.dist-info}/METADATA +2 -2
- vision_agent-0.2.237.dist-info/RECORD +55 -0
- vision_agent-0.2.236.dist-info/RECORD +0 -52
- /vision_agent/{agent/types.py → models/agent_types.py} +0 -0
- /vision_agent/{tools → models}/tools_types.py +0 -0
- {vision_agent-0.2.236.dist-info → vision_agent-0.2.237.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.236.dist-info → vision_agent-0.2.237.dist-info}/WHEEL +0 -0
vision_agent/.sim_tools/df.csv
CHANGED
@@ -65,7 +65,7 @@ desc,doc,name
|
|
65
65
|
},
|
66
66
|
]
|
67
67
|
",owlv2_sam2_instance_segmentation
|
68
|
-
"'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] =
|
68
|
+
"'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.1, chunk_length: Optional[int] = 25, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
|
69
69
|
'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
|
70
70
|
objects in a video given a text prompt such as category names or referring
|
71
71
|
expressions. The categories in the text prompt are separated by commas. It returns
|
@@ -75,6 +75,8 @@ desc,doc,name
|
|
75
75
|
Parameters:
|
76
76
|
prompt (str): The prompt to ground to the image.
|
77
77
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
78
|
+
box_threshold (float, optional): The threshold for the box detection. Defaults
|
79
|
+
to 0.10.
|
78
80
|
chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
|
79
81
|
new objects.
|
80
82
|
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
@@ -175,7 +177,7 @@ desc,doc,name
|
|
175
177
|
},
|
176
178
|
]
|
177
179
|
",countgd_sam2_instance_segmentation
|
178
|
-
"'countgd_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] =
|
180
|
+
"'countgd_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.23, chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
|
179
181
|
'countgd_sam2_video_tracking' is a tool that can track and segment multiple
|
180
182
|
objects in a video given a text prompt such as category names or referring
|
181
183
|
expressions. The categories in the text prompt are separated by commas. It returns
|
@@ -185,6 +187,8 @@ desc,doc,name
|
|
185
187
|
Parameters:
|
186
188
|
prompt (str): The prompt to ground to the image.
|
187
189
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
190
|
+
box_threshold (float, optional): The threshold for detection. Defaults
|
191
|
+
to 0.23.
|
188
192
|
chunk_length (Optional[int]): The number of frames to re-run countgd to find
|
189
193
|
new objects.
|
190
194
|
|
@@ -236,6 +240,34 @@ desc,doc,name
|
|
236
240
|
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
237
241
|
]
|
238
242
|
",florence2_ocr
|
243
|
+
"'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
244
|
+
'florence2_object_detection' is a tool that can detect multiple objects given a
|
245
|
+
text prompt which can be object names or caption. You can optionally separate the
|
246
|
+
object names in the text with commas. It returns a list of bounding boxes with
|
247
|
+
normalized coordinates, label names and associated confidence scores of 1.0.
|
248
|
+
|
249
|
+
Parameters:
|
250
|
+
prompt (str): The prompt to ground to the image. Use exclusive categories that
|
251
|
+
do not overlap such as 'person, car' and NOT 'person, athlete'.
|
252
|
+
image (np.ndarray): The image to used to detect objects
|
253
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
254
|
+
fine-tuned model ID here to use it.
|
255
|
+
|
256
|
+
Returns:
|
257
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
258
|
+
bounding box of the detected objects with normalized coordinates between 0
|
259
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
260
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
261
|
+
bounding box. The scores are always 1.0 and cannot be thresholded
|
262
|
+
|
263
|
+
Example
|
264
|
+
-------
|
265
|
+
>>> florence2_object_detection('person looking at a coyote', image)
|
266
|
+
[
|
267
|
+
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
268
|
+
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
269
|
+
]
|
270
|
+
",florence2_object_detection
|
239
271
|
"'florence2_sam2_instance_segmentation' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores of 1.0.","florence2_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
240
272
|
'florence2_sam2_instance_segmentation' is a tool that can segment multiple
|
241
273
|
objects given a text prompt such as category names or referring expressions. The
|
@@ -274,7 +306,7 @@ desc,doc,name
|
|
274
306
|
},
|
275
307
|
]
|
276
308
|
",florence2_sam2_instance_segmentation
|
277
|
-
"'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] =
|
309
|
+
"'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 25, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
|
278
310
|
'florence2_sam2_video_tracking' is a tool that can track and segment multiple
|
279
311
|
objects in a video given a text prompt such as category names or referring
|
280
312
|
expressions. The categories in the text prompt are separated by commas. It returns
|
@@ -318,34 +350,6 @@ desc,doc,name
|
|
318
350
|
...
|
319
351
|
]
|
320
352
|
",florence2_sam2_video_tracking
|
321
|
-
"'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
322
|
-
'florence2_object_detection' is a tool that can detect multiple objects given a
|
323
|
-
text prompt which can be object names or caption. You can optionally separate the
|
324
|
-
object names in the text with commas. It returns a list of bounding boxes with
|
325
|
-
normalized coordinates, label names and associated confidence scores of 1.0.
|
326
|
-
|
327
|
-
Parameters:
|
328
|
-
prompt (str): The prompt to ground to the image. Use exclusive categories that
|
329
|
-
do not overlap such as 'person, car' and NOT 'person, athlete'.
|
330
|
-
image (np.ndarray): The image to used to detect objects
|
331
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
332
|
-
fine-tuned model ID here to use it.
|
333
|
-
|
334
|
-
Returns:
|
335
|
-
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
336
|
-
bounding box of the detected objects with normalized coordinates between 0
|
337
|
-
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
338
|
-
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
339
|
-
bounding box. The scores are always 1.0 and cannot be thresholded
|
340
|
-
|
341
|
-
Example
|
342
|
-
-------
|
343
|
-
>>> florence2_object_detection('person looking at a coyote', image)
|
344
|
-
[
|
345
|
-
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
346
|
-
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
347
|
-
]
|
348
|
-
",florence2_object_detection
|
349
353
|
'claude35_text_extraction' is a tool that can extract text from an image. It returns the extracted text as a string and can be used as an alternative to OCR if you do not need to know the exact bounding box of the text.,"claude35_text_extraction(image: numpy.ndarray) -> str:
|
350
354
|
'claude35_text_extraction' is a tool that can extract text from an image. It
|
351
355
|
returns the extracted text as a string and can be used as an alternative to OCR if
|
@@ -458,6 +462,28 @@ desc,doc,name
|
|
458
462
|
>>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
|
459
463
|
'Lionel Messi'
|
460
464
|
",qwen2_vl_video_vqa
|
465
|
+
'activity_recognition' is a tool that can recognize activities in a video given a text prompt. It can be used to identify where specific activities or actions happen in a video and returns a list of 0s and 1s to indicate the activity.,"activity_recognition(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: int = 10) -> List[float]:
|
466
|
+
'activity_recognition' is a tool that can recognize activities in a video given a
|
467
|
+
text prompt. It can be used to identify where specific activities or actions
|
468
|
+
happen in a video and returns a list of 0s and 1s to indicate the activity.
|
469
|
+
|
470
|
+
Parameters:
|
471
|
+
prompt (str): The event you want to identify, should be phrased as a question,
|
472
|
+
for example, ""Did a goal happen?"".
|
473
|
+
frames (List[np.ndarray]): The reference frames used for the question
|
474
|
+
model (str): The model to use for the inference. Valid values are
|
475
|
+
'claude-35', 'gpt-4o', 'qwen2vl'.
|
476
|
+
chunk_length_frames (int): length of each chunk in frames
|
477
|
+
|
478
|
+
Returns:
|
479
|
+
List[float]: A list of floats with a value of 1.0 if the activity is detected in
|
480
|
+
the chunk_length_frames of the video.
|
481
|
+
|
482
|
+
Example
|
483
|
+
-------
|
484
|
+
>>> activity_recognition('Did a goal happened?', frames)
|
485
|
+
[0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
|
486
|
+
",activity_recognition
|
461
487
|
'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intesities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
|
462
488
|
'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a
|
463
489
|
depth image from a given RGB image. The returned depth image is monochrome and
|
@@ -514,30 +540,6 @@ desc,doc,name
|
|
514
540
|
>>> vit_nsfw_classification(image)
|
515
541
|
{""label"": ""normal"", ""scores"": 0.68},
|
516
542
|
",vit_nsfw_classification
|
517
|
-
'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: int = 2) -> List[float]:
|
518
|
-
'video_temporal_localization' will run qwen2vl on each chunk_length_frames
|
519
|
-
value selected for the video. It can detect multiple objects independently per
|
520
|
-
chunk_length_frames given a text prompt such as a referring expression
|
521
|
-
but does not track objects across frames.
|
522
|
-
It returns a list of floats with a value of 1.0 if the objects are found in a given
|
523
|
-
chunk_length_frames of the video.
|
524
|
-
|
525
|
-
Parameters:
|
526
|
-
prompt (str): The question about the video
|
527
|
-
frames (List[np.ndarray]): The reference frames used for the question
|
528
|
-
model (str): The model to use for the inference. Valid values are
|
529
|
-
'qwen2vl', 'gpt4o'.
|
530
|
-
chunk_length_frames (int): length of each chunk in frames
|
531
|
-
|
532
|
-
Returns:
|
533
|
-
List[float]: A list of floats with a value of 1.0 if the objects to be found
|
534
|
-
are present in the chunk_length_frames of the video.
|
535
|
-
|
536
|
-
Example
|
537
|
-
-------
|
538
|
-
>>> video_temporal_localization('Did a goal happened?', frames)
|
539
|
-
[0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
|
540
|
-
",video_temporal_localization
|
541
543
|
"'flux_image_inpainting' performs image inpainting to fill the masked regions, given by mask, in the image, given image based on the text prompt and surrounding image context. It can be used to edit regions of an image according to the prompt given.","flux_image_inpainting(prompt: str, image: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
|
542
544
|
'flux_image_inpainting' performs image inpainting to fill the masked regions,
|
543
545
|
given by mask, in the image, given image based on the text prompt and surrounding
|
@@ -728,28 +730,3 @@ desc,doc,name
|
|
728
730
|
}],
|
729
731
|
)
|
730
732
|
",overlay_segmentation_masks
|
731
|
-
'overlay_heat_map' is a utility function that displays a heat map on an image.,"overlay_heat_map(image: numpy.ndarray, heat_map: Dict[str, Any], alpha: float = 0.8) -> numpy.ndarray:
|
732
|
-
'overlay_heat_map' is a utility function that displays a heat map on an image.
|
733
|
-
|
734
|
-
Parameters:
|
735
|
-
image (np.ndarray): The image to display the heat map on.
|
736
|
-
heat_map (Dict[str, Any]): A dictionary containing the heat map under the key
|
737
|
-
'heat_map'.
|
738
|
-
alpha (float, optional): The transparency of the overlay. Defaults to 0.8.
|
739
|
-
|
740
|
-
Returns:
|
741
|
-
np.ndarray: The image with the heat map displayed.
|
742
|
-
|
743
|
-
Example
|
744
|
-
-------
|
745
|
-
>>> image_with_heat_map = overlay_heat_map(
|
746
|
-
image,
|
747
|
-
{
|
748
|
-
'heat_map': array([[0, 0, 0, ..., 0, 0, 0],
|
749
|
-
[0, 0, 0, ..., 0, 0, 0],
|
750
|
-
...,
|
751
|
-
[0, 0, 0, ..., 0, 0, 0],
|
752
|
-
[0, 0, 0, ..., 125, 125, 125]], dtype=uint8),
|
753
|
-
},
|
754
|
-
)
|
755
|
-
",overlay_heat_map
|
vision_agent/.sim_tools/embs.npy
CHANGED
Binary file
|
vision_agent/agent/agent.py
CHANGED
@@ -2,13 +2,13 @@ from abc import ABC, abstractmethod
|
|
2
2
|
from pathlib import Path
|
3
3
|
from typing import Any, Dict, List, Optional, Union
|
4
4
|
|
5
|
-
from vision_agent.
|
5
|
+
from vision_agent.models import (
|
6
6
|
AgentMessage,
|
7
7
|
CodeContext,
|
8
8
|
InteractionContext,
|
9
|
+
Message,
|
9
10
|
PlanContext,
|
10
11
|
)
|
11
|
-
from vision_agent.lmm.types import Message
|
12
12
|
from vision_agent.utils.execute import CodeInterpreter
|
13
13
|
|
14
14
|
|
@@ -6,7 +6,6 @@ from pathlib import Path
|
|
6
6
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
7
7
|
|
8
8
|
from vision_agent.agent import Agent
|
9
|
-
from vision_agent.agent.agent_utils import extract_json, extract_tag
|
10
9
|
from vision_agent.agent.vision_agent_prompts import (
|
11
10
|
EXAMPLES_CODE1,
|
12
11
|
EXAMPLES_CODE2,
|
@@ -14,7 +13,8 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
14
13
|
EXAMPLES_CODE3_EXTRA2,
|
15
14
|
VA_CODE,
|
16
15
|
)
|
17
|
-
from vision_agent.lmm import LMM, AnthropicLMM,
|
16
|
+
from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM
|
17
|
+
from vision_agent.models import Message
|
18
18
|
from vision_agent.tools.meta_tools import (
|
19
19
|
META_TOOL_DOCSTRING,
|
20
20
|
Artifacts,
|
@@ -22,6 +22,7 @@ from vision_agent.tools.meta_tools import (
|
|
22
22
|
use_extra_vision_agent_args,
|
23
23
|
)
|
24
24
|
from vision_agent.utils import CodeInterpreterFactory
|
25
|
+
from vision_agent.utils.agent import extract_json, extract_tag
|
25
26
|
from vision_agent.utils.execute import CodeInterpreter, Execution
|
26
27
|
|
27
28
|
logging.basicConfig(level=logging.INFO)
|
@@ -9,16 +9,6 @@ from tabulate import tabulate
|
|
9
9
|
|
10
10
|
import vision_agent.tools as T
|
11
11
|
from vision_agent.agent.agent import Agent
|
12
|
-
from vision_agent.agent.agent_utils import (
|
13
|
-
_MAX_TABULATE_COL_WIDTH,
|
14
|
-
DefaultImports,
|
15
|
-
extract_code,
|
16
|
-
extract_tag,
|
17
|
-
format_feedback,
|
18
|
-
print_code,
|
19
|
-
remove_installs_from_code,
|
20
|
-
strip_function_calls,
|
21
|
-
)
|
22
12
|
from vision_agent.agent.vision_agent_coder_prompts import (
|
23
13
|
CODE,
|
24
14
|
FIX_BUG,
|
@@ -32,16 +22,20 @@ from vision_agent.agent.vision_agent_planner import (
|
|
32
22
|
OpenAIVisionAgentPlanner,
|
33
23
|
PlanContext,
|
34
24
|
)
|
35
|
-
from vision_agent.lmm import
|
36
|
-
|
37
|
-
AnthropicLMM,
|
38
|
-
AzureOpenAILMM,
|
39
|
-
Message,
|
40
|
-
OllamaLMM,
|
41
|
-
OpenAILMM,
|
42
|
-
)
|
25
|
+
from vision_agent.lmm import LMM, AnthropicLMM, AzureOpenAILMM, OllamaLMM, OpenAILMM
|
26
|
+
from vision_agent.models import Message
|
43
27
|
from vision_agent.tools.meta_tools import get_diff
|
44
28
|
from vision_agent.utils import CodeInterpreterFactory, Execution
|
29
|
+
from vision_agent.utils.agent import (
|
30
|
+
_MAX_TABULATE_COL_WIDTH,
|
31
|
+
DefaultImports,
|
32
|
+
extract_code,
|
33
|
+
extract_tag,
|
34
|
+
format_feedback,
|
35
|
+
print_code,
|
36
|
+
remove_installs_from_code,
|
37
|
+
strip_function_calls,
|
38
|
+
)
|
45
39
|
from vision_agent.utils.execute import CodeInterpreter
|
46
40
|
|
47
41
|
logging.basicConfig(stream=sys.stdout)
|
@@ -490,7 +484,7 @@ class VisionAgentCoder(Agent):
|
|
490
484
|
tool_info=tool_doc,
|
491
485
|
tool_output=tool_output_str,
|
492
486
|
plan_thoughts=plan_thoughts_str,
|
493
|
-
tool_utils=T.
|
487
|
+
tool_utils=T.get_utilties_docstring(),
|
494
488
|
working_memory=working_memory,
|
495
489
|
coder=self.coder,
|
496
490
|
tester=self.tester,
|
@@ -5,9 +5,22 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
|
|
5
5
|
from rich.console import Console
|
6
6
|
from rich.markup import escape
|
7
7
|
|
8
|
-
import vision_agent.tools.tools as T
|
9
8
|
from vision_agent.agent import AgentCoder, AgentPlanner
|
10
|
-
from vision_agent.agent.
|
9
|
+
from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
|
10
|
+
from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
|
11
|
+
from vision_agent.configs import Config
|
12
|
+
from vision_agent.lmm import LMM
|
13
|
+
from vision_agent.models import (
|
14
|
+
AgentMessage,
|
15
|
+
CodeContext,
|
16
|
+
InteractionContext,
|
17
|
+
Message,
|
18
|
+
PlanContext,
|
19
|
+
)
|
20
|
+
from vision_agent.sim import Sim, get_tool_recommender
|
21
|
+
from vision_agent.tools.meta_tools import get_diff
|
22
|
+
from vision_agent.tools.tools import get_utilties_docstring
|
23
|
+
from vision_agent.utils.agent import (
|
11
24
|
DefaultImports,
|
12
25
|
add_media_to_chat,
|
13
26
|
capture_media_from_exec,
|
@@ -18,24 +31,11 @@ from vision_agent.agent.agent_utils import (
|
|
18
31
|
print_code,
|
19
32
|
strip_function_calls,
|
20
33
|
)
|
21
|
-
from vision_agent.agent.types import (
|
22
|
-
AgentMessage,
|
23
|
-
CodeContext,
|
24
|
-
InteractionContext,
|
25
|
-
PlanContext,
|
26
|
-
)
|
27
|
-
from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
|
28
|
-
from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
|
29
|
-
from vision_agent.configs import Config
|
30
|
-
from vision_agent.lmm import LMM
|
31
|
-
from vision_agent.lmm.types import Message
|
32
|
-
from vision_agent.tools.meta_tools import get_diff
|
33
34
|
from vision_agent.utils.execute import (
|
34
35
|
CodeInterpreter,
|
35
36
|
CodeInterpreterFactory,
|
36
37
|
Execution,
|
37
38
|
)
|
38
|
-
from vision_agent.utils.sim import Sim, get_tool_recommender
|
39
39
|
|
40
40
|
CONFIG = Config()
|
41
41
|
_CONSOLE = Console()
|
@@ -207,7 +207,7 @@ def test_code(
|
|
207
207
|
test = write_test(
|
208
208
|
tester=tester,
|
209
209
|
chat=chat,
|
210
|
-
tool_util_docs=
|
210
|
+
tool_util_docs=get_utilties_docstring(),
|
211
211
|
code=code,
|
212
212
|
media_list=media_list,
|
213
213
|
)
|
@@ -227,7 +227,7 @@ def test_code(
|
|
227
227
|
while (not result.success or len(result.logs.stdout) == 0) and count < 3:
|
228
228
|
code, test, debug_info = debug_code(
|
229
229
|
debugger,
|
230
|
-
|
230
|
+
get_utilties_docstring() + "\n" + tool_docs,
|
231
231
|
plan,
|
232
232
|
code,
|
233
233
|
test,
|
@@ -9,15 +9,6 @@ from tabulate import tabulate
|
|
9
9
|
|
10
10
|
import vision_agent.tools as T
|
11
11
|
from vision_agent.agent import Agent
|
12
|
-
from vision_agent.agent.agent_utils import (
|
13
|
-
_MAX_TABULATE_COL_WIDTH,
|
14
|
-
DefaultImports,
|
15
|
-
extract_code,
|
16
|
-
extract_json,
|
17
|
-
format_feedback,
|
18
|
-
format_plans,
|
19
|
-
print_code,
|
20
|
-
)
|
21
12
|
from vision_agent.agent.vision_agent_planner_prompts import (
|
22
13
|
PICK_PLAN,
|
23
14
|
PLAN,
|
@@ -25,20 +16,24 @@ from vision_agent.agent.vision_agent_planner_prompts import (
|
|
25
16
|
TEST_PLANS,
|
26
17
|
USER_REQ,
|
27
18
|
)
|
28
|
-
from vision_agent.lmm import
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
19
|
+
from vision_agent.lmm import LMM, AnthropicLMM, AzureOpenAILMM, OllamaLMM, OpenAILMM
|
20
|
+
from vision_agent.models import Message
|
21
|
+
from vision_agent.sim import AzureSim, OllamaSim, Sim
|
22
|
+
from vision_agent.utils.agent import (
|
23
|
+
_MAX_TABULATE_COL_WIDTH,
|
24
|
+
DefaultImports,
|
25
|
+
extract_code,
|
26
|
+
extract_json,
|
27
|
+
format_feedback,
|
28
|
+
format_plans,
|
29
|
+
print_code,
|
35
30
|
)
|
36
31
|
from vision_agent.utils.execute import (
|
37
32
|
CodeInterpreter,
|
38
33
|
CodeInterpreterFactory,
|
39
34
|
Execution,
|
40
35
|
)
|
41
|
-
from vision_agent.utils.
|
36
|
+
from vision_agent.utils.tools_doc import get_tool_descriptions_by_names
|
42
37
|
|
43
38
|
_LOGGER = logging.getLogger(__name__)
|
44
39
|
|
@@ -348,7 +343,7 @@ class VisionAgentPlanner(Agent):
|
|
348
343
|
_LOGGER.setLevel(logging.INFO)
|
349
344
|
|
350
345
|
self.tool_recommender = (
|
351
|
-
Sim(T.
|
346
|
+
Sim(T.get_tools_df(), sim_key="desc")
|
352
347
|
if tool_recommender is None
|
353
348
|
else tool_recommender
|
354
349
|
)
|
@@ -414,7 +409,7 @@ class VisionAgentPlanner(Agent):
|
|
414
409
|
|
415
410
|
plans = write_plans(
|
416
411
|
chat,
|
417
|
-
|
412
|
+
get_tool_descriptions_by_names(
|
418
413
|
custom_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
|
419
414
|
),
|
420
415
|
format_feedback(working_memory),
|
@@ -537,7 +532,7 @@ class OllamaVisionAgentPlanner(VisionAgentPlanner):
|
|
537
532
|
else planner
|
538
533
|
),
|
539
534
|
tool_recommender=(
|
540
|
-
OllamaSim(T.
|
535
|
+
OllamaSim(T.get_tools_df(), sim_key="desc")
|
541
536
|
if tool_recommender is None
|
542
537
|
else tool_recommender
|
543
538
|
),
|
@@ -559,7 +554,7 @@ class AzureVisionAgentPlanner(VisionAgentPlanner):
|
|
559
554
|
super().__init__(
|
560
555
|
planner=(AzureOpenAILMM(temperature=0.0) if planner is None else planner),
|
561
556
|
tool_recommender=(
|
562
|
-
AzureSim(T.
|
557
|
+
AzureSim(T.get_tools_df(), sim_key="desc")
|
563
558
|
if tool_recommender is None
|
564
559
|
else tool_recommender
|
565
560
|
),
|
@@ -9,21 +9,22 @@ PLAN = """
|
|
9
9
|
**Example Planning**: Here are some examples of how you can search for a plan, in the examples the user output is denoted by USER, your output is denoted by AGENT and the observations after your code execution are denoted by OBSERVATION:
|
10
10
|
{examples}
|
11
11
|
|
12
|
-
**Current Planning**:
|
13
|
-
--- START PLANNING ---
|
12
|
+
**Current Planning**: This is the plan you are currently working on
|
13
|
+
--- START CURRENT PLANNING ---
|
14
14
|
{planning}
|
15
|
-
--- END PLANNING ---
|
15
|
+
--- END CURRENT PLANNING ---
|
16
16
|
|
17
17
|
**Instructions**:
|
18
18
|
1. Read over the user request and context provided and output <thinking> tags to indicate your thought process. You can <count> number of turns to complete the user's request.
|
19
19
|
2. You can execute python code in the ipython notebook using <execute_python> tags. Only output one <execute_python> tag at a time.
|
20
20
|
3. Only output <finalize_plan> when you are done planning and want to end the planning process. DO NOT output <finalize_plan> with <execute_python> tags, only after OBSERVATION's.
|
21
21
|
4. Only load/save files from {media_list} unless you specifically saved the file previously.
|
22
|
-
5. Ensure you always call `suggestion` initially and `get_tool_for_task` to get the right tool for the subtask.
|
22
|
+
5. Ensure you always call `suggestion` and `claude35_vqa` initially and `get_tool_for_task` to get the right tool for the subtask.
|
23
23
|
6. Calling `plt.imshow` or `save_image` will display the image to you so you can check your results. If you see an image after <execute_python> it's generated from your code.
|
24
|
-
7.
|
25
|
-
8. DO NOT
|
26
|
-
9.
|
24
|
+
7. Be sure to print results returned for tools so you can see the output.
|
25
|
+
8. DO NOT hard code the answer into your code, it should be dynamic and work for any similar request.
|
26
|
+
9. DO NOT over index on claude35_vqa, if tool output is close to claude35_vqa's output you do not need to improve the tool output, tools are often better at things like counting and detecting small objects.
|
27
|
+
10. You can only respond in the following format with a single <thinking>, <execute_python> or <finalize_plan> tag:
|
27
28
|
|
28
29
|
<thinking>Your thought process...</thinking>
|
29
30
|
<execute_python>Your code here</execute_python>
|
@@ -334,23 +335,21 @@ get_tool_for_task('Identify and track the boxes in the video', frames[:5])
|
|
334
335
|
|
335
336
|
OBSERVATION:
|
336
337
|
[get_tool_for_task output]
|
337
|
-
For tracking boxes moving on a conveyor belt, we need a tool that can consistently track the same box across frames without losing it or double counting. Looking at the outputs:
|
338
|
+
For tracking boxes moving on a conveyor belt, we need a tool that can consistently track the same box across frames without losing it or double counting. Looking at the outputs: countgd_sam2_video_tracking successfully tracks the single box across all 5 frames, maintaining consistent tracking IDs and showing the box's movement along the conveyor and using the prompt 'box'.
|
338
339
|
|
339
340
|
Tool Documentation:
|
340
|
-
def
|
341
|
-
'
|
341
|
+
def countgd_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
|
342
|
+
'countgd_sam2_video_tracking' is a tool that can track and segment multiple
|
342
343
|
objects in a video given a text prompt such as category names or referring
|
343
344
|
expressions. The categories in the text prompt are separated by commas. It returns
|
344
345
|
a list of bounding boxes, label names, masks and associated probability scores and
|
345
346
|
is useful for tracking and counting without duplicating counts.
|
346
347
|
|
347
348
|
Parameters:
|
348
|
-
prompt (str): The prompt to ground to the
|
349
|
+
prompt (str): The prompt to ground to the image.
|
349
350
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
350
|
-
chunk_length (Optional[int]): The number of frames to re-run
|
351
|
+
chunk_length (Optional[int]): The number of frames to re-run countgd to find
|
351
352
|
new objects.
|
352
|
-
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
353
|
-
fine-tuned model ID here to use it.
|
354
353
|
|
355
354
|
Returns:
|
356
355
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
|
@@ -364,7 +363,7 @@ def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_l
|
|
364
363
|
|
365
364
|
Example
|
366
365
|
-------
|
367
|
-
>>>
|
366
|
+
>>> countgd_sam2_video_tracking("car, dinosaur", frames)
|
368
367
|
[
|
369
368
|
[
|
370
369
|
{
|
@@ -382,7 +381,7 @@ def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_l
|
|
382
381
|
|
383
382
|
AGENT: <thinking>Now that I have the tool and prompt for tracking I need to find a tool to identify if there is tape on each of the boxes.</thinking>
|
384
383
|
<execute_python>
|
385
|
-
track_predictions =
|
384
|
+
track_predictions = countgd_sam2_video_tracking("box", frames)
|
386
385
|
|
387
386
|
# only use first 5 frames to reduce processing time
|
388
387
|
crops = []
|
@@ -512,16 +511,16 @@ PICK_PLAN = """
|
|
512
511
|
|
513
512
|
CATEGORIZE_TOOL_REQUEST = """
|
514
513
|
You are given a task: "{task}" from the user. You must extract the type of category this task belongs to, it can be one or more of the following:
|
514
|
+
- "VQA" - answering questions about an image or video, can be used for most tasks, should generally be included.
|
515
515
|
- "object detection and counting" - detecting objects or counting objects from a text prompt in an image.
|
516
516
|
- "instance segmentation" - segmenting objects in an image given a text prompt.
|
517
517
|
- "classification" - classifying objects in an image given a text prompt.
|
518
518
|
- "segmentation" - segmenting objects in an image or video given a text prompt.
|
519
519
|
- "OCR" - extracting text from an image.
|
520
|
-
- "VQA" - answering questions about an image or video, can also be used for text extraction.
|
521
520
|
- "DocQA" - answering questions about a document or extracting information from a document.
|
522
521
|
- "video object tracking" - tracking objects in a video.
|
523
522
|
- "depth and pose estimation" - estimating the depth or pose of objects in an image.
|
524
|
-
- "
|
523
|
+
- "activity recognition" - identifying time period(s) an event occurs in a video.
|
525
524
|
- "inpainting" - filling in masked parts of an image.
|
526
525
|
|
527
526
|
Return the category or categories (comma separated) inside tags <category># your categories here</category>. If you are unsure about a task, it is better to include more categories than less.
|
@@ -718,7 +717,7 @@ PICK_TOOL = """
|
|
718
717
|
FINALIZE_PLAN = """
|
719
718
|
**Task**: You are given a chain of thoughts, python executions and observations from a planning agent as it tries to construct a plan to solve a user request. Your task is to summarize the plan it found so that another programming agent to write a program to accomplish the user request.
|
720
719
|
|
721
|
-
**Documentation**: You can use these tools to help you visualize or save the output:
|
720
|
+
**Documentation**: You can use these tools to help you visualize or save the output (they are imported `from vision_agent.tools import *`):
|
722
721
|
{tool_desc}
|
723
722
|
|
724
723
|
**Planning**: Here is chain of thoughts, executions and observations from the planning agent:
|
@@ -730,7 +729,7 @@ FINALIZE_PLAN = """
|
|
730
729
|
3. Only use tools obtained from calling `get_tool_for_task`.
|
731
730
|
4. Do not include {excluded_tools} tools in your instructions.
|
732
731
|
5. Ensure the function is well documented and easy to understand.
|
733
|
-
6. Ensure you visualize the output with `overlay_bounding_boxes` or `overlay_segmentation_masks
|
732
|
+
6. Ensure you visualize the output with `overlay_bounding_boxes` or `overlay_segmentation_masks`, if bounding boxes or segmentaiton masks are produced, and save it to a file with `save_image` or `save_video`.
|
734
733
|
7. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
|
735
734
|
8. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
|
736
735
|
9. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
|
@@ -13,17 +13,6 @@ from rich.markup import escape
|
|
13
13
|
import vision_agent.tools as T
|
14
14
|
import vision_agent.tools.planner_tools as pt
|
15
15
|
from vision_agent.agent import AgentPlanner
|
16
|
-
from vision_agent.agent.agent_utils import (
|
17
|
-
add_media_to_chat,
|
18
|
-
capture_media_from_exec,
|
19
|
-
convert_message_to_agentmessage,
|
20
|
-
extract_json,
|
21
|
-
extract_tag,
|
22
|
-
print_code,
|
23
|
-
print_table,
|
24
|
-
remove_installs_from_code,
|
25
|
-
)
|
26
|
-
from vision_agent.agent.types import AgentMessage, InteractionContext, PlanContext
|
27
16
|
from vision_agent.agent.vision_agent_planner_prompts_v2 import (
|
28
17
|
CRITIQUE_PLAN,
|
29
18
|
EXAMPLE_PLAN1,
|
@@ -34,17 +23,29 @@ from vision_agent.agent.vision_agent_planner_prompts_v2 import (
|
|
34
23
|
PLAN,
|
35
24
|
)
|
36
25
|
from vision_agent.configs import Config
|
37
|
-
from vision_agent.lmm import LMM
|
38
|
-
from vision_agent.
|
26
|
+
from vision_agent.lmm import LMM
|
27
|
+
from vision_agent.models import AgentMessage, InteractionContext, Message, PlanContext
|
28
|
+
from vision_agent.tools.planner_tools import check_function_call
|
29
|
+
from vision_agent.utils.agent import (
|
30
|
+
add_media_to_chat,
|
31
|
+
capture_media_from_exec,
|
32
|
+
convert_message_to_agentmessage,
|
33
|
+
extract_json,
|
34
|
+
extract_tag,
|
35
|
+
print_code,
|
36
|
+
print_table,
|
37
|
+
remove_installs_from_code,
|
38
|
+
)
|
39
39
|
from vision_agent.utils.execute import (
|
40
40
|
CodeInterpreter,
|
41
41
|
CodeInterpreterFactory,
|
42
42
|
Execution,
|
43
43
|
)
|
44
|
+
from vision_agent.utils.tools_doc import get_tool_documentation
|
44
45
|
|
45
46
|
logging.basicConfig(level=logging.INFO)
|
46
47
|
CONFIG = Config()
|
47
|
-
UTIL_DOCSTRING =
|
48
|
+
UTIL_DOCSTRING = get_tool_documentation(
|
48
49
|
[
|
49
50
|
T.load_image,
|
50
51
|
T.extract_frames_and_timestamps,
|
@@ -360,6 +361,16 @@ def get_steps(chat: List[AgentMessage], max_steps: int) -> int:
|
|
360
361
|
return max_steps
|
361
362
|
|
362
363
|
|
364
|
+
def format_tool_output(tool_thoughts: str, tool_docstring: str) -> str:
|
365
|
+
return_str = "[get_tool_for_task output]\n"
|
366
|
+
if tool_thoughts.strip() != "":
|
367
|
+
return_str += f"{tool_thoughts}\n\n"
|
368
|
+
return_str += (
|
369
|
+
f"Tool Documentation:\n{tool_docstring}\n[end of get_tool_for_task output]\n"
|
370
|
+
)
|
371
|
+
return return_str
|
372
|
+
|
373
|
+
|
363
374
|
def replace_interaction_with_obs(chat: List[AgentMessage]) -> List[AgentMessage]:
|
364
375
|
chat = copy.deepcopy(chat)
|
365
376
|
new_chat = []
|
@@ -371,7 +382,10 @@ def replace_interaction_with_obs(chat: List[AgentMessage]) -> List[AgentMessage]
|
|
371
382
|
try:
|
372
383
|
response = json.loads(chat[i + 1].content)
|
373
384
|
function_name = response["function_name"]
|
374
|
-
|
385
|
+
tools_df = T.get_tools_df()
|
386
|
+
tool_doc = format_tool_output(
|
387
|
+
"", tools_df[tools_df["name"] == function_name]["doc"].values[0]
|
388
|
+
)
|
375
389
|
if "box_threshold" in response:
|
376
390
|
tool_doc = f"Use the following function with box_threshold={response['box_threshold']}. This tool and its parameters were chosen by the user so do not change them in your planning.\n\n{tool_doc}."
|
377
391
|
new_chat.append(AgentMessage(role="observation", content=tool_doc))
|