vision-agent 0.2.236__py3-none-any.whl → 0.2.238__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. vision_agent/.sim_tools/df.csv +57 -80
  2. vision_agent/.sim_tools/embs.npy +0 -0
  3. vision_agent/agent/agent.py +2 -2
  4. vision_agent/agent/vision_agent.py +3 -2
  5. vision_agent/agent/vision_agent_coder.py +13 -19
  6. vision_agent/agent/vision_agent_coder_v2.py +17 -17
  7. vision_agent/agent/vision_agent_planner.py +16 -21
  8. vision_agent/agent/vision_agent_planner_prompts_v2.py +19 -20
  9. vision_agent/agent/vision_agent_planner_v2.py +29 -15
  10. vision_agent/agent/vision_agent_v2.py +12 -12
  11. vision_agent/clients/landing_public_api.py +1 -1
  12. vision_agent/configs/anthropic_openai_config.py +17 -3
  13. vision_agent/configs/config.py +17 -3
  14. vision_agent/lmm/__init__.py +0 -1
  15. vision_agent/lmm/lmm.py +4 -3
  16. vision_agent/models/__init__.py +11 -0
  17. vision_agent/{lmm/types.py → models/lmm_types.py} +4 -1
  18. vision_agent/sim/__init__.py +9 -0
  19. vision_agent/{utils → sim}/sim.py +3 -3
  20. vision_agent/tools/__init__.py +10 -23
  21. vision_agent/tools/meta_tools.py +4 -5
  22. vision_agent/tools/planner_tools.py +148 -37
  23. vision_agent/tools/tools.py +388 -302
  24. vision_agent/utils/__init__.py +0 -1
  25. vision_agent/{agent/agent_utils.py → utils/agent.py} +11 -2
  26. vision_agent/utils/image_utils.py +18 -7
  27. vision_agent/{tools/tool_utils.py → utils/tools.py} +1 -93
  28. vision_agent/utils/tools_doc.py +87 -0
  29. vision_agent/utils/video.py +15 -0
  30. vision_agent/utils/video_tracking.py +38 -5
  31. {vision_agent-0.2.236.dist-info → vision_agent-0.2.238.dist-info}/METADATA +2 -3
  32. vision_agent-0.2.238.dist-info/RECORD +55 -0
  33. vision_agent-0.2.236.dist-info/RECORD +0 -52
  34. /vision_agent/{agent/types.py → models/agent_types.py} +0 -0
  35. /vision_agent/{tools → models}/tools_types.py +0 -0
  36. {vision_agent-0.2.236.dist-info → vision_agent-0.2.238.dist-info}/LICENSE +0 -0
  37. {vision_agent-0.2.236.dist-info → vision_agent-0.2.238.dist-info}/WHEEL +0 -0
@@ -65,7 +65,7 @@ desc,doc,name
65
65
  },
66
66
  ]
67
67
  ",owlv2_sam2_instance_segmentation
68
- "'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
68
+ "'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.1, chunk_length: Optional[int] = 25, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
69
69
  'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
70
70
  objects in a video given a text prompt such as category names or referring
71
71
  expressions. The categories in the text prompt are separated by commas. It returns
@@ -75,6 +75,8 @@ desc,doc,name
75
75
  Parameters:
76
76
  prompt (str): The prompt to ground to the image.
77
77
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
78
+ box_threshold (float, optional): The threshold for the box detection. Defaults
79
+ to 0.10.
78
80
  chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
79
81
  new objects.
80
82
  fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
@@ -175,7 +177,7 @@ desc,doc,name
175
177
  },
176
178
  ]
177
179
  ",countgd_sam2_instance_segmentation
178
- "'countgd_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
180
+ "'countgd_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.23, chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
179
181
  'countgd_sam2_video_tracking' is a tool that can track and segment multiple
180
182
  objects in a video given a text prompt such as category names or referring
181
183
  expressions. The categories in the text prompt are separated by commas. It returns
@@ -185,6 +187,8 @@ desc,doc,name
185
187
  Parameters:
186
188
  prompt (str): The prompt to ground to the image.
187
189
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
190
+ box_threshold (float, optional): The threshold for detection. Defaults
191
+ to 0.23.
188
192
  chunk_length (Optional[int]): The number of frames to re-run countgd to find
189
193
  new objects.
190
194
 
@@ -236,6 +240,34 @@ desc,doc,name
236
240
  {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
237
241
  ]
238
242
  ",florence2_ocr
243
+ "'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
244
+ 'florence2_object_detection' is a tool that can detect multiple objects given a
245
+ text prompt which can be object names or caption. You can optionally separate the
246
+ object names in the text with commas. It returns a list of bounding boxes with
247
+ normalized coordinates, label names and associated confidence scores of 1.0.
248
+
249
+ Parameters:
250
+ prompt (str): The prompt to ground to the image. Use exclusive categories that
251
+ do not overlap such as 'person, car' and NOT 'person, athlete'.
252
+ image (np.ndarray): The image to used to detect objects
253
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
254
+ fine-tuned model ID here to use it.
255
+
256
+ Returns:
257
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
258
+ bounding box of the detected objects with normalized coordinates between 0
259
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
260
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
261
+ bounding box. The scores are always 1.0 and cannot be thresholded
262
+
263
+ Example
264
+ -------
265
+ >>> florence2_object_detection('person looking at a coyote', image)
266
+ [
267
+ {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
268
+ {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
269
+ ]
270
+ ",florence2_object_detection
239
271
  "'florence2_sam2_instance_segmentation' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores of 1.0.","florence2_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
240
272
  'florence2_sam2_instance_segmentation' is a tool that can segment multiple
241
273
  objects given a text prompt such as category names or referring expressions. The
@@ -274,7 +306,7 @@ desc,doc,name
274
306
  },
275
307
  ]
276
308
  ",florence2_sam2_instance_segmentation
277
- "'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
309
+ "'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 25, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
278
310
  'florence2_sam2_video_tracking' is a tool that can track and segment multiple
279
311
  objects in a video given a text prompt such as category names or referring
280
312
  expressions. The categories in the text prompt are separated by commas. It returns
@@ -318,34 +350,6 @@ desc,doc,name
318
350
  ...
319
351
  ]
320
352
  ",florence2_sam2_video_tracking
321
- "'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
322
- 'florence2_object_detection' is a tool that can detect multiple objects given a
323
- text prompt which can be object names or caption. You can optionally separate the
324
- object names in the text with commas. It returns a list of bounding boxes with
325
- normalized coordinates, label names and associated confidence scores of 1.0.
326
-
327
- Parameters:
328
- prompt (str): The prompt to ground to the image. Use exclusive categories that
329
- do not overlap such as 'person, car' and NOT 'person, athlete'.
330
- image (np.ndarray): The image to used to detect objects
331
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
332
- fine-tuned model ID here to use it.
333
-
334
- Returns:
335
- List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
336
- bounding box of the detected objects with normalized coordinates between 0
337
- and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
338
- top-left and xmax and ymax are the coordinates of the bottom-right of the
339
- bounding box. The scores are always 1.0 and cannot be thresholded
340
-
341
- Example
342
- -------
343
- >>> florence2_object_detection('person looking at a coyote', image)
344
- [
345
- {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
346
- {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
347
- ]
348
- ",florence2_object_detection
349
353
  'claude35_text_extraction' is a tool that can extract text from an image. It returns the extracted text as a string and can be used as an alternative to OCR if you do not need to know the exact bounding box of the text.,"claude35_text_extraction(image: numpy.ndarray) -> str:
350
354
  'claude35_text_extraction' is a tool that can extract text from an image. It
351
355
  returns the extracted text as a string and can be used as an alternative to OCR if
@@ -458,6 +462,28 @@ desc,doc,name
458
462
  >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
459
463
  'Lionel Messi'
460
464
  ",qwen2_vl_video_vqa
465
+ 'activity_recognition' is a tool that can recognize activities in a video given a text prompt. It can be used to identify where specific activities or actions happen in a video and returns a list of 0s and 1s to indicate the activity.,"activity_recognition(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: int = 10) -> List[float]:
466
+ 'activity_recognition' is a tool that can recognize activities in a video given a
467
+ text prompt. It can be used to identify where specific activities or actions
468
+ happen in a video and returns a list of 0s and 1s to indicate the activity.
469
+
470
+ Parameters:
471
+ prompt (str): The event you want to identify, should be phrased as a question,
472
+ for example, ""Did a goal happen?"".
473
+ frames (List[np.ndarray]): The reference frames used for the question
474
+ model (str): The model to use for the inference. Valid values are
475
+ 'claude-35', 'gpt-4o', 'qwen2vl'.
476
+ chunk_length_frames (int): length of each chunk in frames
477
+
478
+ Returns:
479
+ List[float]: A list of floats with a value of 1.0 if the activity is detected in
480
+ the chunk_length_frames of the video.
481
+
482
+ Example
483
+ -------
484
+ >>> activity_recognition('Did a goal happened?', frames)
485
+ [0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
486
+ ",activity_recognition
461
487
  'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intesities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
462
488
  'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a
463
489
  depth image from a given RGB image. The returned depth image is monochrome and
@@ -514,30 +540,6 @@ desc,doc,name
514
540
  >>> vit_nsfw_classification(image)
515
541
  {""label"": ""normal"", ""scores"": 0.68},
516
542
  ",vit_nsfw_classification
517
- 'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: int = 2) -> List[float]:
518
- 'video_temporal_localization' will run qwen2vl on each chunk_length_frames
519
- value selected for the video. It can detect multiple objects independently per
520
- chunk_length_frames given a text prompt such as a referring expression
521
- but does not track objects across frames.
522
- It returns a list of floats with a value of 1.0 if the objects are found in a given
523
- chunk_length_frames of the video.
524
-
525
- Parameters:
526
- prompt (str): The question about the video
527
- frames (List[np.ndarray]): The reference frames used for the question
528
- model (str): The model to use for the inference. Valid values are
529
- 'qwen2vl', 'gpt4o'.
530
- chunk_length_frames (int): length of each chunk in frames
531
-
532
- Returns:
533
- List[float]: A list of floats with a value of 1.0 if the objects to be found
534
- are present in the chunk_length_frames of the video.
535
-
536
- Example
537
- -------
538
- >>> video_temporal_localization('Did a goal happened?', frames)
539
- [0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
540
- ",video_temporal_localization
541
543
  "'flux_image_inpainting' performs image inpainting to fill the masked regions, given by mask, in the image, given image based on the text prompt and surrounding image context. It can be used to edit regions of an image according to the prompt given.","flux_image_inpainting(prompt: str, image: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
542
544
  'flux_image_inpainting' performs image inpainting to fill the masked regions,
543
545
  given by mask, in the image, given image based on the text prompt and surrounding
@@ -728,28 +730,3 @@ desc,doc,name
728
730
  }],
729
731
  )
730
732
  ",overlay_segmentation_masks
731
- 'overlay_heat_map' is a utility function that displays a heat map on an image.,"overlay_heat_map(image: numpy.ndarray, heat_map: Dict[str, Any], alpha: float = 0.8) -> numpy.ndarray:
732
- 'overlay_heat_map' is a utility function that displays a heat map on an image.
733
-
734
- Parameters:
735
- image (np.ndarray): The image to display the heat map on.
736
- heat_map (Dict[str, Any]): A dictionary containing the heat map under the key
737
- 'heat_map'.
738
- alpha (float, optional): The transparency of the overlay. Defaults to 0.8.
739
-
740
- Returns:
741
- np.ndarray: The image with the heat map displayed.
742
-
743
- Example
744
- -------
745
- >>> image_with_heat_map = overlay_heat_map(
746
- image,
747
- {
748
- 'heat_map': array([[0, 0, 0, ..., 0, 0, 0],
749
- [0, 0, 0, ..., 0, 0, 0],
750
- ...,
751
- [0, 0, 0, ..., 0, 0, 0],
752
- [0, 0, 0, ..., 125, 125, 125]], dtype=uint8),
753
- },
754
- )
755
- ",overlay_heat_map
Binary file
@@ -2,13 +2,13 @@ from abc import ABC, abstractmethod
2
2
  from pathlib import Path
3
3
  from typing import Any, Dict, List, Optional, Union
4
4
 
5
- from vision_agent.agent.types import (
5
+ from vision_agent.models import (
6
6
  AgentMessage,
7
7
  CodeContext,
8
8
  InteractionContext,
9
+ Message,
9
10
  PlanContext,
10
11
  )
11
- from vision_agent.lmm.types import Message
12
12
  from vision_agent.utils.execute import CodeInterpreter
13
13
 
14
14
 
@@ -6,7 +6,6 @@ from pathlib import Path
6
6
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
7
7
 
8
8
  from vision_agent.agent import Agent
9
- from vision_agent.agent.agent_utils import extract_json, extract_tag
10
9
  from vision_agent.agent.vision_agent_prompts import (
11
10
  EXAMPLES_CODE1,
12
11
  EXAMPLES_CODE2,
@@ -14,7 +13,8 @@ from vision_agent.agent.vision_agent_prompts import (
14
13
  EXAMPLES_CODE3_EXTRA2,
15
14
  VA_CODE,
16
15
  )
17
- from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
16
+ from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM
17
+ from vision_agent.models import Message
18
18
  from vision_agent.tools.meta_tools import (
19
19
  META_TOOL_DOCSTRING,
20
20
  Artifacts,
@@ -22,6 +22,7 @@ from vision_agent.tools.meta_tools import (
22
22
  use_extra_vision_agent_args,
23
23
  )
24
24
  from vision_agent.utils import CodeInterpreterFactory
25
+ from vision_agent.utils.agent import extract_json, extract_tag
25
26
  from vision_agent.utils.execute import CodeInterpreter, Execution
26
27
 
27
28
  logging.basicConfig(level=logging.INFO)
@@ -9,16 +9,6 @@ from tabulate import tabulate
9
9
 
10
10
  import vision_agent.tools as T
11
11
  from vision_agent.agent.agent import Agent
12
- from vision_agent.agent.agent_utils import (
13
- _MAX_TABULATE_COL_WIDTH,
14
- DefaultImports,
15
- extract_code,
16
- extract_tag,
17
- format_feedback,
18
- print_code,
19
- remove_installs_from_code,
20
- strip_function_calls,
21
- )
22
12
  from vision_agent.agent.vision_agent_coder_prompts import (
23
13
  CODE,
24
14
  FIX_BUG,
@@ -32,16 +22,20 @@ from vision_agent.agent.vision_agent_planner import (
32
22
  OpenAIVisionAgentPlanner,
33
23
  PlanContext,
34
24
  )
35
- from vision_agent.lmm import (
36
- LMM,
37
- AnthropicLMM,
38
- AzureOpenAILMM,
39
- Message,
40
- OllamaLMM,
41
- OpenAILMM,
42
- )
25
+ from vision_agent.lmm import LMM, AnthropicLMM, AzureOpenAILMM, OllamaLMM, OpenAILMM
26
+ from vision_agent.models import Message
43
27
  from vision_agent.tools.meta_tools import get_diff
44
28
  from vision_agent.utils import CodeInterpreterFactory, Execution
29
+ from vision_agent.utils.agent import (
30
+ _MAX_TABULATE_COL_WIDTH,
31
+ DefaultImports,
32
+ extract_code,
33
+ extract_tag,
34
+ format_feedback,
35
+ print_code,
36
+ remove_installs_from_code,
37
+ strip_function_calls,
38
+ )
45
39
  from vision_agent.utils.execute import CodeInterpreter
46
40
 
47
41
  logging.basicConfig(stream=sys.stdout)
@@ -490,7 +484,7 @@ class VisionAgentCoder(Agent):
490
484
  tool_info=tool_doc,
491
485
  tool_output=tool_output_str,
492
486
  plan_thoughts=plan_thoughts_str,
493
- tool_utils=T.UTILITIES_DOCSTRING,
487
+ tool_utils=T.get_utilties_docstring(),
494
488
  working_memory=working_memory,
495
489
  coder=self.coder,
496
490
  tester=self.tester,
@@ -5,9 +5,22 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
5
5
  from rich.console import Console
6
6
  from rich.markup import escape
7
7
 
8
- import vision_agent.tools.tools as T
9
8
  from vision_agent.agent import AgentCoder, AgentPlanner
10
- from vision_agent.agent.agent_utils import (
9
+ from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
10
+ from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
11
+ from vision_agent.configs import Config
12
+ from vision_agent.lmm import LMM
13
+ from vision_agent.models import (
14
+ AgentMessage,
15
+ CodeContext,
16
+ InteractionContext,
17
+ Message,
18
+ PlanContext,
19
+ )
20
+ from vision_agent.sim import Sim, get_tool_recommender
21
+ from vision_agent.tools.meta_tools import get_diff
22
+ from vision_agent.tools.tools import get_utilties_docstring
23
+ from vision_agent.utils.agent import (
11
24
  DefaultImports,
12
25
  add_media_to_chat,
13
26
  capture_media_from_exec,
@@ -18,24 +31,11 @@ from vision_agent.agent.agent_utils import (
18
31
  print_code,
19
32
  strip_function_calls,
20
33
  )
21
- from vision_agent.agent.types import (
22
- AgentMessage,
23
- CodeContext,
24
- InteractionContext,
25
- PlanContext,
26
- )
27
- from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
28
- from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
29
- from vision_agent.configs import Config
30
- from vision_agent.lmm import LMM
31
- from vision_agent.lmm.types import Message
32
- from vision_agent.tools.meta_tools import get_diff
33
34
  from vision_agent.utils.execute import (
34
35
  CodeInterpreter,
35
36
  CodeInterpreterFactory,
36
37
  Execution,
37
38
  )
38
- from vision_agent.utils.sim import Sim, get_tool_recommender
39
39
 
40
40
  CONFIG = Config()
41
41
  _CONSOLE = Console()
@@ -207,7 +207,7 @@ def test_code(
207
207
  test = write_test(
208
208
  tester=tester,
209
209
  chat=chat,
210
- tool_util_docs=T.UTILITIES_DOCSTRING,
210
+ tool_util_docs=get_utilties_docstring(),
211
211
  code=code,
212
212
  media_list=media_list,
213
213
  )
@@ -227,7 +227,7 @@ def test_code(
227
227
  while (not result.success or len(result.logs.stdout) == 0) and count < 3:
228
228
  code, test, debug_info = debug_code(
229
229
  debugger,
230
- T.UTILITIES_DOCSTRING + "\n" + tool_docs,
230
+ get_utilties_docstring() + "\n" + tool_docs,
231
231
  plan,
232
232
  code,
233
233
  test,
@@ -9,15 +9,6 @@ from tabulate import tabulate
9
9
 
10
10
  import vision_agent.tools as T
11
11
  from vision_agent.agent import Agent
12
- from vision_agent.agent.agent_utils import (
13
- _MAX_TABULATE_COL_WIDTH,
14
- DefaultImports,
15
- extract_code,
16
- extract_json,
17
- format_feedback,
18
- format_plans,
19
- print_code,
20
- )
21
12
  from vision_agent.agent.vision_agent_planner_prompts import (
22
13
  PICK_PLAN,
23
14
  PLAN,
@@ -25,20 +16,24 @@ from vision_agent.agent.vision_agent_planner_prompts import (
25
16
  TEST_PLANS,
26
17
  USER_REQ,
27
18
  )
28
- from vision_agent.lmm import (
29
- LMM,
30
- AnthropicLMM,
31
- AzureOpenAILMM,
32
- Message,
33
- OllamaLMM,
34
- OpenAILMM,
19
+ from vision_agent.lmm import LMM, AnthropicLMM, AzureOpenAILMM, OllamaLMM, OpenAILMM
20
+ from vision_agent.models import Message
21
+ from vision_agent.sim import AzureSim, OllamaSim, Sim
22
+ from vision_agent.utils.agent import (
23
+ _MAX_TABULATE_COL_WIDTH,
24
+ DefaultImports,
25
+ extract_code,
26
+ extract_json,
27
+ format_feedback,
28
+ format_plans,
29
+ print_code,
35
30
  )
36
31
  from vision_agent.utils.execute import (
37
32
  CodeInterpreter,
38
33
  CodeInterpreterFactory,
39
34
  Execution,
40
35
  )
41
- from vision_agent.utils.sim import AzureSim, OllamaSim, Sim
36
+ from vision_agent.utils.tools_doc import get_tool_descriptions_by_names
42
37
 
43
38
  _LOGGER = logging.getLogger(__name__)
44
39
 
@@ -348,7 +343,7 @@ class VisionAgentPlanner(Agent):
348
343
  _LOGGER.setLevel(logging.INFO)
349
344
 
350
345
  self.tool_recommender = (
351
- Sim(T.TOOLS_DF, sim_key="desc")
346
+ Sim(T.get_tools_df(), sim_key="desc")
352
347
  if tool_recommender is None
353
348
  else tool_recommender
354
349
  )
@@ -414,7 +409,7 @@ class VisionAgentPlanner(Agent):
414
409
 
415
410
  plans = write_plans(
416
411
  chat,
417
- T.get_tool_descriptions_by_names(
412
+ get_tool_descriptions_by_names(
418
413
  custom_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
419
414
  ),
420
415
  format_feedback(working_memory),
@@ -537,7 +532,7 @@ class OllamaVisionAgentPlanner(VisionAgentPlanner):
537
532
  else planner
538
533
  ),
539
534
  tool_recommender=(
540
- OllamaSim(T.TOOLS_DF, sim_key="desc")
535
+ OllamaSim(T.get_tools_df(), sim_key="desc")
541
536
  if tool_recommender is None
542
537
  else tool_recommender
543
538
  ),
@@ -559,7 +554,7 @@ class AzureVisionAgentPlanner(VisionAgentPlanner):
559
554
  super().__init__(
560
555
  planner=(AzureOpenAILMM(temperature=0.0) if planner is None else planner),
561
556
  tool_recommender=(
562
- AzureSim(T.TOOLS_DF, sim_key="desc")
557
+ AzureSim(T.get_tools_df(), sim_key="desc")
563
558
  if tool_recommender is None
564
559
  else tool_recommender
565
560
  ),
@@ -9,21 +9,22 @@ PLAN = """
9
9
  **Example Planning**: Here are some examples of how you can search for a plan, in the examples the user output is denoted by USER, your output is denoted by AGENT and the observations after your code execution are denoted by OBSERVATION:
10
10
  {examples}
11
11
 
12
- **Current Planning**:
13
- --- START PLANNING ---
12
+ **Current Planning**: This is the plan you are currently working on
13
+ --- START CURRENT PLANNING ---
14
14
  {planning}
15
- --- END PLANNING ---
15
+ --- END CURRENT PLANNING ---
16
16
 
17
17
  **Instructions**:
18
18
  1. Read over the user request and context provided and output <thinking> tags to indicate your thought process. You can <count> number of turns to complete the user's request.
19
19
  2. You can execute python code in the ipython notebook using <execute_python> tags. Only output one <execute_python> tag at a time.
20
20
  3. Only output <finalize_plan> when you are done planning and want to end the planning process. DO NOT output <finalize_plan> with <execute_python> tags, only after OBSERVATION's.
21
21
  4. Only load/save files from {media_list} unless you specifically saved the file previously.
22
- 5. Ensure you always call `suggestion` initially and `get_tool_for_task` to get the right tool for the subtask.
22
+ 5. Ensure you always call `suggestion` and `claude35_vqa` initially and `get_tool_for_task` to get the right tool for the subtask.
23
23
  6. Calling `plt.imshow` or `save_image` will display the image to you so you can check your results. If you see an image after <execute_python> it's generated from your code.
24
- 7. DO NOT hard code the answer into your code, it should be dynamic and work for any similar request.
25
- 8. DO NOT over index on claude35_vqa, if tool output is close to claude35_vqa's output you do not need to improve the tool.
26
- 9. You can only respond in the following format with a single <thinking>, <execute_python> or <finalize_plan> tag:
24
+ 7. Be sure to print results returned for tools so you can see the output.
25
+ 8. DO NOT hard code the answer into your code, it should be dynamic and work for any similar request.
26
+ 9. DO NOT over index on claude35_vqa, if tool output is close to claude35_vqa's output you do not need to improve the tool output, tools are often better at things like counting and detecting small objects.
27
+ 10. You can only respond in the following format with a single <thinking>, <execute_python> or <finalize_plan> tag:
27
28
 
28
29
  <thinking>Your thought process...</thinking>
29
30
  <execute_python>Your code here</execute_python>
@@ -334,23 +335,21 @@ get_tool_for_task('Identify and track the boxes in the video', frames[:5])
334
335
 
335
336
  OBSERVATION:
336
337
  [get_tool_for_task output]
337
- For tracking boxes moving on a conveyor belt, we need a tool that can consistently track the same box across frames without losing it or double counting. Looking at the outputs: florence2_sam2_video_tracking successfully tracks the single box across all 5 frames, maintaining consistent tracking IDs and showing the box's movement along the conveyor and using the prompt 'box'.
338
+ For tracking boxes moving on a conveyor belt, we need a tool that can consistently track the same box across frames without losing it or double counting. Looking at the outputs: countgd_sam2_video_tracking successfully tracks the single box across all 5 frames, maintaining consistent tracking IDs and showing the box's movement along the conveyor and using the prompt 'box'.
338
339
 
339
340
  Tool Documentation:
340
- def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
341
- 'florence2_sam2_video_tracking' is a tool that can track and segment multiple
341
+ def countgd_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
342
+ 'countgd_sam2_video_tracking' is a tool that can track and segment multiple
342
343
  objects in a video given a text prompt such as category names or referring
343
344
  expressions. The categories in the text prompt are separated by commas. It returns
344
345
  a list of bounding boxes, label names, masks and associated probability scores and
345
346
  is useful for tracking and counting without duplicating counts.
346
347
 
347
348
  Parameters:
348
- prompt (str): The prompt to ground to the video.
349
+ prompt (str): The prompt to ground to the image.
349
350
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
350
- chunk_length (Optional[int]): The number of frames to re-run florence2 to find
351
+ chunk_length (Optional[int]): The number of frames to re-run countgd to find
351
352
  new objects.
352
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
353
- fine-tuned model ID here to use it.
354
353
 
355
354
  Returns:
356
355
  List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
@@ -364,7 +363,7 @@ def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_l
364
363
 
365
364
  Example
366
365
  -------
367
- >>> florence2_sam2_video_tracking("car, dinosaur", frames)
366
+ >>> countgd_sam2_video_tracking("car, dinosaur", frames)
368
367
  [
369
368
  [
370
369
  {
@@ -382,7 +381,7 @@ def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_l
382
381
 
383
382
  AGENT: <thinking>Now that I have the tool and prompt for tracking I need to find a tool to identify if there is tape on each of the boxes.</thinking>
384
383
  <execute_python>
385
- track_predictions = florence2_sam2_video_tracking("box", frames)
384
+ track_predictions = countgd_sam2_video_tracking("box", frames)
386
385
 
387
386
  # only use first 5 frames to reduce processing time
388
387
  crops = []
@@ -512,16 +511,16 @@ PICK_PLAN = """
512
511
 
513
512
  CATEGORIZE_TOOL_REQUEST = """
514
513
  You are given a task: "{task}" from the user. You must extract the type of category this task belongs to, it can be one or more of the following:
514
+ - "VQA" - answering questions about an image or video, can be used for most tasks, should generally be included.
515
515
  - "object detection and counting" - detecting objects or counting objects from a text prompt in an image.
516
516
  - "instance segmentation" - segmenting objects in an image given a text prompt.
517
517
  - "classification" - classifying objects in an image given a text prompt.
518
518
  - "segmentation" - segmenting objects in an image or video given a text prompt.
519
519
  - "OCR" - extracting text from an image.
520
- - "VQA" - answering questions about an image or video, can also be used for text extraction.
521
520
  - "DocQA" - answering questions about a document or extracting information from a document.
522
521
  - "video object tracking" - tracking objects in a video.
523
522
  - "depth and pose estimation" - estimating the depth or pose of objects in an image.
524
- - "temporal localization" - localizing the time period an event occurs in a video.
523
+ - "activity recognition" - identifying time period(s) an event occurs in a video.
525
524
  - "inpainting" - filling in masked parts of an image.
526
525
 
527
526
  Return the category or categories (comma separated) inside tags <category># your categories here</category>. If you are unsure about a task, it is better to include more categories than less.
@@ -718,7 +717,7 @@ PICK_TOOL = """
718
717
  FINALIZE_PLAN = """
719
718
  **Task**: You are given a chain of thoughts, python executions and observations from a planning agent as it tries to construct a plan to solve a user request. Your task is to summarize the plan it found so that another programming agent to write a program to accomplish the user request.
720
719
 
721
- **Documentation**: You can use these tools to help you visualize or save the output:
720
+ **Documentation**: You can use these tools to help you visualize or save the output (they are imported `from vision_agent.tools import *`):
722
721
  {tool_desc}
723
722
 
724
723
  **Planning**: Here is chain of thoughts, executions and observations from the planning agent:
@@ -730,7 +729,7 @@ FINALIZE_PLAN = """
730
729
  3. Only use tools obtained from calling `get_tool_for_task`.
731
730
  4. Do not include {excluded_tools} tools in your instructions.
732
731
  5. Ensure the function is well documented and easy to understand.
733
- 6. Ensure you visualize the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and save it to a file with `save_image` or `save_video`.
732
+ 6. Ensure you visualize the output with `overlay_bounding_boxes` or `overlay_segmentation_masks`, if bounding boxes or segmentaiton masks are produced, and save it to a file with `save_image` or `save_video`.
734
733
  7. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
735
734
  8. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
736
735
  9. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
@@ -13,17 +13,6 @@ from rich.markup import escape
13
13
  import vision_agent.tools as T
14
14
  import vision_agent.tools.planner_tools as pt
15
15
  from vision_agent.agent import AgentPlanner
16
- from vision_agent.agent.agent_utils import (
17
- add_media_to_chat,
18
- capture_media_from_exec,
19
- convert_message_to_agentmessage,
20
- extract_json,
21
- extract_tag,
22
- print_code,
23
- print_table,
24
- remove_installs_from_code,
25
- )
26
- from vision_agent.agent.types import AgentMessage, InteractionContext, PlanContext
27
16
  from vision_agent.agent.vision_agent_planner_prompts_v2 import (
28
17
  CRITIQUE_PLAN,
29
18
  EXAMPLE_PLAN1,
@@ -34,17 +23,29 @@ from vision_agent.agent.vision_agent_planner_prompts_v2 import (
34
23
  PLAN,
35
24
  )
36
25
  from vision_agent.configs import Config
37
- from vision_agent.lmm import LMM, Message
38
- from vision_agent.tools.planner_tools import check_function_call, get_tool_documentation
26
+ from vision_agent.lmm import LMM
27
+ from vision_agent.models import AgentMessage, InteractionContext, Message, PlanContext
28
+ from vision_agent.tools.planner_tools import check_function_call
29
+ from vision_agent.utils.agent import (
30
+ add_media_to_chat,
31
+ capture_media_from_exec,
32
+ convert_message_to_agentmessage,
33
+ extract_json,
34
+ extract_tag,
35
+ print_code,
36
+ print_table,
37
+ remove_installs_from_code,
38
+ )
39
39
  from vision_agent.utils.execute import (
40
40
  CodeInterpreter,
41
41
  CodeInterpreterFactory,
42
42
  Execution,
43
43
  )
44
+ from vision_agent.utils.tools_doc import get_tool_documentation
44
45
 
45
46
  logging.basicConfig(level=logging.INFO)
46
47
  CONFIG = Config()
47
- UTIL_DOCSTRING = T.get_tool_documentation(
48
+ UTIL_DOCSTRING = get_tool_documentation(
48
49
  [
49
50
  T.load_image,
50
51
  T.extract_frames_and_timestamps,
@@ -360,6 +361,16 @@ def get_steps(chat: List[AgentMessage], max_steps: int) -> int:
360
361
  return max_steps
361
362
 
362
363
 
364
+ def format_tool_output(tool_thoughts: str, tool_docstring: str) -> str:
365
+ return_str = "[get_tool_for_task output]\n"
366
+ if tool_thoughts.strip() != "":
367
+ return_str += f"{tool_thoughts}\n\n"
368
+ return_str += (
369
+ f"Tool Documentation:\n{tool_docstring}\n[end of get_tool_for_task output]\n"
370
+ )
371
+ return return_str
372
+
373
+
363
374
  def replace_interaction_with_obs(chat: List[AgentMessage]) -> List[AgentMessage]:
364
375
  chat = copy.deepcopy(chat)
365
376
  new_chat = []
@@ -371,7 +382,10 @@ def replace_interaction_with_obs(chat: List[AgentMessage]) -> List[AgentMessage]
371
382
  try:
372
383
  response = json.loads(chat[i + 1].content)
373
384
  function_name = response["function_name"]
374
- tool_doc = get_tool_documentation(function_name)
385
+ tools_df = T.get_tools_df()
386
+ tool_doc = format_tool_output(
387
+ "", tools_df[tools_df["name"] == function_name]["doc"].values[0]
388
+ )
375
389
  if "box_threshold" in response:
376
390
  tool_doc = f"Use the following function with box_threshold={response['box_threshold']}. This tool and its parameters were chosen by the user so do not change them in your planning.\n\n{tool_doc}."
377
391
  new_chat.append(AgentMessage(role="observation", content=tool_doc))