vision-agent 1.0.4__tar.gz → 1.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {vision_agent-1.0.4 → vision_agent-1.0.7}/PKG-INFO +31 -3
  2. {vision_agent-1.0.4 → vision_agent-1.0.7}/README.md +30 -2
  3. {vision_agent-1.0.4 → vision_agent-1.0.7}/pyproject.toml +1 -1
  4. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/.sim_tools/df.csv +46 -47
  5. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/.sim_tools/embs.npy +0 -0
  6. vision_agent-1.0.7/vision_agent/agent/__init__.py +4 -0
  7. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/vision_agent_planner_prompts_v2.py +57 -58
  8. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/vision_agent_planner_v2.py +3 -2
  9. vision_agent-1.0.4/vision_agent/configs/anthropic_openai_config.py → vision_agent-1.0.7/vision_agent/configs/anthropic_config.py +12 -13
  10. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/configs/config.py +14 -15
  11. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/configs/openai_config.py +10 -10
  12. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/lmm/lmm.py +2 -2
  13. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/tools/__init__.py +0 -6
  14. vision_agent-1.0.7/vision_agent/tools/meta_tools.py +200 -0
  15. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/tools/planner_tools.py +13 -14
  16. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/tools/tools.py +16 -27
  17. vision_agent-1.0.4/vision_agent/agent/__init__.py +0 -20
  18. vision_agent-1.0.4/vision_agent/agent/vision_agent.py +0 -605
  19. vision_agent-1.0.4/vision_agent/agent/vision_agent_coder.py +0 -742
  20. vision_agent-1.0.4/vision_agent/agent/vision_agent_coder_prompts.py +0 -290
  21. vision_agent-1.0.4/vision_agent/agent/vision_agent_planner.py +0 -564
  22. vision_agent-1.0.4/vision_agent/agent/vision_agent_planner_prompts.py +0 -199
  23. vision_agent-1.0.4/vision_agent/agent/vision_agent_prompts.py +0 -312
  24. vision_agent-1.0.4/vision_agent/configs/anthropic_config.py +0 -150
  25. vision_agent-1.0.4/vision_agent/tools/meta_tools.py +0 -691
  26. {vision_agent-1.0.4 → vision_agent-1.0.7}/LICENSE +0 -0
  27. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/__init__.py +0 -0
  28. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/README.md +0 -0
  29. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/agent.py +0 -0
  30. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
  31. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
  32. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
  33. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/agent/vision_agent_v2.py +0 -0
  34. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/clients/__init__.py +0 -0
  35. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/clients/http.py +0 -0
  36. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/configs/__init__.py +0 -0
  37. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/fonts/__init__.py +0 -0
  38. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  39. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/lmm/__init__.py +0 -0
  40. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/models/__init__.py +0 -0
  41. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/models/agent_types.py +0 -0
  42. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/models/lmm_types.py +0 -0
  43. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/models/tools_types.py +0 -0
  44. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/sim/__init__.py +0 -0
  45. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/sim/sim.py +0 -0
  46. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/tools/prompts.py +0 -0
  47. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/__init__.py +0 -0
  48. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/agent.py +0 -0
  49. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/exceptions.py +0 -0
  50. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/execute.py +0 -0
  51. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/image_utils.py +0 -0
  52. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/tools.py +0 -0
  53. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/tools_doc.py +0 -0
  54. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/video.py +0 -0
  55. {vision_agent-1.0.4 → vision_agent-1.0.7}/vision_agent/utils/video_tracking.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 1.0.4
3
+ Version: 1.0.7
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -65,10 +65,10 @@ pip install vision-agent
65
65
 
66
66
  ```bash
67
67
  export ANTHROPIC_API_KEY="your-api-key"
68
- export OPENAI_API_KEY="your-api-key"
68
+ export GEMINI_API_KEY="your-api-key"
69
69
  ```
70
70
 
71
- > **_NOTE:_** We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
71
+ > **_NOTE:_** We found using both Anthropic Claude-3.7 and Gemini-2.0-Flash-Exp to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
72
72
 
73
73
  You will also need to set your VisionAgent API key to be able to authenticate when using the hosted vision tools that we provide through our APIs. Currently, the APIs are free to use so you will only need to get it from [here](https://va.landing.ai/account/api-key).
74
74
 
@@ -147,5 +147,33 @@ directory. For example to change to Anthropic simply just run:
147
147
  cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
148
148
  ```
149
149
 
150
+ You can also modify the existing `config.py` file yourself to use a different LLM
151
+ provider, for example if you wanted to change the planner from Anthropic inside
152
+ `config.py` to OpenAI you would replace this code:
153
+ ```python
154
+ planner: Type[LMM] = Field(default=AnthropicLMM)
155
+ planner_kwargs: dict = Field(
156
+ default_factory=lambda: {
157
+ "model_name": "claude-3-7-sonnet-20250219",
158
+ "temperature": 0.0,
159
+ "image_size": 768,
160
+ }
161
+ )
162
+ ```
163
+
164
+ with this code:
165
+
166
+ ```python
167
+ planner: Type[LMM] = Field(default=OpenAILMM)
168
+ planner_kwargs: dict = Field(
169
+ default_factory=lambda: {
170
+ "model_name": "gpt-4o-2024-11-20",
171
+ "temperature": 0.0,
172
+ "image_size": 768,
173
+ "image_detail": "low",
174
+ }
175
+ )
176
+ ```
177
+
150
178
  > **_NOTE:_** VisionAgent moves fast and we are constantly updating and changing the library. If you have any questions or need help, please reach out to us on our discord channel.
151
179
 
@@ -23,10 +23,10 @@ pip install vision-agent
23
23
 
24
24
  ```bash
25
25
  export ANTHROPIC_API_KEY="your-api-key"
26
- export OPENAI_API_KEY="your-api-key"
26
+ export GEMINI_API_KEY="your-api-key"
27
27
  ```
28
28
 
29
- > **_NOTE:_** We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
29
+ > **_NOTE:_** We found using both Anthropic Claude-3.7 and Gemini-2.0-Flash-Exp to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
30
30
 
31
31
  You will also need to set your VisionAgent API key to be able to authenticate when using the hosted vision tools that we provide through our APIs. Currently, the APIs are free to use so you will only need to get it from [here](https://va.landing.ai/account/api-key).
32
32
 
@@ -105,4 +105,32 @@ directory. For example to change to Anthropic simply just run:
105
105
  cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
106
106
  ```
107
107
 
108
+ You can also modify the existing `config.py` file yourself to use a different LLM
109
+ provider, for example if you wanted to change the planner from Anthropic inside
110
+ `config.py` to OpenAI you would replace this code:
111
+ ```python
112
+ planner: Type[LMM] = Field(default=AnthropicLMM)
113
+ planner_kwargs: dict = Field(
114
+ default_factory=lambda: {
115
+ "model_name": "claude-3-7-sonnet-20250219",
116
+ "temperature": 0.0,
117
+ "image_size": 768,
118
+ }
119
+ )
120
+ ```
121
+
122
+ with this code:
123
+
124
+ ```python
125
+ planner: Type[LMM] = Field(default=OpenAILMM)
126
+ planner_kwargs: dict = Field(
127
+ default_factory=lambda: {
128
+ "model_name": "gpt-4o-2024-11-20",
129
+ "temperature": 0.0,
130
+ "image_size": 768,
131
+ "image_detail": "low",
132
+ }
133
+ )
134
+ ```
135
+
108
136
  > **_NOTE:_** VisionAgent moves fast and we are constantly updating and changing the library. If you have any questions or need help, please reach out to us on our discord channel.
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "1.0.4"
7
+ version = "1.0.7"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -1,15 +1,15 @@
1
1
  desc,doc,name
2
- "'owlv2_object_detection' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions on images. The categories in text prompt are separated by commas. It returns a list of bounding boxes with normalized coordinates, label names and associated probability scores.","owlv2_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.1) -> List[Dict[str, Any]]:
3
- 'owlv2_object_detection' is a tool that can detect and count multiple objects
4
- given a text prompt such as category names or referring expressions on images. The
5
- categories in text prompt are separated by commas. It returns a list of bounding
6
- boxes with normalized coordinates, label names and associated probability scores.
2
+ "'glee_object_detection' is a tool that can detect multiple objects given a text prompt such as object names or referring expressions on images. It's particularly good at detecting specific objects given detailed descriptive prompts. It returns a list of bounding boxes with normalized coordinates, label names and associated probability scores.","glee_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
3
+ 'glee_object_detection' is a tool that can detect multiple objects given a
4
+ text prompt such as object names or referring expressions on images. It's
5
+ particularly good at detecting specific objects given detailed descriptive prompts.
6
+ It returns a list of bounding boxes with normalized coordinates, label names and
7
+ associated probability scores.
7
8
 
8
9
  Parameters:
9
- prompt (str): The prompt to ground to the image.
10
+ prompt (str): The prompt to ground to the image, only supports a single prompt
11
+ with no commas or periods.
10
12
  image (np.ndarray): The image to ground the prompt to.
11
- box_threshold (float, optional): The threshold for the box detection. Defaults
12
- to 0.10.
13
13
 
14
14
  Returns:
15
15
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -20,24 +20,23 @@ desc,doc,name
20
20
 
21
21
  Example
22
22
  -------
23
- >>> owlv2_object_detection(""car, dinosaur"", image)
23
+ >>> glee_object_detection(""person holding a box"", image)
24
24
  [
25
- {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
26
- {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
25
+ {'score': 0.99, 'label': 'person holding a box', 'bbox': [0.1, 0.11, 0.35, 0.4]},
26
+ {'score': 0.98, 'label': 'person holding a box', 'bbox': [0.2, 0.21, 0.45, 0.5},
27
27
  ]
28
- ",owlv2_object_detection
29
- "'owlv2_sam2_instance_segmentation' is a tool that can detect and count multiple instances of objects given a text prompt such as category names or referring expressions on images. The categories in text prompt are separated by commas. It returns a list of bounding boxes with normalized coordinates, label names, masks and associated probability scores.","owlv2_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, box_threshold: float = 0.1) -> List[Dict[str, Any]]:
30
- 'owlv2_sam2_instance_segmentation' is a tool that can detect and count multiple
31
- instances of objects given a text prompt such as category names or referring
32
- expressions on images. The categories in text prompt are separated by commas. It
33
- returns a list of bounding boxes with normalized coordinates, label names, masks
34
- and associated probability scores.
28
+ ",glee_object_detection
29
+ "'glee_sam2_instance_segmentation' is a tool that can detect multiple instances given a text prompt such as object names or referring expressions on images. It's particularly good at detecting specific objects given detailed descriptive prompts. It returns a list of bounding boxes with normalized coordinates, label names, masks and associated probability scores.","glee_sam2_instance_segmentation(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
30
+ 'glee_sam2_instance_segmentation' is a tool that can detect multiple
31
+ instances given a text prompt such as object names or referring expressions on
32
+ images. It's particularly good at detecting specific objects given detailed
33
+ descriptive prompts. It returns a list of bounding boxes with normalized
34
+ coordinates, label names, masks and associated probability scores.
35
35
 
36
36
  Parameters:
37
- prompt (str): The object that needs to be counted.
37
+ prompt (str): The object that needs to be counted, only supports a single
38
+ prompt with no commas or periods.
38
39
  image (np.ndarray): The image that contains multiple instances of the object.
39
- box_threshold (float, optional): The threshold for detection. Defaults
40
- to 0.10.
41
40
 
42
41
  Returns:
43
42
  List[Dict[str, Any]]: A list of dictionaries containing the score, label,
@@ -49,11 +48,11 @@ desc,doc,name
49
48
 
50
49
  Example
51
50
  -------
52
- >>> owlv2_sam2_instance_segmentation(""flower"", image)
51
+ >>> glee_sam2_instance_segmentation(""a large blue flower"", image)
53
52
  [
54
53
  {
55
54
  'score': 0.49,
56
- 'label': 'flower',
55
+ 'label': 'a large blue flower',
57
56
  'bbox': [0.1, 0.11, 0.35, 0.4],
58
57
  'mask': array([[0, 0, 0, ..., 0, 0, 0],
59
58
  [0, 0, 0, ..., 0, 0, 0],
@@ -62,21 +61,21 @@ desc,doc,name
62
61
  [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
63
62
  },
64
63
  ]
65
- ",owlv2_sam2_instance_segmentation
66
- "'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.1, chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
67
- 'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
68
- objects in a video given a text prompt such as category names or referring
69
- expressions. The categories in the text prompt are separated by commas. It returns
70
- a list of bounding boxes, label names, masks and associated probability scores and
71
- is useful for tracking and counting without duplicating counts.
64
+ ",glee_sam2_instance_segmentation
65
+ "'glee_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as object names or referring expressions. It's particularly good at detecting specific objects given detailed descriptive prompts and returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","glee_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], box_threshold: float = 0.23, chunk_length: Optional[int] = 25) -> List[List[Dict[str, Any]]]:
66
+ 'glee_sam2_video_tracking' is a tool that can track and segment multiple
67
+ objects in a video given a text prompt such as object names or referring
68
+ expressions. It's particularly good at detecting specific objects given detailed
69
+ descriptive prompts and returns a list of bounding boxes, label names, masks and
70
+ associated probability scores and is useful for tracking and counting without
71
+ duplicating counts.
72
72
 
73
73
  Parameters:
74
- prompt (str): The prompt to ground to the image.
74
+ prompt (str): The prompt to ground to the image, only supports a single prompt
75
+ with no commas or periods.
75
76
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
76
- box_threshold (float, optional): The threshold for the box detection. Defaults
77
- to 0.10.
78
- chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
79
- new objects.
77
+ chunk_length (Optional[int]): The number of frames to re-run agentic object detection to
78
+ to find new objects.
80
79
 
81
80
  Returns:
82
81
  List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
@@ -90,11 +89,11 @@ desc,doc,name
90
89
 
91
90
  Example
92
91
  -------
93
- >>> owlv2_sam2_video_tracking(""car, dinosaur"", frames)
92
+ >>> glee_sam2_video_tracking(""a runner with yellow shoes"", frames)
94
93
  [
95
94
  [
96
95
  {
97
- 'label': '0: dinosaur',
96
+ 'label': '0: a runner with yellow shoes',
98
97
  'bbox': [0.1, 0.11, 0.35, 0.4],
99
98
  'mask': array([[0, 0, 0, ..., 0, 0, 0],
100
99
  [0, 0, 0, ..., 0, 0, 0],
@@ -105,7 +104,7 @@ desc,doc,name
105
104
  ],
106
105
  ...
107
106
  ]
108
- ",owlv2_sam2_video_tracking
107
+ ",glee_sam2_video_tracking
109
108
  "'countgd_object_detection' is a tool that can detect multiple instances of an object given a text prompt. It is particularly useful when trying to detect and count a large number of objects. You can optionally separate object names in the prompt with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores.","countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
110
109
  'countgd_object_detection' is a tool that can detect multiple instances of an
111
110
  object given a text prompt. It is particularly useful when trying to detect and
@@ -417,8 +416,8 @@ desc,doc,name
417
416
  {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
418
417
  ]
419
418
  ",ocr
420
- 'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It can be very useful for document QA or OCR text extraction. It returns text as an answer to the question.,"qwen2_vl_images_vqa(prompt: str, images: List[numpy.ndarray]) -> str:
421
- 'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
419
+ 'qwen25_vl_images_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It can be very useful for document QA or OCR text extraction. It returns text as an answer to the question.,"qwen25_vl_images_vqa(prompt: str, images: List[numpy.ndarray]) -> str:
420
+ 'qwen25_vl_images_vqa' is a tool that can answer any questions about arbitrary
422
421
  images including regular images or images of documents or presentations. It can be
423
422
  very useful for document QA or OCR text extraction. It returns text as an answer to
424
423
  the question.
@@ -432,11 +431,11 @@ desc,doc,name
432
431
 
433
432
  Example
434
433
  -------
435
- >>> qwen2_vl_images_vqa('Give a summary of the document', images)
434
+ >>> qwen25_vl_images_vqa('Give a summary of the document', images)
436
435
  'The document talks about the history of the United States of America and its...'
437
- ",qwen2_vl_images_vqa
438
- 'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos including regular videos or videos of documents or presentations. It returns text as an answer to the question.,"qwen2_vl_video_vqa(prompt: str, frames: List[numpy.ndarray]) -> str:
439
- 'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
436
+ ",qwen25_vl_images_vqa
437
+ 'qwen25_vl_video_vqa' is a tool that can answer any questions about arbitrary videos including regular videos or videos of documents or presentations. It returns text as an answer to the question.,"qwen25_vl_video_vqa(prompt: str, frames: List[numpy.ndarray]) -> str:
438
+ 'qwen25_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
440
439
  including regular videos or videos of documents or presentations. It returns text
441
440
  as an answer to the question.
442
441
 
@@ -449,10 +448,10 @@ desc,doc,name
449
448
 
450
449
  Example
451
450
  -------
452
- >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
451
+ >>> qwen25_vl_video_vqa('Which football player made the goal?', frames)
453
452
  'Lionel Messi'
454
- ",qwen2_vl_video_vqa
455
- 'activity_recognition' is a tool that can recognize activities in a video given a text prompt. It can be used to identify where specific activities or actions happen in a video and returns a list of 0s and 1s to indicate the activity.,"activity_recognition(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: int = 10) -> List[float]:
453
+ ",qwen25_vl_video_vqa
454
+ 'activity_recognition' is a tool that can recognize activities in a video given a text prompt. It can be used to identify where specific activities or actions happen in a video and returns a list of 0s and 1s to indicate the activity.,"activity_recognition(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen25vl', chunk_length_frames: int = 10) -> List[float]:
456
455
  'activity_recognition' is a tool that can recognize activities in a video given a
457
456
  text prompt. It can be used to identify where specific activities or actions
458
457
  happen in a video and returns a list of 0s and 1s to indicate the activity.
@@ -0,0 +1,4 @@
1
+ from .agent import Agent, AgentCoder, AgentPlanner
2
+ from .vision_agent_coder_v2 import VisionAgentCoderV2
3
+ from .vision_agent_planner_v2 import VisionAgentPlannerV2
4
+ from .vision_agent_v2 import VisionAgentV2
@@ -9,21 +9,21 @@ PLAN = """
9
9
  **Example Planning**: Here are some examples of how you can search for a plan, in the examples the user output is denoted by USER, your output is denoted by AGENT and the observations after your code execution are denoted by OBSERVATION:
10
10
  {examples}
11
11
 
12
- **Current Planning**: This is the plan you are currently working on
12
+ **Current Planning**: Below are your previous thoughts, code and observations from the planning process:
13
13
  --- START CURRENT PLANNING ---
14
14
  {planning}
15
15
  --- END CURRENT PLANNING ---
16
16
 
17
17
  **Instructions**:
18
- 1. Read over the user request and context provided and output <thinking> tags to indicate your thought process. You can <count> number of turns to complete the user's request.
18
+ 1. Read over the user request and context provided and output <thinking> tags to indicate your thought process. You have <count> number of turns to complete the user's request.
19
19
  2. You can execute python code in the ipython notebook using <execute_python> tags. Only output one <execute_python> tag at a time.
20
- 3. Only output <finalize_plan> when you are done planning and want to end the planning process. DO NOT output <finalize_plan> with <execute_python> tags, only after OBSERVATION's.
20
+ 3. Output <finalize_plan> when you have written your final code, you do not need to wait until all turns are used to do this. DO NOT output <finalize_plan> with <execute_python> tags, only after OBSERVATION.
21
21
  4. Only load/save files from {media_list} unless you specifically saved the file previously.
22
- 5. Ensure you always call `suggestion` and `claude35_vqa` initially and `get_tool_for_task` to get the right tool for the subtask.
22
+ 5. Ensure you always call `suggestion` and `vqa` initially and `get_tool_for_task` to get the right tool for the subtask.
23
23
  6. Calling `plt.imshow` or `save_image` will display the image to you so you can check your results. If you see an image after <execute_python> it's generated from your code.
24
24
  7. Be sure to print results returned for tools so you can see the output.
25
25
  8. DO NOT hard code the answer into your code, it should be dynamic and work for any similar request.
26
- 9. DO NOT over index on claude35_vqa, if tool output is close to claude35_vqa's output you do not need to improve the tool output, tools are often better at things like counting and detecting small objects.
26
+ 9. DO NOT over index on the vqa model. If tool output is close to vqa's output do not improve the tool output. If you have >10 detected objects, or the objects are very small, trust the tool output over the vqa model.
27
27
  10. You can only respond in the following format with a single <thinking>, <execute_python> or <finalize_plan> tag:
28
28
 
29
29
  <thinking>Your thought process...</thinking>
@@ -39,13 +39,13 @@ USER: Count the number of pedestrians in the image.
39
39
  AGENT: <thinking>I need to gather more information, I will ask for a description of the image and a solution to the problem to work towards as well as a suggestion on how best to solve it.</thinking>
40
40
  <execute_python>
41
41
  image = load_image('drone.jpg')
42
- claude35_vqa('Can you describe this image? How many pedestrians do you count in the image?', [image])
42
+ vqa('Can you describe this image? How many pedestrians do you count in the image?', [image])
43
43
  suggestion('How can I count the number of pedestrians in the image?', [image])
44
44
  </execute_python>
45
45
 
46
- OBSERVATION: [claude35_vqa output]
46
+ OBSERVATION: [vqa output]
47
47
  From this aerial view of a busy urban street, it's difficult to clearly see or count individual pedestrians. The image shows a bird's eye view of a city intersection with multiple lanes of traffic, parked cars, sidewalks, and some green spaces. While there may be people in cars or on the sidewalks, they are not distinctly visible from this height and perspective. The focus of the image is more on the urban infrastructure, traffic patterns, and overall city layout rather than on individuals.
48
- [end of claude35_vqa output]
48
+ [end of vqa output]
49
49
 
50
50
  [suggestions]
51
51
  [suggestion 0]
@@ -161,7 +161,7 @@ detections = merge_bounding_box_list(detection_from_subdivided_images)
161
161
  [end of suggestion]
162
162
  <count>9</count>
163
163
 
164
- AGENT: <thinking>Following the suggested plan, I will subdivide the image into smaller parts and find the right tool on the smaller parts. I will also re-check with claude35_vqa on the subdivided images.</thinking>
164
+ AGENT: <thinking>Following the suggested plan, I will subdivide the image into smaller parts and find the right tool on the smaller parts. I will also re-check with vqa on the subdivided images.</thinking>
165
165
  <execute_python>
166
166
  def subdivide_image(image):
167
167
  height, width, _ = image.shape
@@ -176,14 +176,14 @@ def subdivide_image(image):
176
176
  return [top_left, top_right, bottom_left, bottom_right]
177
177
 
178
178
  images = subdivide_image(image)
179
- claude35_vqa('How many pedestrians do you see in these images?', images)
179
+ vqa('How many pedestrians do you see in these images?', images)
180
180
  get_tool_for_task('Count the number of pedestrians in the images', images)
181
181
  </execute_python>
182
182
 
183
183
  OBSERVATION:
184
- [claude35_vqa_output]
184
+ [vqa_output]
185
185
  In these aerial images, I can see approximately 5-6 pedestrians walking in various locations, including near the retail signage are and along the sidewalks.
186
- [end of claude35_vqa_output]
186
+ [end of vqa_output]
187
187
 
188
188
  [get_tool_for_task output]
189
189
  After examining the image, I can see it's an aerial view of a busy urban intersection with multiple lanes of traffic. There are numerous cars visible, each likely containing at least one person (the driver). While it's difficult to count exact individuals, I can estimate based on the number of vehicles. The countgd_object_detection tool is the best choice for this task as it can detect and count multiple instances of an object given a text prompt. I will use this tool with the prompt 'pedistrian' to count the number of pedestrians in the image.
@@ -269,7 +269,7 @@ print(len(detections))
269
269
  OBSERVATION: 24
270
270
  <count>7</count>
271
271
 
272
- AGENT: <thinking>countgd_object_detection detected 24 pedestrians in the image by subdividing the image into smaller regions, this differs from claude35_vqa's answer of 5-6 pedestrians. I will use countgd_object_detection's answer since it is specifically designed to count objects.</thinking>
272
+ AGENT: <thinking>countgd_object_detection detected 24 pedestrians in the image by subdividing the image into smaller regions, this differs from vqa model's answer of 5-6 pedestrians. I will use countgd_object_detection's answer since it is specifically designed to count objects.</thinking>
273
273
  <finalize_plan>This plan successfully detected 24 pedestrians in the image by subdividing the image into smaller regions and using the countgd_object_detection tool on each region. The bounding boxes were then merged to count the unique pedestrians.</finalize_plan>
274
274
  --- END EXAMPLE1 ---
275
275
  """
@@ -283,13 +283,13 @@ AGENT: <thinking>I need to gather more information, I will ask for a description
283
283
  <execute_python>
284
284
  frames = extract_frames_and_timestamps('video.mp4', 1)
285
285
  frames = [f["frame"] for f in frames]
286
- claude35_vqa('Can you describe this video? How many boxes are there and do they have tape on them?', frames[:5])
286
+ vqa('Can you describe this video? How many boxes are there and do they have tape on them?', frames[:5])
287
287
  suggestion('How can I identify boxes with tape on them in the video?', frames[:5])
288
288
  </execute_python>
289
289
 
290
- OBSERVATION: [claude35_vqa output]
290
+ OBSERVATION: [vqa output]
291
291
  In this video, there appear to be 2 cardboard boxes moving along a roller conveyor system. The boxes are brown/kraft colored and appear to have tape sealing them. You can see clear packing tape on the top and sides of the boxes.
292
- [end of claude35_vqa output]
292
+ [end of vqa output]
293
293
 
294
294
  [suggestions]
295
295
  [suggestion 0]
@@ -400,20 +400,18 @@ get_tool_for_task('Identify if there is tape on the boxes', crops[:3])
400
400
 
401
401
  OBSERVATION:
402
402
  [get_tool_for_task output]
403
- owlv2_object_detection performed best with the prompt 'tape', as it specifically detected multiple instances of tape with localized bounding boxes, which matches what's visible in the images.
403
+ glee_object_detection performed best with the prompt 'tape', as it specifically detected multiple instances of tape with localized bounding boxes, which matches what's visible in the images.
404
404
 
405
- 'owlv2_object_detection' is a tool that can detect and count multiple objects given a
406
- text prompt such as category names or referring expressions on images. The categories
407
- in text prompt are separated by commas. It returns a list of bounding boxes with
408
- normalized coordinates, label names and associated probability scores.
405
+ 'glee_object_detection' is a tool that can detect multiple objects given a
406
+ text prompt such as object names or referring expressions on images. It's
407
+ particularly good at detecting specific objects given detailed descriptive prompts.
408
+ It returns a list of bounding boxes with normalized coordinates, label names and
409
+ associated probability scores.
409
410
 
410
411
  Parameters:
411
- prompt (str): The prompt to ground to the image.
412
+ prompt (str): The prompt to ground to the image, only supports a single prompt
413
+ with no commas or periods.
412
414
  image (np.ndarray): The image to ground the prompt to.
413
- box_threshold (float, optional): The threshold for the box detection. Defaults
414
- to 0.10.
415
- fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
416
- fine-tuned model ID here to use it.
417
415
 
418
416
  Returns:
419
417
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -424,10 +422,10 @@ Returns:
424
422
 
425
423
  Example
426
424
  -------
427
- >>> owlv2_object_detection("car, dinosaur", image)
425
+ >>> glee_object_detection("person holding a box", image)
428
426
  [
429
- {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
430
- {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
427
+ {'score': 0.99, 'label': 'person holding a box', 'bbox': [0.1, 0.11, 0.35, 0.4]},
428
+ {'score': 0.98, 'label': 'person holding a box', 'bbox': [0.2, 0.21, 0.45, 0.5},
431
429
  ]
432
430
  [end of get_tool_for_task output]
433
431
  <count>7</count>
@@ -445,7 +443,7 @@ for frame, frame_predictions in zip(frames, track_predictions):
445
443
  int(obj["bbox"][0] * width) : int(obj["bbox"][2] * width),
446
444
  :,
447
445
  ]
448
- detections = owlv2_object_detection("tape", crop)
446
+ detections = glee_object_detection("tape", crop)
449
447
  obj_to_info[obj["label"]].extend(detections)
450
448
 
451
449
 
@@ -512,7 +510,7 @@ PICK_PLAN = """
512
510
  CATEGORIZE_TOOL_REQUEST = """
513
511
  You are given a task: "{task}" from the user. You must extract the type of category this task belongs to, it can be one or more of the following:
514
512
  - "VQA" - answering questions about an image or video, can be used for most tasks, should generally be included.
515
- - "object detection and counting" - detecting objects or counting objects from a text prompt in an image.
513
+ - "object detection" - detecting objects or counting objects from a text prompt in an image.
516
514
  - "instance segmentation" - segmenting objects in an image given a text prompt.
517
515
  - "classification" - classifying objects in an image given a text prompt.
518
516
  - "segmentation" - segmenting objects in an image or video given a text prompt.
@@ -569,27 +567,27 @@ Count the number of pedestrians across all the images.
569
567
 
570
568
  <code>
571
569
  from concurrent.futures import ThreadPoolExecutor, as_completed
572
- from vision_agent.tools import load_image, owlv2_object_detection, florence2_object_detection, countgd_object_detection
570
+ from vision_agent.tools import load_image, agentic_object_detection, glee_object_detection, countgd_object_detection
573
571
 
574
572
  # process functions in a try catch so that if it fails it doesn't cause `as_completed` to hang
575
- def process_owlv2(image_paths):
573
+ def process_agentic(image_paths):
576
574
  try:
577
575
  results = []
578
576
  for image_path in image_paths:
579
577
  image = load_image(image_path)
580
- results.extend(owlv2_object_detection("person", image))
578
+ results.extend(agentic_object_detection("person", image))
581
579
  except Exception as e:
582
- results = f"Encountered error when executing process_owlv2: {str(e)}"
580
+ results = f"Encountered error when executing process_agentic: {str(e)}"
583
581
  return results
584
582
 
585
- def process_florence2(image_paths):
583
+ def process_glee_object_detection(image_paths):
586
584
  try:
587
585
  results = []
588
586
  for image_path in image_paths:
589
587
  image = load_image(image_path)
590
- results.extend(florence2_object_detection("person", image))
588
+ results.extend(glee_object_detection("person", image))
591
589
  except Exception as e:
592
- results = f"Encountered error when executing process_florence2: {str(e)}"
590
+ results = f"Encountered error when executing process_glee: {str(e)}"
593
591
  return results
594
592
 
595
593
  def process_countgd(image_paths):
@@ -606,8 +604,8 @@ image_paths = ["image1.jpg", "image2.jpg", "image3.jpg", "image4.jpg"]
606
604
 
607
605
  with ThreadPoolExecutor() as executor:
608
606
  futures = {{
609
- executor.submit(process_owlv2, image_paths): "owlv2_object_detection",
610
- executor.submit(process_florence2, image_paths): "florence2_phrase_grounding",
607
+ executor.submit(process_agentic, image_paths): "agentic_object_detection",
608
+ executor.submit(process_glee, image_paths): "glee_object_detection",
611
609
  executor.submit(process_countgd, image_paths): "countgd_object_detection",
612
610
  }}
613
611
 
@@ -632,7 +630,7 @@ Count the number of people in the video.
632
630
  <code>
633
631
  import numpy as np
634
632
  from concurrent.futures import ThreadPoolExecutor, as_completed
635
- from vision_agent.tools import extract_frames_and_timestamps, owlv2_sam2_video_tracking, florence2_sam2_video_tracking
633
+ from vision_agent.tools import extract_frames_and_timestamps, countgd_sam2_video_tracking, glee_sam2_video_tracking
636
634
 
637
635
  # sample at 1 FPS and use the first 10 frames to reduce processing time
638
636
  frames = extract_frames_and_timestamps("video.mp4", 1)
@@ -649,18 +647,18 @@ def remove_arrays(o):
649
647
  else:
650
648
  return o
651
649
 
652
- def process_owlv2_sam2_video_tracking(frames):
650
+ def process_countgd_sam2_video_tracking(frames):
653
651
  try:
654
652
  # run with chunk_length=1 to ensure best results
655
- results = owlv2_sam2_video_tracking("person", frames, chunk_length=1)
653
+ results = countgd_sam2_video_tracking("person", frames, chunk_length=1)
656
654
  except Exception as e:
657
- results = f"Encountered error when executing process_owlv2_sam2_video_tracking: {str(e)}"
655
+ results = f"Encountered error when executing process_countgd_sam2_video_tracking: {str(e)}"
658
656
  return results
659
657
 
660
- def process_florence2_sam2_video_tracking(frames):
658
+ def process_glee_sam2_video_tracking(frames):
661
659
  try:
662
660
  # run with chunk_length=1 to ensure best results
663
- results = florence2_sam2_video_tracking("person", frames, chunk_length=1)
661
+ results = glee_sam2_video_tracking("person", frames, chunk_length=1)
664
662
  except Exception as e:
665
663
  results = f"Encountered error when executing process_florence2_sam2: {str(e)}"
666
664
  return results
@@ -668,8 +666,8 @@ def process_florence2_sam2_video_tracking(frames):
668
666
 
669
667
  with ThreadPoolExecutor() as executor:
670
668
  futures = {{
671
- executor.submit(process_owlv2_sam2_video_tracking, frames): "owlv2_sam2_video_tracking",
672
- executor.submit(process_florence2_sam2_video_tracking, frames): "florence2_sam2_video_tracking",
669
+ executor.submit(process_countgd_sam2_video_tracking, frames): "countgd_sam2_video_tracking",
670
+ executor.submit(process_glee_sam2_video_tracking, frames): "glee_sam2_video_tracking",
673
671
  }}
674
672
  final_results = {{}}
675
673
  for future in as_completed(futures):
@@ -701,7 +699,7 @@ PICK_TOOL = """
701
699
  **Instructions**:
702
700
  1. Re-read the user request, plans, tool outputs and examine the image.
703
701
  2. Given the user request, try to solve the problem yourself.
704
- 3. Pick which tool output best matches your solution and the user request, DO NOT focus on other factors.
702
+ 3. Pick which tool output best matches your solution and the user request, DO NOT focus on other factors like confidence score.
705
703
  4. DO NOT modify confidence thresholds unless the tool output is completely wrong.
706
704
  5. Remember for videos that in order to count objects a video some sort of tracking is needed, or else you will overcount the objects.
707
705
  7. Return the following JSON format inside <json> tags using the exact tool name for best_tool:
@@ -717,7 +715,7 @@ PICK_TOOL = """
717
715
  FINALIZE_PLAN = """
718
716
  **Task**: You are given a chain of thoughts, python executions and observations from a planning agent as it tries to construct a plan to solve a user request. Your task is to summarize the plan it found so that another programming agent to write a program to accomplish the user request.
719
717
 
720
- **Documentation**: You can use these tools to help you visualize or save the output (they are imported `from vision_agent.tools import *`):
718
+ **Documentation**: You can use these tools to help complete the code (they are imported `from vision_agent.tools import *`):
721
719
  {tool_desc}
722
720
 
723
721
  **Planning**: Here is chain of thoughts, executions and observations from the planning agent:
@@ -726,13 +724,14 @@ FINALIZE_PLAN = """
726
724
  **Instructions**:
727
725
  1. Summarize the plan that the planning agent found.
728
726
  2. Write a single function that solves the problem based on what the planner found and only returns the final solution.
729
- 3. Only use tools obtained from calling `get_tool_for_task`.
730
- 4. Do not include {excluded_tools} tools in your instructions.
731
- 5. Ensure the function is well documented and easy to understand.
732
- 6. Ensure you visualize the output with `overlay_bounding_boxes` or `overlay_segmentation_masks`, if bounding boxes or segmentaiton masks are produced, and save it to a file with `save_image` or `save_video`.
733
- 7. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
734
- 8. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
735
- 9. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
727
+ 3. Ensure the function takes in a single argument for the image or video path, all other arguments must be keyword arguments with default values.
728
+ 4. Only use tools obtained from calling `get_tool_for_task` and the ones provided in the documentation.
729
+ 5. Do not include {excluded_tools} tools in your instructions.
730
+ 6. Ensure the function is well documented and easy to understand, DO NOT escape quotes in the function documentation.
731
+ 7. Ensure you visualize the output with `overlay_bounding_boxes` or `overlay_segmentation_masks`, if bounding boxes or segmentaiton masks are produced, and save it to a file with `save_image` or `save_video`.
732
+ 8. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
733
+ 9. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
734
+ 10. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
736
735
 
737
736
  <json>
738
737
  {{
@@ -744,7 +743,7 @@ FINALIZE_PLAN = """
744
743
  </json>
745
744
 
746
745
  <code>
747
- # Code snippets here
746
+ # Your function code here
748
747
  </code>
749
748
  """
750
749
 
@@ -73,7 +73,7 @@ class DefaultPlanningImports:
73
73
  "import cv2",
74
74
  "from typing import *",
75
75
  "from vision_agent.tools import *",
76
- "from vision_agent.tools.planner_tools import claude35_vqa, suggestion, get_tool_for_task",
76
+ "from vision_agent.tools.planner_tools import vqa, suggestion, get_tool_for_task",
77
77
  "from pillow_heif import register_heif_opener",
78
78
  "register_heif_opener()",
79
79
  "import matplotlib.pyplot as plt",
@@ -228,6 +228,7 @@ def execute_code_action(
228
228
 
229
229
  count = 1
230
230
  while not execution.success and count <= 3:
231
+ start = time.time()
231
232
  prompt = FIX_BUG.format(chat_history=get_planning(chat), code=code, error=obs)
232
233
  response = cast(str, model.chat([{"role": "user", "content": prompt}]))
233
234
  new_code = extract_tag(response, "code", extract_markdown="python")
@@ -243,7 +244,7 @@ def execute_code_action(
243
244
  if verbose:
244
245
  print_code(f"Fixing Bug Round {count}:", code)
245
246
  _CONSOLE.print(
246
- f"[bold cyan]Code Execution Output:[/bold cyan] [yellow]{escape(obs)}[/yellow]"
247
+ f"[bold cyan]Code Execution Output ({end - start:.2f}s):[/bold cyan] [yellow]{escape(obs)}[/yellow]"
247
248
  )
248
249
  count += 1
249
250