vision-agent 0.2.229__tar.gz → 0.2.230__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. vision_agent-0.2.230/PKG-INFO +156 -0
  2. vision_agent-0.2.230/README.md +110 -0
  3. {vision_agent-0.2.229 → vision_agent-0.2.230}/pyproject.toml +1 -1
  4. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/.sim_tools/df.csv +10 -8
  5. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/agent_utils.py +10 -9
  6. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent.py +3 -4
  7. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_coder_prompts.py +6 -6
  8. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_coder_v2.py +41 -26
  9. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_planner_prompts.py +6 -6
  10. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_planner_prompts_v2.py +16 -50
  11. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_planner_v2.py +10 -12
  12. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_prompts.py +11 -11
  13. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_prompts_v2.py +18 -3
  14. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_v2.py +29 -30
  15. vision_agent-0.2.230/vision_agent/configs/__init__.py +1 -0
  16. vision_agent-0.2.230/vision_agent/configs/anthropic_config.py +150 -0
  17. vision_agent-0.2.230/vision_agent/configs/anthropic_openai_config.py +150 -0
  18. vision_agent-0.2.230/vision_agent/configs/config.py +150 -0
  19. vision_agent-0.2.230/vision_agent/configs/openai_config.py +160 -0
  20. vision_agent-0.2.230/vision_agent/lmm/__init__.py +2 -0
  21. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/lmm/lmm.py +63 -9
  22. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/tools/planner_tools.py +60 -40
  23. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/tools/tools.py +10 -8
  24. vision_agent-0.2.229/PKG-INFO +0 -562
  25. vision_agent-0.2.229/README.md +0 -516
  26. vision_agent-0.2.229/vision_agent/lmm/__init__.py +0 -2
  27. {vision_agent-0.2.229 → vision_agent-0.2.230}/LICENSE +0 -0
  28. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/.sim_tools/embs.npy +0 -0
  29. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/__init__.py +0 -0
  30. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/README.md +0 -0
  31. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/__init__.py +0 -0
  32. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/agent.py +0 -0
  33. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/types.py +0 -0
  34. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_coder.py +0 -0
  35. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
  36. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/agent/vision_agent_planner.py +0 -0
  37. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/clients/__init__.py +0 -0
  38. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/clients/http.py +0 -0
  39. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/clients/landing_public_api.py +0 -0
  40. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/fonts/__init__.py +0 -0
  41. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  42. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/lmm/types.py +0 -0
  43. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/tools/__init__.py +0 -0
  44. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/tools/meta_tools.py +0 -0
  45. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/tools/prompts.py +0 -0
  46. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/tools/tool_utils.py +0 -0
  47. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/tools/tools_types.py +0 -0
  48. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/utils/__init__.py +0 -0
  49. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/utils/exceptions.py +0 -0
  50. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/utils/execute.py +0 -0
  51. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/utils/image_utils.py +0 -0
  52. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/utils/sim.py +0 -0
  53. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/utils/type_defs.py +0 -0
  54. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/utils/video.py +0 -0
  55. {vision_agent-0.2.229 → vision_agent-0.2.230}/vision_agent/utils/video_tracking.py +0 -0
@@ -0,0 +1,156 @@
1
+ Metadata-Version: 2.1
2
+ Name: vision-agent
3
+ Version: 0.2.230
4
+ Summary: Toolset for Vision Agent
5
+ Author: Landing AI
6
+ Author-email: dev@landing.ai
7
+ Requires-Python: >=3.9,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Requires-Dist: anthropic (>=0.31.0,<0.32.0)
13
+ Requires-Dist: av (>=11.0.0,<12.0.0)
14
+ Requires-Dist: e2b (>=0.17.2a50,<0.18.0)
15
+ Requires-Dist: e2b-code-interpreter (==0.0.11a37)
16
+ Requires-Dist: flake8 (>=7.0.0,<8.0.0)
17
+ Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
18
+ Requires-Dist: langsmith (>=0.1.58,<0.2.0)
19
+ Requires-Dist: libcst (>=1.5.0,<2.0.0)
20
+ Requires-Dist: matplotlib (>=3.9.2,<4.0.0)
21
+ Requires-Dist: nbclient (>=0.10.0,<0.11.0)
22
+ Requires-Dist: nbformat (>=5.10.4,<6.0.0)
23
+ Requires-Dist: numpy (>=1.21.0,<2.0.0)
24
+ Requires-Dist: openai (>=1.0.0,<2.0.0)
25
+ Requires-Dist: opencv-python (>=4.0.0,<5.0.0)
26
+ Requires-Dist: opentelemetry-api (>=1.29.0,<2.0.0)
27
+ Requires-Dist: pandas (>=2.0.0,<3.0.0)
28
+ Requires-Dist: pillow (>=10.0.0,<11.0.0)
29
+ Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
30
+ Requires-Dist: pydantic (==2.7.4)
31
+ Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
32
+ Requires-Dist: pytube (==15.0.0)
33
+ Requires-Dist: requests (>=2.0.0,<3.0.0)
34
+ Requires-Dist: rich (>=13.7.1,<14.0.0)
35
+ Requires-Dist: scikit-learn (>=1.5.2,<2.0.0)
36
+ Requires-Dist: scipy (>=1.13.0,<1.14.0)
37
+ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
38
+ Requires-Dist: tenacity (>=8.3.0,<9.0.0)
39
+ Requires-Dist: tqdm (>=4.64.0,<5.0.0)
40
+ Requires-Dist: typing_extensions (>=4.0.0,<5.0.0)
41
+ Project-URL: Homepage, https://landing.ai
42
+ Project-URL: documentation, https://github.com/landing-ai/vision-agent
43
+ Project-URL: repository, https://github.com/landing-ai/vision-agent
44
+ Description-Content-Type: text/markdown
45
+
46
+ <div align="center">
47
+ <picture>
48
+ <source media="(prefers-color-scheme: dark)" srcset="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_light.svg?raw=true">
49
+ <source media="(prefers-color-scheme: light)" srcset="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_dark.svg?raw=true">
50
+ <img alt="VisionAgent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_light.svg?raw=true">
51
+ </picture>
52
+
53
+ [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
54
+ ![ci_status](https://github.com/landing-ai/vision-agent/actions/workflows/ci_cd.yml/badge.svg)
55
+ [![PyPI version](https://badge.fury.io/py/vision-agent.svg)](https://badge.fury.io/py/vision-agent)
56
+ ![version](https://img.shields.io/pypi/pyversions/vision-agent)
57
+ </div>
58
+
59
+ ## VisionAgent
60
+ VisionAgent is a library that helps you utilize agent frameworks to generate code to
61
+ solve your vision task. Check out our discord for updates and roadmaps! The fastest
62
+ way to test out VisionAgent is to use our web application which you can find [here](https://va.landing.ai/).
63
+
64
+ ## Installation
65
+ ```bash
66
+ pip install vision-agent
67
+ ```
68
+
69
+ ```bash
70
+ export ANTHROPIC_API_KEY="your-api-key"
71
+ export OPENAI_API_KEY="your-api-key"
72
+ ```
73
+
74
+ ---
75
+ **NOTE**
76
+ We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance
77
+ for VisionAgent. If you want to use a different LLM provider or only one, see
78
+ 'Using Other LLM Providers' below.
79
+ ---
80
+
81
+ ## Documentation
82
+
83
+ [VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
84
+
85
+ ## Examples
86
+ ### Counting cans in an image
87
+ You can run VisionAgent in a local Jupyter Notebook [Counting cans in an image](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks/counting_cans.ipynb)
88
+
89
+ ### Generating code
90
+ You can use VisionAgent to generate code to count the number of people in an image:
91
+ ```python
92
+ from vision_agent.agent import VisionAgentCoderV2
93
+ from vision_agent.agent.types import AgentMessage
94
+
95
+ agent = VisionAgentCoderV2(verbose=True)
96
+ code_context = agent.generate_code(
97
+ [
98
+ AgentMessage(
99
+ role="user",
100
+ content="Count the number of people in this image",
101
+ media=["people.png"]
102
+ )
103
+ ]
104
+ )
105
+
106
+ with open("generated_code.py", "w") as f:
107
+ f.write(code_context.code + "\n" + code_context.test)
108
+ ```
109
+
110
+ ### Using the tools directly
111
+ VisionAgent produces code that utilizes our tools. You can also use the tools directly.
112
+ For example if you wanted to detect people in an image and visualize the results:
113
+ ```python
114
+ import vision_agent.tools as T
115
+ import matplotlib.pyplot as plt
116
+
117
+ image = T.load_image("people.png")
118
+ dets = T.countgd_object_detection("person", image)
119
+ # visualize the countgd bounding boxes on the image
120
+ viz = T.overlay_bounding_boxes(image, dets)
121
+
122
+ # save the visualization to a file
123
+ T.save_image(viz, "people_detected.png")
124
+
125
+ # display the visualization
126
+ plt.imshow(viz)
127
+ plt.show()
128
+ ```
129
+
130
+ You can also use the tools for running on video files:
131
+ ```python
132
+ import vision_agent.tools as T
133
+
134
+ frames_and_ts = T.extract_frames_and_timestamps("people.mp4")
135
+ # extract the frames from the frames_and_ts list
136
+ frames = [f["frame"] for f in frames_and_ts]
137
+
138
+ # run the countgd tracking on the frames
139
+ tracks = T.countgd_sam2_video_tracking("person", frames)
140
+ # visualize the countgd tracking results on the frames and save the video
141
+ viz = T.overlay_segmentation_masks(frames, tracks)
142
+ T.save_video(viz, "people_detected.mp4")
143
+ ```
144
+
145
+ ## Using Other LLM Providers
146
+ You can use other LLM providers by changing `config.py` in the `vision_agent/configs`
147
+ directory. For example to change to Anthropic simply just run:
148
+ ```bash
149
+ cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
150
+ ```
151
+
152
+ **NOTE**
153
+ VisionAgent moves fast and we are constantly updating and changing the library. If you
154
+ have any questions or need help, please reach out to us on our discord channel.
155
+ ---
156
+
@@ -0,0 +1,110 @@
1
+ <div align="center">
2
+ <picture>
3
+ <source media="(prefers-color-scheme: dark)" srcset="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_light.svg?raw=true">
4
+ <source media="(prefers-color-scheme: light)" srcset="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_dark.svg?raw=true">
5
+ <img alt="VisionAgent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo_light.svg?raw=true">
6
+ </picture>
7
+
8
+ [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
9
+ ![ci_status](https://github.com/landing-ai/vision-agent/actions/workflows/ci_cd.yml/badge.svg)
10
+ [![PyPI version](https://badge.fury.io/py/vision-agent.svg)](https://badge.fury.io/py/vision-agent)
11
+ ![version](https://img.shields.io/pypi/pyversions/vision-agent)
12
+ </div>
13
+
14
+ ## VisionAgent
15
+ VisionAgent is a library that helps you utilize agent frameworks to generate code to
16
+ solve your vision task. Check out our discord for updates and roadmaps! The fastest
17
+ way to test out VisionAgent is to use our web application which you can find [here](https://va.landing.ai/).
18
+
19
+ ## Installation
20
+ ```bash
21
+ pip install vision-agent
22
+ ```
23
+
24
+ ```bash
25
+ export ANTHROPIC_API_KEY="your-api-key"
26
+ export OPENAI_API_KEY="your-api-key"
27
+ ```
28
+
29
+ ---
30
+ **NOTE**
31
+ We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance
32
+ for VisionAgent. If you want to use a different LLM provider or only one, see
33
+ 'Using Other LLM Providers' below.
34
+ ---
35
+
36
+ ## Documentation
37
+
38
+ [VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
39
+
40
+ ## Examples
41
+ ### Counting cans in an image
42
+ You can run VisionAgent in a local Jupyter Notebook [Counting cans in an image](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks/counting_cans.ipynb)
43
+
44
+ ### Generating code
45
+ You can use VisionAgent to generate code to count the number of people in an image:
46
+ ```python
47
+ from vision_agent.agent import VisionAgentCoderV2
48
+ from vision_agent.agent.types import AgentMessage
49
+
50
+ agent = VisionAgentCoderV2(verbose=True)
51
+ code_context = agent.generate_code(
52
+ [
53
+ AgentMessage(
54
+ role="user",
55
+ content="Count the number of people in this image",
56
+ media=["people.png"]
57
+ )
58
+ ]
59
+ )
60
+
61
+ with open("generated_code.py", "w") as f:
62
+ f.write(code_context.code + "\n" + code_context.test)
63
+ ```
64
+
65
+ ### Using the tools directly
66
+ VisionAgent produces code that utilizes our tools. You can also use the tools directly.
67
+ For example if you wanted to detect people in an image and visualize the results:
68
+ ```python
69
+ import vision_agent.tools as T
70
+ import matplotlib.pyplot as plt
71
+
72
+ image = T.load_image("people.png")
73
+ dets = T.countgd_object_detection("person", image)
74
+ # visualize the countgd bounding boxes on the image
75
+ viz = T.overlay_bounding_boxes(image, dets)
76
+
77
+ # save the visualization to a file
78
+ T.save_image(viz, "people_detected.png")
79
+
80
+ # display the visualization
81
+ plt.imshow(viz)
82
+ plt.show()
83
+ ```
84
+
85
+ You can also use the tools for running on video files:
86
+ ```python
87
+ import vision_agent.tools as T
88
+
89
+ frames_and_ts = T.extract_frames_and_timestamps("people.mp4")
90
+ # extract the frames from the frames_and_ts list
91
+ frames = [f["frame"] for f in frames_and_ts]
92
+
93
+ # run the countgd tracking on the frames
94
+ tracks = T.countgd_sam2_video_tracking("person", frames)
95
+ # visualize the countgd tracking results on the frames and save the video
96
+ viz = T.overlay_segmentation_masks(frames, tracks)
97
+ T.save_video(viz, "people_detected.mp4")
98
+ ```
99
+
100
+ ## Using Other LLM Providers
101
+ You can use other LLM providers by changing `config.py` in the `vision_agent/configs`
102
+ directory. For example to change to Anthropic simply just run:
103
+ ```bash
104
+ cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
105
+ ```
106
+
107
+ **NOTE**
108
+ VisionAgent moves fast and we are constantly updating and changing the library. If you
109
+ have any questions or need help, please reach out to us on our discord channel.
110
+ ---
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.229"
7
+ version = "0.2.230"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -244,7 +244,8 @@ desc,doc,name
244
244
  1.0.
245
245
 
246
246
  Parameters:
247
- prompt (str): The prompt to ground to the image.
247
+ prompt (str): The prompt to ground to the image. Use exclusive categories that
248
+ do not overlap such as 'person, car' and NOT 'person, athlete'.
248
249
  image (np.ndarray): The image to ground the prompt to.
249
250
  fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
250
251
  fine-tuned model ID here to use it.
@@ -281,7 +282,8 @@ desc,doc,name
281
282
  is useful for tracking and counting without duplicating counts.
282
283
 
283
284
  Parameters:
284
- prompt (str): The prompt to ground to the video.
285
+ prompt (str): The prompt to ground to the image. Use exclusive categories that
286
+ do not overlap such as 'person, car' and NOT 'person, athlete'.
285
287
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
286
288
  chunk_length (Optional[int]): The number of frames to re-run florence2 to find
287
289
  new objects.
@@ -317,14 +319,14 @@ desc,doc,name
317
319
  ]
318
320
  ",florence2_sam2_video_tracking
319
321
  "'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
320
- 'florence2_object_detection' is a tool that can detect multiple
321
- objects given a text prompt which can be object names or caption. You
322
- can optionally separate the object names in the text with commas. It returns a list
323
- of bounding boxes with normalized coordinates, label names and associated
324
- confidence scores of 1.0.
322
+ 'florence2_object_detection' is a tool that can detect multiple objects given a
323
+ text prompt which can be object names or caption. You can optionally separate the
324
+ object names in the text with commas. It returns a list of bounding boxes with
325
+ normalized coordinates, label names and associated confidence scores of 1.0.
325
326
 
326
327
  Parameters:
327
- prompt (str): The prompt to ground to the image.
328
+ prompt (str): The prompt to ground to the image. Use exclusive categories that
329
+ do not overlap such as 'person, car' and NOT 'person, athlete'.
328
330
  image (np.ndarray): The image to used to detect objects
329
331
  fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
330
332
  fine-tuned model ID here to use it.
@@ -157,10 +157,11 @@ def format_conversation(chat: List[AgentMessage]) -> str:
157
157
  chat = copy.deepcopy(chat)
158
158
  prompt = ""
159
159
  for chat_i in chat:
160
- if chat_i.role == "user":
161
- prompt += f"USER: {chat_i.content}\n\n"
162
- elif chat_i.role == "observation" or chat_i.role == "coder":
163
- prompt += f"OBSERVATION: {chat_i.content}\n\n"
160
+ if chat_i.role == "user" or chat_i.role == "coder":
161
+ if "<final_code>" in chat_i.role:
162
+ prompt += f"OBSERVATION: {chat_i.content}\n\n"
163
+ elif chat_i.role == "user":
164
+ prompt += f"USER: {chat_i.content}\n\n"
164
165
  elif chat_i.role == "conversation":
165
166
  prompt += f"AGENT: {chat_i.content}\n\n"
166
167
  return prompt
@@ -332,26 +333,26 @@ def strip_function_calls( # noqa: C901
332
333
  def __init__(self, exclusions: List[str]):
333
334
  # Store exclusions to skip removing certain function calls
334
335
  self.exclusions = exclusions
335
- self.in_function_or_class = False
336
+ self.in_function_or_class: List[bool] = []
336
337
 
337
338
  def visit_FunctionDef(self, node: cst.FunctionDef) -> Optional[bool]:
338
- self.in_function_or_class = True
339
+ self.in_function_or_class.append(True)
339
340
  return True
340
341
 
341
342
  def leave_FunctionDef(
342
343
  self, original_node: cst.FunctionDef, updated_node: cst.FunctionDef
343
344
  ) -> cst.BaseStatement:
344
- self.in_function_or_class = False
345
+ self.in_function_or_class.pop()
345
346
  return updated_node
346
347
 
347
348
  def visit_ClassDef(self, node: cst.ClassDef) -> Optional[bool]:
348
- self.in_function_or_class = True
349
+ self.in_function_or_class.append(True)
349
350
  return True
350
351
 
351
352
  def leave_ClassDef(
352
353
  self, node: cst.ClassDef, updated_node: cst.ClassDef
353
354
  ) -> cst.BaseStatement:
354
- self.in_function_or_class = False
355
+ self.in_function_or_class.pop()
355
356
  return updated_node
356
357
 
357
358
  def leave_Expr(
@@ -291,10 +291,9 @@ class VisionAgent(Agent):
291
291
  verbosity (int): The verbosity level of the agent.
292
292
  callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
293
293
  function to send intermediate update messages.
294
- code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
295
- it can be one of: None, "local" or "e2b". If None, it will read from
296
- the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
297
- object is provided it will use that.
294
+ code_sandbox_runtime (Optional[str]): For string values it can be one of:
295
+ None, "local" or "e2b". If None, it will read from the environment
296
+ variable "CODE_SANDBOX_RUNTIME".
298
297
  """
299
298
 
300
299
  self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
@@ -44,22 +44,22 @@ Can you write a program to check if each person is wearing a helmet? First detec
44
44
 
45
45
  ## Subtasks
46
46
 
47
- This plan uses the owl_v2_image tool to detect both people and helmets in a single pass, which should be efficient and accurate. We can then compare the detections to determine if each person is wearing a helmet.
48
- -Use owl_v2_image with prompt 'person, helmet' to detect both people and helmets in the image
47
+ This plan uses the owlv2_object_detection tool to detect both people and helmets in a single pass, which should be efficient and accurate. We can then compare the detections to determine if each person is wearing a helmet.
48
+ -Use owlv2_object_detection with prompt 'person, helmet' to detect both people and helmets in the image
49
49
  -Process the detections to match helmets with people based on bounding box proximity
50
50
  -Count people with and without helmets based on the matching results
51
51
  -Return a dictionary with the counts
52
52
 
53
53
 
54
54
  **Tool Tests and Outputs**:
55
- After examining the image, I can see 4 workers in total, with 3 wearing yellow safety helmets and 1 not wearing a helmet. Plan 1 using owl_v2_image seems to be the most accurate in detecting both people and helmets. However, it needs some modifications to improve accuracy. We should increase the confidence threshold to 0.15 to filter out the lowest confidence box, and implement logic to associate helmets with people based on their bounding box positions. Plan 2 and Plan 3 seem less reliable given the tool outputs, as they either failed to distinguish between people with and without helmets or misclassified all workers as not wearing helmets.
55
+ After examining the image, I can see 4 workers in total, with 3 wearing yellow safety helmets and 1 not wearing a helmet. Plan 1 using owlv2_object_detection seems to be the most accurate in detecting both people and helmets. However, it needs some modifications to improve accuracy. We should increase the confidence threshold to 0.15 to filter out the lowest confidence box, and implement logic to associate helmets with people based on their bounding box positions. Plan 2 and Plan 3 seem less reliable given the tool outputs, as they either failed to distinguish between people with and without helmets or misclassified all workers as not wearing helmets.
56
56
 
57
57
  **Tool Output Thoughts**:
58
58
  ```python
59
59
  ...
60
60
  ```
61
61
  ----- stdout -----
62
- Plan 1 - owl_v2_image:
62
+ Plan 1 - owlv2_object_detection:
63
63
 
64
64
  [{{'label': 'helmet', 'score': 0.15, 'bbox': [0.85, 0.41, 0.87, 0.45]}}, {{'label': 'helmet', 'score': 0.3, 'bbox': [0.8, 0.43, 0.81, 0.46]}}, {{'label': 'helmet', 'score': 0.31, 'bbox': [0.85, 0.45, 0.86, 0.46]}}, {{'label': 'person', 'score': 0.31, 'bbox': [0.84, 0.45, 0.88, 0.58]}}, {{'label': 'person', 'score': 0.31, 'bbox': [0.78, 0.43, 0.82, 0.57]}}, {{'label': 'helmet', 'score': 0.33, 'bbox': [0.3, 0.65, 0.32, 0.67]}}, {{'label': 'person', 'score': 0.29, 'bbox': [0.28, 0.65, 0.36, 0.84]}}, {{'label': 'helmet', 'score': 0.29, 'bbox': [0.13, 0.82, 0.15, 0.85]}}, {{'label': 'person', 'score': 0.3, 'bbox': [0.1, 0.82, 0.24, 1.0]}}]
65
65
 
@@ -67,12 +67,12 @@ Plan 1 - owl_v2_image:
67
67
 
68
68
  **Input Code Snippet**:
69
69
  ```python
70
- from vision_agent.tools import load_image, owl_v2_image
70
+ from vision_agent.tools import load_image, owlv2_object_detection
71
71
 
72
72
  def check_helmets(image_path):
73
73
  image = load_image(image_path)
74
74
  # Detect people and helmets, filter out the lowest confidence helmet score of 0.15
75
- detections = owl_v2_image("person, helmet", image, box_threshold=0.15)
75
+ detections = owlv2_object_detection("person, helmet", image, box_threshold=0.15)
76
76
  height, width = image.shape[:2]
77
77
 
78
78
  # Separate people and helmets
@@ -26,7 +26,8 @@ from vision_agent.agent.types import (
26
26
  )
27
27
  from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
28
28
  from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
29
- from vision_agent.lmm import LMM, AnthropicLMM
29
+ from vision_agent.configs import Config
30
+ from vision_agent.lmm import LMM
30
31
  from vision_agent.lmm.types import Message
31
32
  from vision_agent.tools.meta_tools import get_diff
32
33
  from vision_agent.utils.execute import (
@@ -36,6 +37,7 @@ from vision_agent.utils.execute import (
36
37
  )
37
38
  from vision_agent.utils.sim import Sim, get_tool_recommender
38
39
 
40
+ CONFIG = Config()
39
41
  _CONSOLE = Console()
40
42
 
41
43
 
@@ -185,23 +187,17 @@ def debug_code(
185
187
  return code, test, debug_info
186
188
 
187
189
 
188
- def write_and_test_code(
189
- coder: LMM,
190
+ def test_code(
190
191
  tester: LMM,
191
192
  debugger: LMM,
192
193
  chat: List[AgentMessage],
193
194
  plan: str,
195
+ code: str,
194
196
  tool_docs: str,
195
197
  code_interpreter: CodeInterpreter,
196
198
  media_list: List[Union[str, Path]],
197
199
  verbose: bool,
198
200
  ) -> CodeContext:
199
- code = write_code(
200
- coder=coder,
201
- chat=chat,
202
- tool_docs=tool_docs,
203
- plan=plan,
204
- )
205
201
  try:
206
202
  code = strip_function_calls(code)
207
203
  except Exception:
@@ -257,6 +253,36 @@ def write_and_test_code(
257
253
  )
258
254
 
259
255
 
256
+ def write_and_test_code(
257
+ coder: LMM,
258
+ tester: LMM,
259
+ debugger: LMM,
260
+ chat: List[AgentMessage],
261
+ plan: str,
262
+ tool_docs: str,
263
+ code_interpreter: CodeInterpreter,
264
+ media_list: List[Union[str, Path]],
265
+ verbose: bool,
266
+ ) -> CodeContext:
267
+ code = write_code(
268
+ coder=coder,
269
+ chat=chat,
270
+ tool_docs=tool_docs,
271
+ plan=plan,
272
+ )
273
+ return test_code(
274
+ tester,
275
+ debugger,
276
+ chat,
277
+ plan,
278
+ code,
279
+ tool_docs,
280
+ code_interpreter,
281
+ media_list,
282
+ verbose,
283
+ )
284
+
285
+
260
286
  class VisionAgentCoderV2(AgentCoder):
261
287
  """VisionAgentCoderV2 is an agent that will write vision code for you."""
262
288
 
@@ -300,21 +326,9 @@ class VisionAgentCoderV2(AgentCoder):
300
326
  )
301
327
  )
302
328
 
303
- self.coder = (
304
- coder
305
- if coder is not None
306
- else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
307
- )
308
- self.tester = (
309
- tester
310
- if tester is not None
311
- else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
312
- )
313
- self.debugger = (
314
- debugger
315
- if debugger is not None
316
- else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
317
- )
329
+ self.coder = coder if coder is not None else CONFIG.create_coder()
330
+ self.tester = tester if tester is not None else CONFIG.create_tester()
331
+ self.debugger = debugger if debugger is not None else CONFIG.create_debugger()
318
332
  if tool_recommender is not None:
319
333
  if isinstance(tool_recommender, str):
320
334
  self.tool_recommender = Sim.load(tool_recommender)
@@ -440,12 +454,13 @@ class VisionAgentCoderV2(AgentCoder):
440
454
  ) as code_interpreter:
441
455
  int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
442
456
  tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
443
- code_context = write_and_test_code(
444
- coder=self.coder,
457
+
458
+ code_context = test_code(
445
459
  tester=self.tester,
446
460
  debugger=self.debugger,
447
461
  chat=int_chat,
448
462
  plan=format_plan_v2(plan_context),
463
+ code=plan_context.code,
449
464
  tool_docs=tool_docs,
450
465
  code_interpreter=code_interpreter,
451
466
  media_list=media_list,
@@ -55,27 +55,27 @@ This is the documentation for the functions you have access to. You may call any
55
55
  --- EXAMPLE1 ---
56
56
  plan1:
57
57
  - Load the image from the provided file path 'image.jpg'.
58
- - Use the 'owl_v2_image' tool with the prompt 'person' to detect and count the number of people in the image.
58
+ - Use the 'owlv2_object_detection' tool with the prompt 'person' to detect and count the number of people in the image.
59
59
  plan2:
60
60
  - Load the image from the provided file path 'image.jpg'.
61
- - Use the 'florence2_sam2_image' tool with the prompt 'person' to detect and count the number of people in the image.
61
+ - Use the 'florence2_sam2_instance_segmentation' tool with the prompt 'person' to detect and count the number of people in the image.
62
62
  - Count the number of detected objects labeled as 'person'.
63
63
  plan3:
64
64
  - Load the image from the provided file path 'image.jpg'.
65
65
  - Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
66
66
 
67
67
  ```python
68
- from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_object_detection
68
+ from vision_agent.tools import load_image, owlv2_object_detection, florence2_sam2_instance_segmentation, countgd_object_detection
69
69
  image = load_image("image.jpg")
70
- owl_v2_out = owl_v2_image("person", image)
70
+ owl_v2_out = owlv2_object_detection("person", image)
71
71
 
72
- f2s2_out = florence2_sam2_image("person", image)
72
+ f2s2_out = florence2_sam2_instance_segmentation("person", image)
73
73
  # strip out the masks from the output becuase they don't provide useful information when printed
74
74
  f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
75
75
 
76
76
  cgd_out = countgd_object_detection("person", image)
77
77
 
78
- final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_object_detection": cgd_out}}
78
+ final_out = {{"owlv2_object_detection": owl_v2_out, "florence2_sam2_instance_segmentation": f2s2, "countgd_object_detection": cgd_out}}
79
79
  print(final_out)
80
80
  --- END EXAMPLE1 ---
81
81