vision-agent 0.2.232__tar.gz → 0.2.234__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {vision_agent-0.2.232 → vision_agent-0.2.234}/PKG-INFO +1 -1
  2. {vision_agent-0.2.232 → vision_agent-0.2.234}/pyproject.toml +1 -1
  3. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/agent_utils.py +1 -1
  4. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/vision_agent_coder_v2.py +1 -1
  5. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/vision_agent_planner_prompts_v2.py +127 -64
  6. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/vision_agent_planner_v2.py +2 -3
  7. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/vision_agent_v2.py +7 -7
  8. {vision_agent-0.2.232 → vision_agent-0.2.234}/LICENSE +0 -0
  9. {vision_agent-0.2.232 → vision_agent-0.2.234}/README.md +0 -0
  10. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/.sim_tools/df.csv +0 -0
  11. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/.sim_tools/embs.npy +0 -0
  12. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/__init__.py +0 -0
  13. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/README.md +0 -0
  14. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/__init__.py +0 -0
  15. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/agent.py +0 -0
  16. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/types.py +0 -0
  17. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/vision_agent.py +0 -0
  18. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/vision_agent_coder.py +0 -0
  19. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  20. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
  21. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/vision_agent_planner.py +0 -0
  22. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
  23. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/vision_agent_prompts.py +0 -0
  24. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
  25. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/clients/__init__.py +0 -0
  26. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/clients/http.py +0 -0
  27. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/clients/landing_public_api.py +0 -0
  28. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/configs/__init__.py +0 -0
  29. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/configs/anthropic_config.py +0 -0
  30. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/configs/anthropic_openai_config.py +0 -0
  31. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/configs/config.py +0 -0
  32. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/configs/openai_config.py +0 -0
  33. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/fonts/__init__.py +0 -0
  34. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  35. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/lmm/__init__.py +0 -0
  36. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/lmm/lmm.py +0 -0
  37. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/lmm/types.py +0 -0
  38. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/tools/__init__.py +0 -0
  39. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/tools/meta_tools.py +0 -0
  40. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/tools/planner_tools.py +0 -0
  41. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/tools/prompts.py +0 -0
  42. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/tools/tool_utils.py +0 -0
  43. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/tools/tools.py +0 -0
  44. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/tools/tools_types.py +0 -0
  45. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/utils/__init__.py +0 -0
  46. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/utils/exceptions.py +0 -0
  47. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/utils/execute.py +0 -0
  48. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/utils/image_utils.py +0 -0
  49. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/utils/sim.py +0 -0
  50. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/utils/type_defs.py +0 -0
  51. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/utils/video.py +0 -0
  52. {vision_agent-0.2.232 → vision_agent-0.2.234}/vision_agent/utils/video_tracking.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.232
3
+ Version: 0.2.234
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.232"
7
+ version = "0.2.234"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -160,7 +160,7 @@ def format_conversation(chat: List[AgentMessage]) -> str:
160
160
  prompt = ""
161
161
  for chat_i in chat:
162
162
  if chat_i.role == "user" or chat_i.role == "coder":
163
- if "<final_code>" in chat_i.role:
163
+ if "<final_code>" in chat_i.content:
164
164
  prompt += f"OBSERVATION: {chat_i.content}\n\n"
165
165
  elif chat_i.role == "user":
166
166
  prompt += f"USER: {chat_i.content}\n\n"
@@ -443,7 +443,7 @@ class VisionAgentCoderV2(AgentCoder):
443
443
 
444
444
  # we don't need the user_interaction response for generating code since it's
445
445
  # already in the plan context
446
- while chat[-1].role != "user":
446
+ while len(chat) > 0 and chat[-1].role != "user":
447
447
  chat.pop()
448
448
 
449
449
  if not chat:
@@ -20,7 +20,7 @@ PLAN = """
20
20
  3. Only output <finalize_plan> when you are done planning and want to end the planning process. DO NOT output <finalize_plan> with <execute_python> tags, only after OBSERVATION's.
21
21
  4. Only load/save files from {media_list} unless you specifically saved the file previously.
22
22
  5. Ensure you always call `suggestion` initially and `get_tool_for_task` to get the right tool for the subtask.
23
- 6. Calling `plt.imshow` or `save_image` will display the image to you, use this to visually check your results.
23
+ 6. Calling `plt.imshow` or `save_image` will display the image to you so you can check your results. If you see an image after <execute_python> it's generated from your code.
24
24
  7. DO NOT hard code the answer into your code, it should be dynamic and work for any similar request.
25
25
  8. DO NOT over index on claude35_vqa, if tool output is close to claude35_vqa's output you do not need to improve the tool.
26
26
  9. You can only respond in the following format with a single <thinking>, <execute_python> or <finalize_plan> tag:
@@ -50,7 +50,7 @@ From this aerial view of a busy urban street, it's difficult to clearly see or c
50
50
  [suggestion 0]
51
51
  The image is very large and the items you need to detect are small.
52
52
 
53
- Step 1: You should start by splitting the image into sections and runing the detection algorithm on each section:
53
+ Step 1: You should start by splitting the image into overlapping sections and runing the detection algorithm on each section:
54
54
 
55
55
  def subdivide_image(image):
56
56
  height, width, _ = image.shape
@@ -66,41 +66,96 @@ def subdivide_image(image):
66
66
 
67
67
  get_tool_for_task('<your prompt here>', subdivide_image(image))
68
68
 
69
- Step 2: Once you have the detections from each subdivided image, you will need to merge them back together to remove overlapping predictions:
70
-
71
- def translate_ofset(bbox, offset_x, offset_y):
72
- return (bbox[0] + offset_x, bbox[1] + offset_y, bbox[2] + offset_x, bbox[3] + offset_y)
73
-
74
- def bounding_boxes_overlap(bbox1, bbox2):
75
- if bbox1[2] < bbox2[0] or bbox2[0] > bbox1[2]:
76
- return False
77
- if bbox1[3] < bbox2[1] or bbox2[3] > bbox1[3]:
78
- return False
79
- return True
80
-
81
- def merge_bounding_boxes(bbox1, bbox2):
82
- x_min = min(bbox1[0], bbox2[0])
83
- y_min = min(bbox1[1], bbox2[1])
84
- x_max = max(bbox1[2], bbox2[2])
85
- y_max = max(bbox1[3], bbox2[3])
86
- return (x_min, y_min, x_max, y_max)
87
-
88
- def merge_bounding_box_list(bboxes):
89
- merged_bboxes = []
90
- while bboxes:
91
- bbox = bboxes.pop()
92
- overlap_found = False
93
- for i, other_bbox in enumerate(merged_bboxes):
94
- if bounding_boxes_overlap(bbox, other_bbox):
95
- merged_bboxes[i] = merge_bounding_boxes(bbox, other_bbox)
96
- overlap_found = True
69
+ Step 2: Once you have the detections from each subdivided image, you will need to merge them back together to remove overlapping predictions, be sure to tranlate the offset back to the original image:
70
+
71
+ def bounding_box_match(b1: List[float], b2: List[float], iou_threshold: float = 0.1) -> bool:
72
+ # Calculate intersection coordinates
73
+ x1 = max(b1[0], b2[0])
74
+ y1 = max(b1[1], b2[1])
75
+ x2 = min(b1[2], b2[2])
76
+ y2 = min(b1[3], b2[3])
77
+
78
+ # Calculate intersection area
79
+ if x2 < x1 or y2 < y1:
80
+ return False # No overlap
81
+
82
+ intersection = (x2 - x1) * (y2 - y1)
83
+
84
+ # Calculate union area
85
+ area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
86
+ area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
87
+ union = area1 + area2 - intersection
88
+
89
+ # Calculate IoU
90
+ iou = intersection / union if union > 0 else 0
91
+
92
+ return iou >= iou_threshold
93
+
94
+ def merge_bounding_box_list(detections):
95
+ merged_detections = []
96
+ for detection in detections:
97
+ matching_box = None
98
+ for i, other in enumerate(merged_detections):
99
+ if bounding_box_match(detection["bbox"], other["bbox"]):
100
+ matching_box = i
97
101
  break
98
- if not overlap_found:
99
- p
100
- merged_bboxes.append(bbox)
101
- return merged_bboxes
102
102
 
103
- detection = merge_bounding_box_list(detection_from_subdivided_images)
103
+ if matching_box is not None:
104
+ # Keep the box with higher confidence score
105
+ if detection["score"] > merged_detections[matching_box]["score"]:
106
+ merged_detections[matching_box] = detection
107
+ else:
108
+ merged_detections.append(detection)
109
+
110
+ def sub_image_to_original(elt, sub_image_position, original_size):
111
+ offset_x, offset_y = sub_image_position
112
+ return {
113
+ "label": elt["label"],
114
+ "score": elt["score"],
115
+ "bbox": [
116
+ (elt["bbox"][0] + offset_x) / original_size[1],
117
+ (elt["bbox"][1] + offset_y) / original_size[0],
118
+ (elt["bbox"][2] + offset_x) / original_size[1],
119
+ (elt["bbox"][3] + offset_y) / original_size[0],
120
+ ],
121
+ }
122
+
123
+ def normalized_to_unnormalized(elt, image_size):
124
+ return {
125
+ "label": elt["label"],
126
+ "score": elt["score"],
127
+ "bbox": [
128
+ elt["bbox"][0] * image_size[1],
129
+ elt["bbox"][1] * image_size[0],
130
+ elt["bbox"][2] * image_size[1],
131
+ elt["bbox"][3] * image_size[0],
132
+ ],
133
+ }
134
+
135
+ height, width, _ = image.shape
136
+ mid_width = width // 2
137
+ mid_height = height // 2
138
+
139
+ detection_from_subdivided_images = []
140
+ for i, sub_image in enumerate(subdivided_images):
141
+ detections = <your detection function here>("pedestrian", sub_image)
142
+ unnorm_detections = [
143
+ normalized_to_unnormalized(
144
+ detection, (sub_image.shape[0], sub_image.shape[1])
145
+ )
146
+ for detection in detections
147
+ ]
148
+ offset_x = i % 2 * (mid_width - int(mid_width * 0.1))
149
+ offset_y = i // 2 * (mid_height - int(mid_height * 0.1))
150
+ offset_detections = [
151
+ sub_image_to_original(
152
+ unnorm_detection, (offset_x, offset_y), (height, width)
153
+ )
154
+ for unnorm_detection in unnorm_detections
155
+ ]
156
+ detection_from_subdivided_images.extend(offset_detections)
157
+
158
+ detections = merge_bounding_box_list(detection_from_subdivided_images)
104
159
  [end of suggestion 0]
105
160
  [end of suggestion]
106
161
  <count>9</count>
@@ -164,36 +219,44 @@ countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float
164
219
 
165
220
  AGENT: <thinking>I need to now merge the boxes from all region and use the countgd_object_detection tool with the prompt 'pedestrian' as suggested by get_tool_for_task.</thinking>
166
221
  <execute_python>
167
- def translate_ofset(bbox, offset_x, offset_y):
168
- return (bbox[0] + offset_x, bbox[1] + offset_y, bbox[2] + offset_x, bbox[3] + offset_y)
169
-
170
- def bounding_boxes_overlap(bbox1, bbox2):
171
- if bbox1[2] < bbox2[0] or bbox2[0] > bbox1[2]:
172
- return False
173
- if bbox1[3] < bbox2[1] or bbox2[3] > bbox1[3]:
174
- return False
175
- return True
176
-
177
- def merge_bounding_boxes(bbox1, bbox2):
178
- x_min = min(bbox1[0], bbox2[0])
179
- y_min = min(bbox1[1], bbox2[1])
180
- x_max = max(bbox1[2], bbox2[2])
181
- y_max = max(bbox1[3], bbox2[3])
182
- return (x_min, y_min, x_max, y_max)
183
-
184
- def merge_bounding_box_list(bboxes):
185
- merged_bboxes = []
186
- while bboxes:
187
- bbox = bboxes.pop()
188
- overlap_found = False
189
- for i, other_bbox in enumerate(merged_bboxes):
190
- if bounding_boxes_overlap(bbox, other_bbox):
191
- merged_bboxes[i] = merge_bounding_boxes(bbox, other_bbox)
192
- overlap_found = True
222
+ def bounding_box_match(b1: List[float], b2: List[float], iou_threshold: float = 0.1) -> bool:
223
+ # Calculate intersection coordinates
224
+ x1 = max(b1[0], b2[0])
225
+ y1 = max(b1[1], b2[1])
226
+ x2 = min(b1[2], b2[2])
227
+ y2 = min(b1[3], b2[3])
228
+
229
+ # Calculate intersection area
230
+ if x2 < x1 or y2 < y1:
231
+ return False # No overlap
232
+
233
+ intersection = (x2 - x1) * (y2 - y1)
234
+
235
+ # Calculate union area
236
+ area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
237
+ area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
238
+ union = area1 + area2 - intersection
239
+
240
+ # Calculate IoU
241
+ iou = intersection / union if union > 0 else 0
242
+
243
+ return iou >= iou_threshold
244
+
245
+ def merge_bounding_box_list(detections):
246
+ merged_detections = []
247
+ for detection in detections:
248
+ matching_box = None
249
+ for i, other in enumerate(merged_detections):
250
+ if bounding_box_match(detection["bbox"], other["bbox"]):
251
+ matching_box = i
193
252
  break
194
- if not overlap_found:
195
- merged_bboxes.append(bbox)
196
- return merged_bboxes
253
+
254
+ if matching_box is not None:
255
+ # Keep the box with higher confidence score
256
+ if detection["score"] > merged_detections[matching_box]["score"]:
257
+ merged_detections[matching_box] = detection
258
+ else:
259
+ merged_detections.append(detection)
197
260
 
198
261
  detections = []
199
262
  for region in subdivide_image(image):
@@ -97,8 +97,7 @@ def run_planning(
97
97
  media_list: List[Union[str, Path]],
98
98
  model: LMM,
99
99
  ) -> str:
100
- # only keep last 10 messages for planning
101
- planning = get_planning(chat[-10:])
100
+ planning = get_planning(chat)
102
101
  prompt = PLAN.format(
103
102
  tool_desc=PLANNING_TOOLS_DOCSTRING,
104
103
  examples=f"{EXAMPLE_PLAN1}\n{EXAMPLE_PLAN2}",
@@ -372,7 +371,7 @@ def replace_interaction_with_obs(chat: List[AgentMessage]) -> List[AgentMessage]
372
371
  function_name = response["function_name"]
373
372
  tool_doc = get_tool_documentation(function_name)
374
373
  if "box_threshold" in response:
375
- tool_doc = f"Use the following function with box_threshold={response['box_threshold']}\n\n{tool_doc}"
374
+ tool_doc = f"Use the following function with box_threshold={response['box_threshold']}. This tool and its parameters were chosen by the user so do not change them in your planning.\n\n{tool_doc}."
376
375
  new_chat.append(AgentMessage(role="observation", content=tool_doc))
377
376
  except (json.JSONDecodeError, KeyError):
378
377
  raise ValueError(f"Invalid JSON in interaction response: {chat_i}")
@@ -27,7 +27,7 @@ CONFIG = Config()
27
27
 
28
28
 
29
29
  def extract_conversation(
30
- chat: List[AgentMessage],
30
+ chat: List[AgentMessage], include_conv: bool = False
31
31
  ) -> Tuple[List[AgentMessage], Optional[str]]:
32
32
  chat = copy.deepcopy(chat)
33
33
 
@@ -43,6 +43,8 @@ def extract_conversation(
43
43
  elif chat_i.role == "coder":
44
44
  if "<final_code>" in chat_i.content:
45
45
  extracted_chat.append(chat_i)
46
+ elif include_conv and chat_i.role == "conversation":
47
+ extracted_chat.append(chat_i)
46
48
 
47
49
  # only keep the last <final_code> and <final_test>
48
50
  final_code = None
@@ -64,10 +66,9 @@ def extract_conversation(
64
66
 
65
67
 
66
68
  def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
67
- extracted_chat, _ = extract_conversation(chat)
68
- extracted_chat = extracted_chat[-10:]
69
+ extracted_chat, _ = extract_conversation(chat, include_conv=True)
69
70
 
70
- conv = format_conversation(chat)
71
+ conv = format_conversation(extracted_chat)
71
72
  prompt = CONVERSATION.format(
72
73
  conversation=conv,
73
74
  )
@@ -90,8 +91,6 @@ def maybe_run_action(
90
91
  code_interpreter: Optional[CodeInterpreter] = None,
91
92
  ) -> Optional[List[AgentMessage]]:
92
93
  extracted_chat, final_code = extract_conversation(chat)
93
- # only keep last 5 messages to keep context recent and not overwhelm LLM
94
- extracted_chat = extracted_chat[-5:]
95
94
  if action == "generate_or_edit_vision_code":
96
95
  # there's an issue here because coder.generate_code will send it's code_context
97
96
  # to the outside user via it's update_callback, but we don't necessarily have
@@ -124,6 +123,7 @@ def maybe_run_action(
124
123
  ],
125
124
  code="",
126
125
  )
126
+
127
127
  context = coder.generate_code_from_plan(
128
128
  extracted_chat, plan_context, code_interpreter=code_interpreter
129
129
  )
@@ -263,7 +263,7 @@ class VisionAgentV2(Agent):
263
263
  # do not append updated_chat to return_chat becuase the observation
264
264
  # from running the action will have already been added via the callbacks
265
265
  obs_response_context = run_conversation(
266
- self.agent, return_chat + updated_chat
266
+ self.agent, int_chat + return_chat + updated_chat
267
267
  )
268
268
  return_chat.append(
269
269
  AgentMessage(role="conversation", content=obs_response_context)
File without changes
File without changes