vision-agent 1.1.18__tar.gz → 1.1.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {vision_agent-1.1.18 → vision_agent-1.1.19}/.gitignore +1 -3
  2. {vision_agent-1.1.18 → vision_agent-1.1.19}/PKG-INFO +4 -2
  3. {vision_agent-1.1.18 → vision_agent-1.1.19}/README.md +3 -1
  4. {vision_agent-1.1.18 → vision_agent-1.1.19}/pyproject.toml +1 -1
  5. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/agent/vision_agent_v3.py +1 -1
  6. vision_agent-1.1.19/vision_agent/agent/visual_design_patterns.py +638 -0
  7. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/configs/config.py +3 -4
  8. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/tools/planner_tools.py +1 -1
  9. vision_agent-1.1.19/vision_agent/tools/suggestion.py +28 -0
  10. {vision_agent-1.1.18 → vision_agent-1.1.19}/LICENSE +0 -0
  11. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/.sim_tools/df.csv +0 -0
  12. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/.sim_tools/embs.npy +0 -0
  13. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/__init__.py +0 -0
  14. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/agent/README.md +0 -0
  15. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/agent/__init__.py +0 -0
  16. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/agent/agent.py +0 -0
  17. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
  18. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
  19. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/agent/vision_agent_planner_prompts_v2.py +0 -0
  20. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
  21. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
  22. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/agent/vision_agent_prompts_v3.py +0 -0
  23. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/agent/vision_agent_v2.py +0 -0
  24. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/clients/__init__.py +0 -0
  25. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/clients/http.py +0 -0
  26. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/configs/__init__.py +0 -0
  27. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/configs/anthropic_config.py +0 -0
  28. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/configs/openai_config.py +0 -0
  29. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/fonts/__init__.py +0 -0
  30. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  31. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/lmm/__init__.py +0 -0
  32. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/lmm/lmm.py +0 -0
  33. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/models/__init__.py +0 -0
  34. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/models/agent_types.py +0 -0
  35. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/models/lmm_types.py +0 -0
  36. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/models/tools_types.py +0 -0
  37. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/sim/__init__.py +0 -0
  38. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/sim/sim.py +0 -0
  39. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/tools/__init__.py +0 -0
  40. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/tools/meta_tools.py +0 -0
  41. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/tools/planner_v3_tools.py +0 -0
  42. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/tools/prompts.py +0 -0
  43. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/tools/tools.py +0 -0
  44. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/utils/__init__.py +0 -0
  45. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/utils/agent.py +0 -0
  46. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/utils/exceptions.py +0 -0
  47. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/utils/execute.py +0 -0
  48. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/utils/image_utils.py +0 -0
  49. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/utils/tools.py +0 -0
  50. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/utils/tools_doc.py +0 -0
  51. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/utils/video.py +0 -0
  52. {vision_agent-1.1.18 → vision_agent-1.1.19}/vision_agent/utils/video_tracking.py +0 -0
@@ -95,6 +95,4 @@ site
95
95
  local/
96
96
 
97
97
  vision-agent-benchmark/
98
- vision_agent/tools/suggestion.py
99
- vision_agent/agent/visual_design_patterns.py
100
- */node_modules
98
+ */node_modules
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vision-agent
3
- Version: 1.1.18
3
+ Version: 1.1.19
4
4
  Summary: Toolset for Vision Agent
5
5
  Project-URL: Homepage, https://landing.ai
6
6
  Project-URL: repository, https://github.com/landing-ai/vision-agent
@@ -63,7 +63,9 @@ _Prompt with an image/video → Get runnable vision code → Build Visual AI App
63
63
 
64
64
  <br />
65
65
 
66
- **VisionAgent** is the Visual AI pilot from LandingAI. Give it a prompt and an image, and it automatically picks the right vision models and outputs ready‑to‑run code—letting you build vision‑enabled apps in minutes.
66
+ **VisionAgent** is the Visual AI pilot from LandingAI. Give it a prompt and an image, and it automatically picks the right vision models and outputs ready‑to‑run code—letting you build vision‑enabled apps in minutes. You can play around with VisionAgent using our local webapp in `examples/chat` and following the directions in the `README.md`:
67
+
68
+ <https://github.com/user-attachments/assets/752632b3-dda5-44f1-b27e-5cb4c97757ac>
67
69
 
68
70
 
69
71
  ## Steps to Set Up the Library
@@ -22,7 +22,9 @@ _Prompt with an image/video → Get runnable vision code → Build Visual AI App
22
22
 
23
23
  <br />
24
24
 
25
- **VisionAgent** is the Visual AI pilot from LandingAI. Give it a prompt and an image, and it automatically picks the right vision models and outputs ready‑to‑run code—letting you build vision‑enabled apps in minutes.
25
+ **VisionAgent** is the Visual AI pilot from LandingAI. Give it a prompt and an image, and it automatically picks the right vision models and outputs ready‑to‑run code—letting you build vision‑enabled apps in minutes. You can play around with VisionAgent using our local webapp in `examples/chat` and following the directions in the `README.md`:
26
+
27
+ <https://github.com/user-attachments/assets/752632b3-dda5-44f1-b27e-5cb4c97757ac>
26
28
 
27
29
 
28
30
  ## Steps to Set Up the Library
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "vision-agent"
7
- version = "1.1.18"
7
+ version = "1.1.19"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = [{ name = "Landing AI", email = "dev@landing.ai" }]
10
10
  requires-python = ">=3.9,<4.0"
@@ -173,7 +173,7 @@ class VisionAgentV3(Agent):
173
173
  ) -> None:
174
174
  if agent is None:
175
175
  self.agent = AnthropicLMM(
176
- model_name="claude-3-7-sonnet-20250219", max_tokens=8192
176
+ model_name="claude-sonnet-4-20250514", max_tokens=8192
177
177
  )
178
178
  self.kwargs = {
179
179
  "thinking": {"type": "enabled", "budget_tokens": 4096},
@@ -0,0 +1,638 @@
1
+ INPAINTING = (
2
+ """You are trying to generate a synthetic image by inpainting a certain area, such as removing an object or the background of an image, and replacing it with a synthetic image. You can use the following code to help with this task:
3
+
4
+ # First find a segmentation model to segment the object you want to inpaint
5
+ masks = <a segmentation model>("object", image)
6
+
7
+ # If you have multiple masks you can combine them into a single mask
8
+ result = masks[0]["mask"]
9
+ for mask in masks[1:]:
10
+ result = np.logical_or(result, mask["mask"])
11
+
12
+ # Once you have the mask you must ensure you have 1's on the area you want to inpaint
13
+ result = 1 - mask
14
+
15
+ inpaint_image = <an inpainting model>("the prompt for the inpainting", image, mask)
16
+ """,
17
+ {"inpainting"},
18
+ )
19
+
20
+
21
+ DESIGN_PATTERNS = """You are trying to help a vision coding agent write code to accomplish a vision task. Your job is to help the agent by giving it a suggestion on how best to approach the problem. Below is a set of tools the vision agent has access to:
22
+
23
+ <tools>
24
+ - "VQA" - answering questions about an image or video, can be used for most tasks including counting, document QA, etc.
25
+ - "object detection and counting" - detecting objects or counting objects from a text prompt in an image.
26
+ - "instance segmentation" - segmenting objects in an image given a text prompt.
27
+ - "classification" - classifying objects in an image given a text prompt.
28
+ - "segmentation" - segmenting objects in an image or video given a text prompt.
29
+ - "OCR" - extracting text from an image.
30
+ - "DocQA" - answering questions about a document or extracting information from a document.
31
+ - "video object tracking" - tracking objects in a video.
32
+ - "depth and pose estimation" - estimating the depth or pose of objects in an image.
33
+ - "activity recognition" - identifying time period an event occurs in a video.
34
+ - "inpainting" - filling in masked parts of an image.
35
+ </tools>
36
+
37
+ <image_size>
38
+ {image_size}
39
+ </image_size>
40
+
41
+ And here are different plans that you can suggest to the vision coding agent depending on what category of problem it is working on. You can either suggest an existing plan from below if it fits the category, modify and existing plan from below to better fit the user request, create a completely new plan, or you can make no suggestion. If you modify a plan or suggest a new plan, be sure to include lots of code for the vision coding agent to follow, use the simplest approach to solving the problem that is the most likely to generalize to new images or videos. Use the image provided and the vision coding agent's request <request>{request}</request> to decide your answer. Be sure to return the entire <plan></plan> block with your answer.
42
+
43
+ <category>large_image: The user is working with a very large image (not a video) and the objects they are trying to identify are very small.</category>
44
+ <plan>
45
+ The image is very large and the items you need to detect are small.
46
+
47
+ Step 1: You should start by splitting the image into overlapping sections and runing the detection algorithm on each section:
48
+
49
+ def subdivide_image(image):
50
+ height, width, _ = image.shape
51
+ mid_height = height // 2
52
+ mid_width = width // 2
53
+ overlap_height = int(mid_height * 0.1)
54
+ overlap_width = int(mid_width * 0.1)
55
+ top_left = image[:mid_height + overlap_height, :mid_width + overlap_width, :]
56
+ top_right = image[:mid_height + overlap_height, mid_width - overlap_width:, :]
57
+ bottom_left = image[mid_height - overlap_height:, :mid_width + overlap_width, :]
58
+ bottom_right = image[mid_height - overlap_height:, mid_width - overlap_width:, :]
59
+ return [top_left, top_right, bottom_left, bottom_right]
60
+
61
+ get_tool_for_task('<your prompt here>', subdivide_image(image))
62
+
63
+ Step 2: Once you have the detections from each subdivided image, you will need to merge them back together to remove overlapping predictions, be sure to tranlate the offset back to the original image:
64
+
65
+ def bounding_box_match(b1: List[float], b2: List[float], iou_threshold: float = 0.1) -> bool:
66
+ # Calculate intersection coordinates
67
+ x1 = max(b1[0], b2[0])
68
+ y1 = max(b1[1], b2[1])
69
+ x2 = min(b1[2], b2[2])
70
+ y2 = min(b1[3], b2[3])
71
+
72
+ # Calculate intersection area
73
+ if x2 < x1 or y2 < y1:
74
+ return False # No overlap
75
+
76
+ intersection = (x2 - x1) * (y2 - y1)
77
+
78
+ # Calculate union area
79
+ area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
80
+ area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
81
+ union = area1 + area2 - intersection
82
+
83
+ # Calculate IoU
84
+ iou = intersection / union if union > 0 else 0
85
+
86
+ return iou >= iou_threshold
87
+
88
+ def merge_bounding_box_list(detections):
89
+ merged_detections = []
90
+ for detection in detections:
91
+ matching_box = None
92
+ for i, other in enumerate(merged_detections):
93
+ if bounding_box_match(detection["bbox"], other["bbox"]):
94
+ matching_box = i
95
+ break
96
+
97
+ if matching_box is not None:
98
+ # Keep the box with higher confidence score
99
+ if detection["score"] > merged_detections[matching_box]["score"]:
100
+ merged_detections[matching_box] = detection
101
+ else:
102
+ merged_detections.append(detection)
103
+
104
+ def sub_image_to_original(elt, sub_image_position, original_size):
105
+ offset_x, offset_y = sub_image_position
106
+ return {{
107
+ "label": elt["label"],
108
+ "score": elt["score"],
109
+ "bbox": [
110
+ (elt["bbox"][0] + offset_x) / original_size[1],
111
+ (elt["bbox"][1] + offset_y) / original_size[0],
112
+ (elt["bbox"][2] + offset_x) / original_size[1],
113
+ (elt["bbox"][3] + offset_y) / original_size[0],
114
+ ],
115
+ }}
116
+
117
+ def normalized_to_unnormalized(elt, image_size):
118
+ return {{
119
+ "label": elt["label"],
120
+ "score": elt["score"],
121
+ "bbox": [
122
+ elt["bbox"][0] * image_size[1],
123
+ elt["bbox"][1] * image_size[0],
124
+ elt["bbox"][2] * image_size[1],
125
+ elt["bbox"][3] * image_size[0],
126
+ ],
127
+ }}
128
+
129
+ height, width, _ = image.shape
130
+ mid_width = width // 2
131
+ mid_height = height // 2
132
+
133
+ detection_from_subdivided_images = []
134
+ for i, sub_image in enumerate(subdivided_images):
135
+ detections = <your detection function here>("pedestrian", sub_image)
136
+ unnorm_detections = [
137
+ normalized_to_unnormalized(
138
+ detection, (sub_image.shape[0], sub_image.shape[1])
139
+ )
140
+ for detection in detections
141
+ ]
142
+ offset_x = i % 2 * (mid_width - int(mid_width * 0.1))
143
+ offset_y = i // 2 * (mid_height - int(mid_height * 0.1))
144
+ offset_detections = [
145
+ sub_image_to_original(
146
+ unnorm_detection, (offset_x, offset_y), (height, width)
147
+ )
148
+ for unnorm_detection in unnorm_detections
149
+ ]
150
+ detection_from_subdivided_images.extend(offset_detections)
151
+
152
+ detections = merge_bounding_box_list(detection_from_subdivided_images)
153
+ </plan>
154
+
155
+
156
+ <category>small_text: The user is trying to read text that is too small to read properly. If you categorize the problem as small_text, you do not need to use large_image category.</category>
157
+ <plan>
158
+ First try to solve the problem by using an OCR or text extraction tool such as VQA:
159
+
160
+ text = <tool to extract text>(image)
161
+
162
+ If that does not work you must chain two models together, the first model will be an object detection model to locate the text locations, be sure to clarify when asking `get_tool_for_task`, "I need an object detection model where I can find the regions on the image with text but I don't care about the text itself". Once you have the regions, extract each region out and send it to an OCR model to extract the text itself:
163
+
164
+ text_regions = <object detection tool to find text locations>("text", image)
165
+
166
+ all_text = []
167
+ for text_region in text_regions:
168
+ unnormalized_coords = [
169
+ text_region[0] * image.shape[1],
170
+ text_region[1] * image.shape[0],
171
+ text_region[2] * image.shape[1],
172
+ text_region[3] * image.shape[0],
173
+ ]
174
+ # you can widen the crop to make it easier to read the text
175
+ crop = image[
176
+ int(0.95 * unnormalized_coords[1]):int(1.05 * unnormalized_coords[3]),
177
+ int(0.95 * unnormalized_coords[0]):int(1.05 * unnormalized_coords[2]),
178
+ :
179
+ ]
180
+ text = <ocr tool to extract text>(crop)
181
+ all_text.append(text)
182
+ </plan>
183
+
184
+
185
+ <category>color: The user is trying to identify the color of an object in the image.</category>
186
+ <plan>
187
+ You need to find the color of objects in the image, you can use the following code to help with this task:
188
+
189
+ import numpy as np
190
+ import cv2
191
+
192
+ color_ranges = {{
193
+ "red_lower": ((0, 100, 100), (int(179 * 20 / 360), 255, 255)),
194
+ "orange": ((int(179 * 21 / 360), 100, 100), (int(179 * 50 / 360), 255, 255)),
195
+ "yellow": ((int(179 * 51 / 360), 100, 100), (int(179 * 70 / 360), 255, 255)),
196
+ "green": ((int(179 * 71 / 360), 100, 100), (int(179 * 150 / 360), 255, 255)),
197
+ "cyan": ((int(179 * 151 / 360), 100, 100), (int(179 * 180 / 360), 255, 255)),
198
+ "blue": ((int(179 * 181 / 360), 100, 100), (int(179 * 265 / 360), 255, 255)),
199
+ "purple": ((int(179 * 266 / 360), 100, 100), (int(179 * 290 / 360), 255, 255)),
200
+ "pink": ((int(179 * 291 / 360), 100, 100), (int(179 * 330 / 360), 255, 255)),
201
+ "red_upper": ((int(179 * 331 / 360), 100, 100), (179, 255, 255)),
202
+ "white": ((0, 0, 200), (179, 25, 255)),
203
+ "gray": ((0, 0, 50), (179, 50, 200)),
204
+ "black": ((0, 0, 0), (179, 255, 30)),
205
+ }}
206
+
207
+ def get_color(image, color_ranges):
208
+ hsv_image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
209
+ detected_colors = {{}}
210
+ for color, (lower, upper) in color_ranges.items():
211
+ upper_range = np.array(upper, dtype=np.uint8)
212
+ lower_range = np.array(lower, dtype=np.uint8)
213
+ mask = cv2.inRange(hsv_image, lower_range, upper_range)
214
+ detected_pixels = cv2.countNonZero(mask)
215
+ detected_colors[color] = detected_pixels
216
+
217
+ if "red_lower" in detected_colors and "red_upper" in detected_colors:
218
+ detected_colors["red"] = detected_colors["red_lower"] + detected_colors["red_upper"]
219
+ del detected_colors["red_lower"]
220
+ del detected_colors["red_upper"]
221
+ return sorted(detected_colors, key=detected_colors.get, reverse=True)[0]
222
+ </plan>
223
+
224
+
225
+ <category>missing_grid_elements: The user is trying to identify missing elements that are part of a tight perfectly square grid pattern and the grid pattern is symmetric and not warped.</category>
226
+ <plan>
227
+ You are trying to identify missing elements that existing detectors cannot find. The non-missing instances of the item form a grid pattern that you can exploit to locate the missing item. Assuming you have detections of the non-missing instances you can utilize this code to locate the missing instances:
228
+
229
+ widths = [detection["bbox"][2] - detection["bbox"][0] for detection in detections]
230
+ heights = [detection["bbox"][3] - detection["bbox"][1] for detection in detections]
231
+
232
+ med_width = np.median(widths)
233
+ med_height = np.median(heights)
234
+
235
+ sorted_detections = sorted(detections, key=lambda x: x["bbox"][1])
236
+ rows = []
237
+ current_row = []
238
+ prev_y = sorted_detections[0]["bbox"][1]
239
+
240
+ for detection in sorted_detections:
241
+ if abs(detection["bbox"][1] - prev_y) > med_height / 2:
242
+ rows.append(current_row)
243
+ current_row = []
244
+ current_row.append(detection)
245
+ prev_y = detection["bbox"][1]
246
+
247
+ if current_row:
248
+ rows.append(current_row)
249
+ sorted_rows = [sorted(row, key=lambda x: x["bbox"][0]) for row in rows]
250
+ max_cols = max(len(row) for row in sorted_rows)
251
+ max_rows = len(sorted_rows)
252
+
253
+ column_positions = []
254
+ for col in range(max(len(row) for row in sorted_rows)):
255
+ column = [row[col] for row in sorted_rows if col < len(row)]
256
+ med_left = np.median([d["bbox"][0] for d in column])
257
+ med_right = np.median([d["bbox"][2] for d in column])
258
+ column_positions.append((med_left, med_right))
259
+
260
+ row_positions = []
261
+ for row in sorted_rows:
262
+ med_top = np.median([d["bbox"][1] for d in row])
263
+ med_bottom = np.median([d["bbox"][3] for d in row])
264
+ row_positions.append((med_top, med_bottom))
265
+
266
+
267
+ def find_element(left, right, top, bottom, elements):
268
+ center_x = (left + right) / 2
269
+ center_y = (top + bottom) / 2
270
+ for element in elements:
271
+ x_min, y_min, x_max, y_max = element["bbox"]
272
+ elt_center_x = (x_min + x_max) / 2
273
+ elt_center_y = (y_min + y_max) / 2
274
+ if (abs(center_x - elt_center_x) < med_width / 2) and (
275
+ abs(center_y - elt_center_y) < med_height / 2
276
+ ):
277
+ return element
278
+ return
279
+
280
+ missing_elements = []
281
+ for row in range(max_rows):
282
+ for col in range(max_cols):
283
+ left, right = column_positions[col]
284
+ top, bottom = row_positions[row]
285
+ match = find_element(left, right, top, bottom, sorted_rows[row])
286
+ if match is None:
287
+ missing_elements.append((left, top, right, bottom))
288
+ </plan>
289
+
290
+ <category>missing_horizontal_elements: The user is trying to identify missing elements that are part of a horizontal line pattern.</category>
291
+ <plan>
292
+ You are trying to identify missing elements that existing detectors cannot find. The non-missing instances of the item form a horizontal pattern that you can exploit to locate the missing item. Assuming you have detections of the non-missing instances you can utilize this code to locate the missing instances:
293
+
294
+ sorted_detections = sorted(detections, key=lambda x: x["bbox"][0] + x["bbox"][1])
295
+
296
+ horizontal_lines = []
297
+ while len(sorted_detections) > 0:
298
+ current = sorted_detections[0]
299
+ x_min, y_min, x_max, y_max = current["bbox"]
300
+ mean_y = (y_min + y_max) / 2
301
+ line = [
302
+ det for det in sorted_detections if det["bbox"][1] < mean_y < det["bbox"][3]
303
+ ]
304
+ horizontal_lines.append(line)
305
+
306
+ for det in line:
307
+ sorted_detections.remove(det)
308
+
309
+ gaps = []
310
+ for line in horizontal_lines:
311
+ line = sorted(line, key=lambda x: x["bbox"][0])
312
+ median_width = np.median(
313
+ [line[i]["bbox"][2] - line[i]["bbox"][0] for i in range(len(line))]
314
+ )
315
+ median_height = np.median(
316
+ [line[i]["bbox"][3] - line[i]["bbox"][1] for i in range(len(line))]
317
+ )
318
+ for i in range(len(line) - 1):
319
+ w_gap = line[i + 1]["bbox"][0] - line[i]["bbox"][2]
320
+ if w_gap > (0.5 * median_width):
321
+ count = np.round(w_gap / median_width)
322
+ for j in range(int(count)):
323
+ gaps.append(
324
+ [
325
+ line[i]["bbox"][2] + j * median_width,
326
+ line[i]["bbox"][1],
327
+ line[i]["bbox"][2] + (j + 1) * median_width,
328
+ line[i]["bbox"][1] + median_height,
329
+ ]
330
+ )
331
+ missing_elements = [{{"label": "missing_element", "score": 1.0, "bbox": gap}} for gap in gaps]
332
+ </plan>
333
+
334
+ <category>missing_vertical_elements: The user is trying to identify missing elements that are part of a vertical line pattern.</category>
335
+ <plan>
336
+ You are trying to identify missing elements that existing detectors cannot find. The non-missing instances of the item form a vertical pattern that you can exploit to locate the missing item. Assuming you have detections of the non-missing instances you can utilize this code to locate the missing instances:
337
+
338
+ sorted_detections = sorted(detections, key=lambda x: x["bbox"][0] + x["bbox"][1])
339
+
340
+ vertical_lines = []
341
+ while len(sorted_detections) > 0:
342
+ current = sorted_detections[0]
343
+ x_min, y_min, x_max, y_max = current["bbox"]
344
+ mean_x = (x_min + x_max) / 2
345
+ line = [
346
+ det for det in sorted_detections if det["bbox"][0] < mean_x < det["bbox"][2]
347
+ ]
348
+ vertical_lines.append(line)
349
+
350
+ for det in line:
351
+ sorted_detections.remove(det)
352
+
353
+ gaps = []
354
+ for line in vertical_lines:
355
+ line = sorted(line, key=lambda x: x["bbox"][1])
356
+ median_width = np.median(
357
+ [line[i]["bbox"][2] - line[i]["bbox"][0] for i in range(len(line))]
358
+ )
359
+ median_height = np.median(
360
+ [line[i]["bbox"][3] - line[i]["bbox"][1] for i in range(len(line))]
361
+ )
362
+ for i in range(len(line) - 1):
363
+ h_gap = line[i + 1]["bbox"][1] - line[i]["bbox"][3]
364
+ if h_gap > (0.5 * median_height):
365
+ count = np.round(h_gap / median_height)
366
+ for j in range(int(count)):
367
+ gaps.append(
368
+ [
369
+ line[i]["bbox"][0],
370
+ line[i]["bbox"][3] + j * median_height,
371
+ line[i]["bbox"][0] + median_width,
372
+ line[i]["bbox"][3] + (j + 1) * median_height,
373
+ ]
374
+ )
375
+
376
+ missing_elements = [{{"label": "missing_element", "score": 1.0, "bbox": gap}} for gap in gaps]
377
+ </plan>
378
+
379
+ <category>finding_features_with_video_tracking: The user is trying to track objects in a video and identify features on those objects.</category>
380
+ <plan>
381
+ First try to solve the problem using a VQA tool before using the tracking approach for a faster and easier solution:
382
+
383
+ answer = <VQA tool to answer your question>("<your prompt here>", image)
384
+
385
+ If that does not work, you can track the objects in the video and then identify features on those objects. You need to first get a tool that can track objects in a video, and then for each object find another tool to identify the features on the object. You can use the following code to help with this task:
386
+
387
+ track_predictions = <object tracking tool>("object", video_frames)
388
+
389
+
390
+ # Step 1: go through each frame and each prediction and extract the predicted bounding boxes as crops
391
+ obj_to_info = {{}}
392
+ for frame, frame_predictions in zip(video_frames, track_predictions):
393
+ for obj in frame_predictions:
394
+ if obj["label"] not in obj_to_info:
395
+ obj_to_info[obj["label"]] = []
396
+ height, width = frame.shape[:2]
397
+ # Consider adding a buffer to the crop to ensure the object is fully captured
398
+ crop = frame[
399
+ int(obj["bbox"][1] * height) : int(obj["bbox"][3] * height),
400
+ int(obj["bbox"][0] * width) : int(obj["bbox"][2] * width),
401
+ :,
402
+ ]
403
+ # For each crop use an object detection tool, VQA tool or classification tool to identify if the object contains the features you want
404
+ output = <tool, such as VQA, to identify your feature or multiple features>("<your feature(s) here>", crop)
405
+ obj_to_info[obj["label"]].extend(output)
406
+
407
+ print(f"{{len(obj_to_info)}} objects tracked")
408
+
409
+ objects_with_info = set()
410
+ for infos in obj_to_info:
411
+ for info in info:
412
+ if info["label"] == "<your feature here>":
413
+ objects_with_info.add(info)
414
+ break
415
+
416
+ print(f"{{len(objects_with_info)}} objects with features found")
417
+ </plan>
418
+
419
+
420
+ <category>comparing_sizes: The user is trying to compare objects by size or some other metric, e.g. count the smaller objects, or count the larger objects.</category>
421
+ <plan>
422
+ You are trying to order objects into comparative buckets, such as small and large, or small, medium and large. To do this you must first detect the objects, then calculate the bucket of interest (such as area, circumference, etc.) and finally use a clustering algorithm to group the objects into the desired buckets. You can use the following code to help with this task:
423
+
424
+ from sklearn.cluster import KMeans
425
+ import numpy as np
426
+
427
+ detections = <a detection tool that also includes segmentation masks>("object", image)
428
+
429
+ def get_area(detection):
430
+ return np.sum(detection["mask"])
431
+
432
+
433
+ areas = [get_area(detection) for detection in detections]
434
+ X = np.array(areas)[:, None]
435
+
436
+ kmeans = KMeans(n_clusters=<number of clusters>).fit(X)
437
+ smallest_cluster = np.argmin(kmeans.cluster_centers_)
438
+ largest_cluster = np.argmax(kmeans.cluster_centers_)
439
+
440
+ clusters = kmeans.predict(X)
441
+ smallest_detections = [detection for detection, cluster in zip(detections, clusters) if cluster == smallest_cluster]
442
+ largest_detections = [detection for detection, cluster in zip(detections, clusters) if cluster == largest_cluster]
443
+ </plan>
444
+
445
+ <category>nested_structure: The user is trying to count or identify objects but those objects are nested inside other objects.</category>
446
+ <plan>
447
+ You are trying to count objects within objects, or a nested structure. You can solve this by first detecting the outer objects, then cropping the image to the bounding box of each outer object and detecting the inner objects. You can use the following code to help with this task:
448
+
449
+ all_dets = <an object detection tool>("object", image)
450
+
451
+ height, width = image.shape[:2]
452
+
453
+ def area(box):
454
+ return (box[2] - box[0]) * (box[3] - box[1])
455
+
456
+ # only check inner detections on top 25 largest outer detections
457
+ largest_dets = sorted(dets, key=lambda x: area(x["bbox"]), reverse=True)[:25]
458
+ for det in largest_dets:
459
+ x1 = int(det["bbox"][0] * width)
460
+ y1 = int(det["bbox"][1] * height)
461
+ x2 = int(det["bbox"][2] * width)
462
+ y2 = int(det["bbox"][3] * height)
463
+
464
+ crop = image[y1:y2, x1:x2]
465
+ crop_height, crop_width = crop.shape[:2]
466
+
467
+ inner_dets = <an object detection tool>("object", crop)
468
+ for inner_det in inner_dets:
469
+ x1_inner = int(inner_det["bbox"][0] * crop_width)
470
+ y1_inner = int(inner_det["bbox"][1] * crop_height)
471
+ x2_inner = int(inner_det["bbox"][2] * crop_width)
472
+ y2_inner = int(inner_det["bbox"][3] * crop_height)
473
+
474
+ bbox = [
475
+ x1 + x1_inner,
476
+ y1 + y1_inner,
477
+ x1 + x2_inner,
478
+ y1 + y2_inner,
479
+ ]
480
+ norm_bbox = [
481
+ bbox[0] / width,
482
+ bbox[1] / height,
483
+ bbox[2] / width,
484
+ bbox[3] / height,
485
+ ]
486
+ all_dets.append(
487
+ {{
488
+ "label": inner_det["label"],
489
+ "score": inner_det["score"],
490
+ "bbox": norm_bbox,
491
+ }}
492
+ )
493
+ </plan>
494
+
495
+ <category>relative_position: The user is trying to locate an object relative to other 'anchor' objects such as up, down, left or right.</category>
496
+ <plan>
497
+ You are trying to locate an objects relative to 'anchor' objects. The 'anchor' objects can be detected fine, but there are many of the other objects and you only want to return the ones that are located relative to the 'anchor' objects as specified by the user. You can use the following code to help with this task:
498
+
499
+ # First find a model that can detect the location of the anchor objects
500
+ anchor_dets = <a model that can detect the location of the anchor objects>("anchor object", image)
501
+ # Then find a model that can detect the location of the relative objects
502
+ relative_dets = <a model that can detect the location of the relative objects>("relative object", image)
503
+
504
+ # This will give you relative objects 'above' the anchor objects since it's the
505
+ # distance between the lower left corner of the relative object and the upper left
506
+ # corner of the anchor object. The remaining functions can be used to get the other
507
+ # relative positions.
508
+ def above_distance(box1, box2):
509
+ return (box1["bbox"][0] - box2["bbox"][0]) ** 2 + (
510
+ box1["bbox"][3] - box2["bbox"][1]
511
+ ) ** 2
512
+
513
+ def below_distance(box1, box2):
514
+ return (box1["bbox"][0] - box2["bbox"][0]) ** 2 + (
515
+ box1["bbox"][1] - box2["bbox"][3]
516
+ ) ** 2
517
+
518
+ def right_distance(box1, box2):
519
+ return (box1["bbox"][0] - box2["bbox"][2]) ** 2 + (
520
+ box1["bbox"][1] - box2["bbox"][1]
521
+ ) ** 2
522
+
523
+ def left_distance(box1, box2):
524
+ return (box1["bbox"][2] - box2["bbox"][0]) ** 2 + (
525
+ box1["bbox"][1] - box2["bbox"][1]
526
+ ) ** 2
527
+
528
+ closest_boxes = []
529
+ for anchor_det in anchor_dets:
530
+ # You can use any of the above functions to get the relative position
531
+ distances = [
532
+ (relative_det, above_distance(relative_det, anchor_det))
533
+ for relative_det in relative_dets
534
+ ]
535
+ # You must grab the nearest object for each of the anchors. This line will give
536
+ # you the box directly above the anchor box (or below, left, right depending on
537
+ # the function used)
538
+ closest_box = min(distances, key=lambda x: x[1])[0]
539
+ closest_boxes.append(closest_box)
540
+ </plan>
541
+
542
+ <category>depth_position: The user is trying to find a furthest or closest object based on depth in the image.</category>
543
+ <plan>
544
+ You are trying to order objects by their depth in the image. You can use a depth estimation model to estimate the depth of the objects and then sort the objects based on their mean depth. You can use the following code to help with this task:
545
+
546
+ # First find a model to estimate the depth of the image
547
+ depth = <a depth estimation model>(image)
548
+ # Then find a model to segment the objects in the image
549
+ masks = <a segmentation model>("object", image)
550
+
551
+ for elt in masks:
552
+ # Multiply the depth by the mask and keep track of the mean depth for the masked
553
+ # object
554
+ depth_mask = depth * elt["mask"]
555
+ elt["mean_depth"] = depth_mask.mean()
556
+
557
+ # Sort the masks by mean depth in reverse, objects that are closer will have higher
558
+ # mean depth values and further objects will have lower mean depth values.
559
+ masks = sorted(masks, key=lambda x: x["mean_depth"], reverse=True)
560
+ closest_mask = masks[0]
561
+ </plan>
562
+
563
+ <category>activity recognition: The user is trying to identify the time period an event occurs in a video.</category>
564
+ <plan>
565
+ You are trying to identify the time period an event occurs in a video. You can use an activity recognition model to identify the event and the time period it occurs in. You can use the following code to help with this task:
566
+ preds = <activity recognition model>("a description of the event you want to locate", frames)
567
+ even_frames = [frame for i, frame in enumerate(frames) if preds[i] == 1.0]
568
+ </plan>
569
+
570
+
571
+ <category>object_assignment: The user is trying to assign one class of objects to another class, in a many-to-one relationship, such as people sitting at tables.</category>
572
+ <plan>
573
+ You are trying to detect or track two classes of objects where multiple of one class can be assigned to one of the other class.
574
+
575
+ pred = <object detection or instance segmentation tool>("object 1, object 2", image_or_frame)
576
+ objects_1 = [p for p in pred if p["label"] == "object 1"]
577
+ objects_2 = [p for p in pred if p["label"] == "object 2"]
578
+
579
+ def box_iou(bbox1: np.ndarray, bbox2: np.ndarray) -> float:
580
+ # Get coordinates of intersection rectangle
581
+ x1 = max(bbox1[0], bbox2[0])
582
+ y1 = max(bbox1[1], bbox2[1])
583
+ x2 = min(bbox1[2], bbox2[2])
584
+ y2 = min(bbox1[3], bbox2[3])
585
+
586
+ # Calculate area of intersection
587
+ intersection = max(0, x2 - x1) * max(0, y2 - y1)
588
+ if intersection == 0:
589
+ return 0.0
590
+
591
+ # Calculate area of both boxes
592
+ box1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
593
+ box2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
594
+
595
+ # Calculate IoU
596
+ union = box1_area + box2_area - intersection
597
+ return intersection / union if union > 0 else 0.0
598
+
599
+ # initialize assignment counts
600
+ objects_2_counts = {{i: 0 for i in range(len(objects_2))}}
601
+ # you can set a minimum iou threshold for assignment
602
+ iou_threshold = 0.05
603
+
604
+ # You can expand the object 2 box by a certain percentage if needed to help with the
605
+ # assignment.
606
+ for object_2 in objects_2:
607
+ box = object_2["bbox"]
608
+ # If your camera is at an angle you need to expand the top of the box like so:
609
+ box = [0.9 * box[0], 0.9 * box[1], 1.1 * box[2], box[3]]
610
+ # If the camera is top down you should expand all sides of the box like this:
611
+ box = [0.9 * box[0], 0.9 * box[1], 1.1 * box[2], 1.1 * box[3]]
612
+
613
+ object_2["bbox"] = box
614
+
615
+ for object_1 in objects_1:
616
+ best_iou = 0
617
+ best_object_2_idx = None
618
+
619
+ for j, object_2 in enumerate(objects_2):
620
+ iou = box_iou(object_1["bbox"], object_2["bbox"])
621
+ if iou > best_iou and iou > iou_threshold:
622
+ best_iou = iou
623
+ best_object_2_idx = j
624
+
625
+ if best_object_2_idx is not None:
626
+ objects_2_counts[best_object_2_idx] += 1
627
+ </plan>
628
+
629
+ <category>document_qa: The user is trying to answer questions about a document or extract information from a document.</category>
630
+ <plan>
631
+ You are trying to answer questions about a document or extract information from a document. You can use a Document QA or image VQA model to extract the information from the document and answer the questions. You can use the following code to help with this task:
632
+
633
+ doc_text = <a document QA or image VQA model>("question", document)
634
+
635
+ # If you use a VQA model you can also ask it to extract information in json format:
636
+ doc_json = <image VQA model>("Please extract the information ... in JSON format with {{'key1': 'result1', ...}}", document)
637
+ </plan>
638
+ """
@@ -2,7 +2,7 @@ from typing import Type
2
2
 
3
3
  from pydantic import BaseModel, Field
4
4
 
5
- from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM, GoogleLMM
5
+ from vision_agent.lmm import LMM, AnthropicLMM, GoogleLMM
6
6
 
7
7
 
8
8
  class Config(BaseModel):
@@ -106,12 +106,11 @@ class Config(BaseModel):
106
106
  )
107
107
 
108
108
  # for suggestions module
109
- suggester: Type[LMM] = Field(default=OpenAILMM)
109
+ suggester: Type[LMM] = Field(default=AnthropicLMM)
110
110
  suggester_kwargs: dict = Field(
111
111
  default_factory=lambda: {
112
- "model_name": "o1",
112
+ "model_name": "claude-3-7-sonnet-20250219",
113
113
  "temperature": 1.0,
114
- "image_detail": "high",
115
114
  "image_size": 1024,
116
115
  }
117
116
  )
@@ -528,7 +528,7 @@ def suggestion(prompt: str, medias: List[np.ndarray]) -> None:
528
528
  medias: List[np.ndarray]: The images to use for the problem
529
529
  """
530
530
  try:
531
- from .suggestion import suggestion_impl # type: ignore
531
+ from .suggestion import suggestion_impl
532
532
 
533
533
  suggestion = suggestion_impl(prompt, medias)
534
534
  print(suggestion)
@@ -0,0 +1,28 @@
1
+ from typing import List, cast
2
+
3
+ import numpy as np
4
+ from vision_agent.configs import Config
5
+ from vision_agent.utils.image_utils import convert_to_b64
6
+
7
+ from vision_agent.agent.visual_design_patterns import DESIGN_PATTERNS
8
+
9
+ CONFIG = Config()
10
+
11
+
12
+ def suggestion_impl(prompt: str, medias: List[np.ndarray]) -> str:
13
+ suggester = CONFIG.create_suggester()
14
+ if isinstance(medias, np.ndarray):
15
+ medias = [medias]
16
+ all_media_b64 = [
17
+ "data:image/png;base64," + convert_to_b64(media) for media in medias
18
+ ]
19
+ image_sizes = [media.shape[:2] for media in medias]
20
+ resized = suggester.image_size if hasattr(suggester, "image_size") else 768
21
+ image_size = f"The original image sizes were: {str(image_sizes)} and have been resized to {resized}x{resized}"
22
+
23
+ prompt = DESIGN_PATTERNS.format(request=prompt, image_size=image_size)
24
+
25
+ response = cast(
26
+ str, suggester.generate(prompt, media=all_media_b64, temperature=1.0)
27
+ )
28
+ return response
File without changes