vision-agent 0.2.231__py3-none-any.whl → 0.2.233__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -514,7 +514,7 @@ desc,doc,name
514
514
  >>> vit_nsfw_classification(image)
515
515
  {""label"": ""normal"", ""scores"": 0.68},
516
516
  ",vit_nsfw_classification
517
- 'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: Optional[int] = 2) -> List[float]:
517
+ 'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: int = 2) -> List[float]:
518
518
  'video_temporal_localization' will run qwen2vl on each chunk_length_frames
519
519
  value selected for the video. It can detect multiple objects independently per
520
520
  chunk_length_frames given a text prompt such as a referring expression
@@ -527,7 +527,7 @@ desc,doc,name
527
527
  frames (List[np.ndarray]): The reference frames used for the question
528
528
  model (str): The model to use for the inference. Valid values are
529
529
  'qwen2vl', 'gpt4o'.
530
- chunk_length_frames (Optional[int]): length of each chunk in frames
530
+ chunk_length_frames (int): length of each chunk in frames
531
531
 
532
532
  Returns:
533
533
  List[float]: A list of floats with a value of 1.0 if the objects to be found
@@ -540,16 +540,18 @@ desc,doc,name
540
540
  ",video_temporal_localization
541
541
  "'flux_image_inpainting' performs image inpainting to fill the masked regions, given by mask, in the image, given image based on the text prompt and surrounding image context. It can be used to edit regions of an image according to the prompt given.","flux_image_inpainting(prompt: str, image: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
542
542
  'flux_image_inpainting' performs image inpainting to fill the masked regions,
543
- given by mask, in the image, given image based on the text prompt and surrounding image context.
544
- It can be used to edit regions of an image according to the prompt given.
543
+ given by mask, in the image, given image based on the text prompt and surrounding
544
+ image context. It can be used to edit regions of an image according to the prompt
545
+ given.
545
546
 
546
547
  Parameters:
547
548
  prompt (str): A detailed text description guiding what should be generated
548
- in the masked area. More detailed and specific prompts typically yield better results.
549
- image (np.ndarray): The source image to be inpainted.
550
- The image will serve as the base context for the inpainting process.
551
- mask (np.ndarray): A binary mask image with 0's and 1's,
552
- where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
549
+ in the masked area. More detailed and specific prompts typically yield
550
+ better results.
551
+ image (np.ndarray): The source image to be inpainted. The image will serve as
552
+ the base context for the inpainting process.
553
+ mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
554
+ areas to be inpainted and 0 indicates areas to be preserved.
553
555
 
554
556
  Returns:
555
557
  np.ndarray: The generated image(s) as a numpy array in RGB format with values
@@ -658,7 +660,7 @@ desc,doc,name
658
660
  -------
659
661
  >>> save_image(image)
660
662
  ",save_image
661
- 'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.,"save_video(frames: List[numpy.ndarray], output_video_path: Optional[str] = None, fps: float = 1) -> str:
663
+ 'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.,"save_video(frames: List[numpy.ndarray], output_video_path: Optional[str] = None, fps: float = 5) -> str:
662
664
  'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
663
665
 
664
666
  Parameters:
@@ -148,8 +148,10 @@ def format_plan_v2(plan: PlanContext) -> str:
148
148
  plan_str += "Instructions:\n"
149
149
  for v in plan.instructions:
150
150
  plan_str += f" - {v}\n"
151
- plan_str += "Code:\n"
152
- plan_str += plan.code
151
+
152
+ if plan.code:
153
+ plan_str += "Code:\n"
154
+ plan_str += plan.code
153
155
  return plan_str
154
156
 
155
157
 
@@ -158,7 +160,7 @@ def format_conversation(chat: List[AgentMessage]) -> str:
158
160
  prompt = ""
159
161
  for chat_i in chat:
160
162
  if chat_i.role == "user" or chat_i.role == "coder":
161
- if "<final_code>" in chat_i.role:
163
+ if "<final_code>" in chat_i.content:
162
164
  prompt += f"OBSERVATION: {chat_i.content}\n\n"
163
165
  elif chat_i.role == "user":
164
166
  prompt += f"USER: {chat_i.content}\n\n"
@@ -6,7 +6,7 @@ FEEDBACK = """
6
6
 
7
7
 
8
8
  CODE = """
9
- **Role**: You are an expoert software programmer.
9
+ **Role**: You are an expert software programmer.
10
10
 
11
11
  **Task**: You are given a plan by a planning agent that solves a vision problem posed by the user. You are also given code snippets that the planning agent used to solve the task. Your job is to organize the code so that it can be easily called by the user to solve the task.
12
12
 
@@ -425,6 +425,8 @@ class VisionAgentCoderV2(AgentCoder):
425
425
  chat (List[AgentMessage]): The input to the agent. This should be a list of
426
426
  AgentMessage objects.
427
427
  plan_context (PlanContext): The plan context that was previously generated.
428
+ If plan_context.code is not provided, then the code will be generated
429
+ from the chat messages.
428
430
  code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
429
431
 
430
432
  Returns:
@@ -441,7 +443,7 @@ class VisionAgentCoderV2(AgentCoder):
441
443
 
442
444
  # we don't need the user_interaction response for generating code since it's
443
445
  # already in the plan context
444
- while chat[-1].role != "user":
446
+ while len(chat) > 0 and chat[-1].role != "user":
445
447
  chat.pop()
446
448
 
447
449
  if not chat:
@@ -455,12 +457,24 @@ class VisionAgentCoderV2(AgentCoder):
455
457
  int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
456
458
  tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
457
459
 
460
+ # If code is not provided from the plan_context then generate it, else use
461
+ # the provided code and start with testing
462
+ if not plan_context.code.strip():
463
+ code = write_code(
464
+ coder=self.coder,
465
+ chat=int_chat,
466
+ tool_docs=tool_docs,
467
+ plan=format_plan_v2(plan_context),
468
+ )
469
+ else:
470
+ code = plan_context.code
471
+
458
472
  code_context = test_code(
459
473
  tester=self.tester,
460
474
  debugger=self.debugger,
461
475
  chat=int_chat,
462
476
  plan=format_plan_v2(plan_context),
463
- code=plan_context.code,
477
+ code=code,
464
478
  tool_docs=tool_docs,
465
479
  code_interpreter=code_interpreter,
466
480
  media_list=media_list,
@@ -50,7 +50,7 @@ From this aerial view of a busy urban street, it's difficult to clearly see or c
50
50
  [suggestion 0]
51
51
  The image is very large and the items you need to detect are small.
52
52
 
53
- Step 1: You should start by splitting the image into sections and runing the detection algorithm on each section:
53
+ Step 1: You should start by splitting the image into overlapping sections and runing the detection algorithm on each section:
54
54
 
55
55
  def subdivide_image(image):
56
56
  height, width, _ = image.shape
@@ -66,41 +66,96 @@ def subdivide_image(image):
66
66
 
67
67
  get_tool_for_task('<your prompt here>', subdivide_image(image))
68
68
 
69
- Step 2: Once you have the detections from each subdivided image, you will need to merge them back together to remove overlapping predictions:
70
-
71
- def translate_ofset(bbox, offset_x, offset_y):
72
- return (bbox[0] + offset_x, bbox[1] + offset_y, bbox[2] + offset_x, bbox[3] + offset_y)
73
-
74
- def bounding_boxes_overlap(bbox1, bbox2):
75
- if bbox1[2] < bbox2[0] or bbox2[0] > bbox1[2]:
76
- return False
77
- if bbox1[3] < bbox2[1] or bbox2[3] > bbox1[3]:
78
- return False
79
- return True
80
-
81
- def merge_bounding_boxes(bbox1, bbox2):
82
- x_min = min(bbox1[0], bbox2[0])
83
- y_min = min(bbox1[1], bbox2[1])
84
- x_max = max(bbox1[2], bbox2[2])
85
- y_max = max(bbox1[3], bbox2[3])
86
- return (x_min, y_min, x_max, y_max)
87
-
88
- def merge_bounding_box_list(bboxes):
89
- merged_bboxes = []
90
- while bboxes:
91
- bbox = bboxes.pop()
92
- overlap_found = False
93
- for i, other_bbox in enumerate(merged_bboxes):
94
- if bounding_boxes_overlap(bbox, other_bbox):
95
- merged_bboxes[i] = merge_bounding_boxes(bbox, other_bbox)
96
- overlap_found = True
69
+ Step 2: Once you have the detections from each subdivided image, you will need to merge them back together to remove overlapping predictions, be sure to tranlate the offset back to the original image:
70
+
71
+ def bounding_box_match(b1: List[float], b2: List[float], iou_threshold: float = 0.1) -> bool:
72
+ # Calculate intersection coordinates
73
+ x1 = max(b1[0], b2[0])
74
+ y1 = max(b1[1], b2[1])
75
+ x2 = min(b1[2], b2[2])
76
+ y2 = min(b1[3], b2[3])
77
+
78
+ # Calculate intersection area
79
+ if x2 < x1 or y2 < y1:
80
+ return False # No overlap
81
+
82
+ intersection = (x2 - x1) * (y2 - y1)
83
+
84
+ # Calculate union area
85
+ area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
86
+ area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
87
+ union = area1 + area2 - intersection
88
+
89
+ # Calculate IoU
90
+ iou = intersection / union if union > 0 else 0
91
+
92
+ return iou >= iou_threshold
93
+
94
+ def merge_bounding_box_list(detections):
95
+ merged_detections = []
96
+ for detection in detections:
97
+ matching_box = None
98
+ for i, other in enumerate(merged_detections):
99
+ if bounding_box_match(detection["bbox"], other["bbox"]):
100
+ matching_box = i
97
101
  break
98
- if not overlap_found:
99
- p
100
- merged_bboxes.append(bbox)
101
- return merged_bboxes
102
102
 
103
- detection = merge_bounding_box_list(detection_from_subdivided_images)
103
+ if matching_box is not None:
104
+ # Keep the box with higher confidence score
105
+ if detection["score"] > merged_detections[matching_box]["score"]:
106
+ merged_detections[matching_box] = detection
107
+ else:
108
+ merged_detections.append(detection)
109
+
110
+ def sub_image_to_original(elt, sub_image_position, original_size):
111
+ offset_x, offset_y = sub_image_position
112
+ return {
113
+ "label": elt["label"],
114
+ "score": elt["score"],
115
+ "bbox": [
116
+ (elt["bbox"][0] + offset_x) / original_size[1],
117
+ (elt["bbox"][1] + offset_y) / original_size[0],
118
+ (elt["bbox"][2] + offset_x) / original_size[1],
119
+ (elt["bbox"][3] + offset_y) / original_size[0],
120
+ ],
121
+ }
122
+
123
+ def normalized_to_unnormalized(elt, image_size):
124
+ return {
125
+ "label": elt["label"],
126
+ "score": elt["score"],
127
+ "bbox": [
128
+ elt["bbox"][0] * image_size[1],
129
+ elt["bbox"][1] * image_size[0],
130
+ elt["bbox"][2] * image_size[1],
131
+ elt["bbox"][3] * image_size[0],
132
+ ],
133
+ }
134
+
135
+ height, width, _ = image.shape
136
+ mid_width = width // 2
137
+ mid_height = height // 2
138
+
139
+ detection_from_subdivided_images = []
140
+ for i, sub_image in enumerate(subdivided_images):
141
+ detections = <your detection function here>("pedestrian", sub_image)
142
+ unnorm_detections = [
143
+ normalized_to_unnormalized(
144
+ detection, (sub_image.shape[0], sub_image.shape[1])
145
+ )
146
+ for detection in detections
147
+ ]
148
+ offset_x = i % 2 * (mid_width - int(mid_width * 0.1))
149
+ offset_y = i // 2 * (mid_height - int(mid_height * 0.1))
150
+ offset_detections = [
151
+ sub_image_to_original(
152
+ unnorm_detection, (offset_x, offset_y), (height, width)
153
+ )
154
+ for unnorm_detection in unnorm_detections
155
+ ]
156
+ detection_from_subdivided_images.extend(offset_detections)
157
+
158
+ detections = merge_bounding_box_list(detection_from_subdivided_images)
104
159
  [end of suggestion 0]
105
160
  [end of suggestion]
106
161
  <count>9</count>
@@ -164,36 +219,44 @@ countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float
164
219
 
165
220
  AGENT: <thinking>I need to now merge the boxes from all region and use the countgd_object_detection tool with the prompt 'pedestrian' as suggested by get_tool_for_task.</thinking>
166
221
  <execute_python>
167
- def translate_ofset(bbox, offset_x, offset_y):
168
- return (bbox[0] + offset_x, bbox[1] + offset_y, bbox[2] + offset_x, bbox[3] + offset_y)
169
-
170
- def bounding_boxes_overlap(bbox1, bbox2):
171
- if bbox1[2] < bbox2[0] or bbox2[0] > bbox1[2]:
172
- return False
173
- if bbox1[3] < bbox2[1] or bbox2[3] > bbox1[3]:
174
- return False
175
- return True
176
-
177
- def merge_bounding_boxes(bbox1, bbox2):
178
- x_min = min(bbox1[0], bbox2[0])
179
- y_min = min(bbox1[1], bbox2[1])
180
- x_max = max(bbox1[2], bbox2[2])
181
- y_max = max(bbox1[3], bbox2[3])
182
- return (x_min, y_min, x_max, y_max)
183
-
184
- def merge_bounding_box_list(bboxes):
185
- merged_bboxes = []
186
- while bboxes:
187
- bbox = bboxes.pop()
188
- overlap_found = False
189
- for i, other_bbox in enumerate(merged_bboxes):
190
- if bounding_boxes_overlap(bbox, other_bbox):
191
- merged_bboxes[i] = merge_bounding_boxes(bbox, other_bbox)
192
- overlap_found = True
222
+ def bounding_box_match(b1: List[float], b2: List[float], iou_threshold: float = 0.1) -> bool:
223
+ # Calculate intersection coordinates
224
+ x1 = max(b1[0], b2[0])
225
+ y1 = max(b1[1], b2[1])
226
+ x2 = min(b1[2], b2[2])
227
+ y2 = min(b1[3], b2[3])
228
+
229
+ # Calculate intersection area
230
+ if x2 < x1 or y2 < y1:
231
+ return False # No overlap
232
+
233
+ intersection = (x2 - x1) * (y2 - y1)
234
+
235
+ # Calculate union area
236
+ area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
237
+ area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
238
+ union = area1 + area2 - intersection
239
+
240
+ # Calculate IoU
241
+ iou = intersection / union if union > 0 else 0
242
+
243
+ return iou >= iou_threshold
244
+
245
+ def merge_bounding_box_list(detections):
246
+ merged_detections = []
247
+ for detection in detections:
248
+ matching_box = None
249
+ for i, other in enumerate(merged_detections):
250
+ if bounding_box_match(detection["bbox"], other["bbox"]):
251
+ matching_box = i
193
252
  break
194
- if not overlap_found:
195
- merged_bboxes.append(bbox)
196
- return merged_bboxes
253
+
254
+ if matching_box is not None:
255
+ # Keep the box with higher confidence score
256
+ if detection["score"] > merged_detections[matching_box]["score"]:
257
+ merged_detections[matching_box] = detection
258
+ else:
259
+ merged_detections.append(detection)
197
260
 
198
261
  detections = []
199
262
  for region in subdivide_image(image):
@@ -458,6 +521,8 @@ You are given a task: "{task}" from the user. You must extract the type of categ
458
521
  - "DocQA" - answering questions about a document or extracting information from a document.
459
522
  - "video object tracking" - tracking objects in a video.
460
523
  - "depth and pose estimation" - estimating the depth or pose of objects in an image.
524
+ - "temporal localization" - localizing the time period an event occurs in a video.
525
+ - "inpainting" - filling in masked parts of an image.
461
526
 
462
527
  Return the category or categories (comma separated) inside tags <category># your categories here</category>. If you are unsure about a task, it is better to include more categories than less.
463
528
  """
@@ -651,22 +716,24 @@ PICK_TOOL = """
651
716
  """
652
717
 
653
718
  FINALIZE_PLAN = """
654
- **Role**: You are an expert AI model that can understand the user request and construct plans to accomplish it.
719
+ **Task**: You are given a chain of thoughts, python executions and observations from a planning agent as it tries to construct a plan to solve a user request. Your task is to summarize the plan it found so that another programming agent to write a program to accomplish the user request.
655
720
 
656
- **Task**: You are given a chain of thoughts, python executions and observations from a planning agent as it tries to construct a plan to solve a user request. Your task is to summarize the plan it found so that another programming agnet to write a program to accomplish the user request.
721
+ **Documentation**: You can use these tools to help you visualize or save the output:
722
+ {tool_desc}
657
723
 
658
724
  **Planning**: Here is chain of thoughts, executions and observations from the planning agent:
659
725
  {planning}
660
726
 
661
727
  **Instructions**:
662
728
  1. Summarize the plan that the planning agent found.
663
- 2. Write a single function that solves the problem based on what the planner found.
664
- 3. Specifically call out the tools used and the order in which they were used. Only include tools obtained from calling `get_tool_for_task`.
729
+ 2. Write a single function that solves the problem based on what the planner found and only returns the final solution.
730
+ 3. Only use tools obtained from calling `get_tool_for_task`.
665
731
  4. Do not include {excluded_tools} tools in your instructions.
666
- 5. Add final instructions for visualizing the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and saving it to a file with `save_image` or `save_video`.
667
- 6. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
668
- 7. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
669
- 8. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
732
+ 5. Ensure the function is well documented and easy to understand.
733
+ 6. Ensure you visualize the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and save it to a file with `save_image` or `save_video`.
734
+ 7. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
735
+ 8. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
736
+ 9. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
670
737
 
671
738
  <json>
672
739
  {{
@@ -326,6 +326,7 @@ def create_finalize_plan(
326
326
  return [], PlanContext(plan="", instructions=[], code="")
327
327
 
328
328
  prompt = FINALIZE_PLAN.format(
329
+ tool_desc=UTIL_DOCSTRING,
329
330
  planning=get_planning(chat),
330
331
  excluded_tools=str([t.__name__ for t in pt.PLANNER_TOOLS]),
331
332
  )
@@ -42,6 +42,8 @@ AGENT: <response>I am VisionAgent, an agent built by LandingAI, to help users wr
42
42
  - Understanding documents
43
43
  - Pose estimation
44
44
  - Visual question answering for both images and videos
45
+ - Action recognition in videos
46
+ - Image inpainting
45
47
 
46
48
  How can I help you?</response>
47
49
  --- END EXAMPLE2 ---
@@ -54,7 +56,8 @@ Here is the current conversation so far:
54
56
 
55
57
  **Instructions**:
56
58
  1. Only respond with a single <response> tag and a single <action> tag.
57
- 2. Respond in the following format, the <action> tag is optional and can be excluded if you do not want to take any action:
59
+ 2. You can only take one action at a time in response to the user's message. Do not offer to fix code on the user's behalf, only if they have directly asked you to.
60
+ 3. Respond in the following format, the <action> tag is optional and can be excluded if you do not want to take any action:
58
61
 
59
62
  <response>Your response to the user's message</response>
60
63
  <action>The action you want to take from **Actions**</action>
@@ -27,7 +27,7 @@ CONFIG = Config()
27
27
 
28
28
 
29
29
  def extract_conversation(
30
- chat: List[AgentMessage],
30
+ chat: List[AgentMessage], include_conv: bool = False
31
31
  ) -> Tuple[List[AgentMessage], Optional[str]]:
32
32
  chat = copy.deepcopy(chat)
33
33
 
@@ -43,6 +43,8 @@ def extract_conversation(
43
43
  elif chat_i.role == "coder":
44
44
  if "<final_code>" in chat_i.content:
45
45
  extracted_chat.append(chat_i)
46
+ elif include_conv and chat_i.role == "conversation":
47
+ extracted_chat.append(chat_i)
46
48
 
47
49
  # only keep the last <final_code> and <final_test>
48
50
  final_code = None
@@ -64,10 +66,9 @@ def extract_conversation(
64
66
 
65
67
 
66
68
  def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
67
- extracted_chat, _ = extract_conversation(chat)
68
- extracted_chat = extracted_chat[-10:]
69
+ extracted_chat, _ = extract_conversation(chat, include_conv=True)
69
70
 
70
- conv = format_conversation(chat)
71
+ conv = format_conversation(extracted_chat)
71
72
  prompt = CONVERSATION.format(
72
73
  conversation=conv,
73
74
  )
@@ -112,14 +113,17 @@ def maybe_run_action(
112
113
  )
113
114
  ]
114
115
  elif action == "edit_code":
116
+ # We don't want to pass code in plan_context.code so the coder will generate
117
+ # new code from plan_context.plan
115
118
  plan_context = PlanContext(
116
- plan="Edit the latest code observed in the fewest steps possible according to the user's feedback.",
119
+ plan="Edit the latest code observed in the fewest steps possible according to the user's feedback."
120
+ + ("<code>\n" + final_code + "\n</code>" if final_code is not None else ""),
117
121
  instructions=[
118
122
  chat_i.content
119
123
  for chat_i in extracted_chat
120
124
  if chat_i.role == "user" and "<final_code>" not in chat_i.content
121
125
  ],
122
- code=final_code if final_code is not None else "",
126
+ code="",
123
127
  )
124
128
  context = coder.generate_code_from_plan(
125
129
  extracted_chat, plan_context, code_interpreter=code_interpreter
@@ -260,7 +264,7 @@ class VisionAgentV2(Agent):
260
264
  # do not append updated_chat to return_chat becuase the observation
261
265
  # from running the action will have already been added via the callbacks
262
266
  obs_response_context = run_conversation(
263
- self.agent, return_chat + updated_chat
267
+ self.agent, int_chat + return_chat + updated_chat
264
268
  )
265
269
  return_chat.append(
266
270
  AgentMessage(role="conversation", content=obs_response_context)
@@ -2,7 +2,7 @@ import inspect
2
2
  import logging
3
3
  import tempfile
4
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
5
- from typing import Any, Callable, Dict, List, Optional, Tuple, cast
5
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
6
6
 
7
7
  import libcst as cst
8
8
  import numpy as np
@@ -235,7 +235,9 @@ def run_tool_testing(
235
235
 
236
236
 
237
237
  def get_tool_for_task(
238
- task: str, images: List[np.ndarray], exclude_tools: Optional[List[str]] = None
238
+ task: str,
239
+ images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]],
240
+ exclude_tools: Optional[List[str]] = None,
239
241
  ) -> None:
240
242
  """Given a task and one or more images this function will find a tool to accomplish
241
243
  the jobs. It prints the tool documentation and thoughts on why it chose the tool.
@@ -248,6 +250,8 @@ def get_tool_for_task(
248
250
  - VQA
249
251
  - Depth and pose estimation
250
252
  - Video object tracking
253
+ - Video temporal localization (action recognition)
254
+ - Image inpainting
251
255
 
252
256
  Only ask for one type of task at a time, for example a task needing to identify
253
257
  text is one OCR task while needing to identify non-text objects is an OD task. Wait
@@ -256,7 +260,8 @@ def get_tool_for_task(
256
260
 
257
261
  Parameters:
258
262
  task: str: The task to accomplish.
259
- images: List[np.ndarray]: The images to use for the task.
263
+ images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]]: The images to use
264
+ for the task. If a key is provided, it is used as the file name.
260
265
  exclude_tools: Optional[List[str]]: A list of tool names to exclude from the
261
266
  recommendations. This is helpful if you are calling get_tool_for_task twice
262
267
  and do not want the same tool recommended.
@@ -266,20 +271,29 @@ def get_tool_for_task(
266
271
 
267
272
  Examples
268
273
  --------
269
- >>> get_tool_for_task("Give me an OCR model that can find 'hot chocolate' in the image", [image])
274
+ >>> get_tool_for_task(
275
+ >>> "Give me an OCR model that can find 'hot chocolate' in the image",
276
+ >>> {"image": [image]})
277
+ >>> get_tool_for_taks(
278
+ >>> "I need a tool that can paint a background for this image and maks",
279
+ >>> {"image": [image], "mask": [mask]})
270
280
  """
271
281
  tool_tester = CONFIG.create_tool_tester()
272
282
  tool_chooser = CONFIG.create_tool_chooser()
273
283
 
284
+ if isinstance(images, list):
285
+ images = {"image": images}
286
+
274
287
  with (
275
288
  tempfile.TemporaryDirectory() as tmpdirname,
276
289
  CodeInterpreterFactory.new_instance() as code_interpreter,
277
290
  ):
278
291
  image_paths = []
279
- for i, image in enumerate(images[:3]):
280
- image_path = f"{tmpdirname}/image_{i}.png"
281
- Image.fromarray(image).save(image_path)
282
- image_paths.append(image_path)
292
+ for k in images.keys():
293
+ for i, image in enumerate(images[k]):
294
+ image_path = f"{tmpdirname}/{k}_{i}.png"
295
+ Image.fromarray(image).save(image_path)
296
+ image_paths.append(image_path)
283
297
 
284
298
  code, tool_docs_str, tool_output = run_tool_testing(
285
299
  task, image_paths, tool_tester, exclude_tools, code_interpreter
@@ -300,20 +314,26 @@ def get_tool_documentation(tool_name: str) -> str:
300
314
 
301
315
 
302
316
  def get_tool_for_task_human_reviewer(
303
- task: str, images: List[np.ndarray], exclude_tools: Optional[List[str]] = None
317
+ task: str,
318
+ images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]],
319
+ exclude_tools: Optional[List[str]] = None,
304
320
  ) -> None:
305
321
  # NOTE: this will have the same documentation as get_tool_for_task
306
322
  tool_tester = CONFIG.create_tool_tester()
307
323
 
324
+ if isinstance(images, list):
325
+ images = {"image": images}
326
+
308
327
  with (
309
328
  tempfile.TemporaryDirectory() as tmpdirname,
310
329
  CodeInterpreterFactory.new_instance() as code_interpreter,
311
330
  ):
312
331
  image_paths = []
313
- for i, image in enumerate(images[:3]):
314
- image_path = f"{tmpdirname}/image_{i}.png"
315
- Image.fromarray(image).save(image_path)
316
- image_paths.append(image_path)
332
+ for k in images.keys():
333
+ for i, image in enumerate(images[k]):
334
+ image_path = f"{tmpdirname}/{k}_{i}.png"
335
+ Image.fromarray(image).save(image_path)
336
+ image_paths.append(image_path)
317
337
 
318
338
  tools = [
319
339
  t.__name__
@@ -1727,22 +1727,46 @@ def video_temporal_localization(
1727
1727
  }
1728
1728
  payload["chunk_length_frames"] = chunk_length_frames
1729
1729
 
1730
- data = send_inference_request(
1731
- payload, "video-temporal-localization", files=files, v2=True
1732
- )
1730
+ segments = split_frames_into_segments(frames, segment_size=50, overlap=0)
1731
+
1732
+ def _apply_temporal_localization(
1733
+ segment: List[np.ndarray],
1734
+ ) -> List[float]:
1735
+ segment_buffer_bytes = [("video", frames_to_bytes(segment))]
1736
+ data = send_inference_request(
1737
+ payload, "video-temporal-localization", files=segment_buffer_bytes, v2=True
1738
+ )
1739
+ chunked_data = [cast(float, value) for value in data]
1740
+
1741
+ full_data = []
1742
+ for value in chunked_data:
1743
+ full_data.extend([value] * chunk_length_frames)
1744
+
1745
+ return full_data[: len(segment)]
1746
+
1747
+ with ThreadPoolExecutor() as executor:
1748
+ futures = {
1749
+ executor.submit(_apply_temporal_localization, segment): segment_index
1750
+ for segment_index, segment in enumerate(segments)
1751
+ }
1752
+
1753
+ localization_per_segment = []
1754
+ for future in as_completed(futures):
1755
+ segment_index = futures[future]
1756
+ localization_per_segment.append((segment_index, future.result()))
1757
+
1758
+ localization_per_segment = [
1759
+ x[1] for x in sorted(localization_per_segment, key=lambda x: x[0]) # type: ignore
1760
+ ]
1761
+ localizations = cast(List[float], [e for o in localization_per_segment for e in o])
1762
+
1733
1763
  _display_tool_trace(
1734
1764
  video_temporal_localization.__name__,
1735
1765
  payload,
1736
- data,
1766
+ localization_per_segment,
1737
1767
  files,
1738
1768
  )
1739
- chunked_data = [cast(float, value) for value in data]
1740
-
1741
- full_data = []
1742
- for value in chunked_data:
1743
- full_data.extend([value] * chunk_length_frames)
1744
-
1745
- return full_data[: len(frames)]
1769
+ return localizations
1746
1770
 
1747
1771
 
1748
1772
  def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
@@ -2028,16 +2052,18 @@ def flux_image_inpainting(
2028
2052
  mask: np.ndarray,
2029
2053
  ) -> np.ndarray:
2030
2054
  """'flux_image_inpainting' performs image inpainting to fill the masked regions,
2031
- given by mask, in the image, given image based on the text prompt and surrounding image context.
2032
- It can be used to edit regions of an image according to the prompt given.
2055
+ given by mask, in the image, given image based on the text prompt and surrounding
2056
+ image context. It can be used to edit regions of an image according to the prompt
2057
+ given.
2033
2058
 
2034
2059
  Parameters:
2035
2060
  prompt (str): A detailed text description guiding what should be generated
2036
- in the masked area. More detailed and specific prompts typically yield better results.
2037
- image (np.ndarray): The source image to be inpainted.
2038
- The image will serve as the base context for the inpainting process.
2039
- mask (np.ndarray): A binary mask image with 0's and 1's,
2040
- where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
2061
+ in the masked area. More detailed and specific prompts typically yield
2062
+ better results.
2063
+ image (np.ndarray): The source image to be inpainted. The image will serve as
2064
+ the base context for the inpainting process.
2065
+ mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
2066
+ areas to be inpainted and 0 indicates areas to be preserved.
2041
2067
 
2042
2068
  Returns:
2043
2069
  np.ndarray: The generated image(s) as a numpy array in RGB format with values
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.231
3
+ Version: 0.2.233
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -1,23 +1,23 @@
1
- vision_agent/.sim_tools/df.csv,sha256=XdcgkjC7CjF_CoJnXmFkYOPUBwHemiwsauh62b1eh1M,42472
1
+ vision_agent/.sim_tools/df.csv,sha256=oVUuyoVTCnayorbGUAvWed8l1YA_-rF9rSF78fMtvuU,42468
2
2
  vision_agent/.sim_tools/embs.npy,sha256=YJe8EcKVNmeX_75CS2T1sbY-sUS_1HQAMT-34zc18a0,254080
3
3
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
4
4
  vision_agent/agent/README.md,sha256=Q4w7FWw38qaWosQYAZ7NqWx8Q5XzuWrlv7nLhjUd1-8,5527
5
5
  vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
6
6
  vision_agent/agent/agent.py,sha256=_1tHWAs7Jm5tqDzEcPfCRvJV3uRRveyh4n9_9pd6I1w,1565
7
- vision_agent/agent/agent_utils.py,sha256=IXxN9XruaeNTreUrdztb3kWJhimpsdH6hjv6xT4jg1Q,14062
7
+ vision_agent/agent/agent_utils.py,sha256=4RgG8SUEGuMFHkIt0jCFkRQF6G1PZp3Ub4LuVYKF7Ic,14092
8
8
  vision_agent/agent/types.py,sha256=dIdxATH_PP76pD5Wfo0oofWt6iPQh0vpf48QbEQSzhs,2472
9
9
  vision_agent/agent/vision_agent.py,sha256=fH9NOLk7twL1fPr9vLSqkaYhah-gfDWfTOVF2FfMyzI,23461
10
10
  vision_agent/agent/vision_agent_coder.py,sha256=flUxOibyGZK19BCSK5mhaD3HjCxHw6c6FtKom6N2q1E,27359
11
11
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=_kkPLezUVnBXieNPlxMQab_6J6P7F-aa6ItF5NhZZsM,12281
12
- vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=idmSMfxebPULqqvllz3gqRzGDchEvS5dkGngvBs4PGo,4872
13
- vision_agent/agent/vision_agent_coder_v2.py,sha256=ZR2PQoMqNM6yK3vn_0rrCJf_EplRKye7t7bVjyl51ls,16476
12
+ vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=NUMWq-Lxq5JmmyWs3C5O_1Hm-zCbf9I_yPK5UtWGspE,4871
13
+ vision_agent/agent/vision_agent_coder_v2.py,sha256=yQYcO0s4BI9pWaAQQAVtkwWa3UF5w0iLKvwpeJ6iegM,17077
14
14
  vision_agent/agent/vision_agent_planner.py,sha256=fFzjNkZBKkh8Y_oS06ATI4qz31xmIJvixb_tV1kX8KA,18590
15
15
  vision_agent/agent/vision_agent_planner_prompts.py,sha256=rYRdJthc-sQN57VgCBKrF09Sd73BSxcBdjNe6C4WNZ8,6837
16
- vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=5xTx93lNpoyT4eAD9jicwDyDAkuW7eQqicr17zCjrQw,33337
17
- vision_agent/agent/vision_agent_planner_v2.py,sha256=7hBQdg9y4oCLDiQ54Kh12_uIMywedKKNPWiKPRA01cQ,20568
16
+ vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=U88z1Y7CifFs7t53aUrl8qjWtBYs0f_F5vyg_0VYJko,35528
17
+ vision_agent/agent/vision_agent_planner_v2.py,sha256=NUyi57zxCmOO004_cJcCCDa4UgcKSWB1WCGuyOhhXQE,20602
18
18
  vision_agent/agent/vision_agent_prompts.py,sha256=KaJwYPUP7_GvQsCPPs6Fdawmi3AQWmWajBUuzj7gTG4,13812
19
- vision_agent/agent/vision_agent_prompts_v2.py,sha256=AW_bW1boGiCLyLFd3h4GQenfDACttQagDHwpBkSW4Xo,2518
20
- vision_agent/agent/vision_agent_v2.py,sha256=335VT0hk0jkB14y4W3cJo5ueEu1wY_jjN-R_m2xaQ30,10752
19
+ vision_agent/agent/vision_agent_prompts_v2.py,sha256=Wyxa15NOe75PefAfw3_RRwvgjg8YVqCrU7WvvWoYJpk,2733
20
+ vision_agent/agent/vision_agent_v2.py,sha256=86_pPdkkMBk08TTFZ7zu9QG37Iz9uI8Nmt79wwm_EIA,11053
21
21
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
23
23
  vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
@@ -33,10 +33,10 @@ vision_agent/lmm/lmm.py,sha256=arwfYPWme_RxCxSpEQ0ZkpHO22GFPCwVeoSvXqLPOAk,19288
33
33
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
34
34
  vision_agent/tools/__init__.py,sha256=zopUrANPx7p0NGy6BxmEaYhDrj8DX8w7BLfgmCbz-mU,2897
35
35
  vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
36
- vision_agent/tools/planner_tools.py,sha256=Mk3N-I-Qs4ezeyv8EL9BxdxmJG5oWiH5bFkvgwJKB0s,14660
36
+ vision_agent/tools/planner_tools.py,sha256=8pJZCGGOGIqGiV2or52BjyRP6eDlporuQ2hXCIHfLTQ,15382
37
37
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
38
38
  vision_agent/tools/tool_utils.py,sha256=xJRWF96Ge9RvhhVHrOtifjUYoc4HIJ2y7c2VOQ2Lp8s,10152
39
- vision_agent/tools/tools.py,sha256=3B3xWFVA3qfAO6ySSQ2yUPUAiTrgJomL48hLO_VP6RQ,106015
39
+ vision_agent/tools/tools.py,sha256=Eb2paiXjik0HyGeZzXctTpJCLG0V3NnNL9awtaB8HN4,107011
40
40
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
41
41
  vision_agent/utils/__init__.py,sha256=QKk4zVjMwGxQI0MQ-aZZA50N-qItxRY4EB9CwQkZ2HY,185
42
42
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -46,7 +46,7 @@ vision_agent/utils/sim.py,sha256=DYya76dYVtifFyXilMLxBzGgyfyeqhEwU4RJ4894lCI,979
46
46
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
47
47
  vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
48
48
  vision_agent/utils/video_tracking.py,sha256=wK5dOutqV2t2aeaxedstCBa7xy-NNQE0-QZqKu1QUds,9498
49
- vision_agent-0.2.231.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
50
- vision_agent-0.2.231.dist-info/METADATA,sha256=N8t9F4hZ4bgyZeDhrVepMZzO5dtRmzRB8VI6fq1fFAA,5760
51
- vision_agent-0.2.231.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
52
- vision_agent-0.2.231.dist-info/RECORD,,
49
+ vision_agent-0.2.233.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
50
+ vision_agent-0.2.233.dist-info/METADATA,sha256=EoNuerRth0lHRC7TK2Xh7w6V__YtUJraKk9yN8AMx2U,5760
51
+ vision_agent-0.2.233.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
52
+ vision_agent-0.2.233.dist-info/RECORD,,