vision-agent 0.2.225__tar.gz → 0.2.227__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. {vision_agent-0.2.225 → vision_agent-0.2.227}/PKG-INFO +1 -1
  2. {vision_agent-0.2.225 → vision_agent-0.2.227}/pyproject.toml +1 -1
  3. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/.sim_tools/df.csv +49 -91
  4. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/.sim_tools/embs.npy +0 -0
  5. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/agent_utils.py +13 -0
  6. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/vision_agent_coder_prompts_v2.py +1 -1
  7. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/vision_agent_coder_v2.py +6 -1
  8. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/vision_agent_planner_prompts_v2.py +42 -33
  9. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/vision_agent_v2.py +30 -22
  10. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/tools/__init__.py +1 -0
  11. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/tools/planner_tools.py +4 -2
  12. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/tools/tools.py +186 -37
  13. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/utils/sim.py +6 -0
  14. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/utils/video_tracking.py +1 -0
  15. {vision_agent-0.2.225 → vision_agent-0.2.227}/LICENSE +0 -0
  16. {vision_agent-0.2.225 → vision_agent-0.2.227}/README.md +0 -0
  17. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/__init__.py +0 -0
  18. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/README.md +0 -0
  19. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/__init__.py +0 -0
  20. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/agent.py +0 -0
  21. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/types.py +0 -0
  22. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/vision_agent.py +0 -0
  23. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/vision_agent_coder.py +0 -0
  24. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  25. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/vision_agent_planner.py +0 -0
  26. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
  27. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
  28. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/vision_agent_prompts.py +0 -0
  29. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
  30. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/clients/__init__.py +0 -0
  31. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/clients/http.py +0 -0
  32. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/clients/landing_public_api.py +0 -0
  33. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/fonts/__init__.py +0 -0
  34. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  35. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/lmm/__init__.py +0 -0
  36. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/lmm/lmm.py +0 -0
  37. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/lmm/types.py +0 -0
  38. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/tools/meta_tools.py +0 -0
  39. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/tools/prompts.py +0 -0
  40. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/tools/tool_utils.py +0 -0
  41. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/tools/tools_types.py +0 -0
  42. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/utils/__init__.py +0 -0
  43. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/utils/exceptions.py +0 -0
  44. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/utils/execute.py +0 -0
  45. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/utils/image_utils.py +0 -0
  46. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/utils/type_defs.py +0 -0
  47. {vision_agent-0.2.225 → vision_agent-0.2.227}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.225
3
+ Version: 0.2.227
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.225"
7
+ version = "0.2.227"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -65,25 +65,30 @@ desc,doc,name
65
65
  },
66
66
  ]
67
67
  ",owlv2_sam2_instance_segmentation
68
- "'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
69
- 'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
70
- prompt such as category names or referring expressions. The categories in the text
71
- prompt are separated by commas. It returns a list of bounding boxes, label names,
72
- mask file names and associated probability scores.
68
+ "'owlv2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","owlv2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
69
+ 'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
70
+ objects in a video given a text prompt such as category names or referring
71
+ expressions. The categories in the text prompt are separated by commas. It returns
72
+ a list of bounding boxes, label names, masks and associated probability scores and
73
+ is useful for tracking and counting without duplicating counts.
73
74
 
74
75
  Parameters:
75
76
  prompt (str): The prompt to ground to the image.
76
- image (np.ndarray): The image to ground the prompt to.
77
+ frames (List[np.ndarray]): The list of frames to ground the prompt to.
78
+ chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
79
+ new objects.
77
80
  fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
78
81
  fine-tuned model ID here to use it.
79
82
 
80
83
  Returns:
81
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
82
- bounding box, and mask of the detected objects with normalized coordinates
83
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
84
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
85
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
86
- the background.
84
+ List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
85
+ label, segmentation mask and bounding boxes. The outer list represents each
86
+ frame and the inner list is the entities per frame. The detected objects
87
+ have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
88
+ and ymin are the coordinates of the top-left and xmax and ymax are the
89
+ coordinates of the bottom-right of the bounding box. The mask is binary 2D
90
+ numpy array where 1 indicates the object and 0 indicates the background.
91
+ The label names are prefixed with their ID represent the total count.
87
92
 
88
93
  Example
89
94
  -------
@@ -170,25 +175,28 @@ desc,doc,name
170
175
  },
171
176
  ]
172
177
  ",countgd_sam2_instance_segmentation
173
- "'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, mask file names and associated probability scores.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
174
- 'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
175
- prompt such as category names or referring expressions. The categories in the text
176
- prompt are separated by commas. It returns a list of bounding boxes, label names,
177
- mask file names and associated probability scores.
178
+ "'countgd_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","countgd_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
179
+ 'countgd_sam2_video_tracking' is a tool that can track and segment multiple
180
+ objects in a video given a text prompt such as category names or referring
181
+ expressions. The categories in the text prompt are separated by commas. It returns
182
+ a list of bounding boxes, label names, masks and associated probability scores and
183
+ is useful for tracking and counting without duplicating counts.
178
184
 
179
185
  Parameters:
180
186
  prompt (str): The prompt to ground to the image.
181
- image (np.ndarray): The image to ground the prompt to.
182
- chunk_length (Optional[int]): The number of frames to re-run florence2 to find
187
+ frames (List[np.ndarray]): The list of frames to ground the prompt to.
188
+ chunk_length (Optional[int]): The number of frames to re-run countgd to find
183
189
  new objects.
184
190
 
185
191
  Returns:
186
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
187
- bounding box, and mask of the detected objects with normalized coordinates
188
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
189
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
190
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
191
- the background.
192
+ List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
193
+ label, segmentation mask and bounding boxes. The outer list represents each
194
+ frame and the inner list is the entities per frame. The detected objects
195
+ have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
196
+ and ymin are the coordinates of the top-left and xmax and ymax are the
197
+ coordinates of the bottom-right of the bounding box. The mask is binary 2D
198
+ numpy array where 1 indicates the object and 0 indicates the background.
199
+ The label names are prefixed with their ID represent the total count.
192
200
 
193
201
  Example
194
202
  -------
@@ -265,12 +273,12 @@ desc,doc,name
265
273
  },
266
274
  ]
267
275
  ",florence2_sam2_instance_segmentation
268
- 'florence2_sam2_video_tracking' is a tool that can segment and track multiple entities in a video given a text prompt such as category names or referring expressions. You can optionally separate the categories in the text with commas. It can find new objects every 'chunk_length' frames and is useful for tracking and counting without duplicating counts and always outputs scores of 1.0.,"florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
269
- 'florence2_sam2_video_tracking' is a tool that can segment and track multiple
270
- entities in a video given a text prompt such as category names or referring
271
- expressions. You can optionally separate the categories in the text with commas. It
272
- can find new objects every 'chunk_length' frames and is useful for tracking and
273
- counting without duplicating counts and always outputs scores of 1.0.
276
+ "'florence2_sam2_video_tracking' is a tool that can track and segment multiple objects in a video given a text prompt such as category names or referring expressions. The categories in the text prompt are separated by commas. It returns a list of bounding boxes, label names, masks and associated probability scores and is useful for tracking and counting without duplicating counts.","florence2_sam2_video_tracking(prompt: str, frames: List[numpy.ndarray], chunk_length: Optional[int] = 10, fine_tune_id: Optional[str] = None) -> List[List[Dict[str, Any]]]:
277
+ 'florence2_sam2_video_tracking' is a tool that can track and segment multiple
278
+ objects in a video given a text prompt such as category names or referring
279
+ expressions. The categories in the text prompt are separated by commas. It returns
280
+ a list of bounding boxes, label names, masks and associated probability scores and
281
+ is useful for tracking and counting without duplicating counts.
274
282
 
275
283
  Parameters:
276
284
  prompt (str): The prompt to ground to the video.
@@ -282,10 +290,13 @@ desc,doc,name
282
290
 
283
291
  Returns:
284
292
  List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
285
- label, segment mask and bounding boxes. The outer list represents each frame
286
- and the inner list is the entities per frame. The label contains the object ID
287
- followed by the label name. The objects are only identified in the first framed
288
- and tracked throughout the video.
293
+ label, segmentation mask and bounding boxes. The outer list represents each
294
+ frame and the inner list is the entities per frame. The detected objects
295
+ have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
296
+ and ymin are the coordinates of the top-left and xmax and ymax are the
297
+ coordinates of the bottom-right of the bounding box. The mask is binary 2D
298
+ numpy array where 1 indicates the object and 0 indicates the background.
299
+ The label names are prefixed with their ID represent the total count.
289
300
 
290
301
  Example
291
302
  -------
@@ -445,43 +456,6 @@ desc,doc,name
445
456
  >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
446
457
  'Lionel Messi'
447
458
  ",qwen2_vl_video_vqa
448
- "'detr_segmentation' is a tool that can segment common objects in an image without any text prompt. It returns a list of detected objects as labels, their regions as masks and their scores.","detr_segmentation(image: numpy.ndarray) -> List[Dict[str, Any]]:
449
- 'detr_segmentation' is a tool that can segment common objects in an
450
- image without any text prompt. It returns a list of detected objects
451
- as labels, their regions as masks and their scores.
452
-
453
- Parameters:
454
- image (np.ndarray): The image used to segment things and objects
455
-
456
- Returns:
457
- List[Dict[str, Any]]: A list of dictionaries containing the score, label
458
- and mask of the detected objects. The mask is binary 2D numpy array where 1
459
- indicates the object and 0 indicates the background.
460
-
461
- Example
462
- -------
463
- >>> detr_segmentation(image)
464
- [
465
- {
466
- 'score': 0.45,
467
- 'label': 'window',
468
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
469
- [0, 0, 0, ..., 0, 0, 0],
470
- ...,
471
- [0, 0, 0, ..., 0, 0, 0],
472
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
473
- },
474
- {
475
- 'score': 0.70,
476
- 'label': 'bird',
477
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
478
- [0, 0, 0, ..., 0, 0, 0],
479
- ...,
480
- [0, 0, 0, ..., 0, 0, 0],
481
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
482
- },
483
- ]
484
- ",detr_segmentation
485
459
  'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intesities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
486
460
  'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a
487
461
  depth image from a given RGB image. The returned depth image is monochrome and
@@ -522,22 +496,6 @@ desc,doc,name
522
496
  [10, 11, 15, ..., 202, 202, 205],
523
497
  [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
524
498
  ",generate_pose_image
525
- 'vit_image_classification' is a tool that can classify an image. It returns a list of classes and their probability scores based on image content.,"vit_image_classification(image: numpy.ndarray) -> Dict[str, Any]:
526
- 'vit_image_classification' is a tool that can classify an image. It returns a
527
- list of classes and their probability scores based on image content.
528
-
529
- Parameters:
530
- image (np.ndarray): The image to classify or tag
531
-
532
- Returns:
533
- Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
534
- contains a list of labels and other a list of scores.
535
-
536
- Example
537
- -------
538
- >>> vit_image_classification(image)
539
- {""labels"": [""leopard"", ""lemur, otter"", ""bird""], ""scores"": [0.68, 0.30, 0.02]},
540
- ",vit_image_classification
541
499
  'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'. It returns the predicted label and their probability scores based on image content.,"vit_nsfw_classification(image: numpy.ndarray) -> Dict[str, Any]:
542
500
  'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'.
543
501
  It returns the predicted label and their probability scores based on image content.
@@ -566,7 +524,7 @@ desc,doc,name
566
524
  prompt (str): The question about the video
567
525
  frames (List[np.ndarray]): The reference frames used for the question
568
526
  model (str): The model to use for the inference. Valid values are
569
- 'qwen2vl', 'gpt4o', 'internlm-xcomposer'
527
+ 'qwen2vl', 'gpt4o'.
570
528
  chunk_length_frames (Optional[int]): length of each chunk in frames
571
529
 
572
530
  Returns:
@@ -641,7 +599,7 @@ desc,doc,name
641
599
  >>> closest_distance(det1, det2, image_size)
642
600
  141.42
643
601
  ",minimum_distance
644
- "'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 1) -> List[Dict[str, Union[numpy.ndarray, float]]]:
602
+ "'extract_frames_and_timestamps' extracts frames and timestamps from a video which can be a file path, url or youtube link, returns a list of dictionaries with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is the relative time in seconds where the frame was captured. The frame is a numpy array.","extract_frames_and_timestamps(video_uri: Union[str, pathlib.Path], fps: float = 5) -> List[Dict[str, Union[numpy.ndarray, float]]]:
645
603
  'extract_frames_and_timestamps' extracts frames and timestamps from a video
646
604
  which can be a file path, url or youtube link, returns a list of dictionaries
647
605
  with keys ""frame"" and ""timestamp"" where ""frame"" is a numpy array and ""timestamp"" is
@@ -651,7 +609,7 @@ desc,doc,name
651
609
  Parameters:
652
610
  video_uri (Union[str, Path]): The path to the video file, url or youtube link
653
611
  fps (float, optional): The frame rate per second to extract the frames. Defaults
654
- to 1.
612
+ to 5.
655
613
 
656
614
  Returns:
657
615
  List[Dict[str, Union[np.ndarray, float]]]: A list of dictionaries containing the
@@ -153,6 +153,19 @@ def format_plan_v2(plan: PlanContext) -> str:
153
153
  return plan_str
154
154
 
155
155
 
156
+ def format_conversation(chat: List[AgentMessage]) -> str:
157
+ chat = copy.deepcopy(chat)
158
+ prompt = ""
159
+ for chat_i in chat:
160
+ if chat_i.role == "user":
161
+ prompt += f"USER: {chat_i.content}\n\n"
162
+ elif chat_i.role == "observation" or chat_i.role == "coder":
163
+ prompt += f"OBSERVATION: {chat_i.content}\n\n"
164
+ elif chat_i.role == "conversation":
165
+ prompt += f"AGENT: {chat_i.content}\n\n"
166
+ return prompt
167
+
168
+
156
169
  def format_plans(plans: Dict[str, Any]) -> str:
157
170
  plan_str = ""
158
171
  for k, v in plans.items():
@@ -65,7 +65,7 @@ This is the documentation for the functions you have access to. You may call any
65
65
  7. DO NOT assert the output value, run the code and assert only the output format or data structure.
66
66
  8. DO NOT use try except block to handle the error, let the error be raised if the code is incorrect.
67
67
  9. DO NOT import the testing function as it will available in the testing environment.
68
- 10. Print the output of the function that is being tested.
68
+ 10. Print the output of the function that is being tested and ensure it is not empty.
69
69
  11. Use the output of the function that is being tested as the return value of the testing function.
70
70
  12. Run the testing function in the end and don't assign a variable to its output.
71
71
  13. Output your test code using <code> tags:
@@ -202,7 +202,12 @@ def write_and_test_code(
202
202
  tool_docs=tool_docs,
203
203
  plan=plan,
204
204
  )
205
- code = strip_function_calls(code)
205
+ try:
206
+ code = strip_function_calls(code)
207
+ except Exception:
208
+ # the code may be malformatted, this will fail in the exec call and the agent
209
+ # will attempt to debug it
210
+ pass
206
211
  test = write_test(
207
212
  tester=tester,
208
213
  chat=chat,
@@ -136,8 +136,9 @@ Tool Documentation:
136
136
  countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
137
137
  'countgd_object_detection' is a tool that can detect multiple instances of an
138
138
  object given a text prompt. It is particularly useful when trying to detect and
139
- count a large number of objects. It returns a list of bounding boxes with
140
- normalized coordinates, label names and associated confidence scores.
139
+ count a large number of objects. You can optionally separate object names in the
140
+ prompt with commas. It returns a list of bounding boxes with normalized
141
+ coordinates, label names and associated confidence scores.
141
142
 
142
143
  Parameters:
143
144
  prompt (str): The object that needs to be counted.
@@ -272,40 +273,47 @@ OBSERVATION:
272
273
  [get_tool_for_task output]
273
274
  For tracking boxes moving on a conveyor belt, we need a tool that can consistently track the same box across frames without losing it or double counting. Looking at the outputs: florence2_sam2_video_tracking successfully tracks the single box across all 5 frames, maintaining consistent tracking IDs and showing the box's movement along the conveyor.
274
275
 
275
- 'florence2_sam2_video_tracking' is a tool that can segment and track multiple
276
- entities in a video given a text prompt such as category names or referring
277
- expressions. You can optionally separate the categories in the text with commas. It
278
- can find new objects every 'chunk_length' frames and is useful for tracking and
279
- counting without duplicating counts and always outputs scores of 1.0.
276
+ Tool Documentation:
277
+ def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
278
+ 'florence2_sam2_video_tracking' is a tool that can track and segment multiple
279
+ objects in a video given a text prompt such as category names or referring
280
+ expressions. The categories in the text prompt are separated by commas. It returns
281
+ a list of bounding boxes, label names, masks and associated probability scores and
282
+ is useful for tracking and counting without duplicating counts.
280
283
 
281
- Parameters:
282
- prompt (str): The prompt to ground to the video.
283
- frames (List[np.ndarray]): The list of frames to ground the prompt to.
284
- chunk_length (Optional[int]): The number of frames to re-run florence2 to find
285
- new objects.
284
+ Parameters:
285
+ prompt (str): The prompt to ground to the video.
286
+ frames (List[np.ndarray]): The list of frames to ground the prompt to.
287
+ chunk_length (Optional[int]): The number of frames to re-run florence2 to find
288
+ new objects.
289
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
290
+ fine-tuned model ID here to use it.
286
291
 
287
- Returns:
288
- List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
289
- label,segment mask and bounding boxes. The outer list represents each frame and
290
- the inner list is the entities per frame. The label contains the object ID
291
- followed by the label name. The objects are only identified in the first framed
292
- and tracked throughout the video.
292
+ Returns:
293
+ List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
294
+ label, segmentation mask and bounding boxes. The outer list represents each
295
+ frame and the inner list is the entities per frame. The detected objects
296
+ have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
297
+ and ymin are the coordinates of the top-left and xmax and ymax are the
298
+ coordinates of the bottom-right of the bounding box. The mask is binary 2D
299
+ numpy array where 1 indicates the object and 0 indicates the background.
300
+ The label names are prefixed with their ID represent the total count.
293
301
 
294
- Example
295
- -------
296
- >>> florence2_sam2_video("car, dinosaur", frames)
297
- [
302
+ Example
303
+ -------
304
+ >>> florence2_sam2_video_tracking("car, dinosaur", frames)
298
305
  [
299
- {
300
- 'label': '0: dinosaur',
301
- 'bbox': [0.1, 0.11, 0.35, 0.4],
302
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
303
- ...,
304
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
305
- },
306
- ],
307
- ...
308
- ]
306
+ [
307
+ {
308
+ 'label': '0: dinosaur',
309
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
310
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
311
+ ...,
312
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
313
+ },
314
+ ],
315
+ ...
316
+ ]
309
317
  [end of get_tool_for_task output]
310
318
  <count>8</count>
311
319
 
@@ -691,7 +699,8 @@ FINALIZE_PLAN = """
691
699
  4. Specifically call out the tools used and the order in which they were used. Only include tools obtained from calling `get_tool_for_task`.
692
700
  5. Do not include {excluded_tools} tools in your instructions.
693
701
  6. Add final instructions for visualizing the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and saving it to a file with `save_file` or `save_video`.
694
- 6. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
702
+ 7. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
703
+ 8. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
695
704
 
696
705
  <json>
697
706
  {{
@@ -1,13 +1,14 @@
1
1
  import copy
2
2
  import json
3
3
  from pathlib import Path
4
- from typing import Any, Callable, Dict, List, Optional, Union, cast
4
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
5
5
 
6
6
  from vision_agent.agent import Agent, AgentCoder, VisionAgentCoderV2
7
7
  from vision_agent.agent.agent_utils import (
8
8
  add_media_to_chat,
9
9
  convert_message_to_agentmessage,
10
10
  extract_tag,
11
+ format_conversation,
11
12
  )
12
13
  from vision_agent.agent.types import (
13
14
  AgentMessage,
@@ -22,19 +23,6 @@ from vision_agent.lmm.types import Message
22
23
  from vision_agent.utils.execute import CodeInterpreter, CodeInterpreterFactory
23
24
 
24
25
 
25
- def format_conversation(chat: List[AgentMessage]) -> str:
26
- chat = copy.deepcopy(chat)
27
- prompt = ""
28
- for chat_i in chat:
29
- if chat_i.role == "user":
30
- prompt += f"USER: {chat_i.content}\n\n"
31
- elif chat_i.role == "observation" or chat_i.role == "coder":
32
- prompt += f"OBSERVATION: {chat_i.content}\n\n"
33
- elif chat_i.role == "conversation":
34
- prompt += f"AGENT: {chat_i.content}\n\n"
35
- return prompt
36
-
37
-
38
26
  def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
39
27
  # only keep last 10 messages
40
28
  conv = format_conversation(chat[-10:])
@@ -55,23 +43,39 @@ def check_for_interaction(chat: List[AgentMessage]) -> bool:
55
43
 
56
44
  def extract_conversation_for_generate_code(
57
45
  chat: List[AgentMessage],
58
- ) -> List[AgentMessage]:
46
+ ) -> Tuple[List[AgentMessage], Optional[str]]:
59
47
  chat = copy.deepcopy(chat)
60
48
 
61
49
  # if we are in the middle of an interaction, return all the intermediate planning
62
50
  # steps
63
51
  if check_for_interaction(chat):
64
- return chat
52
+ return chat, None
65
53
 
66
54
  extracted_chat = []
67
55
  for chat_i in chat:
68
56
  if chat_i.role == "user":
69
57
  extracted_chat.append(chat_i)
70
58
  elif chat_i.role == "coder":
71
- if "<final_code>" in chat_i.content and "<final_test>" in chat_i.content:
59
+ if "<final_code>" in chat_i.content:
72
60
  extracted_chat.append(chat_i)
73
61
 
74
- return extracted_chat
62
+ # only keep the last <final_code> and <final_test>
63
+ final_code = None
64
+ extracted_chat_strip_code: List[AgentMessage] = []
65
+ for chat_i in reversed(extracted_chat):
66
+ if "<final_code>" in chat_i.content and final_code is None:
67
+ extracted_chat_strip_code = [chat_i] + extracted_chat_strip_code
68
+ final_code = extract_tag(chat_i.content, "final_code")
69
+ if final_code is not None:
70
+ test_code = extract_tag(chat_i.content, "final_test")
71
+ final_code += "\n" + test_code if test_code is not None else ""
72
+
73
+ if "<final_code>" in chat_i.content and final_code is not None:
74
+ continue
75
+
76
+ extracted_chat_strip_code = [chat_i] + extracted_chat_strip_code
77
+
78
+ return extracted_chat_strip_code[-5:], final_code
75
79
 
76
80
 
77
81
  def maybe_run_action(
@@ -81,7 +85,7 @@ def maybe_run_action(
81
85
  code_interpreter: Optional[CodeInterpreter] = None,
82
86
  ) -> Optional[List[AgentMessage]]:
83
87
  if action == "generate_or_edit_vision_code":
84
- extracted_chat = extract_conversation_for_generate_code(chat)
88
+ extracted_chat, _ = extract_conversation_for_generate_code(chat)
85
89
  # there's an issue here because coder.generate_code will send it's code_context
86
90
  # to the outside user via it's update_callback, but we don't necessarily have
87
91
  # access to that update_callback here, so we re-create the message using
@@ -101,11 +105,15 @@ def maybe_run_action(
101
105
  )
102
106
  ]
103
107
  elif action == "edit_code":
104
- extracted_chat = extract_conversation_for_generate_code(chat)
108
+ extracted_chat, final_code = extract_conversation_for_generate_code(chat)
105
109
  plan_context = PlanContext(
106
110
  plan="Edit the latest code observed in the fewest steps possible according to the user's feedback.",
107
- instructions=[],
108
- code="",
111
+ instructions=[
112
+ chat_i.content
113
+ for chat_i in extracted_chat
114
+ if chat_i.role == "user" and "<final_code>" not in chat_i.content
115
+ ],
116
+ code=final_code if final_code is not None else "",
109
117
  )
110
118
  context = coder.generate_code_from_plan(
111
119
  extracted_chat, plan_context, code_interpreter=code_interpreter
@@ -63,6 +63,7 @@ from .tools import (
63
63
  video_temporal_localization,
64
64
  vit_image_classification,
65
65
  vit_nsfw_classification,
66
+ custom_object_detection,
66
67
  )
67
68
 
68
69
  __new_tools__ = [
@@ -193,8 +193,10 @@ def get_tool_for_task(
193
193
  - Depth and pose estimation
194
194
  - Video object tracking
195
195
 
196
- Wait until the documentation is printed to use the function so you know what the
197
- input and output signatures are.
196
+ Only ask for one type of task at a time, for example a task needing to identify
197
+ text is one OCR task while needing to identify non-text objects is an OD task. Wait
198
+ until the documentation is printed to use the function so you know what the input
199
+ and output signatures are.
198
200
 
199
201
  Parameters:
200
202
  task: str: The task to accomplish.
@@ -290,6 +290,13 @@ def od_sam2_video_tracking(
290
290
  )
291
291
  function_name = "florence2_object_detection"
292
292
 
293
+ elif od_model == ODModels.CUSTOM:
294
+ segment_results = custom_object_detection(
295
+ deployment_id=fine_tune_id,
296
+ image=segment_frames[frame_number],
297
+ )
298
+ function_name = "custom_object_detection"
299
+
293
300
  else:
294
301
  raise NotImplementedError(
295
302
  f"Object detection model '{od_model}' is not implemented."
@@ -515,24 +522,29 @@ def owlv2_sam2_video_tracking(
515
522
  chunk_length: Optional[int] = 10,
516
523
  fine_tune_id: Optional[str] = None,
517
524
  ) -> List[List[Dict[str, Any]]]:
518
- """'owlv2_sam2_video_tracking' is a tool that can segment multiple objects given a text
519
- prompt such as category names or referring expressions. The categories in the text
520
- prompt are separated by commas. It returns a list of bounding boxes, label names,
521
- mask file names and associated probability scores.
525
+ """'owlv2_sam2_video_tracking' is a tool that can track and segment multiple
526
+ objects in a video given a text prompt such as category names or referring
527
+ expressions. The categories in the text prompt are separated by commas. It returns
528
+ a list of bounding boxes, label names, masks and associated probability scores and
529
+ is useful for tracking and counting without duplicating counts.
522
530
 
523
531
  Parameters:
524
532
  prompt (str): The prompt to ground to the image.
525
- image (np.ndarray): The image to ground the prompt to.
533
+ frames (List[np.ndarray]): The list of frames to ground the prompt to.
534
+ chunk_length (Optional[int]): The number of frames to re-run owlv2 to find
535
+ new objects.
526
536
  fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
527
537
  fine-tuned model ID here to use it.
528
538
 
529
539
  Returns:
530
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
531
- bounding box, and mask of the detected objects with normalized coordinates
532
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
533
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
534
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
535
- the background.
540
+ List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
541
+ label, segmentation mask and bounding boxes. The outer list represents each
542
+ frame and the inner list is the entities per frame. The detected objects
543
+ have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
544
+ and ymin are the coordinates of the top-left and xmax and ymax are the
545
+ coordinates of the bottom-right of the bounding box. The mask is binary 2D
546
+ numpy array where 1 indicates the object and 0 indicates the background.
547
+ The label names are prefixed with their ID represent the total count.
536
548
 
537
549
  Example
538
550
  -------
@@ -742,11 +754,11 @@ def florence2_sam2_video_tracking(
742
754
  chunk_length: Optional[int] = 10,
743
755
  fine_tune_id: Optional[str] = None,
744
756
  ) -> List[List[Dict[str, Any]]]:
745
- """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
746
- entities in a video given a text prompt such as category names or referring
747
- expressions. You can optionally separate the categories in the text with commas. It
748
- can find new objects every 'chunk_length' frames and is useful for tracking and
749
- counting without duplicating counts and always outputs scores of 1.0.
757
+ """'florence2_sam2_video_tracking' is a tool that can track and segment multiple
758
+ objects in a video given a text prompt such as category names or referring
759
+ expressions. The categories in the text prompt are separated by commas. It returns
760
+ a list of bounding boxes, label names, masks and associated probability scores and
761
+ is useful for tracking and counting without duplicating counts.
750
762
 
751
763
  Parameters:
752
764
  prompt (str): The prompt to ground to the video.
@@ -758,10 +770,13 @@ def florence2_sam2_video_tracking(
758
770
 
759
771
  Returns:
760
772
  List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
761
- label, segment mask and bounding boxes. The outer list represents each frame
762
- and the inner list is the entities per frame. The label contains the object ID
763
- followed by the label name. The objects are only identified in the first framed
764
- and tracked throughout the video.
773
+ label, segmentation mask and bounding boxes. The outer list represents each
774
+ frame and the inner list is the entities per frame. The detected objects
775
+ have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
776
+ and ymin are the coordinates of the top-left and xmax and ymax are the
777
+ coordinates of the bottom-right of the bounding box. The mask is binary 2D
778
+ numpy array where 1 indicates the object and 0 indicates the background.
779
+ The label names are prefixed with their ID represent the total count.
765
780
 
766
781
  Example
767
782
  -------
@@ -1076,24 +1091,27 @@ def countgd_sam2_video_tracking(
1076
1091
  frames: List[np.ndarray],
1077
1092
  chunk_length: Optional[int] = 10,
1078
1093
  ) -> List[List[Dict[str, Any]]]:
1079
- """'countgd_sam2_video_tracking' is a tool that can segment multiple objects given a text
1080
- prompt such as category names or referring expressions. The categories in the text
1081
- prompt are separated by commas. It returns a list of bounding boxes, label names,
1082
- mask file names and associated probability scores.
1094
+ """'countgd_sam2_video_tracking' is a tool that can track and segment multiple
1095
+ objects in a video given a text prompt such as category names or referring
1096
+ expressions. The categories in the text prompt are separated by commas. It returns
1097
+ a list of bounding boxes, label names, masks and associated probability scores and
1098
+ is useful for tracking and counting without duplicating counts.
1083
1099
 
1084
1100
  Parameters:
1085
1101
  prompt (str): The prompt to ground to the image.
1086
- image (np.ndarray): The image to ground the prompt to.
1087
- chunk_length (Optional[int]): The number of frames to re-run florence2 to find
1102
+ frames (List[np.ndarray]): The list of frames to ground the prompt to.
1103
+ chunk_length (Optional[int]): The number of frames to re-run countgd to find
1088
1104
  new objects.
1089
1105
 
1090
1106
  Returns:
1091
- List[Dict[str, Any]]: A list of dictionaries containing the score, label,
1092
- bounding box, and mask of the detected objects with normalized coordinates
1093
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
1094
- and xmax and ymax are the coordinates of the bottom-right of the bounding box.
1095
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
1096
- the background.
1107
+ List[List[Dict[str, Any]]]: A list of list of dictionaries containing the
1108
+ label, segmentation mask and bounding boxes. The outer list represents each
1109
+ frame and the inner list is the entities per frame. The detected objects
1110
+ have normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin
1111
+ and ymin are the coordinates of the top-left and xmax and ymax are the
1112
+ coordinates of the bottom-right of the bounding box. The mask is binary 2D
1113
+ numpy array where 1 indicates the object and 0 indicates the background.
1114
+ The label names are prefixed with their ID represent the total count.
1097
1115
 
1098
1116
  Example
1099
1117
  -------
@@ -1206,6 +1224,139 @@ def countgd_visual_prompt_object_detection(
1206
1224
  return bboxes_formatted
1207
1225
 
1208
1226
 
1227
+ def custom_object_detection(
1228
+ deployment_id: str,
1229
+ image: np.ndarray,
1230
+ box_threshold: float = 0.1,
1231
+ ) -> List[Dict[str, Any]]:
1232
+ """'custom_object_detection' is a tool that can detect instances of an
1233
+ object given a deployment_id of a previously finetuned object detection model.
1234
+ It is particularly useful when trying to detect objects that are not well detected by generalist models.
1235
+ It returns a list of bounding boxes with normalized
1236
+ coordinates, label names and associated confidence scores.
1237
+
1238
+ Parameters:
1239
+ deployment_id (str): The id of the finetuned model.
1240
+ image (np.ndarray): The image that contains instances of the object.
1241
+ box_threshold (float, optional): The threshold for detection. Defaults
1242
+ to 0.1.
1243
+
1244
+ Returns:
1245
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
1246
+ bounding box of the detected objects with normalized coordinates between 0
1247
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
1248
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
1249
+ bounding box.
1250
+
1251
+ Example
1252
+ -------
1253
+ >>> custom_object_detection("abcd1234-5678efg", image)
1254
+ [
1255
+ {'score': 0.49, 'label': 'flower', 'bbox': [0.1, 0.11, 0.35, 0.4]},
1256
+ {'score': 0.68, 'label': 'flower', 'bbox': [0.2, 0.21, 0.45, 0.5]},
1257
+ {'score': 0.78, 'label': 'flower', 'bbox': [0.3, 0.35, 0.48, 0.52]},
1258
+ {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58]},
1259
+ ]
1260
+ """
1261
+ image_size = image.shape[:2]
1262
+ if image_size[0] < 1 or image_size[1] < 1:
1263
+ return []
1264
+
1265
+ files = [("image", numpy_to_bytes(image))]
1266
+ payload = {
1267
+ "deployment_id": deployment_id,
1268
+ "confidence": box_threshold,
1269
+ }
1270
+ detections: List[List[Dict[str, Any]]] = send_inference_request(
1271
+ payload, "custom-object-detection", files=files, v2=True
1272
+ )
1273
+
1274
+ bboxes = detections[0]
1275
+ bboxes_formatted = [
1276
+ {
1277
+ "label": bbox["label"],
1278
+ "bbox": normalize_bbox(bbox["bounding_box"], image_size),
1279
+ "score": bbox["score"],
1280
+ }
1281
+ for bbox in bboxes
1282
+ ]
1283
+ display_data = [
1284
+ {
1285
+ "label": bbox["label"],
1286
+ "bbox": bbox["bounding_box"],
1287
+ "score": bbox["score"],
1288
+ }
1289
+ for bbox in bboxes
1290
+ ]
1291
+
1292
+ _display_tool_trace(
1293
+ custom_object_detection.__name__,
1294
+ payload,
1295
+ display_data,
1296
+ files,
1297
+ )
1298
+ return bboxes_formatted
1299
+
1300
+
1301
+ def custom_od_sam2_video_tracking(
1302
+ deployment_id: str,
1303
+ frames: List[np.ndarray],
1304
+ chunk_length: Optional[int] = 10,
1305
+ ) -> List[List[Dict[str, Any]]]:
1306
+ """'custom_od_sam2_video_tracking' is a tool that can segment multiple objects given a
1307
+ custom model with predefined category names.
1308
+ It returns a list of bounding boxes, label names,
1309
+ mask file names and associated probability scores.
1310
+
1311
+ Parameters:
1312
+ deployment_id (str): The id of the deployed custom model.
1313
+ image (np.ndarray): The image to ground the prompt to.
1314
+ chunk_length (Optional[int]): The number of frames to re-run florence2 to find
1315
+ new objects.
1316
+
1317
+ Returns:
1318
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
1319
+ bounding box, and mask of the detected objects with normalized coordinates
1320
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
1321
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
1322
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
1323
+ the background.
1324
+
1325
+ Example
1326
+ -------
1327
+ >>> custom_od_sam2_video_tracking("abcd1234-5678efg", frames)
1328
+ [
1329
+ [
1330
+ {
1331
+ 'label': '0: dinosaur',
1332
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
1333
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
1334
+ [0, 0, 0, ..., 0, 0, 0],
1335
+ ...,
1336
+ [0, 0, 0, ..., 0, 0, 0],
1337
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
1338
+ },
1339
+ ],
1340
+ ...
1341
+ ]
1342
+ """
1343
+
1344
+ ret = od_sam2_video_tracking(
1345
+ ODModels.CUSTOM,
1346
+ prompt="",
1347
+ frames=frames,
1348
+ chunk_length=chunk_length,
1349
+ fine_tune_id=deployment_id,
1350
+ )
1351
+ _display_tool_trace(
1352
+ custom_od_sam2_video_tracking.__name__,
1353
+ {},
1354
+ ret["display_data"],
1355
+ ret["files"],
1356
+ )
1357
+ return ret["return_data"] # type: ignore
1358
+
1359
+
1209
1360
  def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
1210
1361
  """'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary
1211
1362
  images including regular images or images of documents or presentations. It can be
@@ -1533,7 +1684,7 @@ def video_temporal_localization(
1533
1684
  prompt (str): The question about the video
1534
1685
  frames (List[np.ndarray]): The reference frames used for the question
1535
1686
  model (str): The model to use for the inference. Valid values are
1536
- 'qwen2vl', 'gpt4o', 'internlm-xcomposer'
1687
+ 'qwen2vl', 'gpt4o'.
1537
1688
  chunk_length_frames (Optional[int]): length of each chunk in frames
1538
1689
 
1539
1690
  Returns:
@@ -2102,7 +2253,7 @@ def closest_box_distance(
2102
2253
 
2103
2254
 
2104
2255
  def extract_frames_and_timestamps(
2105
- video_uri: Union[str, Path], fps: float = 1
2256
+ video_uri: Union[str, Path], fps: float = 5
2106
2257
  ) -> List[Dict[str, Union[np.ndarray, float]]]:
2107
2258
  """'extract_frames_and_timestamps' extracts frames and timestamps from a video
2108
2259
  which can be a file path, url or youtube link, returns a list of dictionaries
@@ -2113,7 +2264,7 @@ def extract_frames_and_timestamps(
2113
2264
  Parameters:
2114
2265
  video_uri (Union[str, Path]): The path to the video file, url or youtube link
2115
2266
  fps (float, optional): The frame rate per second to extract the frames. Defaults
2116
- to 1.
2267
+ to 5.
2117
2268
 
2118
2269
  Returns:
2119
2270
  List[Dict[str, Union[np.ndarray, float]]]: A list of dictionaries containing the
@@ -2636,10 +2787,8 @@ FUNCTION_TOOLS = [
2636
2787
  ocr,
2637
2788
  qwen2_vl_images_vqa,
2638
2789
  qwen2_vl_video_vqa,
2639
- detr_segmentation,
2640
2790
  depth_anything_v2,
2641
2791
  generate_pose_image,
2642
- vit_image_classification,
2643
2792
  vit_nsfw_classification,
2644
2793
  video_temporal_localization,
2645
2794
  flux_image_inpainting,
@@ -133,6 +133,12 @@ class Sim:
133
133
  df: pd.DataFrame,
134
134
  ) -> bool:
135
135
  load_dir = Path(load_dir)
136
+ if (
137
+ not Path(load_dir / "df.csv").exists()
138
+ or not Path(load_dir / "embs.npy").exists()
139
+ ):
140
+ return False
141
+
136
142
  df_load = pd.read_csv(load_dir / "df.csv")
137
143
  if platform.system() == "Windows":
138
144
  df_load["doc"] = df_load["doc"].apply(lambda x: x.replace("\r", ""))
@@ -17,6 +17,7 @@ class ODModels(str, Enum):
17
17
  COUNTGD = "countgd"
18
18
  FLORENCE2 = "florence2"
19
19
  OWLV2 = "owlv2"
20
+ CUSTOM = "custom"
20
21
 
21
22
 
22
23
  def split_frames_into_segments(
File without changes
File without changes