vision-agent 0.2.220__py3-none-any.whl → 0.2.222__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Binary file
@@ -330,11 +330,11 @@ get_tool_for_task('Identify if there is tape on the boxes', crops[:3])
330
330
 
331
331
  OBSERVATION:
332
332
  [get_tool_for_task output]
333
- owl_v2_image performed best as it specifically detected multiple instances of tape with localized bounding boxes, which matches what's visible in the images.
333
+ owlv2_object_detection performed best as it specifically detected multiple instances of tape with localized bounding boxes, which matches what's visible in the images.
334
334
 
335
- 'owl_v2_image' is a tool that can detect and count multiple objects given a text
336
- prompt such as category names or referring expressions on images. The categories in
337
- text prompt are separated by commas. It returns a list of bounding boxes with
335
+ 'owlv2_object_detection' is a tool that can detect and count multiple objects given a
336
+ text prompt such as category names or referring expressions on images. The categories
337
+ in text prompt are separated by commas. It returns a list of bounding boxes with
338
338
  normalized coordinates, label names and associated probability scores.
339
339
 
340
340
  Parameters:
@@ -354,7 +354,7 @@ Returns:
354
354
 
355
355
  Example
356
356
  -------
357
- >>> owl_v2_image("car, dinosaur", image)
357
+ >>> owlv2_object_detection("car, dinosaur", image)
358
358
  [
359
359
  {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
360
360
  {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
@@ -375,7 +375,7 @@ for frame, frame_predictions in zip(frames, track_predictions):
375
375
  int(obj["bbox"][0] * width) : int(obj["bbox"][2] * width),
376
376
  :,
377
377
  ]
378
- detections = owl_v2_image("tape", crop)
378
+ detections = owlv2_object_detection("tape", crop)
379
379
  obj_to_info[obj["label"]].extend(detections)
380
380
 
381
381
 
@@ -441,7 +441,8 @@ PICK_PLAN = """
441
441
 
442
442
  CATEGORIZE_TOOL_REQUEST = """
443
443
  You are given a task: "{task}" from the user. You must extract the type of category this task belongs to, it can be one or more of the following:
444
- - "object detection and counting" - detecting objects or counting objects from a text prompt in an image or video.
444
+ - "object detection and counting" - detecting objects or counting objects from a text prompt in an image.
445
+ - "instance segmentation" - segmenting objects in an image given a text prompt.
445
446
  - "classification" - classifying objects in an image given a text prompt.
446
447
  - "segmentation" - segmenting objects in an image or video given a text prompt.
447
448
  - "OCR" - extracting text from an image.
@@ -477,8 +478,9 @@ TEST_TOOLS = """
477
478
  1. List all the tools under **Tools** and the user request. Write a program to load the media and call the most relevant tools in parallel and print it's output along with other relevant information.
478
479
  2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
479
480
  3. Your test case MUST run only on the given images which are {media}
480
- 4. Print this final dictionary.
481
- 5. Output your code in the following format wrapped in <code> tags:
481
+ 4. For video tracking, use chunk_length=1 and at least 3 frames to ensure the best results when evaluating the tool.
482
+ 5. Print this final dictionary.
483
+ 6. Output your code in the following format wrapped in <code> tags:
482
484
  <code>
483
485
  # Your code here
484
486
  </code>
@@ -494,17 +496,17 @@ Count the number of pedestrians across all the images.
494
496
 
495
497
  <code>
496
498
  from concurrent.futures import ThreadPoolExecutor, as_completed
497
- from vision_agent.tools import load_image, owl_v2_image, florence2_phrase_grounding, countgd_object_detection
499
+ from vision_agent.tools import load_image, owlv2_object_detection, florence2_object_detection, countgd_object_detection
498
500
 
499
501
  # process functions in a try catch so that if it fails it doesn't cause `as_completed` to hang
500
- def process_owl_v2(image_paths):
502
+ def process_owlv2(image_paths):
501
503
  try:
502
504
  results = []
503
505
  for image_path in image_paths:
504
506
  image = load_image(image_path)
505
- results.extend(owl_v2_image("person", image))
507
+ results.extend(owlv2_object_detection("person", image))
506
508
  except Exception as e:
507
- results = f"Encountered error when executing process_owl_v2: {str(e)}"
509
+ results = f"Encountered error when executing process_owlv2: {str(e)}"
508
510
  return results
509
511
 
510
512
  def process_florence2(image_paths):
@@ -512,7 +514,7 @@ def process_florence2(image_paths):
512
514
  results = []
513
515
  for image_path in image_paths:
514
516
  image = load_image(image_path)
515
- results.extend(florence2_phrase_grounding("person", image))
517
+ results.extend(florence2_object_detection("person", image))
516
518
  except Exception as e:
517
519
  results = f"Encountered error when executing process_florence2: {str(e)}"
518
520
  return results
@@ -531,7 +533,7 @@ image_paths = ["image1.jpg", "image2.jpg", "image3.jpg", "image4.jpg"]
531
533
 
532
534
  with ThreadPoolExecutor() as executor:
533
535
  futures = {{
534
- executor.submit(process_owl_v2, image_paths): "owl_v2_image",
536
+ executor.submit(process_owlv2, image_paths): "owlv2_object_detection",
535
537
  executor.submit(process_florence2, image_paths): "florence2_phrase_grounding",
536
538
  executor.submit(process_countgd, image_paths): "countgd_object_detection",
537
539
  }}
@@ -557,7 +559,7 @@ Count the number of people in the video.
557
559
  <code>
558
560
  import numpy as np
559
561
  from concurrent.futures import ThreadPoolExecutor, as_completed
560
- from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_sam2_video_tracking
562
+ from vision_agent.tools import extract_frames_and_timestamps, owlv2_sam2_video_tracking, florence2_sam2_video_tracking
561
563
 
562
564
  # sample at 1 FPS and use the first 10 frames to reduce processing time
563
565
  frames = extract_frames_and_timestamps("video.mp4", 1)
@@ -574,16 +576,18 @@ def remove_arrays(o):
574
576
  else:
575
577
  return o
576
578
 
577
- def process_owl_v2_video(frames):
579
+ def process_owlv2_sam2_video_tracking(frames):
578
580
  try:
579
- results = owl_v2_video("person", frames)
581
+ # run with chunk_length=1 to ensure best results
582
+ results = owlv2_sam2_video_tracking("person", frames, chunk_length=1)
580
583
  except Exception as e:
581
- results = f"Encountered error when executing process_owl_v2_video: {str(e)}"
584
+ results = f"Encountered error when executing process_owlv2_sam2_video_tracking: {str(e)}"
582
585
  return results
583
586
 
584
- def process_florence2_sam2(frames):
587
+ def process_florence2_sam2_video_tracking(frames):
585
588
  try:
586
- results = florence2_sam2_video_tracking("person", frames)
589
+ # run with chunk_length=1 to ensure best results
590
+ results = florence2_sam2_video_tracking("person", frames, chunk_length=1)
587
591
  except Exception as e:
588
592
  results = f"Encountered error when executing process_florence2_sam2: {str(e)}"
589
593
  return results
@@ -591,8 +595,8 @@ def process_florence2_sam2(frames):
591
595
 
592
596
  with ThreadPoolExecutor() as executor:
593
597
  futures = {{
594
- executor.submit(process_owl_v2_video, frames): "owl_v2_video",
595
- executor.submit(process_florence2_sam2, frames): "florence2_sam2_video_tracking",
598
+ executor.submit(process_owlv2_sam2_video_tracking, frames): "owlv2_sam2_video_tracking",
599
+ executor.submit(process_florence2_sam2_video_tracking, frames): "florence2_sam2_video_tracking",
596
600
  }}
597
601
  final_results = {{}}
598
602
  for future in as_completed(futures):
@@ -686,6 +690,7 @@ FINALIZE_PLAN = """
686
690
  3. Include ALL relevant python code in your plan to accomplish the user request.
687
691
  4. Specifically call out the tools used and the order in which they were used. Only include tools obtained from calling `get_tool_for_task`.
688
692
  5. Do not include {excluded_tools} tools in your instructions.
693
+ 6. Add final instructions for visualizing the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and saving it to a file with `save_file` or `save_video`.
689
694
  6. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
690
695
 
691
696
  <json>
@@ -26,25 +26,22 @@ from .tools import (
26
26
  claude35_text_extraction,
27
27
  closest_box_distance,
28
28
  closest_mask_distance,
29
- countgd_example_based_counting,
30
29
  countgd_object_detection,
31
- countgd_sam2_object_detection,
30
+ countgd_sam2_instance_segmentation,
32
31
  countgd_sam2_video_tracking,
32
+ countgd_visual_prompt_object_detection,
33
33
  depth_anything_v2,
34
34
  detr_segmentation,
35
35
  document_extraction,
36
36
  document_qa,
37
37
  extract_frames_and_timestamps,
38
+ florence2_object_detection,
38
39
  florence2_ocr,
39
- florence2_phrase_grounding,
40
- florence2_phrase_grounding_video,
41
- florence2_sam2_image,
40
+ florence2_sam2_instance_segmentation,
42
41
  florence2_sam2_video_tracking,
43
42
  flux_image_inpainting,
44
43
  generate_pose_image,
45
44
  get_tool_documentation,
46
- gpt4o_image_vqa,
47
- gpt4o_video_vqa,
48
45
  load_image,
49
46
  minimum_distance,
50
47
  ocr,
@@ -52,8 +49,8 @@ from .tools import (
52
49
  overlay_bounding_boxes,
53
50
  overlay_heat_map,
54
51
  overlay_segmentation_masks,
55
- owl_v2_image,
56
- owl_v2_video,
52
+ owlv2_object_detection,
53
+ owlv2_sam2_instance_segmentation,
57
54
  owlv2_sam2_video_tracking,
58
55
  qwen2_vl_images_vqa,
59
56
  qwen2_vl_video_vqa,
@@ -62,7 +59,6 @@ from .tools import (
62
59
  save_json,
63
60
  save_video,
64
61
  siglip_classification,
65
- stella_embeddings,
66
62
  template_match,
67
63
  video_temporal_localization,
68
64
  vit_image_classification,