PyPI - vision-agent - Versions diffs - 0.2.153__tar.gz → 0.2.154__tar.gz - Mend

vision-agent 0.2.153tar.gz → 0.2.154tar.gz

Files changed (33) hide show

{vision_agent-0.2.153 → vision_agent-0.2.154}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.153
+Version: 0.2.154
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.153 → vision_agent-0.2.154}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.153"
+version = "0.2.154"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/vision_agent_coder_prompts.py RENAMED Viewed

@@ -101,7 +101,7 @@ plan1:
 - Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
 plan2:
 - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
-- Use the 'florence2_phrase_grounding_image' tool with the prompt 'person' to detect where the people are in the video.
+- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
 plan3:
 - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
 - Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
@@ -109,7 +109,7 @@ plan3:
 ```python
 import numpy as np
-from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding_image, florence2_sam2_video_tracking
+from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
 # sample at 1 FPS and use the first 10 frames to reduce processing time
 frames = extract_frames_and_timestamps("video.mp4", 1)
@@ -143,7 +143,7 @@ owl_v2_out = owl_v2_video("person", frames)
 owl_v2_counts = get_counts(owl_v2_out)
 # plan2
-florence2_out = [florence2_phrase_grounding_image("person", f) for f in frames]
+florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
 florence2_counts = get_counts(florence2_out)
 # plan3
@@ -153,13 +153,13 @@ f2s2_counts = get_counts(f2s2_tracking_out)
 final_out = {{
     "owl_v2_video": owl_v2_out,
-    "florence2_phrase_grounding_image": florence2_out,
+    "florence2_phrase_grounding": florence2_out,
     "florence2_sam2_video_tracking": f2s2_out,
 }}
 counts = {{
     "owl_v2_video": owl_v2_counts,
-    "florence2_phrase_grounding_image": florence2_counts,
+    "florence2_phrase_grounding": florence2_counts,
     "florence2_sam2_video_tracking": f2s2_counts,
 }}

{vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/agent/vision_agent_prompts.py RENAMED Viewed

@@ -131,10 +131,10 @@ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers wi
 OBSERVATION:
 [Artifact code.py]
-0|from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
+0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
 1|def count_workers_with_helmets(image_path: str, output_path: str):
 2|    image = load_image(image_path)
-3|    detections = florence2_phrase_grounding_image("worker, helmet", image)
+3|    detections = florence2_phrase_grounding("worker, helmet", image)
 4|    workers = [d for d in detections if d['label'] == 'worker']
 5|    helmets = [d for d in detections if d['label'] == 'helmet']
 6|    count = 0
@@ -166,18 +166,18 @@ AGENT: {"thoughts": "Because the user has supplied me with labels I can call obj
 OBSERVATION:
 [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
-AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding_image call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
 OBSERVATION:
 [Artifact code.py edits]
 ---
 +++
@@ -1,7 +1,7 @@
- from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
+ from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
  def count_workers_with_helmets(image_path: str, output_path: str):
      image = load_image(image_path)
--    detections = florence2_phrase_grounding_image("worker, helmet", image)
-+    detections = florence2_phrase_grounding_image("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
+-    detections = florence2_phrase_grounding("worker, helmet", image)
++    detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
      workers = [d for d in detections if d['label'] == 'worker']
      helmets = [d for d in detections if d['label'] == 'helmet']
      count = 0
@@ -189,5 +189,5 @@ OBSERVATION:
 ----- stdout -----
 3
-AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding_image model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
+AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
 """

{vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -24,7 +24,8 @@ from .tools import (
     extract_frames_and_timestamps,
     florence2_image_caption,
     florence2_ocr,
-    florence2_phrase_grounding_image,
+    florence2_phrase_grounding,
+    florence2_phrase_grounding_video,
     florence2_roberta_vqa,
     florence2_sam2_image,
     florence2_sam2_video_tracking,

{vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/tools/meta_tools.py RENAMED Viewed

@@ -668,8 +668,8 @@ def use_object_detection_fine_tuning(
     patterns_with_fine_tune_id = [
         (
-            r'florence2_phrase_grounding_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
-            lambda match: f'florence2_phrase_grounding_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
+            r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
+            lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
         ),
         (
             r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',

{vision_agent-0.2.153 → vision_agent-0.2.154}/vision_agent/tools/tools.py RENAMED Viewed

@@ -1143,10 +1143,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
     return answer[task]  # type: ignore
-def florence2_phrase_grounding_image(
+def florence2_phrase_grounding(
     prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
 ) -> List[Dict[str, Any]]:
-    """'florence2_phrase_grounding_image' will run florence2 on a image. It can
+    """'florence2_phrase_grounding' will run florence2 on a image. It can
     detect multiple objects given a text prompt which can be object names or caption.
     You can optionally separate the object names in the text with commas. It returns
     a list of bounding boxes with normalized coordinates, label names and associated
@@ -1167,7 +1167,7 @@ def florence2_phrase_grounding_image(
     Example
     -------
-        >>> florence2_phrase_grounding_image('person looking at a coyote', image)
+        >>> florence2_phrase_grounding('person looking at a coyote', image)
         [
             {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
             {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -1196,7 +1196,7 @@ def florence2_phrase_grounding_image(
             "florence2-ft",
             v2=True,
             is_form=True,
-            metadata_payload={"function_name": "florence2_phrase_grounding_image"},
+            metadata_payload={"function_name": "florence2_phrase_grounding"},
         )
         # get the first frame
         detection = detections[0]
@@ -1205,7 +1205,7 @@ def florence2_phrase_grounding_image(
             "image": image_b64,
             "task": "<CAPTION_TO_PHRASE_GROUNDING>",
             "prompt": prompt,
-            "function_name": "florence2_phrase_grounding_image",
+            "function_name": "florence2_phrase_grounding",
         }
         detections = send_inference_request(data, "florence2", v2=True)
         detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
@@ -2164,8 +2164,7 @@ FUNCTION_TOOLS = [
     florence2_ocr,
     florence2_sam2_image,
     florence2_sam2_video_tracking,
-    florence2_phrase_grounding_image,
-    florence2_phrase_grounding_video,
+    florence2_phrase_grounding,
     ixc25_image_vqa,
     ixc25_video_vqa,
     detr_segmentation,