PyPI - vision-agent - Versions diffs - 0.2.230__tar.gz → 0.2.231__tar.gz - Mend

vision-agent 0.2.230tar.gz → 0.2.231tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

{vision_agent-0.2.230 → vision_agent-0.2.231}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.230
+Version: 0.2.231
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -71,12 +71,7 @@ export ANTHROPIC_API_KEY="your-api-key"
 export OPENAI_API_KEY="your-api-key"
 ```
----
-**NOTE**
-We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance
-for VisionAgent. If you want to use a different LLM provider or only one, see
-'Using Other LLM Providers' below.
----
+> **_NOTE:_** We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
 ## Documentation
@@ -149,8 +144,5 @@ directory. For example to change to Anthropic simply just run:
 cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
 ```
-**NOTE**
-VisionAgent moves fast and we are constantly updating and changing the library. If you
-have any questions or need help, please reach out to us on our discord channel.
----
+> **_NOTE:_** VisionAgent moves fast and we are constantly updating and changing the library. If you have any questions or need help, please reach out to us on our discord channel.

{vision_agent-0.2.230 → vision_agent-0.2.231}/README.md RENAMED Viewed

@@ -26,12 +26,7 @@ export ANTHROPIC_API_KEY="your-api-key"
 export OPENAI_API_KEY="your-api-key"
 ```
----
-**NOTE**
-We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance
-for VisionAgent. If you want to use a different LLM provider or only one, see
-'Using Other LLM Providers' below.
----
+> **_NOTE:_** We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
 ## Documentation
@@ -104,7 +99,4 @@ directory. For example to change to Anthropic simply just run:
 cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
 ```
-**NOTE**
-VisionAgent moves fast and we are constantly updating and changing the library. If you
-have any questions or need help, please reach out to us on our discord channel.
----
+> **_NOTE:_** VisionAgent moves fast and we are constantly updating and changing the library. If you have any questions or need help, please reach out to us on our discord channel.

{vision_agent-0.2.230 → vision_agent-0.2.231}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.230"
+version = "0.2.231"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/types.py RENAMED Viewed

@@ -33,6 +33,7 @@ class AgentMessage(BaseModel):
         Literal["interaction_response"],
         Literal["conversation"],
         Literal["planner"],
+        Literal["planner_update"],
         Literal["coder"],
     ]
     content: str

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/vision_agent_planner_v2.py RENAMED Viewed

@@ -513,6 +513,7 @@ class VisionAgentPlannerV2(AgentPlanner):
                 code = extract_tag(response, "execute_python")
                 finalize_plan = extract_tag(response, "finalize_plan")
                 finished = finalize_plan is not None
+                self.update_callback({"role": "planner_update", "content": response})
                 if self.verbose:
                     _CONSOLE.print(

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/configs/anthropic_config.py RENAMED Viewed

@@ -81,7 +81,7 @@ class Config(BaseModel):
     tool_tester_kwargs: dict = Field(
         default_factory=lambda: {
             "model_name": "claude-3-5-sonnet-20241022",
-            "temperature": 1.0,
+            "temperature": 0.0,
             "image_size": 768,
         }
     )
@@ -111,7 +111,7 @@ class Config(BaseModel):
     vqa_kwargs: dict = Field(
         default_factory=lambda: {
             "model_name": "claude-3-5-sonnet-20241022",
-            "temperature": 1.0,
+            "temperature": 0.0,
             "image_size": 768,
         }
     )

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/configs/openai_config.py RENAMED Viewed

@@ -98,7 +98,7 @@ class Config(BaseModel):
     tool_chooser_kwargs: dict = Field(
         default_factory=lambda: {
             "model_name": "gpt-4o-2024-08-06",
-            "temperature": 0.0,
+            "temperature": 1.0,
             "image_size": 768,
             "image_detail": "low",
         }
@@ -109,7 +109,7 @@ class Config(BaseModel):
     suggester_kwargs: dict = Field(
         default_factory=lambda: {
             "model_name": "gpt-4o-2024-08-06",
-            "temperature": 0.0,
+            "temperature": 1.0,
             "image_size": 768,
             "image_detail": "low",
         }

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/tools/planner_tools.py RENAMED Viewed

@@ -10,12 +10,7 @@ from IPython.display import display
 from PIL import Image
 import vision_agent.tools as T
-from vision_agent.agent.agent_utils import (
-    DefaultImports,
-    extract_code,
-    extract_json,
-    extract_tag,
-)
+from vision_agent.agent.agent_utils import DefaultImports, extract_json, extract_tag
 from vision_agent.agent.vision_agent_planner_prompts_v2 import (
     CATEGORIZE_TOOL_REQUEST,
     FINALIZE_PLAN,
@@ -36,6 +31,9 @@ from vision_agent.utils.image_utils import convert_to_b64
 from vision_agent.utils.sim import get_tool_recommender
 TOOL_FUNCTIONS = {tool.__name__: tool for tool in T.TOOLS}
+LOAD_TOOLS_DOCSTRING = T.get_tool_documentation(
+    [T.load_image, T.extract_frames_and_timestamps]
+)
 CONFIG = Config()
 _LOGGER = logging.getLogger(__name__)
@@ -179,6 +177,7 @@ def run_tool_testing(
                 cleaned_tool_docs.append(tool_doc)
         tool_docs = cleaned_tool_docs
     tool_docs_str = "\n".join([e["doc"] for e in tool_docs])
+    tool_docs_str += "\n" + LOAD_TOOLS_DOCSTRING
     prompt = TEST_TOOLS.format(
         tool_docs=tool_docs_str,
@@ -217,8 +216,15 @@ def run_tool_testing(
             examples=EXAMPLES,
             media=str(image_paths),
         )
-        code = extract_code(lmm.generate(prompt, media=image_paths))  # type: ignore
-        code = process_code(code)
+        response = cast(str, lmm.generate(prompt, media=image_paths))
+        code = extract_tag(response, "code")
+        if code is None:
+            code = response
+        try:
+            code = process_code(code)
+        except Exception as e:
+            _LOGGER.error(f"Error processing code: {e}")
         tool_output = code_interpreter.exec_isolation(
             DefaultImports.prepend_imports(code)
         )

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/tools/tool_utils.py RENAMED Viewed

@@ -318,6 +318,9 @@ def single_nms(
 def nms(
     all_preds: List[List[Dict[str, Any]]], iou_threshold: float
 ) -> List[List[Dict[str, Any]]]:
+    if not isinstance(all_preds[0], List):
+        all_preds = [all_preds]
     return_preds = []
     for frame_preds in all_preds:
         frame_preds = single_nms(frame_preds, iou_threshold)

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/tools/tools.py RENAMED Viewed

@@ -222,7 +222,7 @@ def sam2(
     ret = _sam2(image, detections, image_size)
     _display_tool_trace(
         sam2.__name__,
-        {},
+        {"detections": detections},
         ret["display_data"],
         ret["files"],
     )
@@ -314,18 +314,29 @@ def od_sam2_video_tracking(
     # Process each segment and collect detections
     detections_per_segment: List[Any] = []
-    for segment_index, segment in enumerate(segments):
-        segment_detections = process_segment(
-            segment_frames=segment,
-            od_model=od_model,
-            prompt=prompt,
-            fine_tune_id=fine_tune_id,
-            chunk_length=chunk_length,
-            image_size=image_size,
-            segment_index=segment_index,
-            object_detection_tool=_apply_object_detection,
-        )
-        detections_per_segment.append(segment_detections)
+    with ThreadPoolExecutor() as executor:
+        futures = {
+            executor.submit(
+                process_segment,
+                segment_frames=segment,
+                od_model=od_model,
+                prompt=prompt,
+                fine_tune_id=fine_tune_id,
+                chunk_length=chunk_length,
+                image_size=image_size,
+                segment_index=segment_index,
+                object_detection_tool=_apply_object_detection,
+            ): segment_index
+            for segment_index, segment in enumerate(segments)
+        }
+        for future in as_completed(futures):
+            segment_index = futures[future]
+            detections_per_segment.append((segment_index, future.result()))
+    detections_per_segment = [
+        x[1] for x in sorted(detections_per_segment, key=lambda x: x[0])
+    ]
     merged_detections = merge_segments(detections_per_segment)
     post_processed = post_process(merged_detections, image_size)
@@ -390,7 +401,7 @@ def _owlv2_object_detection(
         {
             "label": bbox["label"],
             "bbox": normalize_bbox(bbox["bounding_box"], image_size),
-            "score": bbox["score"],
+            "score": round(bbox["score"], 2),
         }
         for bbox in bboxes
     ]
@@ -398,7 +409,7 @@ def _owlv2_object_detection(
         {
             "label": bbox["label"],
             "bbox": bbox["bounding_box"],
-            "score": bbox["score"],
+            "score": round(bbox["score"], 2),
         }
         for bbox in bboxes
     ]
@@ -582,7 +593,7 @@ def owlv2_sam2_video_tracking(
     )
     _display_tool_trace(
         owlv2_sam2_video_tracking.__name__,
-        {},
+        {"prompt": prompt, "chunk_length": chunk_length},
         ret["display_data"],
         ret["files"],
     )
@@ -1681,7 +1692,7 @@ def video_temporal_localization(
     prompt: str,
     frames: List[np.ndarray],
     model: str = "qwen2vl",
-    chunk_length_frames: Optional[int] = 2,
+    chunk_length_frames: int = 2,
 ) -> List[float]:
     """'video_temporal_localization' will run qwen2vl on each chunk_length_frames
     value selected for the video. It can detect multiple objects independently per
@@ -1695,7 +1706,7 @@ def video_temporal_localization(
         frames (List[np.ndarray]): The reference frames used for the question
         model (str): The model to use for the inference. Valid values are
             'qwen2vl', 'gpt4o'.
-        chunk_length_frames (Optional[int]): length of each chunk in frames
+        chunk_length_frames (int): length of each chunk in frames
     Returns:
         List[float]: A list of floats with a value of 1.0 if the objects to be found
@@ -1714,8 +1725,7 @@ def video_temporal_localization(
         "model": model,
         "function_name": "video_temporal_localization",
     }
-    if chunk_length_frames is not None:
-        payload["chunk_length_frames"] = chunk_length_frames
+    payload["chunk_length_frames"] = chunk_length_frames
     data = send_inference_request(
         payload, "video-temporal-localization", files=files, v2=True
@@ -1726,7 +1736,13 @@ def video_temporal_localization(
         data,
         files,
     )
-    return [cast(float, value) for value in data]
+    chunked_data = [cast(float, value) for value in data]
+    full_data = []
+    for value in chunked_data:
+        full_data.extend([value] * chunk_length_frames)
+    return full_data[: len(frames)]
 def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
@@ -2150,7 +2166,7 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
     return response
-# agentic od tools
+# Agentic OD Tools
 def _agentic_object_detection(
@@ -2646,7 +2662,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
 def save_video(
-    frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 1
+    frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 5
 ) -> str:
     """'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/utils/sim.py RENAMED Viewed

@@ -98,10 +98,12 @@ class Sim:
             raise ValueError("key is required if no column 'embs' is present.")
         if sim_key is not None:
-            self.df["embs"] = self.df[sim_key].apply(
-                lambda x: get_embedding(
-                    self.emb_call,
-                    x,
+            self.df = self.df.assign(
+                embs=self.df[sim_key].apply(
+                    lambda x: get_embedding(
+                        self.emb_call,
+                        x,
+                    )
                 )
             )
@@ -141,7 +143,9 @@ class Sim:
         df_load = pd.read_csv(load_dir / "df.csv")
         if platform.system() == "Windows":
-            df_load["doc"] = df_load["doc"].apply(lambda x: x.replace("\r", ""))
+            df_load = df_load.assign(
+                doc=df_load.doc.apply(lambda x: x.replace("\r", ""))
+            )
         return df.equals(df_load)  # type: ignore
     @lru_cache(maxsize=256)
@@ -166,7 +170,9 @@ class Sim:
             self.emb_call,
             query,
         )
-        self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
+        self.df = self.df.assign(
+            sim=self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
+        )
         res = self.df.sort_values("sim", ascending=False).head(k)
         if thresh is not None:
             res = res[res.sim > thresh]
@@ -214,8 +220,13 @@ class AzureSim(Sim):
             raise ValueError("key is required if no column 'embs' is present.")
         if sim_key is not None:
-            self.df["embs"] = self.df[sim_key].apply(
-                lambda x: get_embedding(self.emb_call, x)
+            self.df = self.df.assign(
+                embs=self.df[sim_key].apply(
+                    lambda x: get_embedding(
+                        self.emb_call,
+                        x,
+                    )
+                )
             )
@@ -245,8 +256,13 @@ class OllamaSim(Sim):
             raise ValueError("key is required if no column 'embs' is present.")
         if sim_key is not None:
-            self.df["embs"] = self.df[sim_key].apply(
-                lambda x: get_embedding(emb_call, x)
+            self.df = self.df.assign(
+                embs=self.df[sim_key].apply(
+                    lambda x: get_embedding(
+                        self.emb_call,
+                        x,
+                    )
+                )
             )
@@ -267,8 +283,13 @@ class StellaSim(Sim):
             raise ValueError("key is required if no column 'embs' is present.")
         if sim_key is not None:
-            self.df["embs"] = self.df[sim_key].apply(
-                lambda x: get_embedding(emb_call, x)
+            self.df = self.df.assign(
+                embs=self.df[sim_key].apply(
+                    lambda x: get_embedding(
+                        self.emb_call,
+                        x,
+                    )
+                )
             )
     @staticmethod

{vision_agent-0.2.230 → vision_agent-0.2.231}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/.sim_tools/df.csv RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/.sim_tools/embs.npy RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/README.md RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/agent.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/agent_utils.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/vision_agent.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/vision_agent_coder.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/vision_agent_coder_prompts.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/vision_agent_coder_prompts_v2.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/vision_agent_coder_v2.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/vision_agent_planner.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/vision_agent_planner_prompts.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/vision_agent_planner_prompts_v2.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/vision_agent_prompts.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/vision_agent_prompts_v2.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/agent/vision_agent_v2.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/clients/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/clients/http.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/clients/landing_public_api.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/configs/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/configs/anthropic_openai_config.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/configs/config.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/fonts/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/fonts/default_font_ch_en.ttf RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/lmm/__init__.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/lmm/lmm.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/lmm/types.py RENAMED Viewed

File without changes

{vision_agent-0.2.230 → vision_agent-0.2.231}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -23,6 +23,9 @@ from .tools import (
     TOOLS_INFO,
     UTIL_TOOLS,
     UTILITIES_DOCSTRING,
+    agentic_object_detection,
+    agentic_sam2_instance_segmentation,
+    agentic_sam2_video_tracking,
     claude35_text_extraction,
     closest_box_distance,
     closest_mask_distance,
@@ -30,6 +33,7 @@ from .tools import (
     countgd_sam2_instance_segmentation,
     countgd_sam2_video_tracking,
     countgd_visual_prompt_object_detection,
+    custom_object_detection,
     depth_anything_v2,
     detr_segmentation,
     document_extraction,
@@ -63,10 +67,6 @@ from .tools import (
     video_temporal_localization,
     vit_image_classification,
     vit_nsfw_classification,
-    custom_object_detection,
-    agentic_object_detection,
-    agentic_sam2_instance_segmentation,
-    agentic_sam2_video_tracking,
 )
 __new_tools__ = [