PyPI - vision-agent - Versions diffs - 0.2.113__tar.gz → 0.2.115__tar.gz - Mend

vision-agent 0.2.113tar.gz → 0.2.115tar.gz

Files changed (34) hide show

{vision_agent-0.2.113 → vision_agent-0.2.115}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.113
+Version: 0.2.115
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -208,20 +208,18 @@ result = agent.chat_with_workflow(conv)
 ### Tools
 There are a variety of tools for the model or the user to use. Some are executed locally
-while others are hosted for you. You can also ask an LMM directly to build a tool for
-you. For example:
+while others are hosted for you. You can easily access them yourself, for example if
+you want to run `owl_v2` and visualize the output you can run:
 ```python
->>> import vision_agent as va
->>> lmm = va.lmm.OpenAILMM()
->>> detector = lmm.generate_detector("Can you build a jar detector for me?")
->>> detector(va.tools.load_image("jar.jpg"))
-[{"labels": ["jar",],
-  "scores": [0.99],
-  "bboxes": [
-    [0.58, 0.2, 0.72, 0.45],
-  ]
-}]
+import vision_agent.tools as T
+import matplotlib.pyplot as plt
+image = T.load_image("dogs.jpg")
+dets = T.owl_v2("dogs", image)
+viz = T.overlay_bounding_boxes(image, dets)
+plt.imshow(viz)
+plt.show()
 ```
 You can also add custom tools to the agent:
@@ -254,6 +252,41 @@ function. Make sure the documentation is in the same format above with descripti
 `Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
 [here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
+## Additional LLMs
+### Ollama
+We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
+a few models:
+```bash
+ollama pull llama3.1
+ollama pull mxbai-embed-large
+```
+`llama3.1` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Normally we would
+use an actual LMM such as `llava` but `llava` cannot handle the long context lengths
+required by the agent. Since `llama3.1` cannot handle images you may see some
+performance degredation. `mxbai-embed-large` is the embedding model used to look up
+tools. You can use it just like you would use `VisionAgentCoder`:
+```python
+>>> import vision_agent as va
+>>> agent = va.agent.OllamaVisionAgentCoder()
+>>> agent("Count the apples in the image", media="apples.jpg")
+```
+> WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B.
+### Azure OpenAI
+We also provide a `AzureVisionAgentCoder` that uses Azure OpenAI models. To get started
+follow the Azure Setup section below. You can use it just like you would use=
+`VisionAgentCoder`:
+```python
+>>> import vision_agent as va
+>>> agent = va.agent.AzureVisionAgentCoder()
+>>> agent("Count the apples in the image", media="apples.jpg")
+```
 ### Azure Setup
 If you want to use Azure OpenAI models, you need to have two OpenAI model deployments:
@@ -292,7 +325,7 @@ agent = va.agent.AzureVisionAgentCoder()
 2. Follow the instructions to purchase and manage your API credits.
 3. Ensure your API key is correctly configured in your project settings.
-Failure to have sufficient API credits may result in limited or no functionality for the features that rely on the OpenAI API.
-For more details on managing your API usage and credits, please refer to the OpenAI API documentation.
+Failure to have sufficient API credits may result in limited or no functionality for
+the features that rely on the OpenAI API. For more details on managing your API usage
+and credits, please refer to the OpenAI API documentation.

{vision_agent-0.2.113 → vision_agent-0.2.115}/README.md RENAMED Viewed

@@ -168,20 +168,18 @@ result = agent.chat_with_workflow(conv)
 ### Tools
 There are a variety of tools for the model or the user to use. Some are executed locally
-while others are hosted for you. You can also ask an LMM directly to build a tool for
-you. For example:
+while others are hosted for you. You can easily access them yourself, for example if
+you want to run `owl_v2` and visualize the output you can run:
 ```python
->>> import vision_agent as va
->>> lmm = va.lmm.OpenAILMM()
->>> detector = lmm.generate_detector("Can you build a jar detector for me?")
->>> detector(va.tools.load_image("jar.jpg"))
-[{"labels": ["jar",],
-  "scores": [0.99],
-  "bboxes": [
-    [0.58, 0.2, 0.72, 0.45],
-  ]
-}]
+import vision_agent.tools as T
+import matplotlib.pyplot as plt
+image = T.load_image("dogs.jpg")
+dets = T.owl_v2("dogs", image)
+viz = T.overlay_bounding_boxes(image, dets)
+plt.imshow(viz)
+plt.show()
 ```
 You can also add custom tools to the agent:
@@ -214,6 +212,41 @@ function. Make sure the documentation is in the same format above with descripti
 `Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
 [here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
+## Additional LLMs
+### Ollama
+We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
+a few models:
+```bash
+ollama pull llama3.1
+ollama pull mxbai-embed-large
+```
+`llama3.1` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Normally we would
+use an actual LMM such as `llava` but `llava` cannot handle the long context lengths
+required by the agent. Since `llama3.1` cannot handle images you may see some
+performance degredation. `mxbai-embed-large` is the embedding model used to look up
+tools. You can use it just like you would use `VisionAgentCoder`:
+```python
+>>> import vision_agent as va
+>>> agent = va.agent.OllamaVisionAgentCoder()
+>>> agent("Count the apples in the image", media="apples.jpg")
+```
+> WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B.
+### Azure OpenAI
+We also provide a `AzureVisionAgentCoder` that uses Azure OpenAI models. To get started
+follow the Azure Setup section below. You can use it just like you would use=
+`VisionAgentCoder`:
+```python
+>>> import vision_agent as va
+>>> agent = va.agent.AzureVisionAgentCoder()
+>>> agent("Count the apples in the image", media="apples.jpg")
+```
 ### Azure Setup
 If you want to use Azure OpenAI models, you need to have two OpenAI model deployments:
@@ -252,6 +285,6 @@ agent = va.agent.AzureVisionAgentCoder()
 2. Follow the instructions to purchase and manage your API credits.
 3. Ensure your API key is correctly configured in your project settings.
-Failure to have sufficient API credits may result in limited or no functionality for the features that rely on the OpenAI API.
-For more details on managing your API usage and credits, please refer to the OpenAI API documentation.
+Failure to have sufficient API credits may result in limited or no functionality for
+the features that rely on the OpenAI API. For more details on managing your API usage
+and credits, please refer to the OpenAI API documentation.

{vision_agent-0.2.113 → vision_agent-0.2.115}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.113"
+version = "0.2.115"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

vision_agent-0.2.115/vision_agent/agent/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from .agent import Agent
+from .vision_agent import VisionAgent
+from .vision_agent_coder import (
+    AzureVisionAgentCoder,
+    OllamaVisionAgentCoder,
+    VisionAgentCoder,
+)

{vision_agent-0.2.113 → vision_agent-0.2.115}/vision_agent/agent/agent_utils.py RENAMED Viewed

@@ -1,9 +1,24 @@
 import json
 import logging
+import re
 import sys
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 logging.basicConfig(stream=sys.stdout)
+_LOGGER = logging.getLogger(__name__)
+def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
+    json_pattern = r"\{.*\}"
+    match = re.search(json_pattern, json_str, re.DOTALL)
+    if match:
+        json_str = match.group()
+        try:
+            json_dict = json.loads(json_str)
+            return json_dict  # type: ignore
+        except json.JSONDecodeError:
+            return None
+    return None
 def extract_json(json_str: str) -> Dict[str, Any]:
@@ -18,8 +33,16 @@ def extract_json(json_str: str) -> Dict[str, Any]:
             json_str = json_str[json_str.find("```") + len("```") :]
             # get the last ``` not one from an intermediate string
             json_str = json_str[: json_str.find("}```")]
+        try:
+            json_dict = json.loads(json_str)
+        except json.JSONDecodeError as e:
+            json_dict = _extract_sub_json(json_str)
+            if json_dict is not None:
+                return json_dict  # type: ignore
+            error_msg = f"Could not extract JSON from the given str: {json_str}"
+            _LOGGER.exception(error_msg)
+            raise ValueError(error_msg) from e
-        json_dict = json.loads(json_str)
     return json_dict  # type: ignore

{vision_agent-0.2.113 → vision_agent-0.2.115}/vision_agent/agent/vision_agent_coder.py RENAMED Viewed

@@ -28,11 +28,11 @@ from vision_agent.agent.vision_agent_coder_prompts import (
     TEST_PLANS,
     USER_REQ,
 )
-from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OpenAILMM
+from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM
 from vision_agent.utils import CodeInterpreterFactory, Execution
 from vision_agent.utils.execute import CodeInterpreter
 from vision_agent.utils.image_utils import b64_to_pil
-from vision_agent.utils.sim import AzureSim, Sim
+from vision_agent.utils.sim import AzureSim, OllamaSim, Sim
 from vision_agent.utils.video import play_video
 logging.basicConfig(stream=sys.stdout)
@@ -267,7 +267,11 @@ def pick_plan(
             pass
         count += 1
-    if best_plan is None:
+    if (
+        best_plan is None
+        or "best_plan" not in best_plan
+        or ("best_plan" in best_plan and best_plan["best_plan"] not in plans)
+    ):
         best_plan = {"best_plan": list(plans.keys())[0]}
     if verbosity >= 1:
@@ -589,8 +593,8 @@ class VisionAgentCoder(Agent):
     Example
     -------
-        >>> from vision_agent.agent import VisionAgentCoder
-        >>> agent = VisionAgentCoder()
+        >>> import vision_agent as va
+        >>> agent = va.agent.VisionAgentCoder()
         >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
     """
@@ -857,6 +861,64 @@ class VisionAgentCoder(Agent):
             self.report_progress_callback(data)
+class OllamaVisionAgentCoder(VisionAgentCoder):
+    """VisionAgentCoder that uses Ollama models for planning, coding, testing.
+    Pre-requisites:
+    1. Run ollama pull llama3.1 for the LLM
+    2. Run ollama pull mxbai-embed-large for the embedding similarity model
+    Technically you should use a VLM such as llava but llava is not able to handle the
+    context length and crashes.
+    Example
+    -------
+        >>> image vision_agent as va
+        >>> agent = va.agent.OllamaVisionAgentCoder()
+        >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
+    """
+    def __init__(
+        self,
+        planner: Optional[LMM] = None,
+        coder: Optional[LMM] = None,
+        tester: Optional[LMM] = None,
+        debugger: Optional[LMM] = None,
+        tool_recommender: Optional[Sim] = None,
+        verbosity: int = 0,
+        report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
+    ) -> None:
+        super().__init__(
+            planner=(
+                OllamaLMM(model_name="llama3.1", temperature=0.0, json_mode=True)
+                if planner is None
+                else planner
+            ),
+            coder=(
+                OllamaLMM(model_name="llama3.1", temperature=0.0)
+                if coder is None
+                else coder
+            ),
+            tester=(
+                OllamaLMM(model_name="llama3.1", temperature=0.0)
+                if tester is None
+                else tester
+            ),
+            debugger=(
+                OllamaLMM(model_name="llama3.1", temperature=0.0, json_mode=True)
+                if debugger is None
+                else debugger
+            ),
+            tool_recommender=(
+                OllamaSim(T.TOOLS_DF, sim_key="desc")
+                if tool_recommender is None
+                else tool_recommender
+            ),
+            verbosity=verbosity,
+            report_progress_callback=report_progress_callback,
+        )
 class AzureVisionAgentCoder(VisionAgentCoder):
     """VisionAgentCoder that uses Azure OpenAI APIs for planning, coding, testing.
@@ -866,8 +928,8 @@ class AzureVisionAgentCoder(VisionAgentCoder):
     Example
     -------
-        >>> from vision_agent import AzureVisionAgentCoder
-        >>> agent = AzureVisionAgentCoder()
+        >>> import vision_agent as va
+        >>> agent = va.agent.AzureVisionAgentCoder()
         >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg")
     """

{vision_agent-0.2.113 → vision_agent-0.2.115}/vision_agent/lmm/lmm.py RENAMED Viewed

@@ -330,12 +330,28 @@ class OllamaLMM(LMM):
         model_name: str = "llava",
         base_url: Optional[str] = "http://localhost:11434/api",
         json_mode: bool = False,
+        num_ctx: int = 128_000,
         **kwargs: Any,
     ):
+        """Initializes the Ollama LMM. kwargs are passed as 'options' to the model.
+        More information on options can be found here
+        https://github.com/ollama/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values
+        Parameters:
+            model_name (str): The ollama name of the model.
+            base_url (str): The base URL of the Ollama API.
+            json_mode (bool): Whether to use JSON mode.
+            num_ctx (int): The context length for the model.
+            kwargs (Any): Additional options to pass to the model.
+        """
         self.url = base_url
         self.model_name = model_name
-        self.json_mode = json_mode
-        self.kwargs = kwargs
+        self.kwargs = {"options": kwargs}
+        if json_mode:
+            self.kwargs["format"] = "json"  # type: ignore
+        self.kwargs["options"]["num_cxt"] = num_ctx
     def __call__(
         self,
@@ -369,13 +385,14 @@ class OllamaLMM(LMM):
         url = f"{self.url}/chat"
         model = self.model_name
         messages = fixed_chat
-        data = {"model": model, "messages": messages}
+        data: Dict[str, Any] = {"model": model, "messages": messages}
         tmp_kwargs = self.kwargs | kwargs
         data.update(tmp_kwargs)
-        json_data = json.dumps(data)
         if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
+            json_data = json.dumps(data)
             def f() -> Iterator[Optional[str]]:
                 with requests.post(url, data=json_data, stream=True) as stream:
                     if stream.status_code != 200:
@@ -392,13 +409,14 @@ class OllamaLMM(LMM):
             return f()
         else:
-            stream = requests.post(url, data=json_data)
-            if stream.status_code != 200:
-                raise ValueError(
-                    f"Request failed with status code {stream.status_code}"
-                )
-            stream = stream.json()
-            return stream["message"]["content"]  # type: ignore
+            data["stream"] = False
+            json_data = json.dumps(data)
+            resp = requests.post(url, data=json_data)
+            if resp.status_code != 200:
+                raise ValueError(f"Request failed with status code {resp.status_code}")
+            resp = resp.json()
+            return resp["message"]["content"]  # type: ignore
     def generate(
         self,
@@ -408,7 +426,7 @@ class OllamaLMM(LMM):
     ) -> Union[str, Iterator[Optional[str]]]:
         url = f"{self.url}/generate"
-        data = {
+        data: Dict[str, Any] = {
             "model": self.model_name,
             "prompt": prompt,
             "images": [],
@@ -416,13 +434,14 @@ class OllamaLMM(LMM):
         if media and len(media) > 0:
             for m in media:
-                data["images"].append(encode_media(m))  # type: ignore
+                data["images"].append(encode_media(m))
         tmp_kwargs = self.kwargs | kwargs
         data.update(tmp_kwargs)
-        json_data = json.dumps(data)
         if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
+            json_data = json.dumps(data)
             def f() -> Iterator[Optional[str]]:
                 with requests.post(url, data=json_data, stream=True) as stream:
                     if stream.status_code != 200:
@@ -439,15 +458,15 @@ class OllamaLMM(LMM):
             return f()
         else:
-            stream = requests.post(url, data=json_data)
+            data["stream"] = False
+            json_data = json.dumps(data)
+            resp = requests.post(url, data=json_data)
-            if stream.status_code != 200:
-                raise ValueError(
-                    f"Request failed with status code {stream.status_code}"
-                )
+            if resp.status_code != 200:
+                raise ValueError(f"Request failed with status code {resp.status_code}")
-            stream = stream.json()
-            return stream["response"]  # type: ignore
+            resp = resp.json()
+            return resp["response"]  # type: ignore
 class ClaudeSonnetLMM(LMM):

{vision_agent-0.2.113 → vision_agent-0.2.115}/vision_agent/utils/__init__.py RENAMED Viewed

@@ -6,5 +6,5 @@ from .execute import (
     Logs,
     Result,
 )
-from .sim import AzureSim, Sim, load_sim, merge_sim
+from .sim import AzureSim, OllamaSim, Sim, load_sim, merge_sim
 from .video import extract_frames_from_video

{vision_agent-0.2.113 → vision_agent-0.2.115}/vision_agent/utils/execute.py RENAMED Viewed

@@ -532,7 +532,7 @@ print(f"Vision Agent version: {va_version}")"""
     @staticmethod
     def _new_e2b_interpreter_impl(*args, **kwargs) -> E2BCodeInterpreterImpl:  # type: ignore
-        template_name = os.environ.get("E2B_TEMPLATE_NAME", "nx3fagq7sgdliww9cvm3")
+        template_name = os.environ.get("E2B_TEMPLATE_NAME", "va-sandbox")
         _LOGGER.info(
             f"Creating a new E2BCodeInterpreter using template: {template_name}"
         )

{vision_agent-0.2.113 → vision_agent-0.2.115}/vision_agent/utils/sim.py RENAMED Viewed

@@ -1,20 +1,21 @@
 import os
 from functools import lru_cache
 from pathlib import Path
-from typing import Dict, List, Optional, Sequence, Union
+from typing import Callable, Dict, List, Optional, Sequence, Union
 import numpy as np
 import pandas as pd
-from openai import AzureOpenAI, Client, OpenAI
+import requests
+from openai import AzureOpenAI, OpenAI
 from scipy.spatial.distance import cosine  # type: ignore
 @lru_cache(maxsize=512)
 def get_embedding(
-    client: Client, text: str, model: str = "text-embedding-3-small"
+    emb_call: Callable[[List[str]], List[float]], text: str
 ) -> List[float]:
     text = text.replace("\n", " ")
-    return client.embeddings.create(input=[text], model=model).data[0].embedding
+    return emb_call([text])
 class Sim:
@@ -35,14 +36,19 @@ class Sim:
             model: str: The model to use for embeddings.
         """
         self.df = df
-        self.client = OpenAI(api_key=api_key)
+        client = OpenAI(api_key=api_key)
+        self.emb_call = (
+            lambda text: client.embeddings.create(input=text, model=model)
+            .data[0]
+            .embedding
+        )
         self.model = model
         if "embs" not in df.columns and sim_key is None:
             raise ValueError("key is required if no column 'embs' is present.")
         if sim_key is not None:
             self.df["embs"] = self.df[sim_key].apply(
-                lambda x: get_embedding(self.client, x, model=self.model)
+                lambda x: get_embedding(self.emb_call, x)
             )
     def save(self, sim_file: Union[str, Path]) -> None:
@@ -70,7 +76,7 @@ class Sim:
             Sequence[Dict]: The top k most similar items.
         """
-        embedding = get_embedding(self.client, query, model=self.model)
+        embedding = get_embedding(self.emb_call, query)
         self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
         res = self.df.sort_values("sim", ascending=False).head(k)
         if thresh is not None:
@@ -105,17 +111,51 @@ class AzureSim(Sim):
             )
         self.df = df
-        self.client = AzureOpenAI(
+        client = AzureOpenAI(
             api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint
         )
+        self.emb_call = (
+            lambda text: client.embeddings.create(input=text, model=model)
+            .data[0]
+            .embedding
+        )
         self.model = model
+        if "embs" not in df.columns and sim_key is None:
+            raise ValueError("key is required if no column 'embs' is present.")
+        if sim_key is not None:
+            self.df["embs"] = self.df[sim_key].apply(lambda x: get_embedding(client, x))
+class OllamaSim(Sim):
+    def __init__(
+        self,
+        df: pd.DataFrame,
+        sim_key: Optional[str] = None,
+        model_name: Optional[str] = None,
+        base_url: Optional[str] = None,
+    ) -> None:
+        self.df = df
+        if base_url is None:
+            base_url = "http://localhost:11434/api/embeddings"
+        if model_name is None:
+            model_name = "mxbai-embed-large"
+        def emb_call(text: List[str]) -> List[float]:
+            resp = requests.post(
+                base_url, json={"prompt": text[0], "model": model_name}
+            )
+            return resp.json()["embedding"]  # type: ignore
+        self.emb_call = emb_call
         if "embs" not in df.columns and sim_key is None:
             raise ValueError("key is required if no column 'embs' is present.")
         if sim_key is not None:
             self.df["embs"] = self.df[sim_key].apply(
-                lambda x: get_embedding(self.client, x, model=self.model)
+                lambda x: get_embedding(emb_call, x)
             )

vision_agent-0.2.113/vision_agent/agent/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .agent import Agent
-from .vision_agent import VisionAgent
-from .vision_agent_coder import AzureVisionAgentCoder, VisionAgentCoder