PyPI - vision-agent - Versions diffs - 0.2.218__py3-none-any.whl → 0.2.219__py3-none-any.whl - Mend

vision-agent 0.2.218py3-none-any.whl → 0.2.219py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

vision_agent/agent/vision_agent_coder.py CHANGED Viewed

@@ -644,12 +644,9 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
     """VisionAgentCoder that uses Ollama models for planning, coding, testing.
     Pre-requisites:
-    1. Run ollama pull llama3.1 for the LLM
+    1. Run ollama pull llama3.2-vision for the LMM
     2. Run ollama pull mxbai-embed-large for the embedding similarity model
-    Technically you should use a VLM such as llava but llava is not able to handle the
-    context length and crashes.
     Example
     -------
         >>> image vision_agent as va
@@ -674,17 +671,17 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
                 else planner
             ),
             coder=(
-                OllamaLMM(model_name="llama3.1", temperature=0.0)
+                OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
                 if coder is None
                 else coder
             ),
             tester=(
-                OllamaLMM(model_name="llama3.1", temperature=0.0)
+                OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
                 if tester is None
                 else tester
             ),
             debugger=(
-                OllamaLMM(model_name="llama3.1", temperature=0.0)
+                OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
                 if debugger is None
                 else debugger
             ),

vision_agent/agent/vision_agent_planner.py CHANGED Viewed

@@ -532,7 +532,7 @@ class OllamaVisionAgentPlanner(VisionAgentPlanner):
     ) -> None:
         super().__init__(
             planner=(
-                OllamaLMM(model_name="llama3.1", temperature=0.0)
+                OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
                 if planner is None
                 else planner
             ),

vision_agent/agent/vision_agent_planner_prompts.py CHANGED Viewed

@@ -62,10 +62,10 @@ plan2:
 - Count the number of detected objects labeled as 'person'.
 plan3:
 - Load the image from the provided file path 'image.jpg'.
-- Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
+- Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
 ```python
-from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_counting
+from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_object_detection
 image = load_image("image.jpg")
 owl_v2_out = owl_v2_image("person", image)
@@ -73,9 +73,9 @@ f2s2_out = florence2_sam2_image("person", image)
 # strip out the masks from the output becuase they don't provide useful information when printed
 f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
-cgd_out = countgd_counting(image)
+cgd_out = countgd_object_detection("person", image)
-final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
+final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_object_detection": cgd_out}}
 print(final_out)
 --- END EXAMPLE1 ---

vision_agent/utils/sim.py CHANGED Viewed

@@ -58,6 +58,11 @@ class Sim:
         """
         self.df = df
         self.client = OpenAI(api_key=api_key)
+        self.emb_call = (
+            lambda x: self.client.embeddings.create(input=x, model=model)
+            .data[0]
+            .embedding
+        )
         self.model = model
         if "embs" not in df.columns and sim_key is None:
             raise ValueError("key is required if no column 'embs' is present.")
@@ -65,11 +70,7 @@ class Sim:
         if sim_key is not None:
             self.df["embs"] = self.df[sim_key].apply(
                 lambda x: get_embedding(
-                    lambda text: self.client.embeddings.create(
-                        input=text, model=self.model
-                    )
-                    .data[0]
-                    .embedding,
+                    self.emb_call,
                     x,
                 )
             )
@@ -126,9 +127,7 @@ class Sim:
         """
         embedding = get_embedding(
-            lambda text: self.client.embeddings.create(input=text, model=self.model)
-            .data[0]
-            .embedding,
+            self.emb_call,
             query,
         )
         self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))

{vision_agent-0.2.218.dist-info → vision_agent-0.2.219.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.218
+Version: 0.2.219
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -81,9 +81,10 @@ You can also run VisionAgent in a local Jupyter Notebook.  Here are some example
 Check out the [notebooks](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks) folder for more examples.
-### Installation
+### Get Started
 To get started with the python library, you can install it using pip:
+#### Installation and Setup
 ```bash
 pip install vision-agent
 ```
@@ -92,11 +93,17 @@ Ensure you have both an Anthropic key and an OpenAI API key and set in your envi
 variables (if you are using Azure OpenAI please see the Azure setup section):
 ```bash
-export ANTHROPIC_API_KEY="your-api-key" # needed for VisionAgent and VisionAgentCoder
-export OPENAI_API_KEY="your-api-key" # needed for ToolRecommender
+export ANTHROPIC_API_KEY="your-api-key"
+export OPENAI_API_KEY="your-api-key"
 ```
-### Basic Usage
+---
+**NOTE**
+You must have both Anthropic and OpenAI API keys set in your environment variables to
+use VisionAgent. If you don't have an Anthropic key you can use Ollama as a backend.
+---
+#### Chatting with VisionAgent
 To get started you can just import the `VisionAgent` and start chatting with it:
 ```python
 >>> from vision_agent.agent import VisionAgent
@@ -112,6 +119,40 @@ The chat messages are similar to `OpenAI`'s format with `role` and `content` key
 in addition to those you can add `media` which is a list of media files that can either
 be images or video files.
+#### Getting Code from VisionAgent
+You can also use `VisionAgentCoder` to generate code for you:
+```python
+>>> from vision_agent.agent import VisionAgentCoder
+>>> agent = VisionAgentCoder(verbosity=2)
+>>> code = agent("Count the number of people in this image", media="people.jpg")
+```
+#### Don't have Anthropic/OpenAI API keys?
+You can use `OllamaVisionAgentCoder` which uses Ollama as the backend. To get started
+pull the models:
+```bash
+ollama pull llama3.2-vision
+ollama pull mxbai-embed-large
+```
+Then you can use it just like you would use `VisionAgentCoder`:
+```python
+>>> from vision_agent.agent import OllamaVisionAgentCoder
+>>> agent = OllamaVisionAgentCoder(verbosity=2)
+>>> code = agent("Count the number of people in this image", media="people.jpg")
+```
+---
+**NOTE**
+Smaller open source models like Llama 3.1 8B will not work well with VisionAgent. You
+will encounter many coding errors because it generates incorrect code or JSON decoding
+errors because it generates incorrect JSON. We recommend using larger models or
+Anthropic/OpenAI models.
+---
 ## Documentation
 [VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
@@ -445,15 +486,14 @@ Usage is the same as `VisionAgentCoder`:
 `OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
 ```bash
-ollama pull llama3.1
+ollama pull llama3.2-vision
 ollama pull mxbai-embed-large
 ```
-`llama3.1` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Normally we would
-use an actual LMM such as `llava` but `llava` cannot handle the long context lengths
-required by the agent. Since `llama3.1` cannot handle images you may see some
-performance degredation. `mxbai-embed-large` is the embedding model used to look up
-tools. You can use it just like you would use `VisionAgentCoder`:
+`llama3.2-vision` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Becuase
+`llama3.2-vision` is a smaller model you **WILL see performance degredation** compared to
+using Anthropic or OpenAI models. `mxbai-embed-large` is the embedding model used to
+look up tools. You can use it just like you would use `VisionAgentCoder`:
 ```python
 >>> import vision_agent as va

{vision_agent-0.2.218.dist-info → vision_agent-0.2.219.dist-info}/RECORD RENAMED Viewed

@@ -7,12 +7,12 @@ vision_agent/agent/agent.py,sha256=_1tHWAs7Jm5tqDzEcPfCRvJV3uRRveyh4n9_9pd6I1w,1
 vision_agent/agent/agent_utils.py,sha256=NmrqjhSb6fpnrB8XGWtaywZjr9n89otusOZpcbWLf9k,13534
 vision_agent/agent/types.py,sha256=DkFm3VMMrKlhYyfxEmZx4keppD72Ov3wmLCbM2J2o10,2437
 vision_agent/agent/vision_agent.py,sha256=I75bEU-os9Lf9OSICKfvQ_H_ftg-zOwgTwWnu41oIdo,23555
-vision_agent/agent/vision_agent_coder.py,sha256=ANwUuCO4JpTYJs4s6ynSRFcdjZFUVuSoSfcqp8ZQDDQ,27451
+vision_agent/agent/vision_agent_coder.py,sha256=flUxOibyGZK19BCSK5mhaD3HjCxHw6c6FtKom6N2q1E,27359
 vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
 vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=9v5HwbNidSzYUEFl6ZMniWWOmyLITM_moWLtKVaTen8,4845
 vision_agent/agent/vision_agent_coder_v2.py,sha256=WKYPJAliupxnF2TP5jZlinqxnID37xnYSDNGMwoFKwU,16092
-vision_agent/agent/vision_agent_planner.py,sha256=KWMA7XemcSmc_jn-MwdWz9wnKDtj-sYQ9tINi70_OoU,18583
-vision_agent/agent/vision_agent_planner_prompts.py,sha256=Y3jz9HRf8fz9NLUseN7cTgZqewP0RazxR7vw1sPhcn0,6691
+vision_agent/agent/vision_agent_planner.py,sha256=fFzjNkZBKkh8Y_oS06ATI4qz31xmIJvixb_tV1kX8KA,18590
+vision_agent/agent/vision_agent_planner_prompts.py,sha256=mn9NlZpRkW4XAvlNuMZwIs1ieHCFds5aYZJ55WXupZY,6733
 vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=UfazG0rogmTQk1dBtpQmLhmF4uPLWFssAqmqK0OQRnA,33237
 vision_agent/agent/vision_agent_planner_v2.py,sha256=vvxfmGydBIKB8CtNSAJyPvdEXkG7nIO5-Hs2SjNc48Y,20465
 vision_agent/agent/vision_agent_prompts.py,sha256=NtGdCfzzilCRtscKALC9FK55d1h4CBpMnbhLzg0PYlc,13772
@@ -37,10 +37,10 @@ vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj
 vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
 vision_agent/utils/execute.py,sha256=Qs-C9lnRBc3frUH_bmrwHLuJ9qjPykIytex8y4E0f7s,29356
 vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50vG3w,12182
-vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
+vision_agent/utils/sim.py,sha256=sRbEfX5WVHJyE8VPTggXUdYbUM1Z9pF0trpHTAtWDWA,7348
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
 vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
-vision_agent-0.2.218.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.218.dist-info/METADATA,sha256=Bh9yQRcNSytsUOIqztuXkUhSprPu-le7ncfb7owkc24,19122
-vision_agent-0.2.218.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.218.dist-info/RECORD,,
+vision_agent-0.2.219.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.219.dist-info/METADATA,sha256=AxTPK82zfoAwsFsHwVQvtHSr8UywSPYXZ5wlRbLiOXY,20287
+vision_agent-0.2.219.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.219.dist-info/RECORD,,

{vision_agent-0.2.218.dist-info → vision_agent-0.2.219.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.218.dist-info → vision_agent-0.2.219.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.218__py3-none-any.whl → 0.2.219__py3-none-any.whl

vision-agent 0.2.218py3-none-any.whl → 0.2.219py3-none-any.whl