vision-agent 0.2.218__py3-none-any.whl → 0.2.220__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -460,19 +460,37 @@ desc,doc,name
460
460
  -------
461
461
  >>> document_analysis(image)
462
462
  {'pages':
463
- [{'bbox': [0, 0, 1700, 2200],
464
- 'chunks': [{'bbox': [1371, 75, 1503, 112],
463
+ [{'bbox': [0, 0, 1.0, 1.0],
464
+ 'chunks': [{'bbox': [0.8, 0.1, 1.0, 0.2],
465
465
  'label': 'page_header',
466
466
  'order': 75
467
467
  'caption': 'Annual Report 2024',
468
468
  'summary': 'This annual report summarizes ...' },
469
- {'bbox': [201, 1119, 1497, 1647],
469
+ {'bbox': [0.2, 0.9, 0.9, 1.0],
470
470
  'label': table',
471
471
  'order': 1119,
472
472
  'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
473
473
  'summary': 'This table illustrates a trend of ...'},
474
474
  ],
475
475
  ",document_extraction
476
+ "'document_qa' is a tool that can answer any questions about arbitrary documents, presentations, or tables. It's very useful for document QA tasks, you can ask it a specific question or ask it to return a JSON object answering multiple questions about the document.","document_qa(prompt: str, image: numpy.ndarray) -> str:
477
+ 'document_qa' is a tool that can answer any questions about arbitrary documents,
478
+ presentations, or tables. It's very useful for document QA tasks, you can ask it a
479
+ specific question or ask it to return a JSON object answering multiple questions
480
+ about the document.
481
+
482
+ Parameters:
483
+ prompt (str): The question to be answered about the document image.
484
+ image (np.ndarray): The document image to analyze.
485
+
486
+ Returns:
487
+ str: The answer to the question based on the document's context.
488
+
489
+ Example
490
+ -------
491
+ >>> document_qa(image, question)
492
+ 'The answer to the question ...'
493
+ ",document_qa
476
494
  'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: Optional[int] = 2) -> List[float]:
477
495
  'video_temporal_localization' will run qwen2vl on each chunk_length_frames
478
496
  value selected for the video. It can detect multiple objects independently per
Binary file
@@ -644,12 +644,9 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
644
644
  """VisionAgentCoder that uses Ollama models for planning, coding, testing.
645
645
 
646
646
  Pre-requisites:
647
- 1. Run ollama pull llama3.1 for the LLM
647
+ 1. Run ollama pull llama3.2-vision for the LMM
648
648
  2. Run ollama pull mxbai-embed-large for the embedding similarity model
649
649
 
650
- Technically you should use a VLM such as llava but llava is not able to handle the
651
- context length and crashes.
652
-
653
650
  Example
654
651
  -------
655
652
  >>> image vision_agent as va
@@ -674,17 +671,17 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
674
671
  else planner
675
672
  ),
676
673
  coder=(
677
- OllamaLMM(model_name="llama3.1", temperature=0.0)
674
+ OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
678
675
  if coder is None
679
676
  else coder
680
677
  ),
681
678
  tester=(
682
- OllamaLMM(model_name="llama3.1", temperature=0.0)
679
+ OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
683
680
  if tester is None
684
681
  else tester
685
682
  ),
686
683
  debugger=(
687
- OllamaLMM(model_name="llama3.1", temperature=0.0)
684
+ OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
688
685
  if debugger is None
689
686
  else debugger
690
687
  ),
@@ -5,7 +5,7 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
5
5
  from rich.console import Console
6
6
  from rich.markup import escape
7
7
 
8
- import vision_agent.tools as T
8
+ import vision_agent.tools.tools as T
9
9
  from vision_agent.agent import AgentCoder, AgentPlanner
10
10
  from vision_agent.agent.agent_utils import (
11
11
  DefaultImports,
@@ -34,7 +34,7 @@ from vision_agent.utils.execute import (
34
34
  CodeInterpreterFactory,
35
35
  Execution,
36
36
  )
37
- from vision_agent.utils.sim import Sim
37
+ from vision_agent.utils.sim import Sim, get_tool_recommender
38
38
 
39
39
  _CONSOLE = Console()
40
40
 
@@ -316,7 +316,7 @@ class VisionAgentCoderV2(AgentCoder):
316
316
  elif isinstance(tool_recommender, Sim):
317
317
  self.tool_recommender = tool_recommender
318
318
  else:
319
- self.tool_recommender = T.get_tool_recommender()
319
+ self.tool_recommender = get_tool_recommender()
320
320
 
321
321
  self.verbose = verbose
322
322
  self.code_sandbox_runtime = code_sandbox_runtime
@@ -532,7 +532,7 @@ class OllamaVisionAgentPlanner(VisionAgentPlanner):
532
532
  ) -> None:
533
533
  super().__init__(
534
534
  planner=(
535
- OllamaLMM(model_name="llama3.1", temperature=0.0)
535
+ OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
536
536
  if planner is None
537
537
  else planner
538
538
  ),
@@ -62,10 +62,10 @@ plan2:
62
62
  - Count the number of detected objects labeled as 'person'.
63
63
  plan3:
64
64
  - Load the image from the provided file path 'image.jpg'.
65
- - Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
65
+ - Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
66
66
 
67
67
  ```python
68
- from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_counting
68
+ from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_object_detection
69
69
  image = load_image("image.jpg")
70
70
  owl_v2_out = owl_v2_image("person", image)
71
71
 
@@ -73,9 +73,9 @@ f2s2_out = florence2_sam2_image("person", image)
73
73
  # strip out the masks from the output becuase they don't provide useful information when printed
74
74
  f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
75
75
 
76
- cgd_out = countgd_counting(image)
76
+ cgd_out = countgd_object_detection("person", image)
77
77
 
78
- final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
78
+ final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_object_detection": cgd_out}}
79
79
  print(final_out)
80
80
  --- END EXAMPLE1 ---
81
81
 
@@ -440,16 +440,17 @@ PICK_PLAN = """
440
440
  """
441
441
 
442
442
  CATEGORIZE_TOOL_REQUEST = """
443
- You are given a task: {task} from the user. Your task is to extract the type of category this task belongs to, it can be one or more of the following:
443
+ You are given a task: "{task}" from the user. You must extract the type of category this task belongs to, it can be one or more of the following:
444
444
  - "object detection and counting" - detecting objects or counting objects from a text prompt in an image or video.
445
445
  - "classification" - classifying objects in an image given a text prompt.
446
446
  - "segmentation" - segmenting objects in an image or video given a text prompt.
447
447
  - "OCR" - extracting text from an image.
448
448
  - "VQA" - answering questions about an image or video, can also be used for text extraction.
449
+ - "DocQA" - answering questions about a document or extracting information from a document.
449
450
  - "video object tracking" - tracking objects in a video.
450
451
  - "depth and pose estimation" - estimating the depth or pose of objects in an image.
451
452
 
452
- Return the category or categories (comma separated) inside tags <category># your categories here</category>.
453
+ Return the category or categories (comma separated) inside tags <category># your categories here</category>. If you are unsure about a task, it is better to include more categories than less.
453
454
  """
454
455
 
455
456
  TEST_TOOLS = """
@@ -473,7 +474,7 @@ TEST_TOOLS = """
473
474
  {examples}
474
475
 
475
476
  **Instructions**:
476
- 1. List all the tools under **Tools** and the user request. Write a program to load the media and call every tool in parallel and print it's output along with other relevant information.
477
+ 1. List all the tools under **Tools** and the user request. Write a program to load the media and call the most relevant tools in parallel and print it's output along with other relevant information.
477
478
  2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
478
479
  3. Your test case MUST run only on the given images which are {media}
479
480
  4. Print this final dictionary.
@@ -43,7 +43,6 @@ from .tools import (
43
43
  flux_image_inpainting,
44
44
  generate_pose_image,
45
45
  get_tool_documentation,
46
- get_tool_recommender,
47
46
  gpt4o_image_vqa,
48
47
  gpt4o_video_vqa,
49
48
  load_image,
@@ -63,6 +62,7 @@ from .tools import (
63
62
  save_json,
64
63
  save_video,
65
64
  siglip_classification,
65
+ stella_embeddings,
66
66
  template_match,
67
67
  video_temporal_localization,
68
68
  vit_image_classification,
@@ -32,6 +32,7 @@ from vision_agent.utils.execute import (
32
32
  MimeType,
33
33
  )
34
34
  from vision_agent.utils.image_utils import convert_to_b64
35
+ from vision_agent.utils.sim import get_tool_recommender
35
36
 
36
37
  TOOL_FUNCTIONS = {tool.__name__: tool for tool in T.TOOLS}
37
38
 
@@ -116,13 +117,11 @@ def run_tool_testing(
116
117
  query = lmm.generate(CATEGORIZE_TOOL_REQUEST.format(task=task))
117
118
  category = extract_tag(query, "category") # type: ignore
118
119
  if category is None:
119
- category = task
120
+ query = task
120
121
  else:
121
- category = (
122
- f"I need models from the {category.strip()} category of tools. {task}"
123
- )
122
+ query = f"{category.strip()}. {task}"
124
123
 
125
- tool_docs = T.get_tool_recommender().top_k(category, k=10, thresh=0.2)
124
+ tool_docs = get_tool_recommender().top_k(query, k=5, thresh=0.3)
126
125
  if exclude_tools is not None and len(exclude_tools) > 0:
127
126
  cleaned_tool_docs = []
128
127
  for tool_doc in tool_docs:
@@ -7,7 +7,6 @@ import urllib.request
7
7
  from base64 import b64encode
8
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
9
  from enum import Enum
10
- from functools import lru_cache
11
10
  from importlib import resources
12
11
  from pathlib import Path
13
12
  from typing import Any, Dict, List, Optional, Tuple, Union, cast
@@ -49,7 +48,6 @@ from vision_agent.utils.image_utils import (
49
48
  rle_decode,
50
49
  rle_decode_array,
51
50
  )
52
- from vision_agent.utils.sim import Sim, load_cached_sim
53
51
  from vision_agent.utils.video import (
54
52
  extract_frames_from_video,
55
53
  frames_to_bytes,
@@ -85,11 +83,6 @@ _OCR_URL = "https://app.landing.ai/ocr/v1/detect-text"
85
83
  _LOGGER = logging.getLogger(__name__)
86
84
 
87
85
 
88
- @lru_cache(maxsize=1)
89
- def get_tool_recommender() -> Sim:
90
- return load_cached_sim(TOOLS_DF)
91
-
92
-
93
86
  def _display_tool_trace(
94
87
  function_name: str,
95
88
  request: Dict[str, Any],
@@ -2178,13 +2171,14 @@ def document_qa(
2178
2171
  prompt: str,
2179
2172
  image: np.ndarray,
2180
2173
  ) -> str:
2181
- """'document_qa' is a tool that can answer any questions about arbitrary
2182
- images of documents or presentations. It answers by analyzing the contextual document data
2183
- and then using a model to answer specific questions. It returns text as an answer to the question.
2174
+ """'document_qa' is a tool that can answer any questions about arbitrary documents,
2175
+ presentations, or tables. It's very useful for document QA tasks, you can ask it a
2176
+ specific question or ask it to return a JSON object answering multiple questions
2177
+ about the document.
2184
2178
 
2185
2179
  Parameters:
2186
- prompt (str): The question to be answered about the document image
2187
- image (np.ndarray): The document image to analyze
2180
+ prompt (str): The question to be answered about the document image.
2181
+ image (np.ndarray): The document image to analyze.
2188
2182
 
2189
2183
  Returns:
2190
2184
  str: The answer to the question based on the document's context.
@@ -2203,7 +2197,7 @@ def document_qa(
2203
2197
  "model": "document-analysis",
2204
2198
  }
2205
2199
 
2206
- data: dict[str, Any] = send_inference_request(
2200
+ data: Dict[str, Any] = send_inference_request(
2207
2201
  payload=payload,
2208
2202
  endpoint_name="document-analysis",
2209
2203
  files=files,
@@ -2225,10 +2219,10 @@ def document_qa(
2225
2219
  data = normalize(data)
2226
2220
 
2227
2221
  prompt = f"""
2228
- Document Context:
2229
- {data}\n
2230
- Question: {prompt}\n
2231
- Please provide a clear, concise answer using only the information from the document. If the answer is not definitively contained in the document, say "I cannot find the answer in the provided document."
2222
+ Document Context:
2223
+ {data}\n
2224
+ Question: {prompt}\n
2225
+ Answer the question directly using only the information from the document, do not answer with any additional text besides the answer. If the answer is not definitively contained in the document, say "I cannot find the answer in the provided document."
2232
2226
  """
2233
2227
 
2234
2228
  lmm = AnthropicLMM()
@@ -2245,6 +2239,22 @@ def document_qa(
2245
2239
  return llm_output
2246
2240
 
2247
2241
 
2242
+ def stella_embeddings(prompts: List[str]) -> List[np.ndarray]:
2243
+ payload = {
2244
+ "input": prompts,
2245
+ "model": "stella1.5b",
2246
+ }
2247
+
2248
+ data: Dict[str, Any] = send_inference_request(
2249
+ payload=payload,
2250
+ endpoint_name="embeddings",
2251
+ v2=True,
2252
+ metadata_payload={"function_name": "get_embeddings"},
2253
+ is_form=True,
2254
+ )
2255
+ return [d["embedding"] for d in data] # type: ignore
2256
+
2257
+
2248
2258
  # Utility and visualization functions
2249
2259
 
2250
2260
 
@@ -2781,6 +2791,7 @@ FUNCTION_TOOLS = [
2781
2791
  qwen2_vl_images_vqa,
2782
2792
  qwen2_vl_video_vqa,
2783
2793
  document_extraction,
2794
+ document_qa,
2784
2795
  video_temporal_localization,
2785
2796
  flux_image_inpainting,
2786
2797
  siglip_classification,
@@ -7,4 +7,3 @@ from .execute import (
7
7
  Result,
8
8
  )
9
9
  from .sim import AzureSim, OllamaSim, Sim, load_sim, merge_sim
10
- from .video import extract_frames_from_video, video_writer
@@ -28,10 +28,10 @@ from nbclient import __version__ as nbclient_version
28
28
  from nbclient.exceptions import CellTimeoutError, DeadKernelError
29
29
  from nbclient.util import run_sync
30
30
  from nbformat.v4 import new_code_cell
31
+ from opentelemetry.context import get_current
32
+ from opentelemetry.trace import SpanKind, Status, StatusCode, get_tracer
31
33
  from pydantic import BaseModel, field_serializer
32
34
  from typing_extensions import Self
33
- from opentelemetry.trace import get_tracer, Status, StatusCode, SpanKind
34
- from opentelemetry.context import get_current
35
35
 
36
36
  from vision_agent.utils.exceptions import (
37
37
  RemoteSandboxCreationError,
@@ -11,7 +11,7 @@ import numpy as np
11
11
  from PIL import Image, ImageDraw, ImageFont
12
12
  from PIL.Image import Image as ImageType
13
13
 
14
- from vision_agent.utils import extract_frames_from_video
14
+ from vision_agent.utils.video import extract_frames_from_video
15
15
 
16
16
  COLORS = [
17
17
  (158, 218, 229),
vision_agent/utils/sim.py CHANGED
@@ -12,6 +12,13 @@ import requests
12
12
  from openai import AzureOpenAI, OpenAI
13
13
  from scipy.spatial.distance import cosine # type: ignore
14
14
 
15
+ from vision_agent.tools.tools import TOOLS_DF, stella_embeddings
16
+
17
+
18
+ @lru_cache(maxsize=1)
19
+ def get_tool_recommender() -> "Sim":
20
+ return load_cached_sim(TOOLS_DF)
21
+
15
22
 
16
23
  @lru_cache(maxsize=512)
17
24
  def get_embedding(
@@ -27,13 +34,13 @@ def load_cached_sim(
27
34
  cached_dir_full_path = str(resources.files("vision_agent") / cached_dir)
28
35
  if os.path.exists(cached_dir_full_path):
29
36
  if tools_df is not None:
30
- if Sim.check_load(cached_dir_full_path, tools_df):
37
+ if StellaSim.check_load(cached_dir_full_path, tools_df):
31
38
  # don't pass sim_key to loaded Sim object or else it will re-calculate embeddings
32
- return Sim.load(cached_dir_full_path)
39
+ return StellaSim.load(cached_dir_full_path)
33
40
  if os.path.exists(cached_dir_full_path):
34
41
  shutil.rmtree(cached_dir_full_path)
35
42
 
36
- sim = Sim(tools_df, sim_key=sim_key)
43
+ sim = StellaSim(tools_df, sim_key=sim_key)
37
44
  sim.save(cached_dir_full_path)
38
45
  return sim
39
46
 
@@ -58,6 +65,11 @@ class Sim:
58
65
  """
59
66
  self.df = df
60
67
  self.client = OpenAI(api_key=api_key)
68
+ self.emb_call = (
69
+ lambda x: self.client.embeddings.create(input=x, model=model)
70
+ .data[0]
71
+ .embedding
72
+ )
61
73
  self.model = model
62
74
  if "embs" not in df.columns and sim_key is None:
63
75
  raise ValueError("key is required if no column 'embs' is present.")
@@ -65,11 +77,7 @@ class Sim:
65
77
  if sim_key is not None:
66
78
  self.df["embs"] = self.df[sim_key].apply(
67
79
  lambda x: get_embedding(
68
- lambda text: self.client.embeddings.create(
69
- input=text, model=self.model
70
- )
71
- .data[0]
72
- .embedding,
80
+ self.emb_call,
73
81
  x,
74
82
  )
75
83
  )
@@ -126,9 +134,7 @@ class Sim:
126
134
  """
127
135
 
128
136
  embedding = get_embedding(
129
- lambda text: self.client.embeddings.create(input=text, model=self.model)
130
- .data[0]
131
- .embedding,
137
+ self.emb_call,
132
138
  query,
133
139
  )
134
140
  self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
@@ -215,6 +221,40 @@ class OllamaSim(Sim):
215
221
  )
216
222
 
217
223
 
224
+ class StellaSim(Sim):
225
+ def __init__(
226
+ self,
227
+ df: pd.DataFrame,
228
+ sim_key: Optional[str] = None,
229
+ ) -> None:
230
+ self.df = df
231
+
232
+ def emb_call(text: List[str]) -> List[float]:
233
+ return stella_embeddings(text)[0] # type: ignore
234
+
235
+ self.emb_call = emb_call
236
+
237
+ if "embs" not in df.columns and sim_key is None:
238
+ raise ValueError("key is required if no column 'embs' is present.")
239
+
240
+ if sim_key is not None:
241
+ self.df["embs"] = self.df[sim_key].apply(
242
+ lambda x: get_embedding(emb_call, x)
243
+ )
244
+
245
+ @staticmethod
246
+ def load(
247
+ load_dir: Union[str, Path],
248
+ api_key: Optional[str] = None,
249
+ model: str = "stella1.5b",
250
+ ) -> "StellaSim":
251
+ load_dir = Path(load_dir)
252
+ df = pd.read_csv(load_dir / "df.csv")
253
+ embs = np.load(load_dir / "embs.npy")
254
+ df["embs"] = list(embs)
255
+ return StellaSim(df)
256
+
257
+
218
258
  def merge_sim(sim1: Sim, sim2: Sim) -> Sim:
219
259
  return Sim(pd.concat([sim1.df, sim2.df], ignore_index=True))
220
260
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.218
3
+ Version: 0.2.220
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -81,22 +81,26 @@ You can also run VisionAgent in a local Jupyter Notebook. Here are some example
81
81
  Check out the [notebooks](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks) folder for more examples.
82
82
 
83
83
 
84
- ### Installation
84
+ ### Get Started
85
85
  To get started with the python library, you can install it using pip:
86
86
 
87
+ #### Installation and Setup
87
88
  ```bash
88
89
  pip install vision-agent
89
90
  ```
90
91
 
91
- Ensure you have both an Anthropic key and an OpenAI API key and set in your environment
92
- variables (if you are using Azure OpenAI please see the Azure setup section):
93
-
94
92
  ```bash
95
- export ANTHROPIC_API_KEY="your-api-key" # needed for VisionAgent and VisionAgentCoder
96
- export OPENAI_API_KEY="your-api-key" # needed for ToolRecommender
93
+ export ANTHROPIC_API_KEY="your-api-key"
97
94
  ```
98
95
 
99
- ### Basic Usage
96
+ ---
97
+ **NOTE**
98
+ You must have the Anthropic API key set in your environment variables to use
99
+ VisionAgent. If you don't have an Anthropic key you can use another provider like
100
+ OpenAI or Ollama.
101
+ ---
102
+
103
+ #### Chatting with VisionAgent
100
104
  To get started you can just import the `VisionAgent` and start chatting with it:
101
105
  ```python
102
106
  >>> from vision_agent.agent import VisionAgent
@@ -112,6 +116,40 @@ The chat messages are similar to `OpenAI`'s format with `role` and `content` key
112
116
  in addition to those you can add `media` which is a list of media files that can either
113
117
  be images or video files.
114
118
 
119
+ #### Getting Code from VisionAgent
120
+ You can also use `VisionAgentCoder` to generate code for you:
121
+
122
+ ```python
123
+ >>> from vision_agent.agent import VisionAgentCoder
124
+ >>> agent = VisionAgentCoder(verbosity=2)
125
+ >>> code = agent("Count the number of people in this image", media="people.jpg")
126
+ ```
127
+
128
+ #### Don't have Anthropic/OpenAI API keys?
129
+ You can use `OllamaVisionAgentCoder` which uses Ollama as the backend. To get started
130
+ pull the models:
131
+
132
+ ```bash
133
+ ollama pull llama3.2-vision
134
+ ollama pull mxbai-embed-large
135
+ ```
136
+
137
+ Then you can use it just like you would use `VisionAgentCoder`:
138
+
139
+ ```python
140
+ >>> from vision_agent.agent import OllamaVisionAgentCoder
141
+ >>> agent = OllamaVisionAgentCoder(verbosity=2)
142
+ >>> code = agent("Count the number of people in this image", media="people.jpg")
143
+ ```
144
+
145
+ ---
146
+ **NOTE**
147
+ Smaller open source models like Llama 3.1 8B will not work well with VisionAgent. You
148
+ will encounter many coding errors because it generates incorrect code or JSON decoding
149
+ errors because it generates incorrect JSON. We recommend using larger models or
150
+ Anthropic/OpenAI models.
151
+ ---
152
+
115
153
  ## Documentation
116
154
 
117
155
  [VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
@@ -120,8 +158,7 @@ be images or video files.
120
158
  ### Chatting and Message Formats
121
159
  `VisionAgent` is an agent that can chat with you and call other tools or agents to
122
160
  write vision code for you. You can interact with it like you would ChatGPT or any other
123
- chatbot. The agent uses Clause-3.5 for it's LMM and OpenAI for embeddings for searching
124
- for tools.
161
+ chatbot. The agent uses Clause-3.5 for it's LMM.
125
162
 
126
163
  The message format is:
127
164
  ```json
@@ -445,15 +482,14 @@ Usage is the same as `VisionAgentCoder`:
445
482
  `OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
446
483
 
447
484
  ```bash
448
- ollama pull llama3.1
485
+ ollama pull llama3.2-vision
449
486
  ollama pull mxbai-embed-large
450
487
  ```
451
488
 
452
- `llama3.1` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Normally we would
453
- use an actual LMM such as `llava` but `llava` cannot handle the long context lengths
454
- required by the agent. Since `llama3.1` cannot handle images you may see some
455
- performance degredation. `mxbai-embed-large` is the embedding model used to look up
456
- tools. You can use it just like you would use `VisionAgentCoder`:
489
+ `llama3.2-vision` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Becuase
490
+ `llama3.2-vision` is a smaller model you **WILL see performance degredation** compared to
491
+ using Anthropic or OpenAI models. `mxbai-embed-large` is the embedding model used to
492
+ look up tools. You can use it just like you would use `VisionAgentCoder`:
457
493
 
458
494
  ```python
459
495
  >>> import vision_agent as va
@@ -1,5 +1,5 @@
1
- vision_agent/.sim_tools/df.csv,sha256=nHhcCD55RO9XTiWq_uQ8pHKkVxLXciCHH-SbGPAQEy0,41969
2
- vision_agent/.sim_tools/embs.npy,sha256=UmnXd2Zv1xBu4a7pxHHf4wOhTLKub629rVX9fAusTxY,393344
1
+ vision_agent/.sim_tools/df.csv,sha256=aTpXEOmdIhomzCB4c_qprECxrxTdudet3FK_C3TTzKw,42927
2
+ vision_agent/.sim_tools/embs.npy,sha256=22NYWI7NswR367TwyD3s8I8td7ai2ZumgRLWsE6ajtM,270464
3
3
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
4
4
  vision_agent/agent/README.md,sha256=Q4w7FWw38qaWosQYAZ7NqWx8Q5XzuWrlv7nLhjUd1-8,5527
5
5
  vision_agent/agent/__init__.py,sha256=M8CffavdIh8Zh-skznLHIaQkYGCGK7vk4dq1FaVkbs4,617
@@ -7,13 +7,13 @@ vision_agent/agent/agent.py,sha256=_1tHWAs7Jm5tqDzEcPfCRvJV3uRRveyh4n9_9pd6I1w,1
7
7
  vision_agent/agent/agent_utils.py,sha256=NmrqjhSb6fpnrB8XGWtaywZjr9n89otusOZpcbWLf9k,13534
8
8
  vision_agent/agent/types.py,sha256=DkFm3VMMrKlhYyfxEmZx4keppD72Ov3wmLCbM2J2o10,2437
9
9
  vision_agent/agent/vision_agent.py,sha256=I75bEU-os9Lf9OSICKfvQ_H_ftg-zOwgTwWnu41oIdo,23555
10
- vision_agent/agent/vision_agent_coder.py,sha256=ANwUuCO4JpTYJs4s6ynSRFcdjZFUVuSoSfcqp8ZQDDQ,27451
10
+ vision_agent/agent/vision_agent_coder.py,sha256=flUxOibyGZK19BCSK5mhaD3HjCxHw6c6FtKom6N2q1E,27359
11
11
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
12
12
  vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=9v5HwbNidSzYUEFl6ZMniWWOmyLITM_moWLtKVaTen8,4845
13
- vision_agent/agent/vision_agent_coder_v2.py,sha256=WKYPJAliupxnF2TP5jZlinqxnID37xnYSDNGMwoFKwU,16092
14
- vision_agent/agent/vision_agent_planner.py,sha256=KWMA7XemcSmc_jn-MwdWz9wnKDtj-sYQ9tINi70_OoU,18583
15
- vision_agent/agent/vision_agent_planner_prompts.py,sha256=Y3jz9HRf8fz9NLUseN7cTgZqewP0RazxR7vw1sPhcn0,6691
16
- vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=UfazG0rogmTQk1dBtpQmLhmF4uPLWFssAqmqK0OQRnA,33237
13
+ vision_agent/agent/vision_agent_coder_v2.py,sha256=G3I8O89gzE2VczQGPWV149aYaOjbbfB1lmgGuwFWvo4,16118
14
+ vision_agent/agent/vision_agent_planner.py,sha256=fFzjNkZBKkh8Y_oS06ATI4qz31xmIJvixb_tV1kX8KA,18590
15
+ vision_agent/agent/vision_agent_planner_prompts.py,sha256=mn9NlZpRkW4XAvlNuMZwIs1ieHCFds5aYZJ55WXupZY,6733
16
+ vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=TENjNF_mQHw6RHic24TrP5b7-Q5KWiJardtP9_hitdw,33420
17
17
  vision_agent/agent/vision_agent_planner_v2.py,sha256=vvxfmGydBIKB8CtNSAJyPvdEXkG7nIO5-Hs2SjNc48Y,20465
18
18
  vision_agent/agent/vision_agent_prompts.py,sha256=NtGdCfzzilCRtscKALC9FK55d1h4CBpMnbhLzg0PYlc,13772
19
19
  vision_agent/agent/vision_agent_prompts_v2.py,sha256=-vCWat-ARlCOOOeIDIFhg-kcwRRwjTXYEwsvvqPeaCs,1972
@@ -26,21 +26,21 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
26
26
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
27
27
  vision_agent/lmm/lmm.py,sha256=x_nIyDNDZwq4-pfjnJTmcyyJZ2_B7TjkA5jZp88YVO8,17103
28
28
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
29
- vision_agent/tools/__init__.py,sha256=Jdq34jMw_KuYwk4Wexqm4DRjuLLoL1Q8wukm0NBv1Tc,2812
29
+ vision_agent/tools/__init__.py,sha256=XrViKgxqMjY1Ep3QCBootXqftDBlrtH6M7qIVXOq_MY,2809
30
30
  vision_agent/tools/meta_tools.py,sha256=TPeS7QWnc_PmmU_ndiDT03dXbQ5yDSP33E7U8cSj7Ls,28660
31
- vision_agent/tools/planner_tools.py,sha256=tU1qz_VIQM_yPKDmuxjMWu68ZlAZ7ePWI1g7zswyWhI,13540
31
+ vision_agent/tools/planner_tools.py,sha256=CvaJ2vGM8O_CYvsoSk1avxAMqpIu3tv4C2bY0p1X-X4,13519
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
33
  vision_agent/tools/tool_utils.py,sha256=LAnrb_nY6PNVamqJahRN-J0cuOy4gsKvCtSuXJf0RsI,10075
34
- vision_agent/tools/tools.py,sha256=Xcm_9EQdDCR9X5FhIm7VJaTL0qWqhnJUVTRVrRtETrA,96112
34
+ vision_agent/tools/tools.py,sha256=FAfqeXCjHH7glqDte9arq0JhFyMbGXtkNLRGQ7udSPY,96411
35
35
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
36
- vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
36
+ vision_agent/utils/__init__.py,sha256=QKk4zVjMwGxQI0MQ-aZZA50N-qItxRY4EB9CwQkZ2HY,185
37
37
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
38
- vision_agent/utils/execute.py,sha256=Qs-C9lnRBc3frUH_bmrwHLuJ9qjPykIytex8y4E0f7s,29356
39
- vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50vG3w,12182
40
- vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
38
+ vision_agent/utils/execute.py,sha256=vOEP5Ys7S2lc0_7pOJbgk7OaWi85hrCNu9_8Bo3zk6I,29356
39
+ vision_agent/utils/image_utils.py,sha256=z_ONgcza125B10NkoGwPOzXnL470bpTWZbkB16NeeH0,12188
40
+ vision_agent/utils/sim.py,sha256=u6a04nOc34FJcpMN0mb9eZImDIfNGYQscHWN53Hav6I,8482
41
41
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
42
42
  vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
43
- vision_agent-0.2.218.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
- vision_agent-0.2.218.dist-info/METADATA,sha256=Bh9yQRcNSytsUOIqztuXkUhSprPu-le7ncfb7owkc24,19122
45
- vision_agent-0.2.218.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
- vision_agent-0.2.218.dist-info/RECORD,,
43
+ vision_agent-0.2.220.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
+ vision_agent-0.2.220.dist-info/METADATA,sha256=f4eSfCTMOzJWa2cTUbU01362RE6dZj3Ix43BXf0pdr0,20039
45
+ vision_agent-0.2.220.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
+ vision_agent-0.2.220.dist-info/RECORD,,