vision-agent 0.2.218__py3-none-any.whl → 0.2.219__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -644,12 +644,9 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
644
644
  """VisionAgentCoder that uses Ollama models for planning, coding, testing.
645
645
 
646
646
  Pre-requisites:
647
- 1. Run ollama pull llama3.1 for the LLM
647
+ 1. Run ollama pull llama3.2-vision for the LMM
648
648
  2. Run ollama pull mxbai-embed-large for the embedding similarity model
649
649
 
650
- Technically you should use a VLM such as llava but llava is not able to handle the
651
- context length and crashes.
652
-
653
650
  Example
654
651
  -------
655
652
  >>> image vision_agent as va
@@ -674,17 +671,17 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
674
671
  else planner
675
672
  ),
676
673
  coder=(
677
- OllamaLMM(model_name="llama3.1", temperature=0.0)
674
+ OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
678
675
  if coder is None
679
676
  else coder
680
677
  ),
681
678
  tester=(
682
- OllamaLMM(model_name="llama3.1", temperature=0.0)
679
+ OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
683
680
  if tester is None
684
681
  else tester
685
682
  ),
686
683
  debugger=(
687
- OllamaLMM(model_name="llama3.1", temperature=0.0)
684
+ OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
688
685
  if debugger is None
689
686
  else debugger
690
687
  ),
@@ -532,7 +532,7 @@ class OllamaVisionAgentPlanner(VisionAgentPlanner):
532
532
  ) -> None:
533
533
  super().__init__(
534
534
  planner=(
535
- OllamaLMM(model_name="llama3.1", temperature=0.0)
535
+ OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
536
536
  if planner is None
537
537
  else planner
538
538
  ),
@@ -62,10 +62,10 @@ plan2:
62
62
  - Count the number of detected objects labeled as 'person'.
63
63
  plan3:
64
64
  - Load the image from the provided file path 'image.jpg'.
65
- - Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
65
+ - Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
66
66
 
67
67
  ```python
68
- from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_counting
68
+ from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_object_detection
69
69
  image = load_image("image.jpg")
70
70
  owl_v2_out = owl_v2_image("person", image)
71
71
 
@@ -73,9 +73,9 @@ f2s2_out = florence2_sam2_image("person", image)
73
73
  # strip out the masks from the output becuase they don't provide useful information when printed
74
74
  f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
75
75
 
76
- cgd_out = countgd_counting(image)
76
+ cgd_out = countgd_object_detection("person", image)
77
77
 
78
- final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
78
+ final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_object_detection": cgd_out}}
79
79
  print(final_out)
80
80
  --- END EXAMPLE1 ---
81
81
 
vision_agent/utils/sim.py CHANGED
@@ -58,6 +58,11 @@ class Sim:
58
58
  """
59
59
  self.df = df
60
60
  self.client = OpenAI(api_key=api_key)
61
+ self.emb_call = (
62
+ lambda x: self.client.embeddings.create(input=x, model=model)
63
+ .data[0]
64
+ .embedding
65
+ )
61
66
  self.model = model
62
67
  if "embs" not in df.columns and sim_key is None:
63
68
  raise ValueError("key is required if no column 'embs' is present.")
@@ -65,11 +70,7 @@ class Sim:
65
70
  if sim_key is not None:
66
71
  self.df["embs"] = self.df[sim_key].apply(
67
72
  lambda x: get_embedding(
68
- lambda text: self.client.embeddings.create(
69
- input=text, model=self.model
70
- )
71
- .data[0]
72
- .embedding,
73
+ self.emb_call,
73
74
  x,
74
75
  )
75
76
  )
@@ -126,9 +127,7 @@ class Sim:
126
127
  """
127
128
 
128
129
  embedding = get_embedding(
129
- lambda text: self.client.embeddings.create(input=text, model=self.model)
130
- .data[0]
131
- .embedding,
130
+ self.emb_call,
132
131
  query,
133
132
  )
134
133
  self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.218
3
+ Version: 0.2.219
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -81,9 +81,10 @@ You can also run VisionAgent in a local Jupyter Notebook. Here are some example
81
81
  Check out the [notebooks](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks) folder for more examples.
82
82
 
83
83
 
84
- ### Installation
84
+ ### Get Started
85
85
  To get started with the python library, you can install it using pip:
86
86
 
87
+ #### Installation and Setup
87
88
  ```bash
88
89
  pip install vision-agent
89
90
  ```
@@ -92,11 +93,17 @@ Ensure you have both an Anthropic key and an OpenAI API key and set in your envi
92
93
  variables (if you are using Azure OpenAI please see the Azure setup section):
93
94
 
94
95
  ```bash
95
- export ANTHROPIC_API_KEY="your-api-key" # needed for VisionAgent and VisionAgentCoder
96
- export OPENAI_API_KEY="your-api-key" # needed for ToolRecommender
96
+ export ANTHROPIC_API_KEY="your-api-key"
97
+ export OPENAI_API_KEY="your-api-key"
97
98
  ```
98
99
 
99
- ### Basic Usage
100
+ ---
101
+ **NOTE**
102
+ You must have both Anthropic and OpenAI API keys set in your environment variables to
103
+ use VisionAgent. If you don't have an Anthropic key you can use Ollama as a backend.
104
+ ---
105
+
106
+ #### Chatting with VisionAgent
100
107
  To get started you can just import the `VisionAgent` and start chatting with it:
101
108
  ```python
102
109
  >>> from vision_agent.agent import VisionAgent
@@ -112,6 +119,40 @@ The chat messages are similar to `OpenAI`'s format with `role` and `content` key
112
119
  in addition to those you can add `media` which is a list of media files that can either
113
120
  be images or video files.
114
121
 
122
+ #### Getting Code from VisionAgent
123
+ You can also use `VisionAgentCoder` to generate code for you:
124
+
125
+ ```python
126
+ >>> from vision_agent.agent import VisionAgentCoder
127
+ >>> agent = VisionAgentCoder(verbosity=2)
128
+ >>> code = agent("Count the number of people in this image", media="people.jpg")
129
+ ```
130
+
131
+ #### Don't have Anthropic/OpenAI API keys?
132
+ You can use `OllamaVisionAgentCoder` which uses Ollama as the backend. To get started
133
+ pull the models:
134
+
135
+ ```bash
136
+ ollama pull llama3.2-vision
137
+ ollama pull mxbai-embed-large
138
+ ```
139
+
140
+ Then you can use it just like you would use `VisionAgentCoder`:
141
+
142
+ ```python
143
+ >>> from vision_agent.agent import OllamaVisionAgentCoder
144
+ >>> agent = OllamaVisionAgentCoder(verbosity=2)
145
+ >>> code = agent("Count the number of people in this image", media="people.jpg")
146
+ ```
147
+
148
+ ---
149
+ **NOTE**
150
+ Smaller open source models like Llama 3.1 8B will not work well with VisionAgent. You
151
+ will encounter many coding errors because it generates incorrect code or JSON decoding
152
+ errors because it generates incorrect JSON. We recommend using larger models or
153
+ Anthropic/OpenAI models.
154
+ ---
155
+
115
156
  ## Documentation
116
157
 
117
158
  [VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
@@ -445,15 +486,14 @@ Usage is the same as `VisionAgentCoder`:
445
486
  `OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
446
487
 
447
488
  ```bash
448
- ollama pull llama3.1
489
+ ollama pull llama3.2-vision
449
490
  ollama pull mxbai-embed-large
450
491
  ```
451
492
 
452
- `llama3.1` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Normally we would
453
- use an actual LMM such as `llava` but `llava` cannot handle the long context lengths
454
- required by the agent. Since `llama3.1` cannot handle images you may see some
455
- performance degredation. `mxbai-embed-large` is the embedding model used to look up
456
- tools. You can use it just like you would use `VisionAgentCoder`:
493
+ `llama3.2-vision` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Becuase
494
+ `llama3.2-vision` is a smaller model you **WILL see performance degredation** compared to
495
+ using Anthropic or OpenAI models. `mxbai-embed-large` is the embedding model used to
496
+ look up tools. You can use it just like you would use `VisionAgentCoder`:
457
497
 
458
498
  ```python
459
499
  >>> import vision_agent as va
@@ -7,12 +7,12 @@ vision_agent/agent/agent.py,sha256=_1tHWAs7Jm5tqDzEcPfCRvJV3uRRveyh4n9_9pd6I1w,1
7
7
  vision_agent/agent/agent_utils.py,sha256=NmrqjhSb6fpnrB8XGWtaywZjr9n89otusOZpcbWLf9k,13534
8
8
  vision_agent/agent/types.py,sha256=DkFm3VMMrKlhYyfxEmZx4keppD72Ov3wmLCbM2J2o10,2437
9
9
  vision_agent/agent/vision_agent.py,sha256=I75bEU-os9Lf9OSICKfvQ_H_ftg-zOwgTwWnu41oIdo,23555
10
- vision_agent/agent/vision_agent_coder.py,sha256=ANwUuCO4JpTYJs4s6ynSRFcdjZFUVuSoSfcqp8ZQDDQ,27451
10
+ vision_agent/agent/vision_agent_coder.py,sha256=flUxOibyGZK19BCSK5mhaD3HjCxHw6c6FtKom6N2q1E,27359
11
11
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
12
12
  vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=9v5HwbNidSzYUEFl6ZMniWWOmyLITM_moWLtKVaTen8,4845
13
13
  vision_agent/agent/vision_agent_coder_v2.py,sha256=WKYPJAliupxnF2TP5jZlinqxnID37xnYSDNGMwoFKwU,16092
14
- vision_agent/agent/vision_agent_planner.py,sha256=KWMA7XemcSmc_jn-MwdWz9wnKDtj-sYQ9tINi70_OoU,18583
15
- vision_agent/agent/vision_agent_planner_prompts.py,sha256=Y3jz9HRf8fz9NLUseN7cTgZqewP0RazxR7vw1sPhcn0,6691
14
+ vision_agent/agent/vision_agent_planner.py,sha256=fFzjNkZBKkh8Y_oS06ATI4qz31xmIJvixb_tV1kX8KA,18590
15
+ vision_agent/agent/vision_agent_planner_prompts.py,sha256=mn9NlZpRkW4XAvlNuMZwIs1ieHCFds5aYZJ55WXupZY,6733
16
16
  vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=UfazG0rogmTQk1dBtpQmLhmF4uPLWFssAqmqK0OQRnA,33237
17
17
  vision_agent/agent/vision_agent_planner_v2.py,sha256=vvxfmGydBIKB8CtNSAJyPvdEXkG7nIO5-Hs2SjNc48Y,20465
18
18
  vision_agent/agent/vision_agent_prompts.py,sha256=NtGdCfzzilCRtscKALC9FK55d1h4CBpMnbhLzg0PYlc,13772
@@ -37,10 +37,10 @@ vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj
37
37
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
38
38
  vision_agent/utils/execute.py,sha256=Qs-C9lnRBc3frUH_bmrwHLuJ9qjPykIytex8y4E0f7s,29356
39
39
  vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50vG3w,12182
40
- vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
40
+ vision_agent/utils/sim.py,sha256=sRbEfX5WVHJyE8VPTggXUdYbUM1Z9pF0trpHTAtWDWA,7348
41
41
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
42
42
  vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
43
- vision_agent-0.2.218.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
- vision_agent-0.2.218.dist-info/METADATA,sha256=Bh9yQRcNSytsUOIqztuXkUhSprPu-le7ncfb7owkc24,19122
45
- vision_agent-0.2.218.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
- vision_agent-0.2.218.dist-info/RECORD,,
43
+ vision_agent-0.2.219.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
+ vision_agent-0.2.219.dist-info/METADATA,sha256=AxTPK82zfoAwsFsHwVQvtHSr8UywSPYXZ5wlRbLiOXY,20287
45
+ vision_agent-0.2.219.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
+ vision_agent-0.2.219.dist-info/RECORD,,