vision-agent 0.2.218__py3-none-any.whl → 0.2.219__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/vision_agent_coder.py +4 -7
- vision_agent/agent/vision_agent_planner.py +1 -1
- vision_agent/agent/vision_agent_planner_prompts.py +4 -4
- vision_agent/utils/sim.py +7 -8
- {vision_agent-0.2.218.dist-info → vision_agent-0.2.219.dist-info}/METADATA +51 -11
- {vision_agent-0.2.218.dist-info → vision_agent-0.2.219.dist-info}/RECORD +8 -8
- {vision_agent-0.2.218.dist-info → vision_agent-0.2.219.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.218.dist-info → vision_agent-0.2.219.dist-info}/WHEEL +0 -0
@@ -644,12 +644,9 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
|
|
644
644
|
"""VisionAgentCoder that uses Ollama models for planning, coding, testing.
|
645
645
|
|
646
646
|
Pre-requisites:
|
647
|
-
1. Run ollama pull llama3.
|
647
|
+
1. Run ollama pull llama3.2-vision for the LMM
|
648
648
|
2. Run ollama pull mxbai-embed-large for the embedding similarity model
|
649
649
|
|
650
|
-
Technically you should use a VLM such as llava but llava is not able to handle the
|
651
|
-
context length and crashes.
|
652
|
-
|
653
650
|
Example
|
654
651
|
-------
|
655
652
|
>>> image vision_agent as va
|
@@ -674,17 +671,17 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
|
|
674
671
|
else planner
|
675
672
|
),
|
676
673
|
coder=(
|
677
|
-
OllamaLMM(model_name="llama3.
|
674
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
678
675
|
if coder is None
|
679
676
|
else coder
|
680
677
|
),
|
681
678
|
tester=(
|
682
|
-
OllamaLMM(model_name="llama3.
|
679
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
683
680
|
if tester is None
|
684
681
|
else tester
|
685
682
|
),
|
686
683
|
debugger=(
|
687
|
-
OllamaLMM(model_name="llama3.
|
684
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
688
685
|
if debugger is None
|
689
686
|
else debugger
|
690
687
|
),
|
@@ -532,7 +532,7 @@ class OllamaVisionAgentPlanner(VisionAgentPlanner):
|
|
532
532
|
) -> None:
|
533
533
|
super().__init__(
|
534
534
|
planner=(
|
535
|
-
OllamaLMM(model_name="llama3.
|
535
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
536
536
|
if planner is None
|
537
537
|
else planner
|
538
538
|
),
|
@@ -62,10 +62,10 @@ plan2:
|
|
62
62
|
- Count the number of detected objects labeled as 'person'.
|
63
63
|
plan3:
|
64
64
|
- Load the image from the provided file path 'image.jpg'.
|
65
|
-
- Use the '
|
65
|
+
- Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
|
66
66
|
|
67
67
|
```python
|
68
|
-
from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image,
|
68
|
+
from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_object_detection
|
69
69
|
image = load_image("image.jpg")
|
70
70
|
owl_v2_out = owl_v2_image("person", image)
|
71
71
|
|
@@ -73,9 +73,9 @@ f2s2_out = florence2_sam2_image("person", image)
|
|
73
73
|
# strip out the masks from the output becuase they don't provide useful information when printed
|
74
74
|
f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
|
75
75
|
|
76
|
-
cgd_out =
|
76
|
+
cgd_out = countgd_object_detection("person", image)
|
77
77
|
|
78
|
-
final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "
|
78
|
+
final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_object_detection": cgd_out}}
|
79
79
|
print(final_out)
|
80
80
|
--- END EXAMPLE1 ---
|
81
81
|
|
vision_agent/utils/sim.py
CHANGED
@@ -58,6 +58,11 @@ class Sim:
|
|
58
58
|
"""
|
59
59
|
self.df = df
|
60
60
|
self.client = OpenAI(api_key=api_key)
|
61
|
+
self.emb_call = (
|
62
|
+
lambda x: self.client.embeddings.create(input=x, model=model)
|
63
|
+
.data[0]
|
64
|
+
.embedding
|
65
|
+
)
|
61
66
|
self.model = model
|
62
67
|
if "embs" not in df.columns and sim_key is None:
|
63
68
|
raise ValueError("key is required if no column 'embs' is present.")
|
@@ -65,11 +70,7 @@ class Sim:
|
|
65
70
|
if sim_key is not None:
|
66
71
|
self.df["embs"] = self.df[sim_key].apply(
|
67
72
|
lambda x: get_embedding(
|
68
|
-
|
69
|
-
input=text, model=self.model
|
70
|
-
)
|
71
|
-
.data[0]
|
72
|
-
.embedding,
|
73
|
+
self.emb_call,
|
73
74
|
x,
|
74
75
|
)
|
75
76
|
)
|
@@ -126,9 +127,7 @@ class Sim:
|
|
126
127
|
"""
|
127
128
|
|
128
129
|
embedding = get_embedding(
|
129
|
-
|
130
|
-
.data[0]
|
131
|
-
.embedding,
|
130
|
+
self.emb_call,
|
132
131
|
query,
|
133
132
|
)
|
134
133
|
self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.219
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -81,9 +81,10 @@ You can also run VisionAgent in a local Jupyter Notebook. Here are some example
|
|
81
81
|
Check out the [notebooks](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks) folder for more examples.
|
82
82
|
|
83
83
|
|
84
|
-
###
|
84
|
+
### Get Started
|
85
85
|
To get started with the python library, you can install it using pip:
|
86
86
|
|
87
|
+
#### Installation and Setup
|
87
88
|
```bash
|
88
89
|
pip install vision-agent
|
89
90
|
```
|
@@ -92,11 +93,17 @@ Ensure you have both an Anthropic key and an OpenAI API key and set in your envi
|
|
92
93
|
variables (if you are using Azure OpenAI please see the Azure setup section):
|
93
94
|
|
94
95
|
```bash
|
95
|
-
export ANTHROPIC_API_KEY="your-api-key"
|
96
|
-
export OPENAI_API_KEY="your-api-key"
|
96
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
97
|
+
export OPENAI_API_KEY="your-api-key"
|
97
98
|
```
|
98
99
|
|
99
|
-
|
100
|
+
---
|
101
|
+
**NOTE**
|
102
|
+
You must have both Anthropic and OpenAI API keys set in your environment variables to
|
103
|
+
use VisionAgent. If you don't have an Anthropic key you can use Ollama as a backend.
|
104
|
+
---
|
105
|
+
|
106
|
+
#### Chatting with VisionAgent
|
100
107
|
To get started you can just import the `VisionAgent` and start chatting with it:
|
101
108
|
```python
|
102
109
|
>>> from vision_agent.agent import VisionAgent
|
@@ -112,6 +119,40 @@ The chat messages are similar to `OpenAI`'s format with `role` and `content` key
|
|
112
119
|
in addition to those you can add `media` which is a list of media files that can either
|
113
120
|
be images or video files.
|
114
121
|
|
122
|
+
#### Getting Code from VisionAgent
|
123
|
+
You can also use `VisionAgentCoder` to generate code for you:
|
124
|
+
|
125
|
+
```python
|
126
|
+
>>> from vision_agent.agent import VisionAgentCoder
|
127
|
+
>>> agent = VisionAgentCoder(verbosity=2)
|
128
|
+
>>> code = agent("Count the number of people in this image", media="people.jpg")
|
129
|
+
```
|
130
|
+
|
131
|
+
#### Don't have Anthropic/OpenAI API keys?
|
132
|
+
You can use `OllamaVisionAgentCoder` which uses Ollama as the backend. To get started
|
133
|
+
pull the models:
|
134
|
+
|
135
|
+
```bash
|
136
|
+
ollama pull llama3.2-vision
|
137
|
+
ollama pull mxbai-embed-large
|
138
|
+
```
|
139
|
+
|
140
|
+
Then you can use it just like you would use `VisionAgentCoder`:
|
141
|
+
|
142
|
+
```python
|
143
|
+
>>> from vision_agent.agent import OllamaVisionAgentCoder
|
144
|
+
>>> agent = OllamaVisionAgentCoder(verbosity=2)
|
145
|
+
>>> code = agent("Count the number of people in this image", media="people.jpg")
|
146
|
+
```
|
147
|
+
|
148
|
+
---
|
149
|
+
**NOTE**
|
150
|
+
Smaller open source models like Llama 3.1 8B will not work well with VisionAgent. You
|
151
|
+
will encounter many coding errors because it generates incorrect code or JSON decoding
|
152
|
+
errors because it generates incorrect JSON. We recommend using larger models or
|
153
|
+
Anthropic/OpenAI models.
|
154
|
+
---
|
155
|
+
|
115
156
|
## Documentation
|
116
157
|
|
117
158
|
[VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
|
@@ -445,15 +486,14 @@ Usage is the same as `VisionAgentCoder`:
|
|
445
486
|
`OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
|
446
487
|
|
447
488
|
```bash
|
448
|
-
ollama pull llama3.
|
489
|
+
ollama pull llama3.2-vision
|
449
490
|
ollama pull mxbai-embed-large
|
450
491
|
```
|
451
492
|
|
452
|
-
`llama3.
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
tools. You can use it just like you would use `VisionAgentCoder`:
|
493
|
+
`llama3.2-vision` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Becuase
|
494
|
+
`llama3.2-vision` is a smaller model you **WILL see performance degredation** compared to
|
495
|
+
using Anthropic or OpenAI models. `mxbai-embed-large` is the embedding model used to
|
496
|
+
look up tools. You can use it just like you would use `VisionAgentCoder`:
|
457
497
|
|
458
498
|
```python
|
459
499
|
>>> import vision_agent as va
|
@@ -7,12 +7,12 @@ vision_agent/agent/agent.py,sha256=_1tHWAs7Jm5tqDzEcPfCRvJV3uRRveyh4n9_9pd6I1w,1
|
|
7
7
|
vision_agent/agent/agent_utils.py,sha256=NmrqjhSb6fpnrB8XGWtaywZjr9n89otusOZpcbWLf9k,13534
|
8
8
|
vision_agent/agent/types.py,sha256=DkFm3VMMrKlhYyfxEmZx4keppD72Ov3wmLCbM2J2o10,2437
|
9
9
|
vision_agent/agent/vision_agent.py,sha256=I75bEU-os9Lf9OSICKfvQ_H_ftg-zOwgTwWnu41oIdo,23555
|
10
|
-
vision_agent/agent/vision_agent_coder.py,sha256=
|
10
|
+
vision_agent/agent/vision_agent_coder.py,sha256=flUxOibyGZK19BCSK5mhaD3HjCxHw6c6FtKom6N2q1E,27359
|
11
11
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
|
12
12
|
vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=9v5HwbNidSzYUEFl6ZMniWWOmyLITM_moWLtKVaTen8,4845
|
13
13
|
vision_agent/agent/vision_agent_coder_v2.py,sha256=WKYPJAliupxnF2TP5jZlinqxnID37xnYSDNGMwoFKwU,16092
|
14
|
-
vision_agent/agent/vision_agent_planner.py,sha256=
|
15
|
-
vision_agent/agent/vision_agent_planner_prompts.py,sha256=
|
14
|
+
vision_agent/agent/vision_agent_planner.py,sha256=fFzjNkZBKkh8Y_oS06ATI4qz31xmIJvixb_tV1kX8KA,18590
|
15
|
+
vision_agent/agent/vision_agent_planner_prompts.py,sha256=mn9NlZpRkW4XAvlNuMZwIs1ieHCFds5aYZJ55WXupZY,6733
|
16
16
|
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=UfazG0rogmTQk1dBtpQmLhmF4uPLWFssAqmqK0OQRnA,33237
|
17
17
|
vision_agent/agent/vision_agent_planner_v2.py,sha256=vvxfmGydBIKB8CtNSAJyPvdEXkG7nIO5-Hs2SjNc48Y,20465
|
18
18
|
vision_agent/agent/vision_agent_prompts.py,sha256=NtGdCfzzilCRtscKALC9FK55d1h4CBpMnbhLzg0PYlc,13772
|
@@ -37,10 +37,10 @@ vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj
|
|
37
37
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
38
38
|
vision_agent/utils/execute.py,sha256=Qs-C9lnRBc3frUH_bmrwHLuJ9qjPykIytex8y4E0f7s,29356
|
39
39
|
vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50vG3w,12182
|
40
|
-
vision_agent/utils/sim.py,sha256=
|
40
|
+
vision_agent/utils/sim.py,sha256=sRbEfX5WVHJyE8VPTggXUdYbUM1Z9pF0trpHTAtWDWA,7348
|
41
41
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
42
42
|
vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
|
43
|
-
vision_agent-0.2.
|
44
|
-
vision_agent-0.2.
|
45
|
-
vision_agent-0.2.
|
46
|
-
vision_agent-0.2.
|
43
|
+
vision_agent-0.2.219.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
44
|
+
vision_agent-0.2.219.dist-info/METADATA,sha256=AxTPK82zfoAwsFsHwVQvtHSr8UywSPYXZ5wlRbLiOXY,20287
|
45
|
+
vision_agent-0.2.219.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
46
|
+
vision_agent-0.2.219.dist-info/RECORD,,
|
File without changes
|
File without changes
|