vision-agent 0.2.218__tar.gz → 0.2.219__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.218 → vision_agent-0.2.219}/PKG-INFO +51 -11
- {vision_agent-0.2.218 → vision_agent-0.2.219}/README.md +50 -10
- {vision_agent-0.2.218 → vision_agent-0.2.219}/pyproject.toml +1 -1
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_coder.py +4 -7
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_planner.py +1 -1
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_planner_prompts.py +4 -4
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/utils/sim.py +7 -8
- {vision_agent-0.2.218 → vision_agent-0.2.219}/LICENSE +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/.sim_tools/df.csv +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/.sim_tools/embs.npy +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/README.md +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/agent_utils.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/types.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_planner_prompts_v2.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_v2.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/tools/__init__.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/tools/planner_tools.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/tools/tools.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.219
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -81,9 +81,10 @@ You can also run VisionAgent in a local Jupyter Notebook. Here are some example
|
|
81
81
|
Check out the [notebooks](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks) folder for more examples.
|
82
82
|
|
83
83
|
|
84
|
-
###
|
84
|
+
### Get Started
|
85
85
|
To get started with the python library, you can install it using pip:
|
86
86
|
|
87
|
+
#### Installation and Setup
|
87
88
|
```bash
|
88
89
|
pip install vision-agent
|
89
90
|
```
|
@@ -92,11 +93,17 @@ Ensure you have both an Anthropic key and an OpenAI API key and set in your envi
|
|
92
93
|
variables (if you are using Azure OpenAI please see the Azure setup section):
|
93
94
|
|
94
95
|
```bash
|
95
|
-
export ANTHROPIC_API_KEY="your-api-key"
|
96
|
-
export OPENAI_API_KEY="your-api-key"
|
96
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
97
|
+
export OPENAI_API_KEY="your-api-key"
|
97
98
|
```
|
98
99
|
|
99
|
-
|
100
|
+
---
|
101
|
+
**NOTE**
|
102
|
+
You must have both Anthropic and OpenAI API keys set in your environment variables to
|
103
|
+
use VisionAgent. If you don't have an Anthropic key you can use Ollama as a backend.
|
104
|
+
---
|
105
|
+
|
106
|
+
#### Chatting with VisionAgent
|
100
107
|
To get started you can just import the `VisionAgent` and start chatting with it:
|
101
108
|
```python
|
102
109
|
>>> from vision_agent.agent import VisionAgent
|
@@ -112,6 +119,40 @@ The chat messages are similar to `OpenAI`'s format with `role` and `content` key
|
|
112
119
|
in addition to those you can add `media` which is a list of media files that can either
|
113
120
|
be images or video files.
|
114
121
|
|
122
|
+
#### Getting Code from VisionAgent
|
123
|
+
You can also use `VisionAgentCoder` to generate code for you:
|
124
|
+
|
125
|
+
```python
|
126
|
+
>>> from vision_agent.agent import VisionAgentCoder
|
127
|
+
>>> agent = VisionAgentCoder(verbosity=2)
|
128
|
+
>>> code = agent("Count the number of people in this image", media="people.jpg")
|
129
|
+
```
|
130
|
+
|
131
|
+
#### Don't have Anthropic/OpenAI API keys?
|
132
|
+
You can use `OllamaVisionAgentCoder` which uses Ollama as the backend. To get started
|
133
|
+
pull the models:
|
134
|
+
|
135
|
+
```bash
|
136
|
+
ollama pull llama3.2-vision
|
137
|
+
ollama pull mxbai-embed-large
|
138
|
+
```
|
139
|
+
|
140
|
+
Then you can use it just like you would use `VisionAgentCoder`:
|
141
|
+
|
142
|
+
```python
|
143
|
+
>>> from vision_agent.agent import OllamaVisionAgentCoder
|
144
|
+
>>> agent = OllamaVisionAgentCoder(verbosity=2)
|
145
|
+
>>> code = agent("Count the number of people in this image", media="people.jpg")
|
146
|
+
```
|
147
|
+
|
148
|
+
---
|
149
|
+
**NOTE**
|
150
|
+
Smaller open source models like Llama 3.1 8B will not work well with VisionAgent. You
|
151
|
+
will encounter many coding errors because it generates incorrect code or JSON decoding
|
152
|
+
errors because it generates incorrect JSON. We recommend using larger models or
|
153
|
+
Anthropic/OpenAI models.
|
154
|
+
---
|
155
|
+
|
115
156
|
## Documentation
|
116
157
|
|
117
158
|
[VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
|
@@ -445,15 +486,14 @@ Usage is the same as `VisionAgentCoder`:
|
|
445
486
|
`OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
|
446
487
|
|
447
488
|
```bash
|
448
|
-
ollama pull llama3.
|
489
|
+
ollama pull llama3.2-vision
|
449
490
|
ollama pull mxbai-embed-large
|
450
491
|
```
|
451
492
|
|
452
|
-
`llama3.
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
tools. You can use it just like you would use `VisionAgentCoder`:
|
493
|
+
`llama3.2-vision` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Becuase
|
494
|
+
`llama3.2-vision` is a smaller model you **WILL see performance degredation** compared to
|
495
|
+
using Anthropic or OpenAI models. `mxbai-embed-large` is the embedding model used to
|
496
|
+
look up tools. You can use it just like you would use `VisionAgentCoder`:
|
457
497
|
|
458
498
|
```python
|
459
499
|
>>> import vision_agent as va
|
@@ -36,9 +36,10 @@ You can also run VisionAgent in a local Jupyter Notebook. Here are some example
|
|
36
36
|
Check out the [notebooks](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks) folder for more examples.
|
37
37
|
|
38
38
|
|
39
|
-
###
|
39
|
+
### Get Started
|
40
40
|
To get started with the python library, you can install it using pip:
|
41
41
|
|
42
|
+
#### Installation and Setup
|
42
43
|
```bash
|
43
44
|
pip install vision-agent
|
44
45
|
```
|
@@ -47,11 +48,17 @@ Ensure you have both an Anthropic key and an OpenAI API key and set in your envi
|
|
47
48
|
variables (if you are using Azure OpenAI please see the Azure setup section):
|
48
49
|
|
49
50
|
```bash
|
50
|
-
export ANTHROPIC_API_KEY="your-api-key"
|
51
|
-
export OPENAI_API_KEY="your-api-key"
|
51
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
52
|
+
export OPENAI_API_KEY="your-api-key"
|
52
53
|
```
|
53
54
|
|
54
|
-
|
55
|
+
---
|
56
|
+
**NOTE**
|
57
|
+
You must have both Anthropic and OpenAI API keys set in your environment variables to
|
58
|
+
use VisionAgent. If you don't have an Anthropic key you can use Ollama as a backend.
|
59
|
+
---
|
60
|
+
|
61
|
+
#### Chatting with VisionAgent
|
55
62
|
To get started you can just import the `VisionAgent` and start chatting with it:
|
56
63
|
```python
|
57
64
|
>>> from vision_agent.agent import VisionAgent
|
@@ -67,6 +74,40 @@ The chat messages are similar to `OpenAI`'s format with `role` and `content` key
|
|
67
74
|
in addition to those you can add `media` which is a list of media files that can either
|
68
75
|
be images or video files.
|
69
76
|
|
77
|
+
#### Getting Code from VisionAgent
|
78
|
+
You can also use `VisionAgentCoder` to generate code for you:
|
79
|
+
|
80
|
+
```python
|
81
|
+
>>> from vision_agent.agent import VisionAgentCoder
|
82
|
+
>>> agent = VisionAgentCoder(verbosity=2)
|
83
|
+
>>> code = agent("Count the number of people in this image", media="people.jpg")
|
84
|
+
```
|
85
|
+
|
86
|
+
#### Don't have Anthropic/OpenAI API keys?
|
87
|
+
You can use `OllamaVisionAgentCoder` which uses Ollama as the backend. To get started
|
88
|
+
pull the models:
|
89
|
+
|
90
|
+
```bash
|
91
|
+
ollama pull llama3.2-vision
|
92
|
+
ollama pull mxbai-embed-large
|
93
|
+
```
|
94
|
+
|
95
|
+
Then you can use it just like you would use `VisionAgentCoder`:
|
96
|
+
|
97
|
+
```python
|
98
|
+
>>> from vision_agent.agent import OllamaVisionAgentCoder
|
99
|
+
>>> agent = OllamaVisionAgentCoder(verbosity=2)
|
100
|
+
>>> code = agent("Count the number of people in this image", media="people.jpg")
|
101
|
+
```
|
102
|
+
|
103
|
+
---
|
104
|
+
**NOTE**
|
105
|
+
Smaller open source models like Llama 3.1 8B will not work well with VisionAgent. You
|
106
|
+
will encounter many coding errors because it generates incorrect code or JSON decoding
|
107
|
+
errors because it generates incorrect JSON. We recommend using larger models or
|
108
|
+
Anthropic/OpenAI models.
|
109
|
+
---
|
110
|
+
|
70
111
|
## Documentation
|
71
112
|
|
72
113
|
[VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
|
@@ -400,15 +441,14 @@ Usage is the same as `VisionAgentCoder`:
|
|
400
441
|
`OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
|
401
442
|
|
402
443
|
```bash
|
403
|
-
ollama pull llama3.
|
444
|
+
ollama pull llama3.2-vision
|
404
445
|
ollama pull mxbai-embed-large
|
405
446
|
```
|
406
447
|
|
407
|
-
`llama3.
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
tools. You can use it just like you would use `VisionAgentCoder`:
|
448
|
+
`llama3.2-vision` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Becuase
|
449
|
+
`llama3.2-vision` is a smaller model you **WILL see performance degredation** compared to
|
450
|
+
using Anthropic or OpenAI models. `mxbai-embed-large` is the embedding model used to
|
451
|
+
look up tools. You can use it just like you would use `VisionAgentCoder`:
|
412
452
|
|
413
453
|
```python
|
414
454
|
>>> import vision_agent as va
|
@@ -644,12 +644,9 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
|
|
644
644
|
"""VisionAgentCoder that uses Ollama models for planning, coding, testing.
|
645
645
|
|
646
646
|
Pre-requisites:
|
647
|
-
1. Run ollama pull llama3.
|
647
|
+
1. Run ollama pull llama3.2-vision for the LMM
|
648
648
|
2. Run ollama pull mxbai-embed-large for the embedding similarity model
|
649
649
|
|
650
|
-
Technically you should use a VLM such as llava but llava is not able to handle the
|
651
|
-
context length and crashes.
|
652
|
-
|
653
650
|
Example
|
654
651
|
-------
|
655
652
|
>>> image vision_agent as va
|
@@ -674,17 +671,17 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
|
|
674
671
|
else planner
|
675
672
|
),
|
676
673
|
coder=(
|
677
|
-
OllamaLMM(model_name="llama3.
|
674
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
678
675
|
if coder is None
|
679
676
|
else coder
|
680
677
|
),
|
681
678
|
tester=(
|
682
|
-
OllamaLMM(model_name="llama3.
|
679
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
683
680
|
if tester is None
|
684
681
|
else tester
|
685
682
|
),
|
686
683
|
debugger=(
|
687
|
-
OllamaLMM(model_name="llama3.
|
684
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
688
685
|
if debugger is None
|
689
686
|
else debugger
|
690
687
|
),
|
@@ -532,7 +532,7 @@ class OllamaVisionAgentPlanner(VisionAgentPlanner):
|
|
532
532
|
) -> None:
|
533
533
|
super().__init__(
|
534
534
|
planner=(
|
535
|
-
OllamaLMM(model_name="llama3.
|
535
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
536
536
|
if planner is None
|
537
537
|
else planner
|
538
538
|
),
|
{vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_planner_prompts.py
RENAMED
@@ -62,10 +62,10 @@ plan2:
|
|
62
62
|
- Count the number of detected objects labeled as 'person'.
|
63
63
|
plan3:
|
64
64
|
- Load the image from the provided file path 'image.jpg'.
|
65
|
-
- Use the '
|
65
|
+
- Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
|
66
66
|
|
67
67
|
```python
|
68
|
-
from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image,
|
68
|
+
from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_object_detection
|
69
69
|
image = load_image("image.jpg")
|
70
70
|
owl_v2_out = owl_v2_image("person", image)
|
71
71
|
|
@@ -73,9 +73,9 @@ f2s2_out = florence2_sam2_image("person", image)
|
|
73
73
|
# strip out the masks from the output becuase they don't provide useful information when printed
|
74
74
|
f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
|
75
75
|
|
76
|
-
cgd_out =
|
76
|
+
cgd_out = countgd_object_detection("person", image)
|
77
77
|
|
78
|
-
final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "
|
78
|
+
final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_object_detection": cgd_out}}
|
79
79
|
print(final_out)
|
80
80
|
--- END EXAMPLE1 ---
|
81
81
|
|
@@ -58,6 +58,11 @@ class Sim:
|
|
58
58
|
"""
|
59
59
|
self.df = df
|
60
60
|
self.client = OpenAI(api_key=api_key)
|
61
|
+
self.emb_call = (
|
62
|
+
lambda x: self.client.embeddings.create(input=x, model=model)
|
63
|
+
.data[0]
|
64
|
+
.embedding
|
65
|
+
)
|
61
66
|
self.model = model
|
62
67
|
if "embs" not in df.columns and sim_key is None:
|
63
68
|
raise ValueError("key is required if no column 'embs' is present.")
|
@@ -65,11 +70,7 @@ class Sim:
|
|
65
70
|
if sim_key is not None:
|
66
71
|
self.df["embs"] = self.df[sim_key].apply(
|
67
72
|
lambda x: get_embedding(
|
68
|
-
|
69
|
-
input=text, model=self.model
|
70
|
-
)
|
71
|
-
.data[0]
|
72
|
-
.embedding,
|
73
|
+
self.emb_call,
|
73
74
|
x,
|
74
75
|
)
|
75
76
|
)
|
@@ -126,9 +127,7 @@ class Sim:
|
|
126
127
|
"""
|
127
128
|
|
128
129
|
embedding = get_embedding(
|
129
|
-
|
130
|
-
.data[0]
|
131
|
-
.embedding,
|
130
|
+
self.emb_call,
|
132
131
|
query,
|
133
132
|
)
|
134
133
|
self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
{vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_coder_prompts_v2.py
RENAMED
File without changes
|
File without changes
|
{vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_planner_prompts_v2.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|