vision-agent 0.2.217__py3-none-any.whl → 0.2.219__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/vision_agent_coder.py +4 -7
- vision_agent/agent/vision_agent_planner.py +1 -1
- vision_agent/agent/vision_agent_planner_prompts.py +4 -4
- vision_agent/utils/execute.py +40 -17
- vision_agent/utils/sim.py +7 -8
- {vision_agent-0.2.217.dist-info → vision_agent-0.2.219.dist-info}/METADATA +52 -11
- {vision_agent-0.2.217.dist-info → vision_agent-0.2.219.dist-info}/RECORD +9 -9
- {vision_agent-0.2.217.dist-info → vision_agent-0.2.219.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.217.dist-info → vision_agent-0.2.219.dist-info}/WHEEL +0 -0
@@ -644,12 +644,9 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
|
|
644
644
|
"""VisionAgentCoder that uses Ollama models for planning, coding, testing.
|
645
645
|
|
646
646
|
Pre-requisites:
|
647
|
-
1. Run ollama pull llama3.
|
647
|
+
1. Run ollama pull llama3.2-vision for the LMM
|
648
648
|
2. Run ollama pull mxbai-embed-large for the embedding similarity model
|
649
649
|
|
650
|
-
Technically you should use a VLM such as llava but llava is not able to handle the
|
651
|
-
context length and crashes.
|
652
|
-
|
653
650
|
Example
|
654
651
|
-------
|
655
652
|
>>> image vision_agent as va
|
@@ -674,17 +671,17 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
|
|
674
671
|
else planner
|
675
672
|
),
|
676
673
|
coder=(
|
677
|
-
OllamaLMM(model_name="llama3.
|
674
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
678
675
|
if coder is None
|
679
676
|
else coder
|
680
677
|
),
|
681
678
|
tester=(
|
682
|
-
OllamaLMM(model_name="llama3.
|
679
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
683
680
|
if tester is None
|
684
681
|
else tester
|
685
682
|
),
|
686
683
|
debugger=(
|
687
|
-
OllamaLMM(model_name="llama3.
|
684
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
688
685
|
if debugger is None
|
689
686
|
else debugger
|
690
687
|
),
|
@@ -532,7 +532,7 @@ class OllamaVisionAgentPlanner(VisionAgentPlanner):
|
|
532
532
|
) -> None:
|
533
533
|
super().__init__(
|
534
534
|
planner=(
|
535
|
-
OllamaLMM(model_name="llama3.
|
535
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
536
536
|
if planner is None
|
537
537
|
else planner
|
538
538
|
),
|
@@ -62,10 +62,10 @@ plan2:
|
|
62
62
|
- Count the number of detected objects labeled as 'person'.
|
63
63
|
plan3:
|
64
64
|
- Load the image from the provided file path 'image.jpg'.
|
65
|
-
- Use the '
|
65
|
+
- Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
|
66
66
|
|
67
67
|
```python
|
68
|
-
from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image,
|
68
|
+
from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_object_detection
|
69
69
|
image = load_image("image.jpg")
|
70
70
|
owl_v2_out = owl_v2_image("person", image)
|
71
71
|
|
@@ -73,9 +73,9 @@ f2s2_out = florence2_sam2_image("person", image)
|
|
73
73
|
# strip out the masks from the output becuase they don't provide useful information when printed
|
74
74
|
f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
|
75
75
|
|
76
|
-
cgd_out =
|
76
|
+
cgd_out = countgd_object_detection("person", image)
|
77
77
|
|
78
|
-
final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "
|
78
|
+
final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_object_detection": cgd_out}}
|
79
79
|
print(final_out)
|
80
80
|
--- END EXAMPLE1 ---
|
81
81
|
|
vision_agent/utils/execute.py
CHANGED
@@ -30,6 +30,8 @@ from nbclient.util import run_sync
|
|
30
30
|
from nbformat.v4 import new_code_cell
|
31
31
|
from pydantic import BaseModel, field_serializer
|
32
32
|
from typing_extensions import Self
|
33
|
+
from opentelemetry.trace import get_tracer, Status, StatusCode, SpanKind
|
34
|
+
from opentelemetry.context import get_current
|
33
35
|
|
34
36
|
from vision_agent.utils.exceptions import (
|
35
37
|
RemoteSandboxCreationError,
|
@@ -633,23 +635,44 @@ Timeout: {self.timeout}"""
|
|
633
635
|
self._new_kernel()
|
634
636
|
|
635
637
|
def exec_cell(self, code: str) -> Execution:
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
638
|
+
# track the exec_cell with opentelemetry trace
|
639
|
+
tracer = get_tracer(__name__)
|
640
|
+
context = get_current()
|
641
|
+
with tracer.start_as_current_span(
|
642
|
+
"notebook_cell_execution", kind=SpanKind.INTERNAL, context=context
|
643
|
+
) as span:
|
644
|
+
try:
|
645
|
+
# Add code as span attribute
|
646
|
+
span.set_attribute("code", code)
|
647
|
+
span.set_attribute("cell_index", len(self.nb.cells))
|
648
|
+
|
649
|
+
self.nb.cells.append(new_code_cell(code))
|
650
|
+
cell = self.nb.cells[-1]
|
651
|
+
self.nb_client.execute_cell(cell, len(self.nb.cells) - 1)
|
652
|
+
|
653
|
+
result = _parse_local_code_interpreter_outputs(
|
654
|
+
self.nb.cells[-1].outputs
|
655
|
+
)
|
656
|
+
span.set_status(Status(StatusCode.OK))
|
657
|
+
return result
|
658
|
+
except CellTimeoutError as e:
|
659
|
+
run_sync(self.nb_client.km.interrupt_kernel)() # type: ignore
|
660
|
+
sleep(1)
|
661
|
+
span.set_status(Status(StatusCode.ERROR, str(e)))
|
662
|
+
span.record_exception(e)
|
663
|
+
traceback_raw = traceback.format_exc().splitlines()
|
664
|
+
return Execution.from_exception(e, traceback_raw)
|
665
|
+
except DeadKernelError as e:
|
666
|
+
self.restart_kernel()
|
667
|
+
span.set_status(Status(StatusCode.ERROR, str(e)))
|
668
|
+
span.record_exception(e)
|
669
|
+
traceback_raw = traceback.format_exc().splitlines()
|
670
|
+
return Execution.from_exception(e, traceback_raw)
|
671
|
+
except Exception as e:
|
672
|
+
span.set_status(Status(StatusCode.ERROR, str(e)))
|
673
|
+
span.record_exception(e)
|
674
|
+
traceback_raw = traceback.format_exc().splitlines()
|
675
|
+
return Execution.from_exception(e, traceback_raw)
|
653
676
|
|
654
677
|
def upload_file(self, file_path: Union[str, Path]) -> Path:
|
655
678
|
with open(file_path, "rb") as f:
|
vision_agent/utils/sim.py
CHANGED
@@ -58,6 +58,11 @@ class Sim:
|
|
58
58
|
"""
|
59
59
|
self.df = df
|
60
60
|
self.client = OpenAI(api_key=api_key)
|
61
|
+
self.emb_call = (
|
62
|
+
lambda x: self.client.embeddings.create(input=x, model=model)
|
63
|
+
.data[0]
|
64
|
+
.embedding
|
65
|
+
)
|
61
66
|
self.model = model
|
62
67
|
if "embs" not in df.columns and sim_key is None:
|
63
68
|
raise ValueError("key is required if no column 'embs' is present.")
|
@@ -65,11 +70,7 @@ class Sim:
|
|
65
70
|
if sim_key is not None:
|
66
71
|
self.df["embs"] = self.df[sim_key].apply(
|
67
72
|
lambda x: get_embedding(
|
68
|
-
|
69
|
-
input=text, model=self.model
|
70
|
-
)
|
71
|
-
.data[0]
|
72
|
-
.embedding,
|
73
|
+
self.emb_call,
|
73
74
|
x,
|
74
75
|
)
|
75
76
|
)
|
@@ -126,9 +127,7 @@ class Sim:
|
|
126
127
|
"""
|
127
128
|
|
128
129
|
embedding = get_embedding(
|
129
|
-
|
130
|
-
.data[0]
|
131
|
-
.embedding,
|
130
|
+
self.emb_call,
|
132
131
|
query,
|
133
132
|
)
|
134
133
|
self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.219
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -23,6 +23,7 @@ Requires-Dist: nbformat (>=5.10.4,<6.0.0)
|
|
23
23
|
Requires-Dist: numpy (>=1.21.0,<2.0.0)
|
24
24
|
Requires-Dist: openai (>=1.0.0,<2.0.0)
|
25
25
|
Requires-Dist: opencv-python (>=4.0.0,<5.0.0)
|
26
|
+
Requires-Dist: opentelemetry-api (>=1.29.0,<2.0.0)
|
26
27
|
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
27
28
|
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
28
29
|
Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
|
@@ -80,9 +81,10 @@ You can also run VisionAgent in a local Jupyter Notebook. Here are some example
|
|
80
81
|
Check out the [notebooks](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks) folder for more examples.
|
81
82
|
|
82
83
|
|
83
|
-
###
|
84
|
+
### Get Started
|
84
85
|
To get started with the python library, you can install it using pip:
|
85
86
|
|
87
|
+
#### Installation and Setup
|
86
88
|
```bash
|
87
89
|
pip install vision-agent
|
88
90
|
```
|
@@ -91,11 +93,17 @@ Ensure you have both an Anthropic key and an OpenAI API key and set in your envi
|
|
91
93
|
variables (if you are using Azure OpenAI please see the Azure setup section):
|
92
94
|
|
93
95
|
```bash
|
94
|
-
export ANTHROPIC_API_KEY="your-api-key"
|
95
|
-
export OPENAI_API_KEY="your-api-key"
|
96
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
97
|
+
export OPENAI_API_KEY="your-api-key"
|
96
98
|
```
|
97
99
|
|
98
|
-
|
100
|
+
---
|
101
|
+
**NOTE**
|
102
|
+
You must have both Anthropic and OpenAI API keys set in your environment variables to
|
103
|
+
use VisionAgent. If you don't have an Anthropic key you can use Ollama as a backend.
|
104
|
+
---
|
105
|
+
|
106
|
+
#### Chatting with VisionAgent
|
99
107
|
To get started you can just import the `VisionAgent` and start chatting with it:
|
100
108
|
```python
|
101
109
|
>>> from vision_agent.agent import VisionAgent
|
@@ -111,6 +119,40 @@ The chat messages are similar to `OpenAI`'s format with `role` and `content` key
|
|
111
119
|
in addition to those you can add `media` which is a list of media files that can either
|
112
120
|
be images or video files.
|
113
121
|
|
122
|
+
#### Getting Code from VisionAgent
|
123
|
+
You can also use `VisionAgentCoder` to generate code for you:
|
124
|
+
|
125
|
+
```python
|
126
|
+
>>> from vision_agent.agent import VisionAgentCoder
|
127
|
+
>>> agent = VisionAgentCoder(verbosity=2)
|
128
|
+
>>> code = agent("Count the number of people in this image", media="people.jpg")
|
129
|
+
```
|
130
|
+
|
131
|
+
#### Don't have Anthropic/OpenAI API keys?
|
132
|
+
You can use `OllamaVisionAgentCoder` which uses Ollama as the backend. To get started
|
133
|
+
pull the models:
|
134
|
+
|
135
|
+
```bash
|
136
|
+
ollama pull llama3.2-vision
|
137
|
+
ollama pull mxbai-embed-large
|
138
|
+
```
|
139
|
+
|
140
|
+
Then you can use it just like you would use `VisionAgentCoder`:
|
141
|
+
|
142
|
+
```python
|
143
|
+
>>> from vision_agent.agent import OllamaVisionAgentCoder
|
144
|
+
>>> agent = OllamaVisionAgentCoder(verbosity=2)
|
145
|
+
>>> code = agent("Count the number of people in this image", media="people.jpg")
|
146
|
+
```
|
147
|
+
|
148
|
+
---
|
149
|
+
**NOTE**
|
150
|
+
Smaller open source models like Llama 3.1 8B will not work well with VisionAgent. You
|
151
|
+
will encounter many coding errors because it generates incorrect code or JSON decoding
|
152
|
+
errors because it generates incorrect JSON. We recommend using larger models or
|
153
|
+
Anthropic/OpenAI models.
|
154
|
+
---
|
155
|
+
|
114
156
|
## Documentation
|
115
157
|
|
116
158
|
[VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
|
@@ -444,15 +486,14 @@ Usage is the same as `VisionAgentCoder`:
|
|
444
486
|
`OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
|
445
487
|
|
446
488
|
```bash
|
447
|
-
ollama pull llama3.
|
489
|
+
ollama pull llama3.2-vision
|
448
490
|
ollama pull mxbai-embed-large
|
449
491
|
```
|
450
492
|
|
451
|
-
`llama3.
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
tools. You can use it just like you would use `VisionAgentCoder`:
|
493
|
+
`llama3.2-vision` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Becuase
|
494
|
+
`llama3.2-vision` is a smaller model you **WILL see performance degredation** compared to
|
495
|
+
using Anthropic or OpenAI models. `mxbai-embed-large` is the embedding model used to
|
496
|
+
look up tools. You can use it just like you would use `VisionAgentCoder`:
|
456
497
|
|
457
498
|
```python
|
458
499
|
>>> import vision_agent as va
|
@@ -7,12 +7,12 @@ vision_agent/agent/agent.py,sha256=_1tHWAs7Jm5tqDzEcPfCRvJV3uRRveyh4n9_9pd6I1w,1
|
|
7
7
|
vision_agent/agent/agent_utils.py,sha256=NmrqjhSb6fpnrB8XGWtaywZjr9n89otusOZpcbWLf9k,13534
|
8
8
|
vision_agent/agent/types.py,sha256=DkFm3VMMrKlhYyfxEmZx4keppD72Ov3wmLCbM2J2o10,2437
|
9
9
|
vision_agent/agent/vision_agent.py,sha256=I75bEU-os9Lf9OSICKfvQ_H_ftg-zOwgTwWnu41oIdo,23555
|
10
|
-
vision_agent/agent/vision_agent_coder.py,sha256=
|
10
|
+
vision_agent/agent/vision_agent_coder.py,sha256=flUxOibyGZK19BCSK5mhaD3HjCxHw6c6FtKom6N2q1E,27359
|
11
11
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
|
12
12
|
vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=9v5HwbNidSzYUEFl6ZMniWWOmyLITM_moWLtKVaTen8,4845
|
13
13
|
vision_agent/agent/vision_agent_coder_v2.py,sha256=WKYPJAliupxnF2TP5jZlinqxnID37xnYSDNGMwoFKwU,16092
|
14
|
-
vision_agent/agent/vision_agent_planner.py,sha256=
|
15
|
-
vision_agent/agent/vision_agent_planner_prompts.py,sha256=
|
14
|
+
vision_agent/agent/vision_agent_planner.py,sha256=fFzjNkZBKkh8Y_oS06ATI4qz31xmIJvixb_tV1kX8KA,18590
|
15
|
+
vision_agent/agent/vision_agent_planner_prompts.py,sha256=mn9NlZpRkW4XAvlNuMZwIs1ieHCFds5aYZJ55WXupZY,6733
|
16
16
|
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=UfazG0rogmTQk1dBtpQmLhmF4uPLWFssAqmqK0OQRnA,33237
|
17
17
|
vision_agent/agent/vision_agent_planner_v2.py,sha256=vvxfmGydBIKB8CtNSAJyPvdEXkG7nIO5-Hs2SjNc48Y,20465
|
18
18
|
vision_agent/agent/vision_agent_prompts.py,sha256=NtGdCfzzilCRtscKALC9FK55d1h4CBpMnbhLzg0PYlc,13772
|
@@ -35,12 +35,12 @@ vision_agent/tools/tools.py,sha256=Xcm_9EQdDCR9X5FhIm7VJaTL0qWqhnJUVTRVrRtETrA,9
|
|
35
35
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
36
36
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
37
37
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
38
|
-
vision_agent/utils/execute.py,sha256=
|
38
|
+
vision_agent/utils/execute.py,sha256=Qs-C9lnRBc3frUH_bmrwHLuJ9qjPykIytex8y4E0f7s,29356
|
39
39
|
vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50vG3w,12182
|
40
|
-
vision_agent/utils/sim.py,sha256=
|
40
|
+
vision_agent/utils/sim.py,sha256=sRbEfX5WVHJyE8VPTggXUdYbUM1Z9pF0trpHTAtWDWA,7348
|
41
41
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
42
42
|
vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
|
43
|
-
vision_agent-0.2.
|
44
|
-
vision_agent-0.2.
|
45
|
-
vision_agent-0.2.
|
46
|
-
vision_agent-0.2.
|
43
|
+
vision_agent-0.2.219.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
44
|
+
vision_agent-0.2.219.dist-info/METADATA,sha256=AxTPK82zfoAwsFsHwVQvtHSr8UywSPYXZ5wlRbLiOXY,20287
|
45
|
+
vision_agent-0.2.219.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
46
|
+
vision_agent-0.2.219.dist-info/RECORD,,
|
File without changes
|
File without changes
|