vision-agent 0.2.217__py3-none-any.whl → 0.2.219__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent_coder.py +4 -7
- vision_agent/agent/vision_agent_planner.py +1 -1
- vision_agent/agent/vision_agent_planner_prompts.py +4 -4
- vision_agent/utils/execute.py +40 -17
- vision_agent/utils/sim.py +7 -8
- {vision_agent-0.2.217.dist-info → vision_agent-0.2.219.dist-info}/METADATA +52 -11
- {vision_agent-0.2.217.dist-info → vision_agent-0.2.219.dist-info}/RECORD +9 -9
- {vision_agent-0.2.217.dist-info → vision_agent-0.2.219.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.217.dist-info → vision_agent-0.2.219.dist-info}/WHEEL +0 -0
@@ -644,12 +644,9 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
|
|
644
644
|
"""VisionAgentCoder that uses Ollama models for planning, coding, testing.
|
645
645
|
|
646
646
|
Pre-requisites:
|
647
|
-
1. Run ollama pull llama3.
|
647
|
+
1. Run ollama pull llama3.2-vision for the LMM
|
648
648
|
2. Run ollama pull mxbai-embed-large for the embedding similarity model
|
649
649
|
|
650
|
-
Technically you should use a VLM such as llava but llava is not able to handle the
|
651
|
-
context length and crashes.
|
652
|
-
|
653
650
|
Example
|
654
651
|
-------
|
655
652
|
>>> image vision_agent as va
|
@@ -674,17 +671,17 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
|
|
674
671
|
else planner
|
675
672
|
),
|
676
673
|
coder=(
|
677
|
-
OllamaLMM(model_name="llama3.
|
674
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
678
675
|
if coder is None
|
679
676
|
else coder
|
680
677
|
),
|
681
678
|
tester=(
|
682
|
-
OllamaLMM(model_name="llama3.
|
679
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
683
680
|
if tester is None
|
684
681
|
else tester
|
685
682
|
),
|
686
683
|
debugger=(
|
687
|
-
OllamaLMM(model_name="llama3.
|
684
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
688
685
|
if debugger is None
|
689
686
|
else debugger
|
690
687
|
),
|
@@ -532,7 +532,7 @@ class OllamaVisionAgentPlanner(VisionAgentPlanner):
|
|
532
532
|
) -> None:
|
533
533
|
super().__init__(
|
534
534
|
planner=(
|
535
|
-
OllamaLMM(model_name="llama3.
|
535
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
536
536
|
if planner is None
|
537
537
|
else planner
|
538
538
|
),
|
@@ -62,10 +62,10 @@ plan2:
|
|
62
62
|
- Count the number of detected objects labeled as 'person'.
|
63
63
|
plan3:
|
64
64
|
- Load the image from the provided file path 'image.jpg'.
|
65
|
-
- Use the '
|
65
|
+
- Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
|
66
66
|
|
67
67
|
```python
|
68
|
-
from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image,
|
68
|
+
from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_object_detection
|
69
69
|
image = load_image("image.jpg")
|
70
70
|
owl_v2_out = owl_v2_image("person", image)
|
71
71
|
|
@@ -73,9 +73,9 @@ f2s2_out = florence2_sam2_image("person", image)
|
|
73
73
|
# strip out the masks from the output becuase they don't provide useful information when printed
|
74
74
|
f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
|
75
75
|
|
76
|
-
cgd_out =
|
76
|
+
cgd_out = countgd_object_detection("person", image)
|
77
77
|
|
78
|
-
final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "
|
78
|
+
final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_object_detection": cgd_out}}
|
79
79
|
print(final_out)
|
80
80
|
--- END EXAMPLE1 ---
|
81
81
|
|
vision_agent/utils/execute.py
CHANGED
@@ -30,6 +30,8 @@ from nbclient.util import run_sync
|
|
30
30
|
from nbformat.v4 import new_code_cell
|
31
31
|
from pydantic import BaseModel, field_serializer
|
32
32
|
from typing_extensions import Self
|
33
|
+
from opentelemetry.trace import get_tracer, Status, StatusCode, SpanKind
|
34
|
+
from opentelemetry.context import get_current
|
33
35
|
|
34
36
|
from vision_agent.utils.exceptions import (
|
35
37
|
RemoteSandboxCreationError,
|
@@ -633,23 +635,44 @@ Timeout: {self.timeout}"""
|
|
633
635
|
self._new_kernel()
|
634
636
|
|
635
637
|
def exec_cell(self, code: str) -> Execution:
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
638
|
+
# track the exec_cell with opentelemetry trace
|
639
|
+
tracer = get_tracer(__name__)
|
640
|
+
context = get_current()
|
641
|
+
with tracer.start_as_current_span(
|
642
|
+
"notebook_cell_execution", kind=SpanKind.INTERNAL, context=context
|
643
|
+
) as span:
|
644
|
+
try:
|
645
|
+
# Add code as span attribute
|
646
|
+
span.set_attribute("code", code)
|
647
|
+
span.set_attribute("cell_index", len(self.nb.cells))
|
648
|
+
|
649
|
+
self.nb.cells.append(new_code_cell(code))
|
650
|
+
cell = self.nb.cells[-1]
|
651
|
+
self.nb_client.execute_cell(cell, len(self.nb.cells) - 1)
|
652
|
+
|
653
|
+
result = _parse_local_code_interpreter_outputs(
|
654
|
+
self.nb.cells[-1].outputs
|
655
|
+
)
|
656
|
+
span.set_status(Status(StatusCode.OK))
|
657
|
+
return result
|
658
|
+
except CellTimeoutError as e:
|
659
|
+
run_sync(self.nb_client.km.interrupt_kernel)() # type: ignore
|
660
|
+
sleep(1)
|
661
|
+
span.set_status(Status(StatusCode.ERROR, str(e)))
|
662
|
+
span.record_exception(e)
|
663
|
+
traceback_raw = traceback.format_exc().splitlines()
|
664
|
+
return Execution.from_exception(e, traceback_raw)
|
665
|
+
except DeadKernelError as e:
|
666
|
+
self.restart_kernel()
|
667
|
+
span.set_status(Status(StatusCode.ERROR, str(e)))
|
668
|
+
span.record_exception(e)
|
669
|
+
traceback_raw = traceback.format_exc().splitlines()
|
670
|
+
return Execution.from_exception(e, traceback_raw)
|
671
|
+
except Exception as e:
|
672
|
+
span.set_status(Status(StatusCode.ERROR, str(e)))
|
673
|
+
span.record_exception(e)
|
674
|
+
traceback_raw = traceback.format_exc().splitlines()
|
675
|
+
return Execution.from_exception(e, traceback_raw)
|
653
676
|
|
654
677
|
def upload_file(self, file_path: Union[str, Path]) -> Path:
|
655
678
|
with open(file_path, "rb") as f:
|
vision_agent/utils/sim.py
CHANGED
@@ -58,6 +58,11 @@ class Sim:
|
|
58
58
|
"""
|
59
59
|
self.df = df
|
60
60
|
self.client = OpenAI(api_key=api_key)
|
61
|
+
self.emb_call = (
|
62
|
+
lambda x: self.client.embeddings.create(input=x, model=model)
|
63
|
+
.data[0]
|
64
|
+
.embedding
|
65
|
+
)
|
61
66
|
self.model = model
|
62
67
|
if "embs" not in df.columns and sim_key is None:
|
63
68
|
raise ValueError("key is required if no column 'embs' is present.")
|
@@ -65,11 +70,7 @@ class Sim:
|
|
65
70
|
if sim_key is not None:
|
66
71
|
self.df["embs"] = self.df[sim_key].apply(
|
67
72
|
lambda x: get_embedding(
|
68
|
-
|
69
|
-
input=text, model=self.model
|
70
|
-
)
|
71
|
-
.data[0]
|
72
|
-
.embedding,
|
73
|
+
self.emb_call,
|
73
74
|
x,
|
74
75
|
)
|
75
76
|
)
|
@@ -126,9 +127,7 @@ class Sim:
|
|
126
127
|
"""
|
127
128
|
|
128
129
|
embedding = get_embedding(
|
129
|
-
|
130
|
-
.data[0]
|
131
|
-
.embedding,
|
130
|
+
self.emb_call,
|
132
131
|
query,
|
133
132
|
)
|
134
133
|
self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.219
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -23,6 +23,7 @@ Requires-Dist: nbformat (>=5.10.4,<6.0.0)
|
|
23
23
|
Requires-Dist: numpy (>=1.21.0,<2.0.0)
|
24
24
|
Requires-Dist: openai (>=1.0.0,<2.0.0)
|
25
25
|
Requires-Dist: opencv-python (>=4.0.0,<5.0.0)
|
26
|
+
Requires-Dist: opentelemetry-api (>=1.29.0,<2.0.0)
|
26
27
|
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
27
28
|
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
28
29
|
Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
|
@@ -80,9 +81,10 @@ You can also run VisionAgent in a local Jupyter Notebook. Here are some example
|
|
80
81
|
Check out the [notebooks](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks) folder for more examples.
|
81
82
|
|
82
83
|
|
83
|
-
###
|
84
|
+
### Get Started
|
84
85
|
To get started with the python library, you can install it using pip:
|
85
86
|
|
87
|
+
#### Installation and Setup
|
86
88
|
```bash
|
87
89
|
pip install vision-agent
|
88
90
|
```
|
@@ -91,11 +93,17 @@ Ensure you have both an Anthropic key and an OpenAI API key and set in your envi
|
|
91
93
|
variables (if you are using Azure OpenAI please see the Azure setup section):
|
92
94
|
|
93
95
|
```bash
|
94
|
-
export ANTHROPIC_API_KEY="your-api-key"
|
95
|
-
export OPENAI_API_KEY="your-api-key"
|
96
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
97
|
+
export OPENAI_API_KEY="your-api-key"
|
96
98
|
```
|
97
99
|
|
98
|
-
|
100
|
+
---
|
101
|
+
**NOTE**
|
102
|
+
You must have both Anthropic and OpenAI API keys set in your environment variables to
|
103
|
+
use VisionAgent. If you don't have an Anthropic key you can use Ollama as a backend.
|
104
|
+
---
|
105
|
+
|
106
|
+
#### Chatting with VisionAgent
|
99
107
|
To get started you can just import the `VisionAgent` and start chatting with it:
|
100
108
|
```python
|
101
109
|
>>> from vision_agent.agent import VisionAgent
|
@@ -111,6 +119,40 @@ The chat messages are similar to `OpenAI`'s format with `role` and `content` key
|
|
111
119
|
in addition to those you can add `media` which is a list of media files that can either
|
112
120
|
be images or video files.
|
113
121
|
|
122
|
+
#### Getting Code from VisionAgent
|
123
|
+
You can also use `VisionAgentCoder` to generate code for you:
|
124
|
+
|
125
|
+
```python
|
126
|
+
>>> from vision_agent.agent import VisionAgentCoder
|
127
|
+
>>> agent = VisionAgentCoder(verbosity=2)
|
128
|
+
>>> code = agent("Count the number of people in this image", media="people.jpg")
|
129
|
+
```
|
130
|
+
|
131
|
+
#### Don't have Anthropic/OpenAI API keys?
|
132
|
+
You can use `OllamaVisionAgentCoder` which uses Ollama as the backend. To get started
|
133
|
+
pull the models:
|
134
|
+
|
135
|
+
```bash
|
136
|
+
ollama pull llama3.2-vision
|
137
|
+
ollama pull mxbai-embed-large
|
138
|
+
```
|
139
|
+
|
140
|
+
Then you can use it just like you would use `VisionAgentCoder`:
|
141
|
+
|
142
|
+
```python
|
143
|
+
>>> from vision_agent.agent import OllamaVisionAgentCoder
|
144
|
+
>>> agent = OllamaVisionAgentCoder(verbosity=2)
|
145
|
+
>>> code = agent("Count the number of people in this image", media="people.jpg")
|
146
|
+
```
|
147
|
+
|
148
|
+
---
|
149
|
+
**NOTE**
|
150
|
+
Smaller open source models like Llama 3.1 8B will not work well with VisionAgent. You
|
151
|
+
will encounter many coding errors because it generates incorrect code or JSON decoding
|
152
|
+
errors because it generates incorrect JSON. We recommend using larger models or
|
153
|
+
Anthropic/OpenAI models.
|
154
|
+
---
|
155
|
+
|
114
156
|
## Documentation
|
115
157
|
|
116
158
|
[VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
|
@@ -444,15 +486,14 @@ Usage is the same as `VisionAgentCoder`:
|
|
444
486
|
`OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
|
445
487
|
|
446
488
|
```bash
|
447
|
-
ollama pull llama3.
|
489
|
+
ollama pull llama3.2-vision
|
448
490
|
ollama pull mxbai-embed-large
|
449
491
|
```
|
450
492
|
|
451
|
-
`llama3.
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
tools. You can use it just like you would use `VisionAgentCoder`:
|
493
|
+
`llama3.2-vision` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Becuase
|
494
|
+
`llama3.2-vision` is a smaller model you **WILL see performance degredation** compared to
|
495
|
+
using Anthropic or OpenAI models. `mxbai-embed-large` is the embedding model used to
|
496
|
+
look up tools. You can use it just like you would use `VisionAgentCoder`:
|
456
497
|
|
457
498
|
```python
|
458
499
|
>>> import vision_agent as va
|
@@ -7,12 +7,12 @@ vision_agent/agent/agent.py,sha256=_1tHWAs7Jm5tqDzEcPfCRvJV3uRRveyh4n9_9pd6I1w,1
|
|
7
7
|
vision_agent/agent/agent_utils.py,sha256=NmrqjhSb6fpnrB8XGWtaywZjr9n89otusOZpcbWLf9k,13534
|
8
8
|
vision_agent/agent/types.py,sha256=DkFm3VMMrKlhYyfxEmZx4keppD72Ov3wmLCbM2J2o10,2437
|
9
9
|
vision_agent/agent/vision_agent.py,sha256=I75bEU-os9Lf9OSICKfvQ_H_ftg-zOwgTwWnu41oIdo,23555
|
10
|
-
vision_agent/agent/vision_agent_coder.py,sha256=
|
10
|
+
vision_agent/agent/vision_agent_coder.py,sha256=flUxOibyGZK19BCSK5mhaD3HjCxHw6c6FtKom6N2q1E,27359
|
11
11
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
|
12
12
|
vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=9v5HwbNidSzYUEFl6ZMniWWOmyLITM_moWLtKVaTen8,4845
|
13
13
|
vision_agent/agent/vision_agent_coder_v2.py,sha256=WKYPJAliupxnF2TP5jZlinqxnID37xnYSDNGMwoFKwU,16092
|
14
|
-
vision_agent/agent/vision_agent_planner.py,sha256=
|
15
|
-
vision_agent/agent/vision_agent_planner_prompts.py,sha256=
|
14
|
+
vision_agent/agent/vision_agent_planner.py,sha256=fFzjNkZBKkh8Y_oS06ATI4qz31xmIJvixb_tV1kX8KA,18590
|
15
|
+
vision_agent/agent/vision_agent_planner_prompts.py,sha256=mn9NlZpRkW4XAvlNuMZwIs1ieHCFds5aYZJ55WXupZY,6733
|
16
16
|
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=UfazG0rogmTQk1dBtpQmLhmF4uPLWFssAqmqK0OQRnA,33237
|
17
17
|
vision_agent/agent/vision_agent_planner_v2.py,sha256=vvxfmGydBIKB8CtNSAJyPvdEXkG7nIO5-Hs2SjNc48Y,20465
|
18
18
|
vision_agent/agent/vision_agent_prompts.py,sha256=NtGdCfzzilCRtscKALC9FK55d1h4CBpMnbhLzg0PYlc,13772
|
@@ -35,12 +35,12 @@ vision_agent/tools/tools.py,sha256=Xcm_9EQdDCR9X5FhIm7VJaTL0qWqhnJUVTRVrRtETrA,9
|
|
35
35
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
36
36
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
37
37
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
38
|
-
vision_agent/utils/execute.py,sha256=
|
38
|
+
vision_agent/utils/execute.py,sha256=Qs-C9lnRBc3frUH_bmrwHLuJ9qjPykIytex8y4E0f7s,29356
|
39
39
|
vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50vG3w,12182
|
40
|
-
vision_agent/utils/sim.py,sha256=
|
40
|
+
vision_agent/utils/sim.py,sha256=sRbEfX5WVHJyE8VPTggXUdYbUM1Z9pF0trpHTAtWDWA,7348
|
41
41
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
42
42
|
vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
|
43
|
-
vision_agent-0.2.
|
44
|
-
vision_agent-0.2.
|
45
|
-
vision_agent-0.2.
|
46
|
-
vision_agent-0.2.
|
43
|
+
vision_agent-0.2.219.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
44
|
+
vision_agent-0.2.219.dist-info/METADATA,sha256=AxTPK82zfoAwsFsHwVQvtHSr8UywSPYXZ5wlRbLiOXY,20287
|
45
|
+
vision_agent-0.2.219.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
46
|
+
vision_agent-0.2.219.dist-info/RECORD,,
|
File without changes
|
File without changes
|