vision-agent 0.2.217__py3-none-any.whl → 0.2.219__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -644,12 +644,9 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
644
644
  """VisionAgentCoder that uses Ollama models for planning, coding, testing.
645
645
 
646
646
  Pre-requisites:
647
- 1. Run ollama pull llama3.1 for the LLM
647
+ 1. Run ollama pull llama3.2-vision for the LMM
648
648
  2. Run ollama pull mxbai-embed-large for the embedding similarity model
649
649
 
650
- Technically you should use a VLM such as llava but llava is not able to handle the
651
- context length and crashes.
652
-
653
650
  Example
654
651
  -------
655
652
  >>> image vision_agent as va
@@ -674,17 +671,17 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
674
671
  else planner
675
672
  ),
676
673
  coder=(
677
- OllamaLMM(model_name="llama3.1", temperature=0.0)
674
+ OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
678
675
  if coder is None
679
676
  else coder
680
677
  ),
681
678
  tester=(
682
- OllamaLMM(model_name="llama3.1", temperature=0.0)
679
+ OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
683
680
  if tester is None
684
681
  else tester
685
682
  ),
686
683
  debugger=(
687
- OllamaLMM(model_name="llama3.1", temperature=0.0)
684
+ OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
688
685
  if debugger is None
689
686
  else debugger
690
687
  ),
@@ -532,7 +532,7 @@ class OllamaVisionAgentPlanner(VisionAgentPlanner):
532
532
  ) -> None:
533
533
  super().__init__(
534
534
  planner=(
535
- OllamaLMM(model_name="llama3.1", temperature=0.0)
535
+ OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
536
536
  if planner is None
537
537
  else planner
538
538
  ),
@@ -62,10 +62,10 @@ plan2:
62
62
  - Count the number of detected objects labeled as 'person'.
63
63
  plan3:
64
64
  - Load the image from the provided file path 'image.jpg'.
65
- - Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
65
+ - Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
66
66
 
67
67
  ```python
68
- from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_counting
68
+ from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_object_detection
69
69
  image = load_image("image.jpg")
70
70
  owl_v2_out = owl_v2_image("person", image)
71
71
 
@@ -73,9 +73,9 @@ f2s2_out = florence2_sam2_image("person", image)
73
73
  # strip out the masks from the output becuase they don't provide useful information when printed
74
74
  f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
75
75
 
76
- cgd_out = countgd_counting(image)
76
+ cgd_out = countgd_object_detection("person", image)
77
77
 
78
- final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
78
+ final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_object_detection": cgd_out}}
79
79
  print(final_out)
80
80
  --- END EXAMPLE1 ---
81
81
 
@@ -30,6 +30,8 @@ from nbclient.util import run_sync
30
30
  from nbformat.v4 import new_code_cell
31
31
  from pydantic import BaseModel, field_serializer
32
32
  from typing_extensions import Self
33
+ from opentelemetry.trace import get_tracer, Status, StatusCode, SpanKind
34
+ from opentelemetry.context import get_current
33
35
 
34
36
  from vision_agent.utils.exceptions import (
35
37
  RemoteSandboxCreationError,
@@ -633,23 +635,44 @@ Timeout: {self.timeout}"""
633
635
  self._new_kernel()
634
636
 
635
637
  def exec_cell(self, code: str) -> Execution:
636
- try:
637
- self.nb.cells.append(new_code_cell(code))
638
- cell = self.nb.cells[-1]
639
- self.nb_client.execute_cell(cell, len(self.nb.cells) - 1)
640
- return _parse_local_code_interpreter_outputs(self.nb.cells[-1].outputs)
641
- except CellTimeoutError as e:
642
- run_sync(self.nb_client.km.interrupt_kernel)() # type: ignore
643
- sleep(1)
644
- traceback_raw = traceback.format_exc().splitlines()
645
- return Execution.from_exception(e, traceback_raw)
646
- except DeadKernelError as e:
647
- self.restart_kernel()
648
- traceback_raw = traceback.format_exc().splitlines()
649
- return Execution.from_exception(e, traceback_raw)
650
- except Exception as e:
651
- traceback_raw = traceback.format_exc().splitlines()
652
- return Execution.from_exception(e, traceback_raw)
638
+ # track the exec_cell with opentelemetry trace
639
+ tracer = get_tracer(__name__)
640
+ context = get_current()
641
+ with tracer.start_as_current_span(
642
+ "notebook_cell_execution", kind=SpanKind.INTERNAL, context=context
643
+ ) as span:
644
+ try:
645
+ # Add code as span attribute
646
+ span.set_attribute("code", code)
647
+ span.set_attribute("cell_index", len(self.nb.cells))
648
+
649
+ self.nb.cells.append(new_code_cell(code))
650
+ cell = self.nb.cells[-1]
651
+ self.nb_client.execute_cell(cell, len(self.nb.cells) - 1)
652
+
653
+ result = _parse_local_code_interpreter_outputs(
654
+ self.nb.cells[-1].outputs
655
+ )
656
+ span.set_status(Status(StatusCode.OK))
657
+ return result
658
+ except CellTimeoutError as e:
659
+ run_sync(self.nb_client.km.interrupt_kernel)() # type: ignore
660
+ sleep(1)
661
+ span.set_status(Status(StatusCode.ERROR, str(e)))
662
+ span.record_exception(e)
663
+ traceback_raw = traceback.format_exc().splitlines()
664
+ return Execution.from_exception(e, traceback_raw)
665
+ except DeadKernelError as e:
666
+ self.restart_kernel()
667
+ span.set_status(Status(StatusCode.ERROR, str(e)))
668
+ span.record_exception(e)
669
+ traceback_raw = traceback.format_exc().splitlines()
670
+ return Execution.from_exception(e, traceback_raw)
671
+ except Exception as e:
672
+ span.set_status(Status(StatusCode.ERROR, str(e)))
673
+ span.record_exception(e)
674
+ traceback_raw = traceback.format_exc().splitlines()
675
+ return Execution.from_exception(e, traceback_raw)
653
676
 
654
677
  def upload_file(self, file_path: Union[str, Path]) -> Path:
655
678
  with open(file_path, "rb") as f:
vision_agent/utils/sim.py CHANGED
@@ -58,6 +58,11 @@ class Sim:
58
58
  """
59
59
  self.df = df
60
60
  self.client = OpenAI(api_key=api_key)
61
+ self.emb_call = (
62
+ lambda x: self.client.embeddings.create(input=x, model=model)
63
+ .data[0]
64
+ .embedding
65
+ )
61
66
  self.model = model
62
67
  if "embs" not in df.columns and sim_key is None:
63
68
  raise ValueError("key is required if no column 'embs' is present.")
@@ -65,11 +70,7 @@ class Sim:
65
70
  if sim_key is not None:
66
71
  self.df["embs"] = self.df[sim_key].apply(
67
72
  lambda x: get_embedding(
68
- lambda text: self.client.embeddings.create(
69
- input=text, model=self.model
70
- )
71
- .data[0]
72
- .embedding,
73
+ self.emb_call,
73
74
  x,
74
75
  )
75
76
  )
@@ -126,9 +127,7 @@ class Sim:
126
127
  """
127
128
 
128
129
  embedding = get_embedding(
129
- lambda text: self.client.embeddings.create(input=text, model=self.model)
130
- .data[0]
131
- .embedding,
130
+ self.emb_call,
132
131
  query,
133
132
  )
134
133
  self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.217
3
+ Version: 0.2.219
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -23,6 +23,7 @@ Requires-Dist: nbformat (>=5.10.4,<6.0.0)
23
23
  Requires-Dist: numpy (>=1.21.0,<2.0.0)
24
24
  Requires-Dist: openai (>=1.0.0,<2.0.0)
25
25
  Requires-Dist: opencv-python (>=4.0.0,<5.0.0)
26
+ Requires-Dist: opentelemetry-api (>=1.29.0,<2.0.0)
26
27
  Requires-Dist: pandas (>=2.0.0,<3.0.0)
27
28
  Requires-Dist: pillow (>=10.0.0,<11.0.0)
28
29
  Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
@@ -80,9 +81,10 @@ You can also run VisionAgent in a local Jupyter Notebook. Here are some example
80
81
  Check out the [notebooks](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks) folder for more examples.
81
82
 
82
83
 
83
- ### Installation
84
+ ### Get Started
84
85
  To get started with the python library, you can install it using pip:
85
86
 
87
+ #### Installation and Setup
86
88
  ```bash
87
89
  pip install vision-agent
88
90
  ```
@@ -91,11 +93,17 @@ Ensure you have both an Anthropic key and an OpenAI API key and set in your envi
91
93
  variables (if you are using Azure OpenAI please see the Azure setup section):
92
94
 
93
95
  ```bash
94
- export ANTHROPIC_API_KEY="your-api-key" # needed for VisionAgent and VisionAgentCoder
95
- export OPENAI_API_KEY="your-api-key" # needed for ToolRecommender
96
+ export ANTHROPIC_API_KEY="your-api-key"
97
+ export OPENAI_API_KEY="your-api-key"
96
98
  ```
97
99
 
98
- ### Basic Usage
100
+ ---
101
+ **NOTE**
102
+ You must have both Anthropic and OpenAI API keys set in your environment variables to
103
+ use VisionAgent. If you don't have an Anthropic key you can use Ollama as a backend.
104
+ ---
105
+
106
+ #### Chatting with VisionAgent
99
107
  To get started you can just import the `VisionAgent` and start chatting with it:
100
108
  ```python
101
109
  >>> from vision_agent.agent import VisionAgent
@@ -111,6 +119,40 @@ The chat messages are similar to `OpenAI`'s format with `role` and `content` key
111
119
  in addition to those you can add `media` which is a list of media files that can either
112
120
  be images or video files.
113
121
 
122
+ #### Getting Code from VisionAgent
123
+ You can also use `VisionAgentCoder` to generate code for you:
124
+
125
+ ```python
126
+ >>> from vision_agent.agent import VisionAgentCoder
127
+ >>> agent = VisionAgentCoder(verbosity=2)
128
+ >>> code = agent("Count the number of people in this image", media="people.jpg")
129
+ ```
130
+
131
+ #### Don't have Anthropic/OpenAI API keys?
132
+ You can use `OllamaVisionAgentCoder` which uses Ollama as the backend. To get started
133
+ pull the models:
134
+
135
+ ```bash
136
+ ollama pull llama3.2-vision
137
+ ollama pull mxbai-embed-large
138
+ ```
139
+
140
+ Then you can use it just like you would use `VisionAgentCoder`:
141
+
142
+ ```python
143
+ >>> from vision_agent.agent import OllamaVisionAgentCoder
144
+ >>> agent = OllamaVisionAgentCoder(verbosity=2)
145
+ >>> code = agent("Count the number of people in this image", media="people.jpg")
146
+ ```
147
+
148
+ ---
149
+ **NOTE**
150
+ Smaller open source models like Llama 3.1 8B will not work well with VisionAgent. You
151
+ will encounter many coding errors because it generates incorrect code or JSON decoding
152
+ errors because it generates incorrect JSON. We recommend using larger models or
153
+ Anthropic/OpenAI models.
154
+ ---
155
+
114
156
  ## Documentation
115
157
 
116
158
  [VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
@@ -444,15 +486,14 @@ Usage is the same as `VisionAgentCoder`:
444
486
  `OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
445
487
 
446
488
  ```bash
447
- ollama pull llama3.1
489
+ ollama pull llama3.2-vision
448
490
  ollama pull mxbai-embed-large
449
491
  ```
450
492
 
451
- `llama3.1` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Normally we would
452
- use an actual LMM such as `llava` but `llava` cannot handle the long context lengths
453
- required by the agent. Since `llama3.1` cannot handle images you may see some
454
- performance degredation. `mxbai-embed-large` is the embedding model used to look up
455
- tools. You can use it just like you would use `VisionAgentCoder`:
493
+ `llama3.2-vision` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Becuase
494
+ `llama3.2-vision` is a smaller model you **WILL see performance degredation** compared to
495
+ using Anthropic or OpenAI models. `mxbai-embed-large` is the embedding model used to
496
+ look up tools. You can use it just like you would use `VisionAgentCoder`:
456
497
 
457
498
  ```python
458
499
  >>> import vision_agent as va
@@ -7,12 +7,12 @@ vision_agent/agent/agent.py,sha256=_1tHWAs7Jm5tqDzEcPfCRvJV3uRRveyh4n9_9pd6I1w,1
7
7
  vision_agent/agent/agent_utils.py,sha256=NmrqjhSb6fpnrB8XGWtaywZjr9n89otusOZpcbWLf9k,13534
8
8
  vision_agent/agent/types.py,sha256=DkFm3VMMrKlhYyfxEmZx4keppD72Ov3wmLCbM2J2o10,2437
9
9
  vision_agent/agent/vision_agent.py,sha256=I75bEU-os9Lf9OSICKfvQ_H_ftg-zOwgTwWnu41oIdo,23555
10
- vision_agent/agent/vision_agent_coder.py,sha256=ANwUuCO4JpTYJs4s6ynSRFcdjZFUVuSoSfcqp8ZQDDQ,27451
10
+ vision_agent/agent/vision_agent_coder.py,sha256=flUxOibyGZK19BCSK5mhaD3HjCxHw6c6FtKom6N2q1E,27359
11
11
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
12
12
  vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=9v5HwbNidSzYUEFl6ZMniWWOmyLITM_moWLtKVaTen8,4845
13
13
  vision_agent/agent/vision_agent_coder_v2.py,sha256=WKYPJAliupxnF2TP5jZlinqxnID37xnYSDNGMwoFKwU,16092
14
- vision_agent/agent/vision_agent_planner.py,sha256=KWMA7XemcSmc_jn-MwdWz9wnKDtj-sYQ9tINi70_OoU,18583
15
- vision_agent/agent/vision_agent_planner_prompts.py,sha256=Y3jz9HRf8fz9NLUseN7cTgZqewP0RazxR7vw1sPhcn0,6691
14
+ vision_agent/agent/vision_agent_planner.py,sha256=fFzjNkZBKkh8Y_oS06ATI4qz31xmIJvixb_tV1kX8KA,18590
15
+ vision_agent/agent/vision_agent_planner_prompts.py,sha256=mn9NlZpRkW4XAvlNuMZwIs1ieHCFds5aYZJ55WXupZY,6733
16
16
  vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=UfazG0rogmTQk1dBtpQmLhmF4uPLWFssAqmqK0OQRnA,33237
17
17
  vision_agent/agent/vision_agent_planner_v2.py,sha256=vvxfmGydBIKB8CtNSAJyPvdEXkG7nIO5-Hs2SjNc48Y,20465
18
18
  vision_agent/agent/vision_agent_prompts.py,sha256=NtGdCfzzilCRtscKALC9FK55d1h4CBpMnbhLzg0PYlc,13772
@@ -35,12 +35,12 @@ vision_agent/tools/tools.py,sha256=Xcm_9EQdDCR9X5FhIm7VJaTL0qWqhnJUVTRVrRtETrA,9
35
35
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
36
36
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
37
37
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
38
- vision_agent/utils/execute.py,sha256=ktJX1gWBk4D_tXeWV5olGUMC4dU_Z6m5oSv-6Yu1O0w,28292
38
+ vision_agent/utils/execute.py,sha256=Qs-C9lnRBc3frUH_bmrwHLuJ9qjPykIytex8y4E0f7s,29356
39
39
  vision_agent/utils/image_utils.py,sha256=5uoYgXa6E0-lVrXR7K2XE7fe6r_n7pvK64HYQ50vG3w,12182
40
- vision_agent/utils/sim.py,sha256=f1emBQM8SmyVKSrhj0NHItnfMHSeTw-Nk2pw-0eBZ5c,7462
40
+ vision_agent/utils/sim.py,sha256=sRbEfX5WVHJyE8VPTggXUdYbUM1Z9pF0trpHTAtWDWA,7348
41
41
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
42
42
  vision_agent/utils/video.py,sha256=e1VwKhXzzlC5LcFMyrcQYrPnpnX4wxDpnQ-76sB4jgM,6001
43
- vision_agent-0.2.217.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
- vision_agent-0.2.217.dist-info/METADATA,sha256=xl9AmXP9RBpC5frlASsiG7YktdIOTRuJgv8WZdRV_bA,19071
45
- vision_agent-0.2.217.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
- vision_agent-0.2.217.dist-info/RECORD,,
43
+ vision_agent-0.2.219.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
44
+ vision_agent-0.2.219.dist-info/METADATA,sha256=AxTPK82zfoAwsFsHwVQvtHSr8UywSPYXZ5wlRbLiOXY,20287
45
+ vision_agent-0.2.219.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
46
+ vision_agent-0.2.219.dist-info/RECORD,,