vision-agent 0.2.218__tar.gz → 0.2.219__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. {vision_agent-0.2.218 → vision_agent-0.2.219}/PKG-INFO +51 -11
  2. {vision_agent-0.2.218 → vision_agent-0.2.219}/README.md +50 -10
  3. {vision_agent-0.2.218 → vision_agent-0.2.219}/pyproject.toml +1 -1
  4. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_coder.py +4 -7
  5. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_planner.py +1 -1
  6. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_planner_prompts.py +4 -4
  7. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/utils/sim.py +7 -8
  8. {vision_agent-0.2.218 → vision_agent-0.2.219}/LICENSE +0 -0
  9. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/.sim_tools/df.csv +0 -0
  10. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/.sim_tools/embs.npy +0 -0
  11. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/__init__.py +0 -0
  12. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/README.md +0 -0
  13. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/__init__.py +0 -0
  14. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/agent.py +0 -0
  15. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/agent_utils.py +0 -0
  16. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/types.py +0 -0
  17. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent.py +0 -0
  18. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  19. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
  20. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_coder_v2.py +0 -0
  21. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_planner_prompts_v2.py +0 -0
  22. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
  23. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_prompts.py +0 -0
  24. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
  25. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/agent/vision_agent_v2.py +0 -0
  26. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/clients/__init__.py +0 -0
  27. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/clients/http.py +0 -0
  28. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/clients/landing_public_api.py +0 -0
  29. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/fonts/__init__.py +0 -0
  30. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  31. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/lmm/__init__.py +0 -0
  32. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/lmm/lmm.py +0 -0
  33. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/lmm/types.py +0 -0
  34. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/tools/__init__.py +0 -0
  35. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/tools/meta_tools.py +0 -0
  36. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/tools/planner_tools.py +0 -0
  37. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/tools/prompts.py +0 -0
  38. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/tools/tool_utils.py +0 -0
  39. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/tools/tools.py +0 -0
  40. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/tools/tools_types.py +0 -0
  41. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/utils/__init__.py +0 -0
  42. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/utils/exceptions.py +0 -0
  43. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/utils/execute.py +0 -0
  44. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/utils/image_utils.py +0 -0
  45. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/utils/type_defs.py +0 -0
  46. {vision_agent-0.2.218 → vision_agent-0.2.219}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.218
3
+ Version: 0.2.219
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -81,9 +81,10 @@ You can also run VisionAgent in a local Jupyter Notebook. Here are some example
81
81
  Check out the [notebooks](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks) folder for more examples.
82
82
 
83
83
 
84
- ### Installation
84
+ ### Get Started
85
85
  To get started with the python library, you can install it using pip:
86
86
 
87
+ #### Installation and Setup
87
88
  ```bash
88
89
  pip install vision-agent
89
90
  ```
@@ -92,11 +93,17 @@ Ensure you have both an Anthropic key and an OpenAI API key and set in your envi
92
93
  variables (if you are using Azure OpenAI please see the Azure setup section):
93
94
 
94
95
  ```bash
95
- export ANTHROPIC_API_KEY="your-api-key" # needed for VisionAgent and VisionAgentCoder
96
- export OPENAI_API_KEY="your-api-key" # needed for ToolRecommender
96
+ export ANTHROPIC_API_KEY="your-api-key"
97
+ export OPENAI_API_KEY="your-api-key"
97
98
  ```
98
99
 
99
- ### Basic Usage
100
+ ---
101
+ **NOTE**
102
+ You must have both Anthropic and OpenAI API keys set in your environment variables to
103
+ use VisionAgent. If you don't have an Anthropic key you can use Ollama as a backend.
104
+ ---
105
+
106
+ #### Chatting with VisionAgent
100
107
  To get started you can just import the `VisionAgent` and start chatting with it:
101
108
  ```python
102
109
  >>> from vision_agent.agent import VisionAgent
@@ -112,6 +119,40 @@ The chat messages are similar to `OpenAI`'s format with `role` and `content` key
112
119
  in addition to those you can add `media` which is a list of media files that can either
113
120
  be images or video files.
114
121
 
122
+ #### Getting Code from VisionAgent
123
+ You can also use `VisionAgentCoder` to generate code for you:
124
+
125
+ ```python
126
+ >>> from vision_agent.agent import VisionAgentCoder
127
+ >>> agent = VisionAgentCoder(verbosity=2)
128
+ >>> code = agent("Count the number of people in this image", media="people.jpg")
129
+ ```
130
+
131
+ #### Don't have Anthropic/OpenAI API keys?
132
+ You can use `OllamaVisionAgentCoder` which uses Ollama as the backend. To get started
133
+ pull the models:
134
+
135
+ ```bash
136
+ ollama pull llama3.2-vision
137
+ ollama pull mxbai-embed-large
138
+ ```
139
+
140
+ Then you can use it just like you would use `VisionAgentCoder`:
141
+
142
+ ```python
143
+ >>> from vision_agent.agent import OllamaVisionAgentCoder
144
+ >>> agent = OllamaVisionAgentCoder(verbosity=2)
145
+ >>> code = agent("Count the number of people in this image", media="people.jpg")
146
+ ```
147
+
148
+ ---
149
+ **NOTE**
150
+ Smaller open source models like Llama 3.1 8B will not work well with VisionAgent. You
151
+ will encounter many coding errors because it generates incorrect code or JSON decoding
152
+ errors because it generates incorrect JSON. We recommend using larger models or
153
+ Anthropic/OpenAI models.
154
+ ---
155
+
115
156
  ## Documentation
116
157
 
117
158
  [VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
@@ -445,15 +486,14 @@ Usage is the same as `VisionAgentCoder`:
445
486
  `OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
446
487
 
447
488
  ```bash
448
- ollama pull llama3.1
489
+ ollama pull llama3.2-vision
449
490
  ollama pull mxbai-embed-large
450
491
  ```
451
492
 
452
- `llama3.1` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Normally we would
453
- use an actual LMM such as `llava` but `llava` cannot handle the long context lengths
454
- required by the agent. Since `llama3.1` cannot handle images you may see some
455
- performance degredation. `mxbai-embed-large` is the embedding model used to look up
456
- tools. You can use it just like you would use `VisionAgentCoder`:
493
+ `llama3.2-vision` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Becuase
494
+ `llama3.2-vision` is a smaller model you **WILL see performance degredation** compared to
495
+ using Anthropic or OpenAI models. `mxbai-embed-large` is the embedding model used to
496
+ look up tools. You can use it just like you would use `VisionAgentCoder`:
457
497
 
458
498
  ```python
459
499
  >>> import vision_agent as va
@@ -36,9 +36,10 @@ You can also run VisionAgent in a local Jupyter Notebook. Here are some example
36
36
  Check out the [notebooks](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks) folder for more examples.
37
37
 
38
38
 
39
- ### Installation
39
+ ### Get Started
40
40
  To get started with the python library, you can install it using pip:
41
41
 
42
+ #### Installation and Setup
42
43
  ```bash
43
44
  pip install vision-agent
44
45
  ```
@@ -47,11 +48,17 @@ Ensure you have both an Anthropic key and an OpenAI API key and set in your envi
47
48
  variables (if you are using Azure OpenAI please see the Azure setup section):
48
49
 
49
50
  ```bash
50
- export ANTHROPIC_API_KEY="your-api-key" # needed for VisionAgent and VisionAgentCoder
51
- export OPENAI_API_KEY="your-api-key" # needed for ToolRecommender
51
+ export ANTHROPIC_API_KEY="your-api-key"
52
+ export OPENAI_API_KEY="your-api-key"
52
53
  ```
53
54
 
54
- ### Basic Usage
55
+ ---
56
+ **NOTE**
57
+ You must have both Anthropic and OpenAI API keys set in your environment variables to
58
+ use VisionAgent. If you don't have an Anthropic key you can use Ollama as a backend.
59
+ ---
60
+
61
+ #### Chatting with VisionAgent
55
62
  To get started you can just import the `VisionAgent` and start chatting with it:
56
63
  ```python
57
64
  >>> from vision_agent.agent import VisionAgent
@@ -67,6 +74,40 @@ The chat messages are similar to `OpenAI`'s format with `role` and `content` key
67
74
  in addition to those you can add `media` which is a list of media files that can either
68
75
  be images or video files.
69
76
 
77
+ #### Getting Code from VisionAgent
78
+ You can also use `VisionAgentCoder` to generate code for you:
79
+
80
+ ```python
81
+ >>> from vision_agent.agent import VisionAgentCoder
82
+ >>> agent = VisionAgentCoder(verbosity=2)
83
+ >>> code = agent("Count the number of people in this image", media="people.jpg")
84
+ ```
85
+
86
+ #### Don't have Anthropic/OpenAI API keys?
87
+ You can use `OllamaVisionAgentCoder` which uses Ollama as the backend. To get started
88
+ pull the models:
89
+
90
+ ```bash
91
+ ollama pull llama3.2-vision
92
+ ollama pull mxbai-embed-large
93
+ ```
94
+
95
+ Then you can use it just like you would use `VisionAgentCoder`:
96
+
97
+ ```python
98
+ >>> from vision_agent.agent import OllamaVisionAgentCoder
99
+ >>> agent = OllamaVisionAgentCoder(verbosity=2)
100
+ >>> code = agent("Count the number of people in this image", media="people.jpg")
101
+ ```
102
+
103
+ ---
104
+ **NOTE**
105
+ Smaller open source models like Llama 3.1 8B will not work well with VisionAgent. You
106
+ will encounter many coding errors because it generates incorrect code or JSON decoding
107
+ errors because it generates incorrect JSON. We recommend using larger models or
108
+ Anthropic/OpenAI models.
109
+ ---
110
+
70
111
  ## Documentation
71
112
 
72
113
  [VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
@@ -400,15 +441,14 @@ Usage is the same as `VisionAgentCoder`:
400
441
  `OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
401
442
 
402
443
  ```bash
403
- ollama pull llama3.1
444
+ ollama pull llama3.2-vision
404
445
  ollama pull mxbai-embed-large
405
446
  ```
406
447
 
407
- `llama3.1` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Normally we would
408
- use an actual LMM such as `llava` but `llava` cannot handle the long context lengths
409
- required by the agent. Since `llama3.1` cannot handle images you may see some
410
- performance degredation. `mxbai-embed-large` is the embedding model used to look up
411
- tools. You can use it just like you would use `VisionAgentCoder`:
448
+ `llama3.2-vision` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Becuase
449
+ `llama3.2-vision` is a smaller model you **WILL see performance degredation** compared to
450
+ using Anthropic or OpenAI models. `mxbai-embed-large` is the embedding model used to
451
+ look up tools. You can use it just like you would use `VisionAgentCoder`:
412
452
 
413
453
  ```python
414
454
  >>> import vision_agent as va
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.218"
7
+ version = "0.2.219"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -644,12 +644,9 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
644
644
  """VisionAgentCoder that uses Ollama models for planning, coding, testing.
645
645
 
646
646
  Pre-requisites:
647
- 1. Run ollama pull llama3.1 for the LLM
647
+ 1. Run ollama pull llama3.2-vision for the LMM
648
648
  2. Run ollama pull mxbai-embed-large for the embedding similarity model
649
649
 
650
- Technically you should use a VLM such as llava but llava is not able to handle the
651
- context length and crashes.
652
-
653
650
  Example
654
651
  -------
655
652
  >>> image vision_agent as va
@@ -674,17 +671,17 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
674
671
  else planner
675
672
  ),
676
673
  coder=(
677
- OllamaLMM(model_name="llama3.1", temperature=0.0)
674
+ OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
678
675
  if coder is None
679
676
  else coder
680
677
  ),
681
678
  tester=(
682
- OllamaLMM(model_name="llama3.1", temperature=0.0)
679
+ OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
683
680
  if tester is None
684
681
  else tester
685
682
  ),
686
683
  debugger=(
687
- OllamaLMM(model_name="llama3.1", temperature=0.0)
684
+ OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
688
685
  if debugger is None
689
686
  else debugger
690
687
  ),
@@ -532,7 +532,7 @@ class OllamaVisionAgentPlanner(VisionAgentPlanner):
532
532
  ) -> None:
533
533
  super().__init__(
534
534
  planner=(
535
- OllamaLMM(model_name="llama3.1", temperature=0.0)
535
+ OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
536
536
  if planner is None
537
537
  else planner
538
538
  ),
@@ -62,10 +62,10 @@ plan2:
62
62
  - Count the number of detected objects labeled as 'person'.
63
63
  plan3:
64
64
  - Load the image from the provided file path 'image.jpg'.
65
- - Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
65
+ - Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
66
66
 
67
67
  ```python
68
- from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_counting
68
+ from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_object_detection
69
69
  image = load_image("image.jpg")
70
70
  owl_v2_out = owl_v2_image("person", image)
71
71
 
@@ -73,9 +73,9 @@ f2s2_out = florence2_sam2_image("person", image)
73
73
  # strip out the masks from the output becuase they don't provide useful information when printed
74
74
  f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
75
75
 
76
- cgd_out = countgd_counting(image)
76
+ cgd_out = countgd_object_detection("person", image)
77
77
 
78
- final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
78
+ final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_object_detection": cgd_out}}
79
79
  print(final_out)
80
80
  --- END EXAMPLE1 ---
81
81
 
@@ -58,6 +58,11 @@ class Sim:
58
58
  """
59
59
  self.df = df
60
60
  self.client = OpenAI(api_key=api_key)
61
+ self.emb_call = (
62
+ lambda x: self.client.embeddings.create(input=x, model=model)
63
+ .data[0]
64
+ .embedding
65
+ )
61
66
  self.model = model
62
67
  if "embs" not in df.columns and sim_key is None:
63
68
  raise ValueError("key is required if no column 'embs' is present.")
@@ -65,11 +70,7 @@ class Sim:
65
70
  if sim_key is not None:
66
71
  self.df["embs"] = self.df[sim_key].apply(
67
72
  lambda x: get_embedding(
68
- lambda text: self.client.embeddings.create(
69
- input=text, model=self.model
70
- )
71
- .data[0]
72
- .embedding,
73
+ self.emb_call,
73
74
  x,
74
75
  )
75
76
  )
@@ -126,9 +127,7 @@ class Sim:
126
127
  """
127
128
 
128
129
  embedding = get_embedding(
129
- lambda text: self.client.embeddings.create(input=text, model=self.model)
130
- .data[0]
131
- .embedding,
130
+ self.emb_call,
132
131
  query,
133
132
  )
134
133
  self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
File without changes