vision-agent 0.2.218__tar.gz → 0.2.220__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.218 → vision_agent-0.2.220}/PKG-INFO +52 -16
- {vision_agent-0.2.218 → vision_agent-0.2.220}/README.md +51 -15
- {vision_agent-0.2.218 → vision_agent-0.2.220}/pyproject.toml +1 -1
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/.sim_tools/df.csv +21 -3
- vision_agent-0.2.220/vision_agent/.sim_tools/embs.npy +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/vision_agent_coder.py +4 -7
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/vision_agent_coder_v2.py +3 -3
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/vision_agent_planner.py +1 -1
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/vision_agent_planner_prompts.py +4 -4
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/vision_agent_planner_prompts_v2.py +4 -3
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/tools/__init__.py +1 -1
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/tools/planner_tools.py +4 -5
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/tools/tools.py +28 -17
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/utils/__init__.py +0 -1
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/utils/execute.py +2 -2
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/utils/image_utils.py +1 -1
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/utils/sim.py +51 -11
- vision_agent-0.2.218/vision_agent/.sim_tools/embs.npy +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/LICENSE +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/README.md +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/agent_utils.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/types.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/vision_agent_coder_prompts_v2.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/vision_agent_planner_v2.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/vision_agent_prompts_v2.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/vision_agent_v2.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.220
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -81,22 +81,26 @@ You can also run VisionAgent in a local Jupyter Notebook. Here are some example
|
|
81
81
|
Check out the [notebooks](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks) folder for more examples.
|
82
82
|
|
83
83
|
|
84
|
-
###
|
84
|
+
### Get Started
|
85
85
|
To get started with the python library, you can install it using pip:
|
86
86
|
|
87
|
+
#### Installation and Setup
|
87
88
|
```bash
|
88
89
|
pip install vision-agent
|
89
90
|
```
|
90
91
|
|
91
|
-
Ensure you have both an Anthropic key and an OpenAI API key and set in your environment
|
92
|
-
variables (if you are using Azure OpenAI please see the Azure setup section):
|
93
|
-
|
94
92
|
```bash
|
95
|
-
export ANTHROPIC_API_KEY="your-api-key"
|
96
|
-
export OPENAI_API_KEY="your-api-key" # needed for ToolRecommender
|
93
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
97
94
|
```
|
98
95
|
|
99
|
-
|
96
|
+
---
|
97
|
+
**NOTE**
|
98
|
+
You must have the Anthropic API key set in your environment variables to use
|
99
|
+
VisionAgent. If you don't have an Anthropic key you can use another provider like
|
100
|
+
OpenAI or Ollama.
|
101
|
+
---
|
102
|
+
|
103
|
+
#### Chatting with VisionAgent
|
100
104
|
To get started you can just import the `VisionAgent` and start chatting with it:
|
101
105
|
```python
|
102
106
|
>>> from vision_agent.agent import VisionAgent
|
@@ -112,6 +116,40 @@ The chat messages are similar to `OpenAI`'s format with `role` and `content` key
|
|
112
116
|
in addition to those you can add `media` which is a list of media files that can either
|
113
117
|
be images or video files.
|
114
118
|
|
119
|
+
#### Getting Code from VisionAgent
|
120
|
+
You can also use `VisionAgentCoder` to generate code for you:
|
121
|
+
|
122
|
+
```python
|
123
|
+
>>> from vision_agent.agent import VisionAgentCoder
|
124
|
+
>>> agent = VisionAgentCoder(verbosity=2)
|
125
|
+
>>> code = agent("Count the number of people in this image", media="people.jpg")
|
126
|
+
```
|
127
|
+
|
128
|
+
#### Don't have Anthropic/OpenAI API keys?
|
129
|
+
You can use `OllamaVisionAgentCoder` which uses Ollama as the backend. To get started
|
130
|
+
pull the models:
|
131
|
+
|
132
|
+
```bash
|
133
|
+
ollama pull llama3.2-vision
|
134
|
+
ollama pull mxbai-embed-large
|
135
|
+
```
|
136
|
+
|
137
|
+
Then you can use it just like you would use `VisionAgentCoder`:
|
138
|
+
|
139
|
+
```python
|
140
|
+
>>> from vision_agent.agent import OllamaVisionAgentCoder
|
141
|
+
>>> agent = OllamaVisionAgentCoder(verbosity=2)
|
142
|
+
>>> code = agent("Count the number of people in this image", media="people.jpg")
|
143
|
+
```
|
144
|
+
|
145
|
+
---
|
146
|
+
**NOTE**
|
147
|
+
Smaller open source models like Llama 3.1 8B will not work well with VisionAgent. You
|
148
|
+
will encounter many coding errors because it generates incorrect code or JSON decoding
|
149
|
+
errors because it generates incorrect JSON. We recommend using larger models or
|
150
|
+
Anthropic/OpenAI models.
|
151
|
+
---
|
152
|
+
|
115
153
|
## Documentation
|
116
154
|
|
117
155
|
[VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
|
@@ -120,8 +158,7 @@ be images or video files.
|
|
120
158
|
### Chatting and Message Formats
|
121
159
|
`VisionAgent` is an agent that can chat with you and call other tools or agents to
|
122
160
|
write vision code for you. You can interact with it like you would ChatGPT or any other
|
123
|
-
chatbot. The agent uses Clause-3.5 for it's LMM
|
124
|
-
for tools.
|
161
|
+
chatbot. The agent uses Clause-3.5 for it's LMM.
|
125
162
|
|
126
163
|
The message format is:
|
127
164
|
```json
|
@@ -445,15 +482,14 @@ Usage is the same as `VisionAgentCoder`:
|
|
445
482
|
`OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
|
446
483
|
|
447
484
|
```bash
|
448
|
-
ollama pull llama3.
|
485
|
+
ollama pull llama3.2-vision
|
449
486
|
ollama pull mxbai-embed-large
|
450
487
|
```
|
451
488
|
|
452
|
-
`llama3.
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
tools. You can use it just like you would use `VisionAgentCoder`:
|
489
|
+
`llama3.2-vision` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Becuase
|
490
|
+
`llama3.2-vision` is a smaller model you **WILL see performance degredation** compared to
|
491
|
+
using Anthropic or OpenAI models. `mxbai-embed-large` is the embedding model used to
|
492
|
+
look up tools. You can use it just like you would use `VisionAgentCoder`:
|
457
493
|
|
458
494
|
```python
|
459
495
|
>>> import vision_agent as va
|
@@ -36,22 +36,26 @@ You can also run VisionAgent in a local Jupyter Notebook. Here are some example
|
|
36
36
|
Check out the [notebooks](https://github.com/landing-ai/vision-agent/blob/main/examples/notebooks) folder for more examples.
|
37
37
|
|
38
38
|
|
39
|
-
###
|
39
|
+
### Get Started
|
40
40
|
To get started with the python library, you can install it using pip:
|
41
41
|
|
42
|
+
#### Installation and Setup
|
42
43
|
```bash
|
43
44
|
pip install vision-agent
|
44
45
|
```
|
45
46
|
|
46
|
-
Ensure you have both an Anthropic key and an OpenAI API key and set in your environment
|
47
|
-
variables (if you are using Azure OpenAI please see the Azure setup section):
|
48
|
-
|
49
47
|
```bash
|
50
|
-
export ANTHROPIC_API_KEY="your-api-key"
|
51
|
-
export OPENAI_API_KEY="your-api-key" # needed for ToolRecommender
|
48
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
52
49
|
```
|
53
50
|
|
54
|
-
|
51
|
+
---
|
52
|
+
**NOTE**
|
53
|
+
You must have the Anthropic API key set in your environment variables to use
|
54
|
+
VisionAgent. If you don't have an Anthropic key you can use another provider like
|
55
|
+
OpenAI or Ollama.
|
56
|
+
---
|
57
|
+
|
58
|
+
#### Chatting with VisionAgent
|
55
59
|
To get started you can just import the `VisionAgent` and start chatting with it:
|
56
60
|
```python
|
57
61
|
>>> from vision_agent.agent import VisionAgent
|
@@ -67,6 +71,40 @@ The chat messages are similar to `OpenAI`'s format with `role` and `content` key
|
|
67
71
|
in addition to those you can add `media` which is a list of media files that can either
|
68
72
|
be images or video files.
|
69
73
|
|
74
|
+
#### Getting Code from VisionAgent
|
75
|
+
You can also use `VisionAgentCoder` to generate code for you:
|
76
|
+
|
77
|
+
```python
|
78
|
+
>>> from vision_agent.agent import VisionAgentCoder
|
79
|
+
>>> agent = VisionAgentCoder(verbosity=2)
|
80
|
+
>>> code = agent("Count the number of people in this image", media="people.jpg")
|
81
|
+
```
|
82
|
+
|
83
|
+
#### Don't have Anthropic/OpenAI API keys?
|
84
|
+
You can use `OllamaVisionAgentCoder` which uses Ollama as the backend. To get started
|
85
|
+
pull the models:
|
86
|
+
|
87
|
+
```bash
|
88
|
+
ollama pull llama3.2-vision
|
89
|
+
ollama pull mxbai-embed-large
|
90
|
+
```
|
91
|
+
|
92
|
+
Then you can use it just like you would use `VisionAgentCoder`:
|
93
|
+
|
94
|
+
```python
|
95
|
+
>>> from vision_agent.agent import OllamaVisionAgentCoder
|
96
|
+
>>> agent = OllamaVisionAgentCoder(verbosity=2)
|
97
|
+
>>> code = agent("Count the number of people in this image", media="people.jpg")
|
98
|
+
```
|
99
|
+
|
100
|
+
---
|
101
|
+
**NOTE**
|
102
|
+
Smaller open source models like Llama 3.1 8B will not work well with VisionAgent. You
|
103
|
+
will encounter many coding errors because it generates incorrect code or JSON decoding
|
104
|
+
errors because it generates incorrect JSON. We recommend using larger models or
|
105
|
+
Anthropic/OpenAI models.
|
106
|
+
---
|
107
|
+
|
70
108
|
## Documentation
|
71
109
|
|
72
110
|
[VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/)
|
@@ -75,8 +113,7 @@ be images or video files.
|
|
75
113
|
### Chatting and Message Formats
|
76
114
|
`VisionAgent` is an agent that can chat with you and call other tools or agents to
|
77
115
|
write vision code for you. You can interact with it like you would ChatGPT or any other
|
78
|
-
chatbot. The agent uses Clause-3.5 for it's LMM
|
79
|
-
for tools.
|
116
|
+
chatbot. The agent uses Clause-3.5 for it's LMM.
|
80
117
|
|
81
118
|
The message format is:
|
82
119
|
```json
|
@@ -400,15 +437,14 @@ Usage is the same as `VisionAgentCoder`:
|
|
400
437
|
`OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
|
401
438
|
|
402
439
|
```bash
|
403
|
-
ollama pull llama3.
|
440
|
+
ollama pull llama3.2-vision
|
404
441
|
ollama pull mxbai-embed-large
|
405
442
|
```
|
406
443
|
|
407
|
-
`llama3.
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
tools. You can use it just like you would use `VisionAgentCoder`:
|
444
|
+
`llama3.2-vision` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Becuase
|
445
|
+
`llama3.2-vision` is a smaller model you **WILL see performance degredation** compared to
|
446
|
+
using Anthropic or OpenAI models. `mxbai-embed-large` is the embedding model used to
|
447
|
+
look up tools. You can use it just like you would use `VisionAgentCoder`:
|
412
448
|
|
413
449
|
```python
|
414
450
|
>>> import vision_agent as va
|
@@ -460,19 +460,37 @@ desc,doc,name
|
|
460
460
|
-------
|
461
461
|
>>> document_analysis(image)
|
462
462
|
{'pages':
|
463
|
-
[{'bbox': [0, 0,
|
464
|
-
'chunks': [{'bbox': [
|
463
|
+
[{'bbox': [0, 0, 1.0, 1.0],
|
464
|
+
'chunks': [{'bbox': [0.8, 0.1, 1.0, 0.2],
|
465
465
|
'label': 'page_header',
|
466
466
|
'order': 75
|
467
467
|
'caption': 'Annual Report 2024',
|
468
468
|
'summary': 'This annual report summarizes ...' },
|
469
|
-
{'bbox': [
|
469
|
+
{'bbox': [0.2, 0.9, 0.9, 1.0],
|
470
470
|
'label': table',
|
471
471
|
'order': 1119,
|
472
472
|
'caption': [{'Column 1': 'Value 1', 'Column 2': 'Value 2'},
|
473
473
|
'summary': 'This table illustrates a trend of ...'},
|
474
474
|
],
|
475
475
|
",document_extraction
|
476
|
+
"'document_qa' is a tool that can answer any questions about arbitrary documents, presentations, or tables. It's very useful for document QA tasks, you can ask it a specific question or ask it to return a JSON object answering multiple questions about the document.","document_qa(prompt: str, image: numpy.ndarray) -> str:
|
477
|
+
'document_qa' is a tool that can answer any questions about arbitrary documents,
|
478
|
+
presentations, or tables. It's very useful for document QA tasks, you can ask it a
|
479
|
+
specific question or ask it to return a JSON object answering multiple questions
|
480
|
+
about the document.
|
481
|
+
|
482
|
+
Parameters:
|
483
|
+
prompt (str): The question to be answered about the document image.
|
484
|
+
image (np.ndarray): The document image to analyze.
|
485
|
+
|
486
|
+
Returns:
|
487
|
+
str: The answer to the question based on the document's context.
|
488
|
+
|
489
|
+
Example
|
490
|
+
-------
|
491
|
+
>>> document_qa(image, question)
|
492
|
+
'The answer to the question ...'
|
493
|
+
",document_qa
|
476
494
|
'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: Optional[int] = 2) -> List[float]:
|
477
495
|
'video_temporal_localization' will run qwen2vl on each chunk_length_frames
|
478
496
|
value selected for the video. It can detect multiple objects independently per
|
Binary file
|
@@ -644,12 +644,9 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
|
|
644
644
|
"""VisionAgentCoder that uses Ollama models for planning, coding, testing.
|
645
645
|
|
646
646
|
Pre-requisites:
|
647
|
-
1. Run ollama pull llama3.
|
647
|
+
1. Run ollama pull llama3.2-vision for the LMM
|
648
648
|
2. Run ollama pull mxbai-embed-large for the embedding similarity model
|
649
649
|
|
650
|
-
Technically you should use a VLM such as llava but llava is not able to handle the
|
651
|
-
context length and crashes.
|
652
|
-
|
653
650
|
Example
|
654
651
|
-------
|
655
652
|
>>> image vision_agent as va
|
@@ -674,17 +671,17 @@ class OllamaVisionAgentCoder(VisionAgentCoder):
|
|
674
671
|
else planner
|
675
672
|
),
|
676
673
|
coder=(
|
677
|
-
OllamaLMM(model_name="llama3.
|
674
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
678
675
|
if coder is None
|
679
676
|
else coder
|
680
677
|
),
|
681
678
|
tester=(
|
682
|
-
OllamaLMM(model_name="llama3.
|
679
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
683
680
|
if tester is None
|
684
681
|
else tester
|
685
682
|
),
|
686
683
|
debugger=(
|
687
|
-
OllamaLMM(model_name="llama3.
|
684
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
688
685
|
if debugger is None
|
689
686
|
else debugger
|
690
687
|
),
|
@@ -5,7 +5,7 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
|
|
5
5
|
from rich.console import Console
|
6
6
|
from rich.markup import escape
|
7
7
|
|
8
|
-
import vision_agent.tools as T
|
8
|
+
import vision_agent.tools.tools as T
|
9
9
|
from vision_agent.agent import AgentCoder, AgentPlanner
|
10
10
|
from vision_agent.agent.agent_utils import (
|
11
11
|
DefaultImports,
|
@@ -34,7 +34,7 @@ from vision_agent.utils.execute import (
|
|
34
34
|
CodeInterpreterFactory,
|
35
35
|
Execution,
|
36
36
|
)
|
37
|
-
from vision_agent.utils.sim import Sim
|
37
|
+
from vision_agent.utils.sim import Sim, get_tool_recommender
|
38
38
|
|
39
39
|
_CONSOLE = Console()
|
40
40
|
|
@@ -316,7 +316,7 @@ class VisionAgentCoderV2(AgentCoder):
|
|
316
316
|
elif isinstance(tool_recommender, Sim):
|
317
317
|
self.tool_recommender = tool_recommender
|
318
318
|
else:
|
319
|
-
self.tool_recommender =
|
319
|
+
self.tool_recommender = get_tool_recommender()
|
320
320
|
|
321
321
|
self.verbose = verbose
|
322
322
|
self.code_sandbox_runtime = code_sandbox_runtime
|
@@ -532,7 +532,7 @@ class OllamaVisionAgentPlanner(VisionAgentPlanner):
|
|
532
532
|
) -> None:
|
533
533
|
super().__init__(
|
534
534
|
planner=(
|
535
|
-
OllamaLMM(model_name="llama3.
|
535
|
+
OllamaLMM(model_name="llama3.2-vision", temperature=0.0)
|
536
536
|
if planner is None
|
537
537
|
else planner
|
538
538
|
),
|
{vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/vision_agent_planner_prompts.py
RENAMED
@@ -62,10 +62,10 @@ plan2:
|
|
62
62
|
- Count the number of detected objects labeled as 'person'.
|
63
63
|
plan3:
|
64
64
|
- Load the image from the provided file path 'image.jpg'.
|
65
|
-
- Use the '
|
65
|
+
- Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
|
66
66
|
|
67
67
|
```python
|
68
|
-
from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image,
|
68
|
+
from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_object_detection
|
69
69
|
image = load_image("image.jpg")
|
70
70
|
owl_v2_out = owl_v2_image("person", image)
|
71
71
|
|
@@ -73,9 +73,9 @@ f2s2_out = florence2_sam2_image("person", image)
|
|
73
73
|
# strip out the masks from the output becuase they don't provide useful information when printed
|
74
74
|
f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
|
75
75
|
|
76
|
-
cgd_out =
|
76
|
+
cgd_out = countgd_object_detection("person", image)
|
77
77
|
|
78
|
-
final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "
|
78
|
+
final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_object_detection": cgd_out}}
|
79
79
|
print(final_out)
|
80
80
|
--- END EXAMPLE1 ---
|
81
81
|
|
{vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/vision_agent_planner_prompts_v2.py
RENAMED
@@ -440,16 +440,17 @@ PICK_PLAN = """
|
|
440
440
|
"""
|
441
441
|
|
442
442
|
CATEGORIZE_TOOL_REQUEST = """
|
443
|
-
You are given a task: {task} from the user.
|
443
|
+
You are given a task: "{task}" from the user. You must extract the type of category this task belongs to, it can be one or more of the following:
|
444
444
|
- "object detection and counting" - detecting objects or counting objects from a text prompt in an image or video.
|
445
445
|
- "classification" - classifying objects in an image given a text prompt.
|
446
446
|
- "segmentation" - segmenting objects in an image or video given a text prompt.
|
447
447
|
- "OCR" - extracting text from an image.
|
448
448
|
- "VQA" - answering questions about an image or video, can also be used for text extraction.
|
449
|
+
- "DocQA" - answering questions about a document or extracting information from a document.
|
449
450
|
- "video object tracking" - tracking objects in a video.
|
450
451
|
- "depth and pose estimation" - estimating the depth or pose of objects in an image.
|
451
452
|
|
452
|
-
Return the category or categories (comma separated) inside tags <category># your categories here</category>.
|
453
|
+
Return the category or categories (comma separated) inside tags <category># your categories here</category>. If you are unsure about a task, it is better to include more categories than less.
|
453
454
|
"""
|
454
455
|
|
455
456
|
TEST_TOOLS = """
|
@@ -473,7 +474,7 @@ TEST_TOOLS = """
|
|
473
474
|
{examples}
|
474
475
|
|
475
476
|
**Instructions**:
|
476
|
-
1. List all the tools under **Tools** and the user request. Write a program to load the media and call
|
477
|
+
1. List all the tools under **Tools** and the user request. Write a program to load the media and call the most relevant tools in parallel and print it's output along with other relevant information.
|
477
478
|
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
|
478
479
|
3. Your test case MUST run only on the given images which are {media}
|
479
480
|
4. Print this final dictionary.
|
@@ -43,7 +43,6 @@ from .tools import (
|
|
43
43
|
flux_image_inpainting,
|
44
44
|
generate_pose_image,
|
45
45
|
get_tool_documentation,
|
46
|
-
get_tool_recommender,
|
47
46
|
gpt4o_image_vqa,
|
48
47
|
gpt4o_video_vqa,
|
49
48
|
load_image,
|
@@ -63,6 +62,7 @@ from .tools import (
|
|
63
62
|
save_json,
|
64
63
|
save_video,
|
65
64
|
siglip_classification,
|
65
|
+
stella_embeddings,
|
66
66
|
template_match,
|
67
67
|
video_temporal_localization,
|
68
68
|
vit_image_classification,
|
@@ -32,6 +32,7 @@ from vision_agent.utils.execute import (
|
|
32
32
|
MimeType,
|
33
33
|
)
|
34
34
|
from vision_agent.utils.image_utils import convert_to_b64
|
35
|
+
from vision_agent.utils.sim import get_tool_recommender
|
35
36
|
|
36
37
|
TOOL_FUNCTIONS = {tool.__name__: tool for tool in T.TOOLS}
|
37
38
|
|
@@ -116,13 +117,11 @@ def run_tool_testing(
|
|
116
117
|
query = lmm.generate(CATEGORIZE_TOOL_REQUEST.format(task=task))
|
117
118
|
category = extract_tag(query, "category") # type: ignore
|
118
119
|
if category is None:
|
119
|
-
|
120
|
+
query = task
|
120
121
|
else:
|
121
|
-
|
122
|
-
f"I need models from the {category.strip()} category of tools. {task}"
|
123
|
-
)
|
122
|
+
query = f"{category.strip()}. {task}"
|
124
123
|
|
125
|
-
tool_docs =
|
124
|
+
tool_docs = get_tool_recommender().top_k(query, k=5, thresh=0.3)
|
126
125
|
if exclude_tools is not None and len(exclude_tools) > 0:
|
127
126
|
cleaned_tool_docs = []
|
128
127
|
for tool_doc in tool_docs:
|
@@ -7,7 +7,6 @@ import urllib.request
|
|
7
7
|
from base64 import b64encode
|
8
8
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9
9
|
from enum import Enum
|
10
|
-
from functools import lru_cache
|
11
10
|
from importlib import resources
|
12
11
|
from pathlib import Path
|
13
12
|
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
@@ -49,7 +48,6 @@ from vision_agent.utils.image_utils import (
|
|
49
48
|
rle_decode,
|
50
49
|
rle_decode_array,
|
51
50
|
)
|
52
|
-
from vision_agent.utils.sim import Sim, load_cached_sim
|
53
51
|
from vision_agent.utils.video import (
|
54
52
|
extract_frames_from_video,
|
55
53
|
frames_to_bytes,
|
@@ -85,11 +83,6 @@ _OCR_URL = "https://app.landing.ai/ocr/v1/detect-text"
|
|
85
83
|
_LOGGER = logging.getLogger(__name__)
|
86
84
|
|
87
85
|
|
88
|
-
@lru_cache(maxsize=1)
|
89
|
-
def get_tool_recommender() -> Sim:
|
90
|
-
return load_cached_sim(TOOLS_DF)
|
91
|
-
|
92
|
-
|
93
86
|
def _display_tool_trace(
|
94
87
|
function_name: str,
|
95
88
|
request: Dict[str, Any],
|
@@ -2178,13 +2171,14 @@ def document_qa(
|
|
2178
2171
|
prompt: str,
|
2179
2172
|
image: np.ndarray,
|
2180
2173
|
) -> str:
|
2181
|
-
"""'document_qa' is a tool that can answer any questions about arbitrary
|
2182
|
-
|
2183
|
-
|
2174
|
+
"""'document_qa' is a tool that can answer any questions about arbitrary documents,
|
2175
|
+
presentations, or tables. It's very useful for document QA tasks, you can ask it a
|
2176
|
+
specific question or ask it to return a JSON object answering multiple questions
|
2177
|
+
about the document.
|
2184
2178
|
|
2185
2179
|
Parameters:
|
2186
|
-
prompt (str): The question to be answered about the document image
|
2187
|
-
image (np.ndarray): The document image to analyze
|
2180
|
+
prompt (str): The question to be answered about the document image.
|
2181
|
+
image (np.ndarray): The document image to analyze.
|
2188
2182
|
|
2189
2183
|
Returns:
|
2190
2184
|
str: The answer to the question based on the document's context.
|
@@ -2203,7 +2197,7 @@ def document_qa(
|
|
2203
2197
|
"model": "document-analysis",
|
2204
2198
|
}
|
2205
2199
|
|
2206
|
-
data:
|
2200
|
+
data: Dict[str, Any] = send_inference_request(
|
2207
2201
|
payload=payload,
|
2208
2202
|
endpoint_name="document-analysis",
|
2209
2203
|
files=files,
|
@@ -2225,10 +2219,10 @@ def document_qa(
|
|
2225
2219
|
data = normalize(data)
|
2226
2220
|
|
2227
2221
|
prompt = f"""
|
2228
|
-
|
2229
|
-
|
2230
|
-
|
2231
|
-
|
2222
|
+
Document Context:
|
2223
|
+
{data}\n
|
2224
|
+
Question: {prompt}\n
|
2225
|
+
Answer the question directly using only the information from the document, do not answer with any additional text besides the answer. If the answer is not definitively contained in the document, say "I cannot find the answer in the provided document."
|
2232
2226
|
"""
|
2233
2227
|
|
2234
2228
|
lmm = AnthropicLMM()
|
@@ -2245,6 +2239,22 @@ def document_qa(
|
|
2245
2239
|
return llm_output
|
2246
2240
|
|
2247
2241
|
|
2242
|
+
def stella_embeddings(prompts: List[str]) -> List[np.ndarray]:
|
2243
|
+
payload = {
|
2244
|
+
"input": prompts,
|
2245
|
+
"model": "stella1.5b",
|
2246
|
+
}
|
2247
|
+
|
2248
|
+
data: Dict[str, Any] = send_inference_request(
|
2249
|
+
payload=payload,
|
2250
|
+
endpoint_name="embeddings",
|
2251
|
+
v2=True,
|
2252
|
+
metadata_payload={"function_name": "get_embeddings"},
|
2253
|
+
is_form=True,
|
2254
|
+
)
|
2255
|
+
return [d["embedding"] for d in data] # type: ignore
|
2256
|
+
|
2257
|
+
|
2248
2258
|
# Utility and visualization functions
|
2249
2259
|
|
2250
2260
|
|
@@ -2781,6 +2791,7 @@ FUNCTION_TOOLS = [
|
|
2781
2791
|
qwen2_vl_images_vqa,
|
2782
2792
|
qwen2_vl_video_vqa,
|
2783
2793
|
document_extraction,
|
2794
|
+
document_qa,
|
2784
2795
|
video_temporal_localization,
|
2785
2796
|
flux_image_inpainting,
|
2786
2797
|
siglip_classification,
|
@@ -28,10 +28,10 @@ from nbclient import __version__ as nbclient_version
|
|
28
28
|
from nbclient.exceptions import CellTimeoutError, DeadKernelError
|
29
29
|
from nbclient.util import run_sync
|
30
30
|
from nbformat.v4 import new_code_cell
|
31
|
+
from opentelemetry.context import get_current
|
32
|
+
from opentelemetry.trace import SpanKind, Status, StatusCode, get_tracer
|
31
33
|
from pydantic import BaseModel, field_serializer
|
32
34
|
from typing_extensions import Self
|
33
|
-
from opentelemetry.trace import get_tracer, Status, StatusCode, SpanKind
|
34
|
-
from opentelemetry.context import get_current
|
35
35
|
|
36
36
|
from vision_agent.utils.exceptions import (
|
37
37
|
RemoteSandboxCreationError,
|
@@ -11,7 +11,7 @@ import numpy as np
|
|
11
11
|
from PIL import Image, ImageDraw, ImageFont
|
12
12
|
from PIL.Image import Image as ImageType
|
13
13
|
|
14
|
-
from vision_agent.utils import extract_frames_from_video
|
14
|
+
from vision_agent.utils.video import extract_frames_from_video
|
15
15
|
|
16
16
|
COLORS = [
|
17
17
|
(158, 218, 229),
|
@@ -12,6 +12,13 @@ import requests
|
|
12
12
|
from openai import AzureOpenAI, OpenAI
|
13
13
|
from scipy.spatial.distance import cosine # type: ignore
|
14
14
|
|
15
|
+
from vision_agent.tools.tools import TOOLS_DF, stella_embeddings
|
16
|
+
|
17
|
+
|
18
|
+
@lru_cache(maxsize=1)
|
19
|
+
def get_tool_recommender() -> "Sim":
|
20
|
+
return load_cached_sim(TOOLS_DF)
|
21
|
+
|
15
22
|
|
16
23
|
@lru_cache(maxsize=512)
|
17
24
|
def get_embedding(
|
@@ -27,13 +34,13 @@ def load_cached_sim(
|
|
27
34
|
cached_dir_full_path = str(resources.files("vision_agent") / cached_dir)
|
28
35
|
if os.path.exists(cached_dir_full_path):
|
29
36
|
if tools_df is not None:
|
30
|
-
if
|
37
|
+
if StellaSim.check_load(cached_dir_full_path, tools_df):
|
31
38
|
# don't pass sim_key to loaded Sim object or else it will re-calculate embeddings
|
32
|
-
return
|
39
|
+
return StellaSim.load(cached_dir_full_path)
|
33
40
|
if os.path.exists(cached_dir_full_path):
|
34
41
|
shutil.rmtree(cached_dir_full_path)
|
35
42
|
|
36
|
-
sim =
|
43
|
+
sim = StellaSim(tools_df, sim_key=sim_key)
|
37
44
|
sim.save(cached_dir_full_path)
|
38
45
|
return sim
|
39
46
|
|
@@ -58,6 +65,11 @@ class Sim:
|
|
58
65
|
"""
|
59
66
|
self.df = df
|
60
67
|
self.client = OpenAI(api_key=api_key)
|
68
|
+
self.emb_call = (
|
69
|
+
lambda x: self.client.embeddings.create(input=x, model=model)
|
70
|
+
.data[0]
|
71
|
+
.embedding
|
72
|
+
)
|
61
73
|
self.model = model
|
62
74
|
if "embs" not in df.columns and sim_key is None:
|
63
75
|
raise ValueError("key is required if no column 'embs' is present.")
|
@@ -65,11 +77,7 @@ class Sim:
|
|
65
77
|
if sim_key is not None:
|
66
78
|
self.df["embs"] = self.df[sim_key].apply(
|
67
79
|
lambda x: get_embedding(
|
68
|
-
|
69
|
-
input=text, model=self.model
|
70
|
-
)
|
71
|
-
.data[0]
|
72
|
-
.embedding,
|
80
|
+
self.emb_call,
|
73
81
|
x,
|
74
82
|
)
|
75
83
|
)
|
@@ -126,9 +134,7 @@ class Sim:
|
|
126
134
|
"""
|
127
135
|
|
128
136
|
embedding = get_embedding(
|
129
|
-
|
130
|
-
.data[0]
|
131
|
-
.embedding,
|
137
|
+
self.emb_call,
|
132
138
|
query,
|
133
139
|
)
|
134
140
|
self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
|
@@ -215,6 +221,40 @@ class OllamaSim(Sim):
|
|
215
221
|
)
|
216
222
|
|
217
223
|
|
224
|
+
class StellaSim(Sim):
|
225
|
+
def __init__(
|
226
|
+
self,
|
227
|
+
df: pd.DataFrame,
|
228
|
+
sim_key: Optional[str] = None,
|
229
|
+
) -> None:
|
230
|
+
self.df = df
|
231
|
+
|
232
|
+
def emb_call(text: List[str]) -> List[float]:
|
233
|
+
return stella_embeddings(text)[0] # type: ignore
|
234
|
+
|
235
|
+
self.emb_call = emb_call
|
236
|
+
|
237
|
+
if "embs" not in df.columns and sim_key is None:
|
238
|
+
raise ValueError("key is required if no column 'embs' is present.")
|
239
|
+
|
240
|
+
if sim_key is not None:
|
241
|
+
self.df["embs"] = self.df[sim_key].apply(
|
242
|
+
lambda x: get_embedding(emb_call, x)
|
243
|
+
)
|
244
|
+
|
245
|
+
@staticmethod
|
246
|
+
def load(
|
247
|
+
load_dir: Union[str, Path],
|
248
|
+
api_key: Optional[str] = None,
|
249
|
+
model: str = "stella1.5b",
|
250
|
+
) -> "StellaSim":
|
251
|
+
load_dir = Path(load_dir)
|
252
|
+
df = pd.read_csv(load_dir / "df.csv")
|
253
|
+
embs = np.load(load_dir / "embs.npy")
|
254
|
+
df["embs"] = list(embs)
|
255
|
+
return StellaSim(df)
|
256
|
+
|
257
|
+
|
218
258
|
def merge_sim(sim1: Sim, sim2: Sim) -> Sim:
|
219
259
|
return Sim(pd.concat([sim1.df, sim2.df], ignore_index=True))
|
220
260
|
|
Binary file
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
{vision_agent-0.2.218 → vision_agent-0.2.220}/vision_agent/agent/vision_agent_coder_prompts_v2.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|