vision-agent 1.0.4__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +46 -47
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/agent/__init__.py +0 -16
- vision_agent/agent/vision_agent_planner_prompts_v2.py +57 -58
- vision_agent/agent/vision_agent_planner_v2.py +3 -2
- vision_agent/configs/anthropic_config.py +29 -16
- vision_agent/configs/config.py +14 -15
- vision_agent/configs/openai_config.py +10 -10
- vision_agent/lmm/lmm.py +2 -2
- vision_agent/tools/__init__.py +0 -6
- vision_agent/tools/meta_tools.py +1 -492
- vision_agent/tools/planner_tools.py +13 -14
- vision_agent/tools/tools.py +16 -27
- {vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/METADATA +31 -3
- {vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/RECORD +17 -24
- vision_agent/agent/vision_agent.py +0 -605
- vision_agent/agent/vision_agent_coder.py +0 -742
- vision_agent/agent/vision_agent_coder_prompts.py +0 -290
- vision_agent/agent/vision_agent_planner.py +0 -564
- vision_agent/agent/vision_agent_planner_prompts.py +0 -199
- vision_agent/agent/vision_agent_prompts.py +0 -312
- vision_agent/configs/anthropic_openai_config.py +0 -164
- {vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/LICENSE +0 -0
- {vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -1488,8 +1488,8 @@ def agentic_object_detection(
|
|
1488
1488
|
"""'agentic_object_detection' is a tool that can detect multiple objects given a
|
1489
1489
|
text prompt such as object names or referring expressions on images. It's
|
1490
1490
|
particularly good at detecting specific objects given detailed descriptive prompts
|
1491
|
-
but runs slower. It returns a list of bounding boxes
|
1492
|
-
label names and associated
|
1491
|
+
but runs slower so not ideal for high counts. It returns a list of bounding boxes
|
1492
|
+
with normalized coordinates, label names and associated confidence score of 1.0.
|
1493
1493
|
|
1494
1494
|
Parameters:
|
1495
1495
|
prompt (str): The prompt to ground to the image, only supports a single prompt
|
@@ -1533,8 +1533,9 @@ def agentic_sam2_instance_segmentation(
|
|
1533
1533
|
"""'agentic_sam2_instance_segmentation' is a tool that can detect multiple
|
1534
1534
|
instances given a text prompt such as object names or referring expressions on
|
1535
1535
|
images. It's particularly good at detecting specific objects given detailed
|
1536
|
-
descriptive prompts but runs slower. It returns a list
|
1537
|
-
normalized coordinates, label names, masks and associated
|
1536
|
+
descriptive prompts but runs slower so not ideal for high counts. It returns a list
|
1537
|
+
of bounding boxes with normalized coordinates, label names, masks and associated
|
1538
|
+
confidence score of 1.0.
|
1538
1539
|
|
1539
1540
|
Parameters:
|
1540
1541
|
prompt (str): The object that needs to be counted, only supports a single
|
@@ -1591,9 +1592,9 @@ def agentic_sam2_video_tracking(
|
|
1591
1592
|
"""'agentic_sam2_video_tracking' is a tool that can track and segment multiple
|
1592
1593
|
objects in a video given a text prompt such as object names or referring
|
1593
1594
|
expressions. It's particularly good at detecting specific objects given detailed
|
1594
|
-
descriptive prompts but runs slower
|
1595
|
-
names, masks and associated
|
1596
|
-
counting without duplicating counts.
|
1595
|
+
descriptive prompts but runs slower so not ideal for high counts. It returns a list
|
1596
|
+
of bounding boxes, label names, masks and associated confidence score of 1.0 and is
|
1597
|
+
useful for tracking and counting without duplicating counts.
|
1597
1598
|
|
1598
1599
|
Parameters:
|
1599
1600
|
prompt (str): The prompt to ground to the image, only supports a single prompt
|
@@ -2307,22 +2308,10 @@ def _qwenvl_activity_recognition(
|
|
2307
2308
|
return [0.0] * len(segment)
|
2308
2309
|
|
2309
2310
|
|
2310
|
-
def _qwen2vl_activity_recognition(
|
2311
|
-
segment: List[np.ndarray], prompt: str
|
2312
|
-
) -> List[float]:
|
2313
|
-
return _qwenvl_activity_recognition(segment, prompt, model_name="qwen2vl")
|
2314
|
-
|
2315
|
-
|
2316
|
-
def _qwen25vl_activity_recognition(
|
2317
|
-
segment: List[np.ndarray], prompt: str
|
2318
|
-
) -> List[float]:
|
2319
|
-
return _qwenvl_activity_recognition(segment, prompt, model_name="qwen25vl")
|
2320
|
-
|
2321
|
-
|
2322
2311
|
def activity_recognition(
|
2323
2312
|
prompt: str,
|
2324
2313
|
frames: List[np.ndarray],
|
2325
|
-
model: str = "
|
2314
|
+
model: str = "qwen25vl",
|
2326
2315
|
chunk_length_frames: int = 10,
|
2327
2316
|
) -> List[float]:
|
2328
2317
|
"""'activity_recognition' is a tool that can recognize activities in a video given a
|
@@ -2371,12 +2360,12 @@ def activity_recognition(
|
|
2371
2360
|
elif model == "qwen2vl":
|
2372
2361
|
|
2373
2362
|
def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
|
2374
|
-
return
|
2363
|
+
return _qwenvl_activity_recognition(segment, prompt, model_name="qwen2vl")
|
2375
2364
|
|
2376
2365
|
elif model == "qwen25vl":
|
2377
2366
|
|
2378
2367
|
def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
|
2379
|
-
return
|
2368
|
+
return _qwenvl_activity_recognition(segment, prompt, model_name="qwen25vl")
|
2380
2369
|
|
2381
2370
|
else:
|
2382
2371
|
raise ValueError(f"Invalid model: {model}")
|
@@ -3488,9 +3477,9 @@ def _plot_counting(
|
|
3488
3477
|
|
3489
3478
|
|
3490
3479
|
FUNCTION_TOOLS = [
|
3491
|
-
|
3492
|
-
|
3493
|
-
|
3480
|
+
glee_object_detection,
|
3481
|
+
glee_sam2_instance_segmentation,
|
3482
|
+
glee_sam2_video_tracking,
|
3494
3483
|
countgd_object_detection,
|
3495
3484
|
countgd_sam2_instance_segmentation,
|
3496
3485
|
countgd_sam2_video_tracking,
|
@@ -3502,8 +3491,8 @@ FUNCTION_TOOLS = [
|
|
3502
3491
|
document_extraction,
|
3503
3492
|
document_qa,
|
3504
3493
|
ocr,
|
3505
|
-
|
3506
|
-
|
3494
|
+
qwen25_vl_images_vqa,
|
3495
|
+
qwen25_vl_video_vqa,
|
3507
3496
|
activity_recognition,
|
3508
3497
|
depth_anything_v2,
|
3509
3498
|
generate_pose_image,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.7
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -65,10 +65,10 @@ pip install vision-agent
|
|
65
65
|
|
66
66
|
```bash
|
67
67
|
export ANTHROPIC_API_KEY="your-api-key"
|
68
|
-
export
|
68
|
+
export GEMINI_API_KEY="your-api-key"
|
69
69
|
```
|
70
70
|
|
71
|
-
> **_NOTE:_** We found using both Anthropic Claude-3.
|
71
|
+
> **_NOTE:_** We found using both Anthropic Claude-3.7 and Gemini-2.0-Flash-Exp to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
|
72
72
|
|
73
73
|
You will also need to set your VisionAgent API key to be able to authenticate when using the hosted vision tools that we provide through our APIs. Currently, the APIs are free to use so you will only need to get it from [here](https://va.landing.ai/account/api-key).
|
74
74
|
|
@@ -147,5 +147,33 @@ directory. For example to change to Anthropic simply just run:
|
|
147
147
|
cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
|
148
148
|
```
|
149
149
|
|
150
|
+
You can also modify the existing `config.py` file yourself to use a different LLM
|
151
|
+
provider, for example if you wanted to change the planner from Anthropic inside
|
152
|
+
`config.py` to OpenAI you would replace this code:
|
153
|
+
```python
|
154
|
+
planner: Type[LMM] = Field(default=AnthropicLMM)
|
155
|
+
planner_kwargs: dict = Field(
|
156
|
+
default_factory=lambda: {
|
157
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
158
|
+
"temperature": 0.0,
|
159
|
+
"image_size": 768,
|
160
|
+
}
|
161
|
+
)
|
162
|
+
```
|
163
|
+
|
164
|
+
with this code:
|
165
|
+
|
166
|
+
```python
|
167
|
+
planner: Type[LMM] = Field(default=OpenAILMM)
|
168
|
+
planner_kwargs: dict = Field(
|
169
|
+
default_factory=lambda: {
|
170
|
+
"model_name": "gpt-4o-2024-11-20",
|
171
|
+
"temperature": 0.0,
|
172
|
+
"image_size": 768,
|
173
|
+
"image_detail": "low",
|
174
|
+
}
|
175
|
+
)
|
176
|
+
```
|
177
|
+
|
150
178
|
> **_NOTE:_** VisionAgent moves fast and we are constantly updating and changing the library. If you have any questions or need help, please reach out to us on our discord channel.
|
151
179
|
|
@@ -1,43 +1,36 @@
|
|
1
|
-
vision_agent/.sim_tools/df.csv,sha256=
|
2
|
-
vision_agent/.sim_tools/embs.npy,sha256=
|
1
|
+
vision_agent/.sim_tools/df.csv,sha256=jCyBDlLxI9_yAxzLZcoN2BPpveF1yh29AlfdSAGTZ4A,40842
|
2
|
+
vision_agent/.sim_tools/embs.npy,sha256=QN8Ojc0Mv4_OS6WA4elvBhXTDHcpx2g1pLxsGqk4IQU,245888
|
3
3
|
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
4
4
|
vision_agent/agent/README.md,sha256=3XSPG_VO7-6y6P8COvcgSSonWj5uvfgvfmOkBpfKK8Q,5527
|
5
|
-
vision_agent/agent/__init__.py,sha256=
|
5
|
+
vision_agent/agent/__init__.py,sha256=_-nGLHhRTLViXxBSb9D4OwLTqk9HXKPEkTBkvK8c7OU,206
|
6
6
|
vision_agent/agent/agent.py,sha256=o1Zuhl6h2R7uVwvUur0Aj38kak8U08plfeFWPst_ErM,1576
|
7
|
-
vision_agent/agent/vision_agent.py,sha256=lLNIapK7lhbGSPdF2RLIh7wBNZMGiM60kFaLXkgXV8g,23465
|
8
|
-
vision_agent/agent/vision_agent_coder.py,sha256=SQy83t3b2vXFiYV4ynlT-g7AkUIP8zy03culkAuHMHA,27353
|
9
|
-
vision_agent/agent/vision_agent_coder_prompts.py,sha256=D4RJxTWoxpl-WtYRvHNxaLSdWVHsdYb0jJIQ2ZCGU0A,12277
|
10
7
|
vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=53b_DhQtffX5wxLuCbNQ83AJhB0P_3wEnuKr-v5bx-o,4866
|
11
8
|
vision_agent/agent/vision_agent_coder_v2.py,sha256=ELc_J8Q4NKPs7YETu3a9O0Vk1zN3k6QfHBgu0M0IWGk,17450
|
12
|
-
vision_agent/agent/
|
13
|
-
vision_agent/agent/
|
14
|
-
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=TiiF5BGnFVraFlQnDaeRU67927LvszvpcMUOgVgo0ps,35843
|
15
|
-
vision_agent/agent/vision_agent_planner_v2.py,sha256=FuvTDe1mfznh4lkDGUyXa6yip8zQV5TDSxUG5zrsWc0,22010
|
16
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=KaJwYPUP7_GvQsCPPs6Fdawmi3AQWmWajBUuzj7gTG4,13812
|
9
|
+
vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=YARVphHKLMNUqCeOsrManvgecl77RP1g51vtt7JpdWk,35937
|
10
|
+
vision_agent/agent/vision_agent_planner_v2.py,sha256=Aww_BJhTFKZ5XjYe8FW57z2Gwp2se0vg1t1DKLGRAyQ,22050
|
17
11
|
vision_agent/agent/vision_agent_prompts_v2.py,sha256=OnHmmteAj8__2spEXNcckApQvEpkK_JIZd_SWzEyg9c,4230
|
18
12
|
vision_agent/agent/vision_agent_v2.py,sha256=iPW6DowH7wCFIA5vb1SdSLfZFWbn_oSC7Xa8uO8KIJI,11675
|
19
13
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
14
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
21
15
|
vision_agent/configs/__init__.py,sha256=Iu75-w9_nlPmnB_qKA7nYaaaHf7xtTrDmK8N4v2WV34,27
|
22
|
-
vision_agent/configs/anthropic_config.py,sha256=
|
23
|
-
vision_agent/configs/
|
24
|
-
vision_agent/configs/
|
25
|
-
vision_agent/configs/openai_config.py,sha256=v2_AIY89d7LKWn4uqA2G047U2IdmnqZrGH2Iww9gRIw,4498
|
16
|
+
vision_agent/configs/anthropic_config.py,sha256=FBW3PsrZn4gJXhr-v1GGayuJuBPafzDz2Q4vyI_rBzA,4692
|
17
|
+
vision_agent/configs/config.py,sha256=FjpYViBJHMizgoZ3dp7T4Lbhbh1MHHstaG7icVdVsrU,4694
|
18
|
+
vision_agent/configs/openai_config.py,sha256=Bw7ElBYmBcaZttyRBoNpcy3uTkqg5qADk8LP0wse2NQ,4498
|
26
19
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
20
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
28
21
|
vision_agent/lmm/__init__.py,sha256=4qX2lmGnKWHeKftXueEi9xj_ieK2nQh_ipHf72nKGFk,84
|
29
|
-
vision_agent/lmm/lmm.py,sha256=
|
22
|
+
vision_agent/lmm/lmm.py,sha256=utGJMeGEKImqHrY0q9kGu0uK3owG8wKyDustwrDrLto,19421
|
30
23
|
vision_agent/models/__init__.py,sha256=eIP0pD5dYog8zUA7uuTmUxCF6SIutbLRLRE0cmuCJgQ,326
|
31
24
|
vision_agent/models/agent_types.py,sha256=vBZ9-ns5lHDdFMO7ulCGGeZ6OwRo3gK4O3vN0814IWc,3064
|
32
25
|
vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1cXmw,305
|
33
26
|
vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
34
27
|
vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
|
35
28
|
vision_agent/sim/sim.py,sha256=8y060Ck7qOFJDw9k9Vl2xQYbLkNaTd6lP1YzbcwkkXc,9944
|
36
|
-
vision_agent/tools/__init__.py,sha256=
|
37
|
-
vision_agent/tools/meta_tools.py,sha256=
|
38
|
-
vision_agent/tools/planner_tools.py,sha256=
|
29
|
+
vision_agent/tools/__init__.py,sha256=H8M5v--cANBiOWvAfUJNj9cq9PKm_DjRrG1MeNRWpHs,2434
|
30
|
+
vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
|
31
|
+
vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
|
39
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
40
|
-
vision_agent/tools/tools.py,sha256
|
33
|
+
vision_agent/tools/tools.py,sha256=4gwL8EFMwm6l0MujftJ8G8BO2z8Dh_a4FPjy_xUmYqs,121889
|
41
34
|
vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
|
42
35
|
vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
|
43
36
|
vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
|
@@ -47,7 +40,7 @@ vision_agent/utils/tools.py,sha256=_XGcF-Zd527BWNkGopBZeJ4j5ehreAkNsKQSBFrlvjw,8
|
|
47
40
|
vision_agent/utils/tools_doc.py,sha256=yFue6KSXoa_Z1ngCdBEc4SdPZOWF1rVLeaHu02I8Wis,2523
|
48
41
|
vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
|
49
42
|
vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
|
50
|
-
vision_agent-1.0.
|
51
|
-
vision_agent-1.0.
|
52
|
-
vision_agent-1.0.
|
53
|
-
vision_agent-1.0.
|
43
|
+
vision_agent-1.0.7.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
44
|
+
vision_agent-1.0.7.dist-info/METADATA,sha256=aQSPKBzRC6Ai7_kZWAn72h2EsYpDVEHMcDAWCV2H-ho,6746
|
45
|
+
vision_agent-1.0.7.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
46
|
+
vision_agent-1.0.7.dist-info/RECORD,,
|