vision-agent 0.2.189__py3-none-any.whl → 0.2.191__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent_coder.py +0 -3
- vision_agent/agent/vision_agent_planner.py +2 -10
- vision_agent/tools/__init__.py +5 -4
- vision_agent/tools/tools.py +49 -10
- {vision_agent-0.2.189.dist-info → vision_agent-0.2.191.dist-info}/METADATA +7 -11
- {vision_agent-0.2.189.dist-info → vision_agent-0.2.191.dist-info}/RECORD +8 -8
- {vision_agent-0.2.189.dist-info → vision_agent-0.2.191.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.189.dist-info → vision_agent-0.2.191.dist-info}/WHEEL +0 -0
@@ -527,9 +527,6 @@ class VisionAgentCoder(Agent):
|
|
527
527
|
[{"role": "user", "content": "describe your task here..."}].
|
528
528
|
plan_context (PlanContext): The context of the plan, including the plans,
|
529
529
|
best_plan, plan_thoughts, tool_doc, and tool_output.
|
530
|
-
test_multi_plan (bool): Whether to test multiple plans or just the best plan.
|
531
|
-
custom_tool_names (Optional[List[str]]): A list of custom tool names to use
|
532
|
-
for the planner.
|
533
530
|
|
534
531
|
Returns:
|
535
532
|
Dict[str, Any]: A dictionary containing the code output by the
|
@@ -519,11 +519,7 @@ class OpenAIVisionAgentPlanner(VisionAgentPlanner):
|
|
519
519
|
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
520
520
|
) -> None:
|
521
521
|
super().__init__(
|
522
|
-
planner=(
|
523
|
-
OpenAILMM(temperature=0.0, json_mode=True)
|
524
|
-
if planner is None
|
525
|
-
else planner
|
526
|
-
),
|
522
|
+
planner=(OpenAILMM(temperature=0.0) if planner is None else planner),
|
527
523
|
tool_recommender=tool_recommender,
|
528
524
|
verbosity=verbosity,
|
529
525
|
report_progress_callback=report_progress_callback,
|
@@ -567,11 +563,7 @@ class AzureVisionAgentPlanner(VisionAgentPlanner):
|
|
567
563
|
code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
|
568
564
|
) -> None:
|
569
565
|
super().__init__(
|
570
|
-
planner=(
|
571
|
-
AzureOpenAILMM(temperature=0.0, json_mode=True)
|
572
|
-
if planner is None
|
573
|
-
else planner
|
574
|
-
),
|
566
|
+
planner=(AzureOpenAILMM(temperature=0.0) if planner is None else planner),
|
575
567
|
tool_recommender=(
|
576
568
|
AzureSim(T.TOOLS_DF, sim_key="desc")
|
577
569
|
if tool_recommender is None
|
vision_agent/tools/__init__.py
CHANGED
@@ -40,6 +40,7 @@ from .tools import (
|
|
40
40
|
florence2_roberta_vqa,
|
41
41
|
florence2_sam2_image,
|
42
42
|
florence2_sam2_video_tracking,
|
43
|
+
flux_image_inpainting,
|
43
44
|
generate_pose_image,
|
44
45
|
generate_soft_edge_image,
|
45
46
|
get_tool_documentation,
|
@@ -59,16 +60,16 @@ from .tools import (
|
|
59
60
|
overlay_segmentation_masks,
|
60
61
|
owl_v2_image,
|
61
62
|
owl_v2_video,
|
63
|
+
qwen2_vl_images_vqa,
|
64
|
+
qwen2_vl_video_vqa,
|
62
65
|
save_image,
|
63
66
|
save_json,
|
64
67
|
save_video,
|
68
|
+
siglip_classification,
|
65
69
|
template_match,
|
70
|
+
video_temporal_localization,
|
66
71
|
vit_image_classification,
|
67
72
|
vit_nsfw_classification,
|
68
|
-
qwen2_vl_images_vqa,
|
69
|
-
qwen2_vl_video_vqa,
|
70
|
-
video_temporal_localization,
|
71
|
-
flux_image_inpainting,
|
72
73
|
)
|
73
74
|
|
74
75
|
__new_tools__ = [
|
vision_agent/tools/tools.py
CHANGED
@@ -27,10 +27,7 @@ from vision_agent.tools.tool_utils import (
|
|
27
27
|
send_inference_request,
|
28
28
|
send_task_inference_request,
|
29
29
|
)
|
30
|
-
from vision_agent.tools.tools_types import
|
31
|
-
JobStatus,
|
32
|
-
ODResponseData,
|
33
|
-
)
|
30
|
+
from vision_agent.tools.tools_types import JobStatus, ODResponseData
|
34
31
|
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
35
32
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
36
33
|
from vision_agent.utils.image_utils import (
|
@@ -641,8 +638,8 @@ def loca_visual_prompt_counting(
|
|
641
638
|
|
642
639
|
Parameters:
|
643
640
|
image (np.ndarray): The image that contains lot of instances of a single object
|
644
|
-
|
645
|
-
|
641
|
+
visual_prompt (Dict[str, List[float]]): Bounding box of the object in
|
642
|
+
format [xmin, ymin, xmax, ymax]. Only 1 bounding box can be provided.
|
646
643
|
|
647
644
|
Returns:
|
648
645
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
@@ -750,10 +747,10 @@ def countgd_example_based_counting(
|
|
750
747
|
|
751
748
|
Parameters:
|
752
749
|
visual_prompts (List[List[float]]): Bounding boxes of the object in format
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
750
|
+
[xmin, ymin, xmax, ymax]. Upto 3 bounding boxes can be provided. image
|
751
|
+
(np.ndarray): The image that contains multiple instances of the object.
|
752
|
+
box_threshold (float, optional): The threshold for detection. Defaults to
|
753
|
+
0.23.
|
757
754
|
|
758
755
|
Returns:
|
759
756
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -1845,6 +1842,48 @@ def flux_image_inpainting(
|
|
1845
1842
|
return output_image
|
1846
1843
|
|
1847
1844
|
|
1845
|
+
def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any]:
|
1846
|
+
"""'siglip_classification' is a tool that can classify an image or a cropped detection given a list
|
1847
|
+
of input labels or tags. It returns the same list of the input labels along with
|
1848
|
+
their probability scores based on image content.
|
1849
|
+
|
1850
|
+
Parameters:
|
1851
|
+
image (np.ndarray): The image to classify or tag
|
1852
|
+
labels (List[str]): The list of labels or tags that is associated with the image
|
1853
|
+
|
1854
|
+
Returns:
|
1855
|
+
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
1856
|
+
contains a list of given labels and other a list of scores.
|
1857
|
+
|
1858
|
+
Example
|
1859
|
+
-------
|
1860
|
+
>>> siglip_classification(image, ['dog', 'cat', 'bird'])
|
1861
|
+
{"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
|
1862
|
+
"""
|
1863
|
+
|
1864
|
+
if image.shape[0] < 1 or image.shape[1] < 1:
|
1865
|
+
return {"labels": [], "scores": []}
|
1866
|
+
|
1867
|
+
image_file = numpy_to_bytes(image)
|
1868
|
+
|
1869
|
+
files = [("image", image_file)]
|
1870
|
+
|
1871
|
+
payload = {
|
1872
|
+
"model": "siglip",
|
1873
|
+
"labels": labels,
|
1874
|
+
}
|
1875
|
+
|
1876
|
+
response: dict[str, Any] = send_inference_request(
|
1877
|
+
payload=payload,
|
1878
|
+
endpoint_name="classification",
|
1879
|
+
files=files,
|
1880
|
+
v2=True,
|
1881
|
+
metadata_payload={"function_name": "siglip_classification"},
|
1882
|
+
)
|
1883
|
+
|
1884
|
+
return response
|
1885
|
+
|
1886
|
+
|
1848
1887
|
# Utility and visualization functions
|
1849
1888
|
|
1850
1889
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.191
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -54,11 +54,7 @@ Description-Content-Type: text/markdown
|
|
54
54
|
</div>
|
55
55
|
|
56
56
|
VisionAgent is a library that helps you utilize agent frameworks to generate code to
|
57
|
-
solve your vision task.
|
58
|
-
solve, you need to find the right model, figure out how to use it and program it to
|
59
|
-
accomplish the task you want. VisionAgent aims to provide an in-seconds experience by
|
60
|
-
allowing users to describe their problem in text and have the agent framework generate
|
61
|
-
code to solve the task for them. Check out our discord for updates and roadmaps!
|
57
|
+
solve your vision task. Check out our discord for updates and roadmaps!
|
62
58
|
|
63
59
|
## Table of Contents
|
64
60
|
- [🚀Quick Start](#quick-start)
|
@@ -82,19 +78,19 @@ To get started with the python library, you can install it using pip:
|
|
82
78
|
pip install vision-agent
|
83
79
|
```
|
84
80
|
|
85
|
-
Ensure you have an Anthropic key and an OpenAI API key and set in your environment
|
81
|
+
Ensure you have both an Anthropic key and an OpenAI API key and set in your environment
|
86
82
|
variables (if you are using Azure OpenAI please see the Azure setup section):
|
87
83
|
|
88
84
|
```bash
|
89
|
-
export ANTHROPIC_API_KEY="your-api-key"
|
90
|
-
export OPENAI_API_KEY="your-api-key"
|
85
|
+
export ANTHROPIC_API_KEY="your-api-key" # needed for VisionAgent and VisionAgentCoder
|
86
|
+
export OPENAI_API_KEY="your-api-key" # needed for ToolRecommender
|
91
87
|
```
|
92
88
|
|
93
89
|
### Basic Usage
|
94
90
|
To get started you can just import the `VisionAgent` and start chatting with it:
|
95
91
|
```python
|
96
92
|
>>> from vision_agent.agent import VisionAgent
|
97
|
-
>>> agent = VisionAgent()
|
93
|
+
>>> agent = VisionAgent(verbosity=2)
|
98
94
|
>>> resp = agent("Hello")
|
99
95
|
>>> print(resp)
|
100
96
|
[{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "{'thoughts': 'The user has greeted me. I will respond with a greeting and ask how I can assist them.', 'response': 'Hello! How can I assist you today?', 'let_user_respond': True}"}]
|
@@ -103,7 +99,7 @@ To get started you can just import the `VisionAgent` and start chatting with it:
|
|
103
99
|
```
|
104
100
|
|
105
101
|
The chat messages are similar to `OpenAI`'s format with `role` and `content` keys but
|
106
|
-
in addition to those you can add `
|
102
|
+
in addition to those you can add `media` which is a list of media files that can either
|
107
103
|
be images or video files.
|
108
104
|
|
109
105
|
## Documentation
|
@@ -3,9 +3,9 @@ vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xl
|
|
3
3
|
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=WYJF11PfKXlRMPnogGz3s7c2TlWoxoGzuLiIptVYE1s,5524
|
5
5
|
vision_agent/agent/vision_agent.py,sha256=rr1P9iTbr7OsjgMYWCeIxQYI4cLwPWia3NIMJNi-9Yo,26110
|
6
|
-
vision_agent/agent/vision_agent_coder.py,sha256=
|
6
|
+
vision_agent/agent/vision_agent_coder.py,sha256=7Ko1c41dvdDbSP_Yw2yz_SlE3sO6hhlpf_oCjW0we2w,31749
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
|
8
|
-
vision_agent/agent/vision_agent_planner.py,sha256=
|
8
|
+
vision_agent/agent/vision_agent_planner.py,sha256=Hy4vKiae7zIIKVPgLetGArbsjGRNVOXlxY9xhFgW-A0,18871
|
9
9
|
vision_agent/agent/vision_agent_planner_prompts.py,sha256=JDARUzko2HZdxkBtcy6wuP9DCCmbqhK_gnVgrjr6l1k,6691
|
10
10
|
vision_agent/agent/vision_agent_prompts.py,sha256=4329ll0kqCznRALIMl-rlKWGjN92p3bcRrz8R-cO744,13748
|
11
11
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
16
16
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
17
17
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
18
18
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
19
|
-
vision_agent/tools/__init__.py,sha256=
|
19
|
+
vision_agent/tools/__init__.py,sha256=UrpGFB1ACOZZCAyj8vNw0IHhKm9wGp0qHOtci2cqAMU,2825
|
20
20
|
vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
23
|
-
vision_agent/tools/tools.py,sha256=
|
23
|
+
vision_agent/tools/tools.py,sha256=72Ml8kxtaqIqrh4hiZQ81f5Mrl-7z-W1a6bCjIMBvoA,84952
|
24
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
25
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
26
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
29
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
30
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
31
31
|
vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
32
|
+
vision_agent-0.2.191.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.191.dist-info/METADATA,sha256=eZGSUWuHBTmyStliR_oxFyoWMeLW0_0qP2ULx8y_-1E,18067
|
34
|
+
vision_agent-0.2.191.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.191.dist-info/RECORD,,
|
File without changes
|
File without changes
|