vision-agent 0.2.189__py3-none-any.whl → 0.2.191__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -527,9 +527,6 @@ class VisionAgentCoder(Agent):
527
527
  [{"role": "user", "content": "describe your task here..."}].
528
528
  plan_context (PlanContext): The context of the plan, including the plans,
529
529
  best_plan, plan_thoughts, tool_doc, and tool_output.
530
- test_multi_plan (bool): Whether to test multiple plans or just the best plan.
531
- custom_tool_names (Optional[List[str]]): A list of custom tool names to use
532
- for the planner.
533
530
 
534
531
  Returns:
535
532
  Dict[str, Any]: A dictionary containing the code output by the
@@ -519,11 +519,7 @@ class OpenAIVisionAgentPlanner(VisionAgentPlanner):
519
519
  code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
520
520
  ) -> None:
521
521
  super().__init__(
522
- planner=(
523
- OpenAILMM(temperature=0.0, json_mode=True)
524
- if planner is None
525
- else planner
526
- ),
522
+ planner=(OpenAILMM(temperature=0.0) if planner is None else planner),
527
523
  tool_recommender=tool_recommender,
528
524
  verbosity=verbosity,
529
525
  report_progress_callback=report_progress_callback,
@@ -567,11 +563,7 @@ class AzureVisionAgentPlanner(VisionAgentPlanner):
567
563
  code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
568
564
  ) -> None:
569
565
  super().__init__(
570
- planner=(
571
- AzureOpenAILMM(temperature=0.0, json_mode=True)
572
- if planner is None
573
- else planner
574
- ),
566
+ planner=(AzureOpenAILMM(temperature=0.0) if planner is None else planner),
575
567
  tool_recommender=(
576
568
  AzureSim(T.TOOLS_DF, sim_key="desc")
577
569
  if tool_recommender is None
@@ -40,6 +40,7 @@ from .tools import (
40
40
  florence2_roberta_vqa,
41
41
  florence2_sam2_image,
42
42
  florence2_sam2_video_tracking,
43
+ flux_image_inpainting,
43
44
  generate_pose_image,
44
45
  generate_soft_edge_image,
45
46
  get_tool_documentation,
@@ -59,16 +60,16 @@ from .tools import (
59
60
  overlay_segmentation_masks,
60
61
  owl_v2_image,
61
62
  owl_v2_video,
63
+ qwen2_vl_images_vqa,
64
+ qwen2_vl_video_vqa,
62
65
  save_image,
63
66
  save_json,
64
67
  save_video,
68
+ siglip_classification,
65
69
  template_match,
70
+ video_temporal_localization,
66
71
  vit_image_classification,
67
72
  vit_nsfw_classification,
68
- qwen2_vl_images_vqa,
69
- qwen2_vl_video_vqa,
70
- video_temporal_localization,
71
- flux_image_inpainting,
72
73
  )
73
74
 
74
75
  __new_tools__ = [
@@ -27,10 +27,7 @@ from vision_agent.tools.tool_utils import (
27
27
  send_inference_request,
28
28
  send_task_inference_request,
29
29
  )
30
- from vision_agent.tools.tools_types import (
31
- JobStatus,
32
- ODResponseData,
33
- )
30
+ from vision_agent.tools.tools_types import JobStatus, ODResponseData
34
31
  from vision_agent.utils.exceptions import FineTuneModelIsNotReady
35
32
  from vision_agent.utils.execute import FileSerializer, MimeType
36
33
  from vision_agent.utils.image_utils import (
@@ -641,8 +638,8 @@ def loca_visual_prompt_counting(
641
638
 
642
639
  Parameters:
643
640
  image (np.ndarray): The image that contains lot of instances of a single object
644
- visual_prompt (Dict[str, List[float]]): Bounding box of the object in format
645
- [xmin, ymin, xmax, ymax]. Only 1 bounding box can be provided.
641
+ visual_prompt (Dict[str, List[float]]): Bounding box of the object in
642
+ format [xmin, ymin, xmax, ymax]. Only 1 bounding box can be provided.
646
643
 
647
644
  Returns:
648
645
  Dict[str, Any]: A dictionary containing the key 'count' and the count as a
@@ -750,10 +747,10 @@ def countgd_example_based_counting(
750
747
 
751
748
  Parameters:
752
749
  visual_prompts (List[List[float]]): Bounding boxes of the object in format
753
- [xmin, ymin, xmax, ymax]. Upto 3 bounding boxes can be provided.
754
- image (np.ndarray): The image that contains multiple instances of the object.
755
- box_threshold (float, optional): The threshold for detection. Defaults
756
- to 0.23.
750
+ [xmin, ymin, xmax, ymax]. Upto 3 bounding boxes can be provided. image
751
+ (np.ndarray): The image that contains multiple instances of the object.
752
+ box_threshold (float, optional): The threshold for detection. Defaults to
753
+ 0.23.
757
754
 
758
755
  Returns:
759
756
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -1845,6 +1842,48 @@ def flux_image_inpainting(
1845
1842
  return output_image
1846
1843
 
1847
1844
 
1845
+ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any]:
1846
+ """'siglip_classification' is a tool that can classify an image or a cropped detection given a list
1847
+ of input labels or tags. It returns the same list of the input labels along with
1848
+ their probability scores based on image content.
1849
+
1850
+ Parameters:
1851
+ image (np.ndarray): The image to classify or tag
1852
+ labels (List[str]): The list of labels or tags that is associated with the image
1853
+
1854
+ Returns:
1855
+ Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
1856
+ contains a list of given labels and other a list of scores.
1857
+
1858
+ Example
1859
+ -------
1860
+ >>> siglip_classification(image, ['dog', 'cat', 'bird'])
1861
+ {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
1862
+ """
1863
+
1864
+ if image.shape[0] < 1 or image.shape[1] < 1:
1865
+ return {"labels": [], "scores": []}
1866
+
1867
+ image_file = numpy_to_bytes(image)
1868
+
1869
+ files = [("image", image_file)]
1870
+
1871
+ payload = {
1872
+ "model": "siglip",
1873
+ "labels": labels,
1874
+ }
1875
+
1876
+ response: dict[str, Any] = send_inference_request(
1877
+ payload=payload,
1878
+ endpoint_name="classification",
1879
+ files=files,
1880
+ v2=True,
1881
+ metadata_payload={"function_name": "siglip_classification"},
1882
+ )
1883
+
1884
+ return response
1885
+
1886
+
1848
1887
  # Utility and visualization functions
1849
1888
 
1850
1889
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.189
3
+ Version: 0.2.191
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -54,11 +54,7 @@ Description-Content-Type: text/markdown
54
54
  </div>
55
55
 
56
56
  VisionAgent is a library that helps you utilize agent frameworks to generate code to
57
- solve your vision task. Many current vision problems can easily take hours or days to
58
- solve, you need to find the right model, figure out how to use it and program it to
59
- accomplish the task you want. VisionAgent aims to provide an in-seconds experience by
60
- allowing users to describe their problem in text and have the agent framework generate
61
- code to solve the task for them. Check out our discord for updates and roadmaps!
57
+ solve your vision task. Check out our discord for updates and roadmaps!
62
58
 
63
59
  ## Table of Contents
64
60
  - [🚀Quick Start](#quick-start)
@@ -82,19 +78,19 @@ To get started with the python library, you can install it using pip:
82
78
  pip install vision-agent
83
79
  ```
84
80
 
85
- Ensure you have an Anthropic key and an OpenAI API key and set in your environment
81
+ Ensure you have both an Anthropic key and an OpenAI API key and set in your environment
86
82
  variables (if you are using Azure OpenAI please see the Azure setup section):
87
83
 
88
84
  ```bash
89
- export ANTHROPIC_API_KEY="your-api-key"
90
- export OPENAI_API_KEY="your-api-key"
85
+ export ANTHROPIC_API_KEY="your-api-key" # needed for VisionAgent and VisionAgentCoder
86
+ export OPENAI_API_KEY="your-api-key" # needed for ToolRecommender
91
87
  ```
92
88
 
93
89
  ### Basic Usage
94
90
  To get started you can just import the `VisionAgent` and start chatting with it:
95
91
  ```python
96
92
  >>> from vision_agent.agent import VisionAgent
97
- >>> agent = VisionAgent()
93
+ >>> agent = VisionAgent(verbosity=2)
98
94
  >>> resp = agent("Hello")
99
95
  >>> print(resp)
100
96
  [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "{'thoughts': 'The user has greeted me. I will respond with a greeting and ask how I can assist them.', 'response': 'Hello! How can I assist you today?', 'let_user_respond': True}"}]
@@ -103,7 +99,7 @@ To get started you can just import the `VisionAgent` and start chatting with it:
103
99
  ```
104
100
 
105
101
  The chat messages are similar to `OpenAI`'s format with `role` and `content` keys but
106
- in addition to those you can add `medai` which is a list of media files that can either
102
+ in addition to those you can add `media` which is a list of media files that can either
107
103
  be images or video files.
108
104
 
109
105
  ## Documentation
@@ -3,9 +3,9 @@ vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xl
3
3
  vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
4
4
  vision_agent/agent/agent_utils.py,sha256=WYJF11PfKXlRMPnogGz3s7c2TlWoxoGzuLiIptVYE1s,5524
5
5
  vision_agent/agent/vision_agent.py,sha256=rr1P9iTbr7OsjgMYWCeIxQYI4cLwPWia3NIMJNi-9Yo,26110
6
- vision_agent/agent/vision_agent_coder.py,sha256=3Q1VWrN-BNUoSD4OAqKazvXkP2c04PXDYu2Z1f5dQb0,31960
6
+ vision_agent/agent/vision_agent_coder.py,sha256=7Ko1c41dvdDbSP_Yw2yz_SlE3sO6hhlpf_oCjW0we2w,31749
7
7
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
8
- vision_agent/agent/vision_agent_planner.py,sha256=mjmnXG9CvYf_ZA7ZJ3ri4H-2U_Km55gF1sZYRSOlxpY,19027
8
+ vision_agent/agent/vision_agent_planner.py,sha256=Hy4vKiae7zIIKVPgLetGArbsjGRNVOXlxY9xhFgW-A0,18871
9
9
  vision_agent/agent/vision_agent_planner_prompts.py,sha256=JDARUzko2HZdxkBtcy6wuP9DCCmbqhK_gnVgrjr6l1k,6691
10
10
  vision_agent/agent/vision_agent_prompts.py,sha256=4329ll0kqCznRALIMl-rlKWGjN92p3bcRrz8R-cO744,13748
11
11
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
16
16
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
17
17
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
18
18
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
19
- vision_agent/tools/__init__.py,sha256=KVP4_6qxOb2lpFdQgQtyDfdkMLL1O6wVZNK19MXp-xo,2798
19
+ vision_agent/tools/__init__.py,sha256=UrpGFB1ACOZZCAyj8vNw0IHhKm9wGp0qHOtci2cqAMU,2825
20
20
  vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
23
- vision_agent/tools/tools.py,sha256=LwpFnHRPvnGaRrzHFAs9CojcbKLyhaYnJYDk7l9fGlw,83609
23
+ vision_agent/tools/tools.py,sha256=72Ml8kxtaqIqrh4hiZQ81f5Mrl-7z-W1a6bCjIMBvoA,84952
24
24
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
25
25
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
26
26
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
29
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
30
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
31
31
  vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
32
- vision_agent-0.2.189.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- vision_agent-0.2.189.dist-info/METADATA,sha256=8ZrD4pcM8kLhBGOhLnIITMVYwd02L84J7a_xMvYltPo,18328
34
- vision_agent-0.2.189.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
- vision_agent-0.2.189.dist-info/RECORD,,
32
+ vision_agent-0.2.191.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.191.dist-info/METADATA,sha256=eZGSUWuHBTmyStliR_oxFyoWMeLW0_0qP2ULx8y_-1E,18067
34
+ vision_agent-0.2.191.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.191.dist-info/RECORD,,