vision-agent 0.0.17__tar.gz → 0.0.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.0.17
3
+ Version: 0.0.19
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -47,7 +47,7 @@ model.generate("Describe this image", "image.png")
47
47
  >>> "A yellow house with a green lawn."
48
48
  ```
49
49
 
50
- **WARNING** We are hosting the LLaVA-1.6 34B model, if it times out please wait ~5-10 min for the server to warm up as it shuts down when usage is low.
50
+ **WARNING** We are hosting the LLaVA-1.6 34B model, if it times out please wait ~3-5 min for the server to warm up as it shuts down when usage is low.
51
51
 
52
52
  ### DataStore
53
53
  You can use the `DataStore` class to store your images, add new metadata to them such as descriptions, and search over different columns.
@@ -22,7 +22,7 @@ model.generate("Describe this image", "image.png")
22
22
  >>> "A yellow house with a green lawn."
23
23
  ```
24
24
 
25
- **WARNING** We are hosting the LLaVA-1.6 34B model, if it times out please wait ~5-10 min for the server to warm up as it shuts down when usage is low.
25
+ **WARNING** We are hosting the LLaVA-1.6 34B model, if it times out please wait ~3-5 min for the server to warm up as it shuts down when usage is low.
26
26
 
27
27
  ### DataStore
28
28
  You can use the `DataStore` class to store your images, add new metadata to them such as descriptions, and search over different columns.
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.0.17"
7
+ version = "0.0.19"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -1,4 +1,5 @@
1
1
  import base64
2
+ import json
2
3
  import logging
3
4
  from abc import ABC, abstractmethod
4
5
  from pathlib import Path
@@ -6,6 +7,15 @@ from typing import Any, Dict, List, Optional, Union, cast
6
7
 
7
8
  import requests
8
9
 
10
+ from vision_agent.tools import (
11
+ SYSTEM_PROMPT,
12
+ CHOOSE_PARAMS,
13
+ ImageTool,
14
+ CLIP,
15
+ GroundingDINO,
16
+ GroundingSAM,
17
+ )
18
+
9
19
  logging.basicConfig(level=logging.INFO)
10
20
 
11
21
  _LOGGER = logging.getLogger(__name__)
@@ -90,6 +100,75 @@ class OpenAILMM(LMM):
90
100
  )
91
101
  return cast(str, response.choices[0].message.content)
92
102
 
103
+ def generate_classifier(self, prompt: str) -> ImageTool:
104
+ prompt = CHOOSE_PARAMS.format(api_doc=CLIP.doc, question=prompt)
105
+ response = self.client.chat.completions.create(
106
+ model="gpt-4-turbo-preview", # no need to use vision model here
107
+ response_format={"type": "json_object"},
108
+ messages=[
109
+ {"role": "system", "content": SYSTEM_PROMPT},
110
+ {"role": "user", "content": prompt},
111
+ ],
112
+ )
113
+
114
+ try:
115
+ prompt = json.loads(cast(str, response.choices[0].message.content))[
116
+ "prompt"
117
+ ]
118
+ except json.JSONDecodeError:
119
+ _LOGGER.error(
120
+ f"Failed to decode response: {response.choices[0].message.content}"
121
+ )
122
+ raise ValueError("Failed to decode response")
123
+
124
+ return CLIP(prompt)
125
+
126
+ def generate_detector(self, prompt: str) -> ImageTool:
127
+ prompt = CHOOSE_PARAMS.format(api_doc=GroundingDINO.doc, question=prompt)
128
+ response = self.client.chat.completions.create(
129
+ model="gpt-4-turbo-preview", # no need to use vision model here
130
+ response_format={"type": "json_object"},
131
+ messages=[
132
+ {"role": "system", "content": SYSTEM_PROMPT},
133
+ {"role": "user", "content": prompt},
134
+ ],
135
+ )
136
+
137
+ try:
138
+ prompt = json.loads(cast(str, response.choices[0].message.content))[
139
+ "prompt"
140
+ ]
141
+ except json.JSONDecodeError:
142
+ _LOGGER.error(
143
+ f"Failed to decode response: {response.choices[0].message.content}"
144
+ )
145
+ raise ValueError("Failed to decode response")
146
+
147
+ return GroundingDINO(prompt)
148
+
149
+ def generate_segmentor(self, prompt: str) -> ImageTool:
150
+ prompt = CHOOSE_PARAMS.format(api_doc=GroundingSAM.doc, question=prompt)
151
+ response = self.client.chat.completions.create(
152
+ model="gpt-4-turbo-preview", # no need to use vision model here
153
+ response_format={"type": "json_object"},
154
+ messages=[
155
+ {"role": "system", "content": SYSTEM_PROMPT},
156
+ {"role": "user", "content": prompt},
157
+ ],
158
+ )
159
+
160
+ try:
161
+ prompt = json.loads(cast(str, response.choices[0].message.content))[
162
+ "prompt"
163
+ ]
164
+ except json.JSONDecodeError:
165
+ _LOGGER.error(
166
+ f"Failed to decode response: {response.choices[0].message.content}"
167
+ )
168
+ raise ValueError("Failed to decode response")
169
+
170
+ return GroundingSAM(prompt)
171
+
93
172
 
94
173
  def get_lmm(name: str) -> LMM:
95
174
  if name == "openai":
@@ -0,0 +1,2 @@
1
+ from .prompts import SYSTEM_PROMPT, CHOOSE_PARAMS
2
+ from .tools import ImageTool, CLIP, GroundingDINO, GroundingSAM
@@ -0,0 +1,19 @@
1
+ SYSTEM_PROMPT = "You are a helpful assistant."
2
+
3
+ # EasyTool prompts
4
+ CHOOSE_PARAMS = (
5
+ "This is an API tool documentation. Given a user's question, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question.\n"
6
+ "This is the API tool documentation: {api_doc}\n"
7
+ "Please note that: \n"
8
+ "1. The Example in the API tool documentation can help you better understand the use of the API.\n"
9
+ '2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}\n'
10
+ "3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs.\n"
11
+ '4. If you need to use this API multiple times, please set "Parameters" to a list.\n'
12
+ "5. You must ONLY output in a parsible JSON format. Two examples output looks like:\n"
13
+ "'''\n"
14
+ 'Example 1: {{"Parameters":{{"keyword": "Artificial Intelligence", "language": "English"}}}}\n'
15
+ 'Example 2: {{"Parameters":[{{"keyword": "Artificial Intelligence", "language": "English"}}, {{"keyword": "Machine Learning", "language": "English"}}]}}\n'
16
+ "'''\n"
17
+ "This is user's question: {question}\n"
18
+ "Output:\n"
19
+ )
@@ -0,0 +1,58 @@
1
+ from typing import Dict, List, Union
2
+ from abc import ABC, abstractmethod
3
+
4
+ from PIL.Image import Image as ImageType
5
+
6
+
7
+ class ImageTool(ABC):
8
+ @abstractmethod
9
+ def __call__(self, image: Union[str, ImageType]) -> List[Dict]:
10
+ pass
11
+
12
+
13
+ class CLIP(ImageTool):
14
+ doc = (
15
+ "CLIP is a tool that can classify or tag any image given a set if input classes or tags."
16
+ "Here are some exmaples of how to use the tool, the examples are in the format of User Question: which will have the user's question in quotes followed by the parameters in JSON format, which is the parameters you need to output to call the API to solve the user's question.\n"
17
+ 'Example 1: User Question: "Can you classify this image as a cat?" {{"Parameters":{{"prompt": ["cat"]}}}}\n'
18
+ 'Example 2: User Question: "Can you tag this photograph with cat or dog?" {{"Parameters":{{"prompt": ["cat", "dog"]}}}}\n'
19
+ 'Exmaple 3: User Question: "Can you build me a classifier taht classifies red shirts, green shirts and other?" {{"Parameters":{{"prompt": ["red shirt", "green shirt", "other"]}}}}\n'
20
+ )
21
+
22
+ def __init__(self, prompt: str):
23
+ self.prompt = prompt
24
+
25
+ def __call__(self, image: Union[str, ImageType]) -> List[Dict]:
26
+ raise NotImplementedError
27
+
28
+
29
+ class GroundingDINO(ImageTool):
30
+ doc = (
31
+ "Grounding DINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions."
32
+ "Here are some exmaples of how to use the tool, the examples are in the format of User Question: which will have the user's question in quotes followed by the parameters in JSON format, which is the parameters you need to output to call the API to solve the user's question.\n"
33
+ 'Example 1: User Question: "Can you build me a car detector?" {{"Parameters":{{"prompt": "car"}}}}\n'
34
+ 'Example 2: User Question: "Can you detect the person on the left?" {{"Parameters":{{"prompt": "person on the left"}}\n'
35
+ 'Exmaple 3: User Question: "Can you build me a tool that detects red shirts and green shirts?" {{"Parameters":{{"prompt": "red shirt. green shirt"}}}}\n'
36
+ )
37
+
38
+ def __init__(self, prompt: str):
39
+ self.prompt = prompt
40
+
41
+ def __call__(self, image: Union[str, ImageType]) -> List[Dict]:
42
+ raise NotImplementedError
43
+
44
+
45
+ class GroundingSAM(ImageTool):
46
+ doc = (
47
+ "Grounding SAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions."
48
+ "Here are some exmaples of how to use the tool, the examples are in the format of User Question: which will have the user's question in quotes followed by the parameters in JSON format, which is the parameters you need to output to call the API to solve the user's question.\n"
49
+ 'Example 1: User Question: "Can you build me a car segmentor?" {{"Parameters":{{"prompt": "car"}}}}\n'
50
+ 'Example 2: User Question: "Can you segment the person on the left?" {{"Parameters":{{"prompt": "person on the left"}}\n'
51
+ 'Exmaple 3: User Question: "Can you build me a tool that segments red shirts and green shirts?" {{"Parameters":{{"prompt": "red shirt. green shirt"}}}}\n'
52
+ )
53
+
54
+ def __init__(self, prompt: str):
55
+ self.prompt = prompt
56
+
57
+ def __call__(self, image: Union[str, ImageType]) -> List[Dict]:
58
+ raise NotImplementedError
File without changes