vision-agent 0.0.41__tar.gz → 0.0.43__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {vision_agent-0.0.41 → vision_agent-0.0.43}/PKG-INFO +1 -1
  2. {vision_agent-0.0.41 → vision_agent-0.0.43}/pyproject.toml +1 -1
  3. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/agent/vision_agent.py +15 -10
  4. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/llm/llm.py +10 -7
  5. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/lmm/lmm.py +14 -3
  6. vision_agent-0.0.43/vision_agent/tools/tools.json +154 -0
  7. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/tools/tools.py +14 -5
  8. {vision_agent-0.0.41 → vision_agent-0.0.43}/LICENSE +0 -0
  9. {vision_agent-0.0.41 → vision_agent-0.0.43}/README.md +0 -0
  10. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/__init__.py +0 -0
  11. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/agent/__init__.py +0 -0
  12. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/agent/agent.py +0 -0
  13. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/agent/easytool.py +0 -0
  14. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/agent/easytool_prompts.py +0 -0
  15. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/agent/reflexion.py +0 -0
  16. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/agent/reflexion_prompts.py +0 -0
  17. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/agent/vision_agent_prompts.py +0 -0
  18. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/data/__init__.py +0 -0
  19. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/data/data.py +0 -0
  20. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/emb/__init__.py +0 -0
  21. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/emb/emb.py +0 -0
  22. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/image_utils.py +0 -0
  23. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/llm/__init__.py +0 -0
  24. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/lmm/__init__.py +0 -0
  25. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/tools/__init__.py +0 -0
  26. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/tools/prompts.py +0 -0
  27. {vision_agent-0.0.41 → vision_agent-0.0.43}/vision_agent/tools/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.0.41
3
+ Version: 0.0.43
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.0.41"
7
+ version = "0.0.43"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -256,7 +256,6 @@ def retrieval(
256
256
  )
257
257
  if tool_id is None:
258
258
  return {}, ""
259
- _LOGGER.info(f"\t(Tool ID, name): ({tool_id}, {tools[tool_id]['name']})")
260
259
 
261
260
  tool_instructions = tools[tool_id]
262
261
  tool_usage = tool_instructions["usage"]
@@ -265,7 +264,6 @@ def retrieval(
265
264
  parameters = choose_parameter(
266
265
  model, question, tool_usage, previous_log, reflections
267
266
  )
268
- _LOGGER.info(f"\tParameters: {parameters} for {tool_name}")
269
267
  if parameters is None:
270
268
  return {}, ""
271
269
  tool_results = {"task": question, "tool_name": tool_name, "parameters": parameters}
@@ -290,7 +288,7 @@ def retrieval(
290
288
  tool_results["call_results"] = call_results
291
289
 
292
290
  call_results_str = str(call_results)
293
- _LOGGER.info(f"\tCall Results: {call_results_str}")
291
+ # _LOGGER.info(f"\tCall Results: {call_results_str}")
294
292
  return tool_results, call_results_str
295
293
 
296
294
 
@@ -344,7 +342,9 @@ def self_reflect(
344
342
 
345
343
  def parse_reflect(reflect: str) -> bool:
346
344
  # GPT-4V has a hard time following directions, so make the criteria less strict
347
- return "finish" in reflect.lower() and len(reflect) < 100
345
+ return (
346
+ "finish" in reflect.lower() and len(reflect) < 100
347
+ ) or "finish" in reflect.lower()[-10:]
348
348
 
349
349
 
350
350
  def visualize_result(all_tool_results: List[Dict]) -> List[str]:
@@ -423,10 +423,16 @@ class VisionAgent(Agent):
423
423
  verbose: bool = False,
424
424
  ):
425
425
  self.task_model = (
426
- OpenAILLM(json_mode=True) if task_model is None else task_model
426
+ OpenAILLM(json_mode=True, temperature=0.1)
427
+ if task_model is None
428
+ else task_model
429
+ )
430
+ self.answer_model = (
431
+ OpenAILLM(temperature=0.1) if answer_model is None else answer_model
432
+ )
433
+ self.reflect_model = (
434
+ OpenAILMM(temperature=0.1) if reflect_model is None else reflect_model
427
435
  )
428
- self.answer_model = OpenAILLM() if answer_model is None else answer_model
429
- self.reflect_model = OpenAILMM() if reflect_model is None else reflect_model
430
436
  self.max_retries = max_retries
431
437
 
432
438
  self.tools = TOOLS
@@ -466,7 +472,6 @@ class VisionAgent(Agent):
466
472
  for _ in range(self.max_retries):
467
473
  task_list = create_tasks(self.task_model, question, self.tools, reflections)
468
474
 
469
- _LOGGER.info(f"Task Dependency: {task_list}")
470
475
  task_depend = {"Original Quesiton": question}
471
476
  previous_log = ""
472
477
  answers = []
@@ -477,7 +482,6 @@ class VisionAgent(Agent):
477
482
  for task in task_list:
478
483
  task_str = task["task"]
479
484
  previous_log = str(task_depend)
480
- _LOGGER.info(f"\tSubtask: {task_str}")
481
485
  tool_results, call_results = retrieval(
482
486
  self.task_model,
483
487
  task_str,
@@ -492,6 +496,7 @@ class VisionAgent(Agent):
492
496
  tool_results["answer"] = answer
493
497
  all_tool_results.append(tool_results)
494
498
 
499
+ _LOGGER.info(f"\tCall Result: {call_results}")
495
500
  _LOGGER.info(f"\tAnswer: {answer}")
496
501
  answers.append({"task": task_str, "answer": answer})
497
502
  task_depend[task["id"]]["answer"] = answer # type: ignore
@@ -510,7 +515,7 @@ class VisionAgent(Agent):
510
515
  final_answer,
511
516
  visualized_images[0] if len(visualized_images) > 0 else image,
512
517
  )
513
- _LOGGER.info(f"\tReflection: {reflection}")
518
+ _LOGGER.info(f"Reflection: {reflection}")
514
519
  if parse_reflect(reflection):
515
520
  break
516
521
  else:
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  from abc import ABC, abstractmethod
3
- from typing import Callable, Dict, List, Mapping, Union, cast
3
+ from typing import Any, Callable, Dict, List, Mapping, Union, cast
4
4
 
5
5
  from openai import OpenAI
6
6
 
@@ -31,30 +31,33 @@ class OpenAILLM(LLM):
31
31
  r"""An LLM class for any OpenAI LLM model."""
32
32
 
33
33
  def __init__(
34
- self, model_name: str = "gpt-4-turbo-preview", json_mode: bool = False
34
+ self,
35
+ model_name: str = "gpt-4-turbo-preview",
36
+ json_mode: bool = False,
37
+ **kwargs: Any
35
38
  ):
36
39
  self.model_name = model_name
37
40
  self.client = OpenAI()
38
- self.json_mode = json_mode
41
+ self.kwargs = kwargs
42
+ if json_mode:
43
+ self.kwargs["response_format"] = {"type": "json_object"}
39
44
 
40
45
  def generate(self, prompt: str) -> str:
41
- kwargs = {"response_format": {"type": "json_object"}} if self.json_mode else {}
42
46
  response = self.client.chat.completions.create(
43
47
  model=self.model_name,
44
48
  messages=[
45
49
  {"role": "user", "content": prompt},
46
50
  ],
47
- **kwargs, # type: ignore
51
+ **self.kwargs,
48
52
  )
49
53
 
50
54
  return cast(str, response.choices[0].message.content)
51
55
 
52
56
  def chat(self, chat: List[Dict[str, str]]) -> str:
53
- kwargs = {"response_format": {"type": "json_object"}} if self.json_mode else {}
54
57
  response = self.client.chat.completions.create(
55
58
  model=self.model_name,
56
59
  messages=chat, # type: ignore
57
- **kwargs,
60
+ **self.kwargs,
58
61
  )
59
62
 
60
63
  return cast(str, response.choices[0].message.content)
@@ -97,11 +97,15 @@ class OpenAILMM(LMM):
97
97
  r"""An LMM class for the OpenAI GPT-4 Vision model."""
98
98
 
99
99
  def __init__(
100
- self, model_name: str = "gpt-4-vision-preview", max_tokens: int = 1024
100
+ self,
101
+ model_name: str = "gpt-4-vision-preview",
102
+ max_tokens: int = 1024,
103
+ **kwargs: Any,
101
104
  ):
102
105
  self.model_name = model_name
103
106
  self.max_tokens = max_tokens
104
107
  self.client = OpenAI()
108
+ self.kwargs = kwargs
105
109
 
106
110
  def __call__(
107
111
  self,
@@ -123,6 +127,13 @@ class OpenAILMM(LMM):
123
127
 
124
128
  if image:
125
129
  extension = Path(image).suffix
130
+ if extension.lower() == ".jpeg" or extension.lower() == ".jpg":
131
+ extension = "jpg"
132
+ elif extension.lower() == ".png":
133
+ extension = "png"
134
+ else:
135
+ raise ValueError(f"Unsupported image extension: {extension}")
136
+
126
137
  encoded_image = encode_image(image)
127
138
  fixed_chat[0]["content"].append( # type: ignore
128
139
  {
@@ -135,7 +146,7 @@ class OpenAILMM(LMM):
135
146
  )
136
147
 
137
148
  response = self.client.chat.completions.create(
138
- model=self.model_name, messages=fixed_chat, max_tokens=self.max_tokens # type: ignore
149
+ model=self.model_name, messages=fixed_chat, max_tokens=self.max_tokens, **self.kwargs # type: ignore
139
150
  )
140
151
 
141
152
  return cast(str, response.choices[0].message.content)
@@ -163,7 +174,7 @@ class OpenAILMM(LMM):
163
174
  )
164
175
 
165
176
  response = self.client.chat.completions.create(
166
- model=self.model_name, messages=message, max_tokens=self.max_tokens # type: ignore
177
+ model=self.model_name, messages=message, max_tokens=self.max_tokens, **self.kwargs # type: ignore
167
178
  )
168
179
  return cast(str, response.choices[0].message.content)
169
180
 
@@ -0,0 +1,154 @@
1
+ [
2
+ {
3
+ "name": "image_question_answering",
4
+ "description": "answers a question about an image"
5
+ },
6
+ {
7
+ "name": "text_question_answering",
8
+ "description": "answers a question provided a context in text form"
9
+ },
10
+ {
11
+ "name": "image_captioning",
12
+ "description": "Generate a caption for the image. It can generate a brief description that can be used for image perception and image generation. For example: a) you can use this tool when you want to know what is it in the image\"; and b) when you want to generate a new image similar or resemble to input.png, you can use `image_captioning` to obtain the description about image input.png."
13
+ },
14
+ {
15
+ "name": "image_to_text",
16
+ "description": "Generate a description for the image. It can generate a detailed description that can be used for image perception and image generation. For example: a) you can use this tool when you want to know what is it in the image\"; and b) when you want to generate a new image similar or resemble to input.png, you can use `text_to_image` to obtain the description about image input.png."
17
+ },
18
+ {
19
+ "name": "image_to_edge",
20
+ "description": "get the edge map of the image."
21
+ },
22
+ {
23
+ "name": "image_to_line",
24
+ "description": "get the line map of the image."
25
+ },
26
+ {
27
+ "name": "image_to_hed",
28
+ "description": "get the HED map of the image."
29
+ },
30
+ {
31
+ "name": "image_to_scribble",
32
+ "description": "get the scribble of the image."
33
+ },
34
+ {
35
+ "name": "image_to_pose",
36
+ "description": "Get the pose of the image. It is usually used in image generation conditioned on pose map from input image."
37
+ },
38
+ {
39
+ "name": "image_to_depth",
40
+ "description": "get the depth map of the image."
41
+ },
42
+ {
43
+ "name": "image_to_normal",
44
+ "description": "get the normal map of the image."
45
+ },
46
+ {
47
+ "name": "object_detection",
48
+ "description": "detect all the objects in the image."
49
+ },
50
+ {
51
+ "name": "image_classification",
52
+ "description": "classify the objects in the image."
53
+ },
54
+ {
55
+ "name": "closed_set_image_classification",
56
+ "description": "Given a set of classes as a text prompt, classify the objects in the image based on the given classes."
57
+ },
58
+ {
59
+ "name": "panoptic_segmentation",
60
+ "description": "segment the common objects in the given image."
61
+ },
62
+ {
63
+ "name": "visual_grounding",
64
+ "description": "Visual Grounding (VG) aims to locate the most relevant object or region in an image, based on a natural language query. The query can be a phrase, a sentence or even a multi-round dialogue."
65
+ },
66
+ {
67
+ "name": "visual_grounding_segment",
68
+ "description": "Visual Grounding (VG) aims to locate the most relevant object or region in an image, based on a natural language query. The query can be a phrase, a sentence or even a multi-round dialogue."
69
+ },
70
+ {
71
+ "name": "optical_character_recognition",
72
+ "description": "Optical Character Recognition (OCR) is the process that converts an image of text into a machine-readable text format."
73
+ },
74
+ {
75
+ "name": "select_category",
76
+ "description": "select the target classes in category list with the given condition."
77
+ },
78
+ {
79
+ "name": "select_bbox",
80
+ "description": "select the bounding boxes with the given condition."
81
+ },
82
+ {
83
+ "name": "select_mask",
84
+ "description": "select the masks with the given condition."
85
+ },
86
+ {
87
+ "name": "count_categories",
88
+ "description": "count target categories in the given list."
89
+ },
90
+ {
91
+ "name": "count_objects",
92
+ "description": "count target objects in the given list. It is useful when you want to count the number of objects in the image"
93
+ },
94
+ {
95
+ "name": "count_masks",
96
+ "description": "count target mask in the given list."
97
+ },
98
+ {
99
+ "name": "video_captioning",
100
+ "description": "Generate a caption or description for video. It can generate a detailed description that can be used for video perception and video generation. For example: a) you can use this tool when you want to know what happened in the video\"; and b) when you want to generate tags for input video, you can use translate description obtained from `image_captioning` into tags."
101
+ },
102
+ {
103
+ "name": "video_classification",
104
+ "description": "Classify the video and detect the actions in the video."
105
+ },
106
+ {
107
+ "name": "frame_captioning",
108
+ "description": "Generate a caption or description for video every n seconds. It can generate a detailed description that can be used for video perception and video generation. For example: a) you can use this tool when you want to know what happened in the video\"; and b) when you want to generate tags for input video, you can use translate description obtained from `image_captioning` into tags."
109
+ },
110
+ {
111
+ "name": "frame_classification",
112
+ "description": "Classify the video and detect the actions in the every n seconds."
113
+ },
114
+ {
115
+ "name": "text_to_text_generation",
116
+ "description": "Text to text generation. It can be used for sentence acceptability judgment, Sentiment analysis, Paraphrasing/sentence similarity, Natural language inference, Sentence completion, Word sense disambiguation, Question answering."
117
+ },
118
+ {
119
+ "name": "openai_chat_model",
120
+ "description": "Answer the question by Large Language Model. It is useful for tasks such as generating content, answering questions, engaging in conversations and providing explanations. However, it still has some limitations. For example, it can not directly access the up-to-date information like time, weather, etc."
121
+ },
122
+ {
123
+ "name": "summarization",
124
+ "description": "Summarize sentences, long narratives, articles, papers, textbooks."
125
+ },
126
+ {
127
+ "name": "text_to_tags",
128
+ "description": "Predict the tags of text, article and papers by using the their textual content as input"
129
+ },
130
+ {
131
+ "name": "sentiment_analysis",
132
+ "description": "Sentiment analysis is the process of analyzing digital text to determine if the emotional tone of the message is positive, negative, or neutral."
133
+ }
134
+ ]
135
+
136
+ input_keys: [
137
+ { "image": "image file"},
138
+ { "prompt": "text"},
139
+ { "video": "video file"},
140
+ { "context": "context for text qa"},
141
+ { "tool": "tool name mentioned above"},
142
+ ]
143
+
144
+ output_keys: [
145
+ { "labels": "list of labels for image tasks"},
146
+ { "scores": "list of scores for image and text tasks"},
147
+ { "bboxes": "list of bounding boxes for detection tasks"},
148
+ { "masks": "list of masks for segmentation tasks"},
149
+ { "text": "list of text for text tasks"},
150
+ { "frames": "list of frame numbers for video tasks"},
151
+ ]
152
+
153
+
154
+
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
  import tempfile
3
+ import os
3
4
  from abc import ABC
4
5
  from collections import Counter as CounterClass
5
6
  from pathlib import Path
@@ -139,7 +140,7 @@ class GroundingDINO(Tool):
139
140
  'scores': [0.98, 0.02]}]
140
141
  """
141
142
 
142
- _ENDPOINT = "https://chnicr4kes5ku77niv2zoytggq0qyqlp.lambda-url.us-east-2.on.aws"
143
+ _ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"
143
144
 
144
145
  name = "grounding_dino_"
145
146
  description = "'grounding_dino_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions."
@@ -182,11 +183,15 @@ class GroundingDINO(Tool):
182
183
  image_b64 = convert_to_b64(image)
183
184
  data = {
184
185
  "prompt": prompt,
185
- "images": [image_b64],
186
+ "images": image_b64,
187
+ "tool": "visual_grounding",
186
188
  }
187
189
  res = requests.post(
188
190
  self._ENDPOINT,
189
- headers={"Content-Type": "application/json"},
191
+ headers={
192
+ "Content-Type": "application/json",
193
+ "Authorization": f"Api-Key {os.environ['BASETEN_API_KEY']}",
194
+ },
190
195
  json=data,
191
196
  )
192
197
  resp_json: Dict[str, Any] = res.json()
@@ -230,7 +235,7 @@ class GroundingSAM(Tool):
230
235
  [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
231
236
  """
232
237
 
233
- _ENDPOINT = "https://cou5lfmus33jbddl6hoqdfbw7e0qidrw.lambda-url.us-east-2.on.aws"
238
+ _ENDPOINT = "https://model-owp50nlq.api.baseten.co/production/predict"
234
239
 
235
240
  name = "grounding_sam_"
236
241
  description = "'grounding_sam_' is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions."
@@ -274,10 +279,14 @@ class GroundingSAM(Tool):
274
279
  data = {
275
280
  "classes": prompt,
276
281
  "image": image_b64,
282
+ "tool": "visual_grounding_segment",
277
283
  }
278
284
  res = requests.post(
279
285
  self._ENDPOINT,
280
- headers={"Content-Type": "application/json"},
286
+ headers={
287
+ "Content-Type": "application/json",
288
+ "Authorization": f"Api-Key {os.environ['BASETEN_API_KEY']}",
289
+ },
281
290
  json=data,
282
291
  )
283
292
  resp_json: Dict[str, Any] = res.json()
File without changes
File without changes