vision-agent 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -241,7 +241,8 @@ class EasyTool(Agent):
241
241
  based on the original implementation https://github.com/microsoft/JARVIS/tree/main/easytool
242
242
  from the funcQA code.
243
243
 
244
- Examples::
244
+ Example
245
+ -------
245
246
  >>> from vision_agent.agent import EasyTool
246
247
  >>> agent = EasyTool()
247
248
  >>> resp = agent("If a car is traveling at 64 km/h, how many kilometers does it travel in 29 minutes?")
@@ -273,6 +274,15 @@ class EasyTool(Agent):
273
274
  input: Union[List[Dict[str, str]], str],
274
275
  image: Optional[Union[str, Path]] = None,
275
276
  ) -> str:
277
+ """Invoke the vision agent.
278
+
279
+ Parameters:
280
+ input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
281
+ image: the input image referenced in the prompt parameter.
282
+
283
+ Returns:
284
+ A text response.
285
+ """
276
286
  if isinstance(input, str):
277
287
  input = [{"role": "user", "content": input}]
278
288
  return self.chat(input, image=image)
@@ -68,7 +68,8 @@ class Reflexion(Agent):
68
68
  self_reflect_model. Using Reflexion with LMMs may not work well, if it gets it wrong
69
69
  the first time, chances are it can't actually see the thing you want it to see.
70
70
 
71
- Examples::
71
+ Example
72
+ -------
72
73
  >>> from vision_agent.agent import Reflexion
73
74
  >>> agent = Reflexion()
74
75
  >>> question = "How many tires does a truck have?"
@@ -139,6 +140,15 @@ class Reflexion(Agent):
139
140
  input: Union[str, List[Dict[str, str]]],
140
141
  image: Optional[Union[str, Path]] = None,
141
142
  ) -> str:
143
+ """Invoke the vision agent.
144
+
145
+ Parameters:
146
+ input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
147
+ image: the input image referenced in the prompt parameter.
148
+
149
+ Returns:
150
+ A text response.
151
+ """
142
152
  if isinstance(input, str):
143
153
  input = [{"role": "user", "content": input}]
144
154
  return self.chat(input, image)
@@ -315,12 +315,16 @@ def create_tasks(
315
315
  def self_reflect(
316
316
  reflect_model: Union[LLM, LMM],
317
317
  question: str,
318
+ tools: Dict[int, Any],
318
319
  tool_result: List[Dict],
319
320
  final_answer: str,
320
321
  image: Optional[Union[str, Path]] = None,
321
322
  ) -> str:
322
323
  prompt = VISION_AGENT_REFLECTION.format(
323
- question=question, tool_results=str(tool_result), final_answer=final_answer
324
+ question=question,
325
+ tools=format_tools(tools),
326
+ tool_results=str(tool_result),
327
+ final_answer=final_answer,
324
328
  )
325
329
  if issubclass(type(reflect_model), LMM):
326
330
  return reflect_model(prompt, image=image) # type: ignore
@@ -328,7 +332,8 @@ def self_reflect(
328
332
 
329
333
 
330
334
  def parse_reflect(reflect: str) -> bool:
331
- return reflect.lower() == "finish"
335
+ # GPT-4V has a hard time following directions, so make the criteria less strict
336
+ return "finish" in reflect.lower() and len(reflect) < 100
332
337
 
333
338
 
334
339
  class VisionAgent(Agent):
@@ -339,7 +344,8 @@ class VisionAgent(Agent):
339
344
  reflect on whether or not it was able to accomplish the task based off of the plan
340
345
  and final results, if not it will redo the task with this newly added reflection.
341
346
 
342
- Examples::
347
+ Example
348
+ -------
343
349
  >>> from vision_agent.agent import VisionAgent
344
350
  >>> agent = VisionAgent()
345
351
  >>> resp = agent("If red tomatoes cost $5 each and yellow tomatoes cost $2.50 each, what is the total cost of all the tomatoes in the image?", image="tomatoes.jpg")
@@ -371,6 +377,15 @@ class VisionAgent(Agent):
371
377
  input: Union[List[Dict[str, str]], str],
372
378
  image: Optional[Union[str, Path]] = None,
373
379
  ) -> str:
380
+ """Invoke the vision agent.
381
+
382
+ Parameters:
383
+ input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
384
+ image: the input image referenced in the prompt parameter.
385
+
386
+ Returns:
387
+ The result of the vision agent in text.
388
+ """
374
389
  if isinstance(input, str):
375
390
  input = [{"role": "user", "content": input}]
376
391
  return self.chat(input, image=image)
@@ -425,7 +440,12 @@ class VisionAgent(Agent):
425
440
  )
426
441
 
427
442
  reflection = self_reflect(
428
- self.reflect_model, question, all_tool_results, final_answer, image
443
+ self.reflect_model,
444
+ question,
445
+ self.tools,
446
+ all_tool_results,
447
+ final_answer,
448
+ image,
429
449
  )
430
450
  _LOGGER.info(f"\tReflection: {reflection}")
431
451
  if parse_reflect(reflection):
@@ -1,7 +1,10 @@
1
- VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self refection. You will be given a previous reasoning trial in which you were given the user's question, the decomposed tasks and tools that the agent used to answer teh question and the final answer the agent provided. You must determine if the agent's answer was correct or incorrect. If the agen'ts answer was correct, respond with Finish. If the agent's answer was incorrect, you must diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure. Use complete sentences.
1
+ VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question and the final answer the agent provided. You must determine if the agent's answer was correct or incorrect. If the agent's answer was correct, respond with Finish. If the agent's answer was incorrect, you must diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure with the tools avilable. Use complete sentences.
2
2
 
3
3
  User's question: {question}
4
4
 
5
+ Tools available:
6
+ {tools}
7
+
5
8
  Tasks and tools used:
6
9
  {tool_results}
7
10
 
@@ -1,3 +1,5 @@
1
+ """Utility functions for image processing."""
2
+
1
3
  import base64
2
4
  from io import BytesIO
3
5
  from pathlib import Path
@@ -9,6 +11,14 @@ from PIL.Image import Image as ImageType
9
11
 
10
12
 
11
13
  def b64_to_pil(b64_str: str) -> ImageType:
14
+ """Convert a base64 string to a PIL Image.
15
+
16
+ Parameters:
17
+ b64_str: the base64 encoded image
18
+
19
+ Returns:
20
+ The decoded PIL Image
21
+ """
12
22
  # , can't be encoded in b64 data so must be part of prefix
13
23
  if "," in b64_str:
14
24
  b64_str = b64_str.split(",")[1]
@@ -16,16 +26,29 @@ def b64_to_pil(b64_str: str) -> ImageType:
16
26
 
17
27
 
18
28
  def get_image_size(data: Union[str, Path, np.ndarray, ImageType]) -> Tuple[int, ...]:
29
+ """Get the size of an image.
30
+
31
+ Parameters:
32
+ data: the input image
33
+
34
+ Returns:
35
+ The size of the image in the form (height, width)
36
+ """
19
37
  if isinstance(data, (str, Path)):
20
38
  data = Image.open(data)
21
39
 
22
- if isinstance(data, Image.Image):
23
- return data.size[::-1]
24
- else:
25
- return data.shape[:2]
40
+ return data.size[::-1] if isinstance(data, Image.Image) else data.shape[:2]
26
41
 
27
42
 
28
43
  def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
44
+ """Convert an image to a base64 string.
45
+
46
+ Parameters:
47
+ data: the input image
48
+
49
+ Returns:
50
+ The base64 encoded image
51
+ """
29
52
  if data is None:
30
53
  raise ValueError(f"Invalid input image: {data}. Input image can't be None.")
31
54
  if isinstance(data, (str, Path)):
@@ -30,7 +30,7 @@ def normalize_bbox(
30
30
  def rle_decode(mask_rle: str, shape: Tuple[int, int]) -> np.ndarray:
31
31
  r"""Decode a run-length encoded mask. Returns numpy array, 1 - mask, 0 - background.
32
32
 
33
- Args:
33
+ Parameters:
34
34
  mask_rle: Run-length as string formated (start length)
35
35
  shape: The (height, width) of array to return
36
36
  """
@@ -54,7 +54,8 @@ class CLIP(Tool):
54
54
  r"""CLIP is a tool that can classify or tag any image given a set if input classes
55
55
  or tags.
56
56
 
57
- Examples::
57
+ Example
58
+ -------
58
59
  >>> import vision_agent as va
59
60
  >>> clip = va.tools.CLIP()
60
61
  >>> clip(["red line", "yellow dot"], "ct_scan1.jpg"))
@@ -89,7 +90,17 @@ class CLIP(Tool):
89
90
  ],
90
91
  }
91
92
 
93
+ # TODO: Add support for input multiple images, which aligns with the output type.
92
94
  def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
95
+ """Invoke the CLIP model.
96
+
97
+ Parameters:
98
+ prompt: a list of classes or tags to classify the image.
99
+ image: the input image to classify.
100
+
101
+ Returns:
102
+ A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
103
+ """
93
104
  image_b64 = convert_to_b64(image)
94
105
  data = {
95
106
  "classes": prompt,
@@ -117,7 +128,8 @@ class GroundingDINO(Tool):
117
128
  r"""Grounding DINO is a tool that can detect arbitrary objects with inputs such as
118
129
  category names or referring expressions.
119
130
 
120
- Examples::
131
+ Example
132
+ -------
121
133
  >>> import vision_agent as va
122
134
  >>> t = va.tools.GroundingDINO()
123
135
  >>> t("red line. yellow dot", "ct_scan1.jpg")
@@ -154,7 +166,17 @@ class GroundingDINO(Tool):
154
166
  ],
155
167
  }
156
168
 
169
+ # TODO: Add support for input multiple images, which aligns with the output type.
157
170
  def __call__(self, prompt: str, image: Union[str, Path, ImageType]) -> List[Dict]:
171
+ """Invoke the Grounding DINO model.
172
+
173
+ Parameters:
174
+ prompt: one or multiple class names to detect. The classes should be separated by a period if there are multiple classes. E.g. "big dog . small cat"
175
+ image: the input image to run against.
176
+
177
+ Returns:
178
+ A list of dictionaries containing the labels, scores, and bboxes. Each dictionary contains the detection result for an image.
179
+ """
158
180
  image_size = get_image_size(image)
159
181
  image_b64 = convert_to_b64(image)
160
182
  data = {
@@ -188,7 +210,8 @@ class GroundingSAM(Tool):
188
210
  r"""Grounding SAM is a tool that can detect and segment arbitrary objects with
189
211
  inputs such as category names or referring expressions.
190
212
 
191
- Examples::
213
+ Example
214
+ -------
192
215
  >>> import vision_agent as va
193
216
  >>> t = va.tools.GroundingSAM()
194
217
  >>> t(["red line", "yellow dot"], ct_scan1.jpg"])
@@ -234,7 +257,17 @@ class GroundingSAM(Tool):
234
257
  ],
235
258
  }
236
259
 
260
+ # TODO: Add support for input multiple images, which aligns with the output type.
237
261
  def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
262
+ """Invoke the Grounding SAM model.
263
+
264
+ Parameters:
265
+ prompt: a list of classes to segment.
266
+ image: the input image to segment.
267
+
268
+ Returns:
269
+ A list of dictionaries containing the labels, scores, bboxes and masks. Each dictionary contains the segmentation result for an image.
270
+ """
238
271
  image_size = get_image_size(image)
239
272
  image_b64 = convert_to_b64(image)
240
273
  data = {
@@ -260,8 +293,7 @@ class GroundingSAM(Tool):
260
293
  ret_pred["labels"].append(pred["label_name"])
261
294
  ret_pred["bboxes"].append(normalize_bbox(pred["bbox"], image_size))
262
295
  ret_pred["masks"].append(mask)
263
- ret_preds = [ret_pred]
264
- return ret_preds
296
+ return [ret_pred]
265
297
 
266
298
 
267
299
  class AgentGroundingSAM(GroundingSAM):
@@ -282,6 +314,8 @@ class AgentGroundingSAM(GroundingSAM):
282
314
 
283
315
 
284
316
  class Counter(Tool):
317
+ r"""Counter detects and counts the number of objects in an image given an input such as a category name or referring expression."""
318
+
285
319
  name = "counter_"
286
320
  description = "'counter_' detects and counts the number of objects in an image given an input such as a category name or referring expression."
287
321
  usage = {
@@ -307,6 +341,8 @@ class Counter(Tool):
307
341
 
308
342
 
309
343
  class Crop(Tool):
344
+ r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
345
+
310
346
  name = "crop_"
311
347
  description = "'crop_' crops an image given a bounding box and returns a file name of the cropped image."
312
348
  usage = {
@@ -343,6 +379,8 @@ class Crop(Tool):
343
379
 
344
380
 
345
381
  class BboxArea(Tool):
382
+ r"""BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places."""
383
+
346
384
  name = "bbox_area_"
347
385
  description = "'bbox_area_' returns the area of the bounding box in pixels normalized to 2 decimal places."
348
386
  usage = {
@@ -371,6 +409,8 @@ class BboxArea(Tool):
371
409
 
372
410
 
373
411
  class SegArea(Tool):
412
+ r"""SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places."""
413
+
374
414
  name = "seg_area_"
375
415
  description = "'seg_area_' returns the area of the segmentation mask in pixels normalized to 2 decimal places."
376
416
  usage = {
@@ -390,6 +430,8 @@ class SegArea(Tool):
390
430
 
391
431
 
392
432
  class Add(Tool):
433
+ r"""Add returns the sum of all the arguments passed to it, normalized to 2 decimal places."""
434
+
393
435
  name = "add_"
394
436
  description = "'add_' returns the sum of all the arguments passed to it, normalized to 2 decimal places."
395
437
  usage = {
@@ -407,6 +449,8 @@ class Add(Tool):
407
449
 
408
450
 
409
451
  class Subtract(Tool):
452
+ r"""Subtract returns the difference of all the arguments passed to it, normalized to 2 decimal places."""
453
+
410
454
  name = "subtract_"
411
455
  description = "'subtract_' returns the difference of all the arguments passed to it, normalized to 2 decimal places."
412
456
  usage = {
@@ -424,6 +468,8 @@ class Subtract(Tool):
424
468
 
425
469
 
426
470
  class Multiply(Tool):
471
+ r"""Multiply returns the product of all the arguments passed to it, normalized to 2 decimal places."""
472
+
427
473
  name = "multiply_"
428
474
  description = "'multiply_' returns the product of all the arguments passed to it, normalized to 2 decimal places."
429
475
  usage = {
@@ -441,6 +487,8 @@ class Multiply(Tool):
441
487
 
442
488
 
443
489
  class Divide(Tool):
490
+ r"""Divide returns the division of all the arguments passed to it, normalized to 2 decimal places."""
491
+
444
492
  name = "divide_"
445
493
  description = "'divide_' returns the division of all the arguments passed to it, normalized to 2 decimal places."
446
494
  usage = {
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.0.35
3
+ Version: 0.0.37
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -1,25 +1,25 @@
1
1
  vision_agent/__init__.py,sha256=wD1cssVTAJ55uTViNfBGooqJUV0p9fmVAuTMHHrmUBU,229
2
2
  vision_agent/agent/__init__.py,sha256=B4JVrbY4IRVCJfjmrgvcp7h1mTUEk8MZvL0Zmej4Ka0,127
3
3
  vision_agent/agent/agent.py,sha256=PRLItaPfMc94H6mAIPj_gBvJ8RezDEPanB6Cmu81A0M,306
4
- vision_agent/agent/easytool.py,sha256=SJ1Y8Lnz_HVGEzs2qSb-rq6glEjVG2slVHg8Sri17yo,11168
4
+ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMVg,11511
5
5
  vision_agent/agent/easytool_prompts.py,sha256=uNp12LOFRLr3i2zLhNuLuyFms2-s8es2t6P6h76QDow,4493
6
- vision_agent/agent/reflexion.py,sha256=TDNBpno_8Z-MIENr05msyqIqYOavW-ZP_ARPeXrPr_k,9758
6
+ vision_agent/agent/reflexion.py,sha256=wzpptfALNZIh9Q5jgkK3imGL5LWjTW_n_Ypsvxdh07Q,10101
7
7
  vision_agent/agent/reflexion_prompts.py,sha256=UPGkt_qgHBMUY0VPVoF-BqhR0d_6WPjjrhbYLBYOtnQ,9342
8
- vision_agent/agent/vision_agent.py,sha256=5d_EuySLii7PNLlPsnNkX1_88xzl3ajE31HLJKBYyY0,14336
9
- vision_agent/agent/vision_agent_prompts.py,sha256=F4WEpyYx_HpQj-vDm2LTtUm-yaLCOug-AKhxr7MNCvc,6061
8
+ vision_agent/agent/vision_agent.py,sha256=JPoY92M5xNaViLdNf4d1oqAX00QUuQxk-gcc9jIlfqA,14981
9
+ vision_agent/agent/vision_agent_prompts.py,sha256=otaDRsaHc7bqw_tgWTnu-eUcFeOzBFrn9sPU7_xr2VQ,6151
10
10
  vision_agent/data/__init__.py,sha256=YU-5g3LbEQ6a4drz0RLGTagXMVU2Z4Xr3RlfWE-R0jU,46
11
11
  vision_agent/data/data.py,sha256=pgtSGZdAnbQ8oGsuapLtFTMPajnCGDGekEXTnFuBwsY,5122
12
12
  vision_agent/emb/__init__.py,sha256=YmCkGrJBtXb6X6Z3lnKiFoQYKXMgHMJp8JJyMLVvqcI,75
13
13
  vision_agent/emb/emb.py,sha256=la9lhEzk7jqUCjYYQ5oRgVNSnC9_EJBJIpE_B9c6PJo,1375
14
- vision_agent/image_utils.py,sha256=b1iYAoBlmGgOo-ZRNV_Hdz2XsxH8Nuas6CBBoz2HFUQ,1202
14
+ vision_agent/image_utils.py,sha256=D5H-GN35Bz3u1Fq_JfYQVjNzAmZjJl138wma5fRtVjA,1684
15
15
  vision_agent/llm/__init__.py,sha256=fBKsIjL4z08eA0QYx6wvhRe4Nkp2pJ4VrZK0-uUL5Ec,32
16
16
  vision_agent/llm/llm.py,sha256=d8A7jmLVGx5HzoiYJ75mTMU7dbD5-bOYeXYlHaay6WA,3957
17
17
  vision_agent/lmm/__init__.py,sha256=I8mbeNUajTfWVNqLsuFQVOaNBDlkIhYp9DFU8H4kB7g,51
18
18
  vision_agent/lmm/lmm.py,sha256=ARcbgkcyP83TbVVoXI9B-gtG0gJuTaG_MjcUGbams4U,8052
19
19
  vision_agent/tools/__init__.py,sha256=aX0pU3pXU1V0Cj9FzYCvdsX76TAglFMHx59kNhXHbPs,131
20
20
  vision_agent/tools/prompts.py,sha256=9RBbyqlNlExsGKlJ89Jkph83DAEJ8PCVGaHoNbyN7TM,1416
21
- vision_agent/tools/tools.py,sha256=j_Jq_YHNmwrGXNR3fL9qi0yrHorqFui5UnAnLcEw20U,16826
22
- vision_agent-0.0.35.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
- vision_agent-0.0.35.dist-info/METADATA,sha256=5ofgjIl0NMVqXu_gFeoZ5xlfedqVbNztHONDVa3xP2E,4966
24
- vision_agent-0.0.35.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
25
- vision_agent-0.0.35.dist-info/RECORD,,
21
+ vision_agent/tools/tools.py,sha256=Vlb8H9qm4rA5HxGw5p-gJES6jgPIkfrtVlM7jcxw7d8,19141
22
+ vision_agent-0.0.37.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
+ vision_agent-0.0.37.dist-info/METADATA,sha256=Y9oIfWbRK-3EuNewrwK4WOnpHY2ca7FB8jDa5oucT5Y,4966
24
+ vision_agent-0.0.37.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
25
+ vision_agent-0.0.37.dist-info/RECORD,,