vision-agent 0.0.36__tar.gz → 0.0.37__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-0.0.36 → vision_agent-0.0.37}/PKG-INFO +1 -1
- {vision_agent-0.0.36 → vision_agent-0.0.37}/pyproject.toml +1 -1
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/agent/easytool.py +11 -1
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/agent/reflexion.py +11 -1
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/agent/vision_agent.py +11 -1
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/image_utils.py +27 -4
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/tools/tools.py +54 -6
- {vision_agent-0.0.36 → vision_agent-0.0.37}/LICENSE +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/README.md +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/__init__.py +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/agent/easytool_prompts.py +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/agent/reflexion_prompts.py +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/data/__init__.py +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/data/data.py +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/emb/__init__.py +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/emb/emb.py +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/llm/__init__.py +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/llm/llm.py +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/tools/__init__.py +0 -0
- {vision_agent-0.0.36 → vision_agent-0.0.37}/vision_agent/tools/prompts.py +0 -0
@@ -241,7 +241,8 @@ class EasyTool(Agent):
|
|
241
241
|
based on the original implementation https://github.com/microsoft/JARVIS/tree/main/easytool
|
242
242
|
from the funcQA code.
|
243
243
|
|
244
|
-
|
244
|
+
Example
|
245
|
+
-------
|
245
246
|
>>> from vision_agent.agent import EasyTool
|
246
247
|
>>> agent = EasyTool()
|
247
248
|
>>> resp = agent("If a car is traveling at 64 km/h, how many kilometers does it travel in 29 minutes?")
|
@@ -273,6 +274,15 @@ class EasyTool(Agent):
|
|
273
274
|
input: Union[List[Dict[str, str]], str],
|
274
275
|
image: Optional[Union[str, Path]] = None,
|
275
276
|
) -> str:
|
277
|
+
"""Invoke the vision agent.
|
278
|
+
|
279
|
+
Parameters:
|
280
|
+
input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
|
281
|
+
image: the input image referenced in the prompt parameter.
|
282
|
+
|
283
|
+
Returns:
|
284
|
+
A text response.
|
285
|
+
"""
|
276
286
|
if isinstance(input, str):
|
277
287
|
input = [{"role": "user", "content": input}]
|
278
288
|
return self.chat(input, image=image)
|
@@ -68,7 +68,8 @@ class Reflexion(Agent):
|
|
68
68
|
self_reflect_model. Using Reflexion with LMMs may not work well, if it gets it wrong
|
69
69
|
the first time, chances are it can't actually see the thing you want it to see.
|
70
70
|
|
71
|
-
|
71
|
+
Example
|
72
|
+
-------
|
72
73
|
>>> from vision_agent.agent import Reflexion
|
73
74
|
>>> agent = Reflexion()
|
74
75
|
>>> question = "How many tires does a truck have?"
|
@@ -139,6 +140,15 @@ class Reflexion(Agent):
|
|
139
140
|
input: Union[str, List[Dict[str, str]]],
|
140
141
|
image: Optional[Union[str, Path]] = None,
|
141
142
|
) -> str:
|
143
|
+
"""Invoke the vision agent.
|
144
|
+
|
145
|
+
Parameters:
|
146
|
+
input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
|
147
|
+
image: the input image referenced in the prompt parameter.
|
148
|
+
|
149
|
+
Returns:
|
150
|
+
A text response.
|
151
|
+
"""
|
142
152
|
if isinstance(input, str):
|
143
153
|
input = [{"role": "user", "content": input}]
|
144
154
|
return self.chat(input, image)
|
@@ -344,7 +344,8 @@ class VisionAgent(Agent):
|
|
344
344
|
reflect on whether or not it was able to accomplish the task based off of the plan
|
345
345
|
and final results, if not it will redo the task with this newly added reflection.
|
346
346
|
|
347
|
-
|
347
|
+
Example
|
348
|
+
-------
|
348
349
|
>>> from vision_agent.agent import VisionAgent
|
349
350
|
>>> agent = VisionAgent()
|
350
351
|
>>> resp = agent("If red tomatoes cost $5 each and yellow tomatoes cost $2.50 each, what is the total cost of all the tomatoes in the image?", image="tomatoes.jpg")
|
@@ -376,6 +377,15 @@ class VisionAgent(Agent):
|
|
376
377
|
input: Union[List[Dict[str, str]], str],
|
377
378
|
image: Optional[Union[str, Path]] = None,
|
378
379
|
) -> str:
|
380
|
+
"""Invoke the vision agent.
|
381
|
+
|
382
|
+
Parameters:
|
383
|
+
input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
|
384
|
+
image: the input image referenced in the prompt parameter.
|
385
|
+
|
386
|
+
Returns:
|
387
|
+
The result of the vision agent in text.
|
388
|
+
"""
|
379
389
|
if isinstance(input, str):
|
380
390
|
input = [{"role": "user", "content": input}]
|
381
391
|
return self.chat(input, image=image)
|
@@ -1,3 +1,5 @@
|
|
1
|
+
"""Utility functions for image processing."""
|
2
|
+
|
1
3
|
import base64
|
2
4
|
from io import BytesIO
|
3
5
|
from pathlib import Path
|
@@ -9,6 +11,14 @@ from PIL.Image import Image as ImageType
|
|
9
11
|
|
10
12
|
|
11
13
|
def b64_to_pil(b64_str: str) -> ImageType:
|
14
|
+
"""Convert a base64 string to a PIL Image.
|
15
|
+
|
16
|
+
Parameters:
|
17
|
+
b64_str: the base64 encoded image
|
18
|
+
|
19
|
+
Returns:
|
20
|
+
The decoded PIL Image
|
21
|
+
"""
|
12
22
|
# , can't be encoded in b64 data so must be part of prefix
|
13
23
|
if "," in b64_str:
|
14
24
|
b64_str = b64_str.split(",")[1]
|
@@ -16,16 +26,29 @@ def b64_to_pil(b64_str: str) -> ImageType:
|
|
16
26
|
|
17
27
|
|
18
28
|
def get_image_size(data: Union[str, Path, np.ndarray, ImageType]) -> Tuple[int, ...]:
|
29
|
+
"""Get the size of an image.
|
30
|
+
|
31
|
+
Parameters:
|
32
|
+
data: the input image
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
The size of the image in the form (height, width)
|
36
|
+
"""
|
19
37
|
if isinstance(data, (str, Path)):
|
20
38
|
data = Image.open(data)
|
21
39
|
|
22
|
-
if isinstance(data, Image.Image):
|
23
|
-
return data.size[::-1]
|
24
|
-
else:
|
25
|
-
return data.shape[:2]
|
40
|
+
return data.size[::-1] if isinstance(data, Image.Image) else data.shape[:2]
|
26
41
|
|
27
42
|
|
28
43
|
def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
|
44
|
+
"""Convert an image to a base64 string.
|
45
|
+
|
46
|
+
Parameters:
|
47
|
+
data: the input image
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
The base64 encoded image
|
51
|
+
"""
|
29
52
|
if data is None:
|
30
53
|
raise ValueError(f"Invalid input image: {data}. Input image can't be None.")
|
31
54
|
if isinstance(data, (str, Path)):
|
@@ -30,7 +30,7 @@ def normalize_bbox(
|
|
30
30
|
def rle_decode(mask_rle: str, shape: Tuple[int, int]) -> np.ndarray:
|
31
31
|
r"""Decode a run-length encoded mask. Returns numpy array, 1 - mask, 0 - background.
|
32
32
|
|
33
|
-
|
33
|
+
Parameters:
|
34
34
|
mask_rle: Run-length as string formated (start length)
|
35
35
|
shape: The (height, width) of array to return
|
36
36
|
"""
|
@@ -54,7 +54,8 @@ class CLIP(Tool):
|
|
54
54
|
r"""CLIP is a tool that can classify or tag any image given a set if input classes
|
55
55
|
or tags.
|
56
56
|
|
57
|
-
|
57
|
+
Example
|
58
|
+
-------
|
58
59
|
>>> import vision_agent as va
|
59
60
|
>>> clip = va.tools.CLIP()
|
60
61
|
>>> clip(["red line", "yellow dot"], "ct_scan1.jpg"))
|
@@ -89,7 +90,17 @@ class CLIP(Tool):
|
|
89
90
|
],
|
90
91
|
}
|
91
92
|
|
93
|
+
# TODO: Add support for input multiple images, which aligns with the output type.
|
92
94
|
def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
|
95
|
+
"""Invoke the CLIP model.
|
96
|
+
|
97
|
+
Parameters:
|
98
|
+
prompt: a list of classes or tags to classify the image.
|
99
|
+
image: the input image to classify.
|
100
|
+
|
101
|
+
Returns:
|
102
|
+
A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
|
103
|
+
"""
|
93
104
|
image_b64 = convert_to_b64(image)
|
94
105
|
data = {
|
95
106
|
"classes": prompt,
|
@@ -117,7 +128,8 @@ class GroundingDINO(Tool):
|
|
117
128
|
r"""Grounding DINO is a tool that can detect arbitrary objects with inputs such as
|
118
129
|
category names or referring expressions.
|
119
130
|
|
120
|
-
|
131
|
+
Example
|
132
|
+
-------
|
121
133
|
>>> import vision_agent as va
|
122
134
|
>>> t = va.tools.GroundingDINO()
|
123
135
|
>>> t("red line. yellow dot", "ct_scan1.jpg")
|
@@ -154,7 +166,17 @@ class GroundingDINO(Tool):
|
|
154
166
|
],
|
155
167
|
}
|
156
168
|
|
169
|
+
# TODO: Add support for input multiple images, which aligns with the output type.
|
157
170
|
def __call__(self, prompt: str, image: Union[str, Path, ImageType]) -> List[Dict]:
|
171
|
+
"""Invoke the Grounding DINO model.
|
172
|
+
|
173
|
+
Parameters:
|
174
|
+
prompt: one or multiple class names to detect. The classes should be separated by a period if there are multiple classes. E.g. "big dog . small cat"
|
175
|
+
image: the input image to run against.
|
176
|
+
|
177
|
+
Returns:
|
178
|
+
A list of dictionaries containing the labels, scores, and bboxes. Each dictionary contains the detection result for an image.
|
179
|
+
"""
|
158
180
|
image_size = get_image_size(image)
|
159
181
|
image_b64 = convert_to_b64(image)
|
160
182
|
data = {
|
@@ -188,7 +210,8 @@ class GroundingSAM(Tool):
|
|
188
210
|
r"""Grounding SAM is a tool that can detect and segment arbitrary objects with
|
189
211
|
inputs such as category names or referring expressions.
|
190
212
|
|
191
|
-
|
213
|
+
Example
|
214
|
+
-------
|
192
215
|
>>> import vision_agent as va
|
193
216
|
>>> t = va.tools.GroundingSAM()
|
194
217
|
>>> t(["red line", "yellow dot"], ct_scan1.jpg"])
|
@@ -234,7 +257,17 @@ class GroundingSAM(Tool):
|
|
234
257
|
],
|
235
258
|
}
|
236
259
|
|
260
|
+
# TODO: Add support for input multiple images, which aligns with the output type.
|
237
261
|
def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
|
262
|
+
"""Invoke the Grounding SAM model.
|
263
|
+
|
264
|
+
Parameters:
|
265
|
+
prompt: a list of classes to segment.
|
266
|
+
image: the input image to segment.
|
267
|
+
|
268
|
+
Returns:
|
269
|
+
A list of dictionaries containing the labels, scores, bboxes and masks. Each dictionary contains the segmentation result for an image.
|
270
|
+
"""
|
238
271
|
image_size = get_image_size(image)
|
239
272
|
image_b64 = convert_to_b64(image)
|
240
273
|
data = {
|
@@ -260,8 +293,7 @@ class GroundingSAM(Tool):
|
|
260
293
|
ret_pred["labels"].append(pred["label_name"])
|
261
294
|
ret_pred["bboxes"].append(normalize_bbox(pred["bbox"], image_size))
|
262
295
|
ret_pred["masks"].append(mask)
|
263
|
-
|
264
|
-
return ret_preds
|
296
|
+
return [ret_pred]
|
265
297
|
|
266
298
|
|
267
299
|
class AgentGroundingSAM(GroundingSAM):
|
@@ -282,6 +314,8 @@ class AgentGroundingSAM(GroundingSAM):
|
|
282
314
|
|
283
315
|
|
284
316
|
class Counter(Tool):
|
317
|
+
r"""Counter detects and counts the number of objects in an image given an input such as a category name or referring expression."""
|
318
|
+
|
285
319
|
name = "counter_"
|
286
320
|
description = "'counter_' detects and counts the number of objects in an image given an input such as a category name or referring expression."
|
287
321
|
usage = {
|
@@ -307,6 +341,8 @@ class Counter(Tool):
|
|
307
341
|
|
308
342
|
|
309
343
|
class Crop(Tool):
|
344
|
+
r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
|
345
|
+
|
310
346
|
name = "crop_"
|
311
347
|
description = "'crop_' crops an image given a bounding box and returns a file name of the cropped image."
|
312
348
|
usage = {
|
@@ -343,6 +379,8 @@ class Crop(Tool):
|
|
343
379
|
|
344
380
|
|
345
381
|
class BboxArea(Tool):
|
382
|
+
r"""BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places."""
|
383
|
+
|
346
384
|
name = "bbox_area_"
|
347
385
|
description = "'bbox_area_' returns the area of the bounding box in pixels normalized to 2 decimal places."
|
348
386
|
usage = {
|
@@ -371,6 +409,8 @@ class BboxArea(Tool):
|
|
371
409
|
|
372
410
|
|
373
411
|
class SegArea(Tool):
|
412
|
+
r"""SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places."""
|
413
|
+
|
374
414
|
name = "seg_area_"
|
375
415
|
description = "'seg_area_' returns the area of the segmentation mask in pixels normalized to 2 decimal places."
|
376
416
|
usage = {
|
@@ -390,6 +430,8 @@ class SegArea(Tool):
|
|
390
430
|
|
391
431
|
|
392
432
|
class Add(Tool):
|
433
|
+
r"""Add returns the sum of all the arguments passed to it, normalized to 2 decimal places."""
|
434
|
+
|
393
435
|
name = "add_"
|
394
436
|
description = "'add_' returns the sum of all the arguments passed to it, normalized to 2 decimal places."
|
395
437
|
usage = {
|
@@ -407,6 +449,8 @@ class Add(Tool):
|
|
407
449
|
|
408
450
|
|
409
451
|
class Subtract(Tool):
|
452
|
+
r"""Subtract returns the difference of all the arguments passed to it, normalized to 2 decimal places."""
|
453
|
+
|
410
454
|
name = "subtract_"
|
411
455
|
description = "'subtract_' returns the difference of all the arguments passed to it, normalized to 2 decimal places."
|
412
456
|
usage = {
|
@@ -424,6 +468,8 @@ class Subtract(Tool):
|
|
424
468
|
|
425
469
|
|
426
470
|
class Multiply(Tool):
|
471
|
+
r"""Multiply returns the product of all the arguments passed to it, normalized to 2 decimal places."""
|
472
|
+
|
427
473
|
name = "multiply_"
|
428
474
|
description = "'multiply_' returns the product of all the arguments passed to it, normalized to 2 decimal places."
|
429
475
|
usage = {
|
@@ -441,6 +487,8 @@ class Multiply(Tool):
|
|
441
487
|
|
442
488
|
|
443
489
|
class Divide(Tool):
|
490
|
+
r"""Divide returns the division of all the arguments passed to it, normalized to 2 decimal places."""
|
491
|
+
|
444
492
|
name = "divide_"
|
445
493
|
description = "'divide_' returns the division of all the arguments passed to it, normalized to 2 decimal places."
|
446
494
|
usage = {
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|