vision-agent 0.2.140__py3-none-any.whl → 0.2.141__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +2 -1
- vision_agent/agent/agent_utils.py +8 -2
- vision_agent/agent/vision_agent.py +97 -17
- vision_agent/agent/vision_agent_coder.py +93 -66
- vision_agent/agent/vision_agent_coder_prompts.py +53 -19
- vision_agent/agent/vision_agent_prompts.py +31 -9
- vision_agent/lmm/__init__.py +1 -1
- vision_agent/lmm/lmm.py +6 -9
- vision_agent/tools/__init__.py +1 -1
- vision_agent/tools/meta_tools.py +64 -32
- vision_agent/tools/tools.py +115 -30
- vision_agent/tools/tools_types.py +1 -0
- vision_agent/utils/image_utils.py +18 -7
- vision_agent/utils/video.py +2 -1
- {vision_agent-0.2.140.dist-info → vision_agent-0.2.141.dist-info}/METADATA +60 -12
- vision_agent-0.2.141.dist-info/RECORD +33 -0
- vision_agent-0.2.140.dist-info/RECORD +0 -33
- {vision_agent-0.2.140.dist-info → vision_agent-0.2.141.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.140.dist-info → vision_agent-0.2.141.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -149,6 +149,7 @@ def owl_v2_image(
|
|
149
149
|
prompt: str,
|
150
150
|
image: np.ndarray,
|
151
151
|
box_threshold: float = 0.10,
|
152
|
+
fine_tune_id: Optional[str] = None,
|
152
153
|
) -> List[Dict[str, Any]]:
|
153
154
|
"""'owl_v2_image' is a tool that can detect and count multiple objects given a text
|
154
155
|
prompt such as category names or referring expressions on images. The categories in
|
@@ -160,6 +161,8 @@ def owl_v2_image(
|
|
160
161
|
image (np.ndarray): The image to ground the prompt to.
|
161
162
|
box_threshold (float, optional): The threshold for the box detection. Defaults
|
162
163
|
to 0.10.
|
164
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
165
|
+
fine-tuned model ID here to use it.
|
163
166
|
|
164
167
|
Returns:
|
165
168
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -176,7 +179,38 @@ def owl_v2_image(
|
|
176
179
|
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
177
180
|
]
|
178
181
|
"""
|
182
|
+
|
179
183
|
image_size = image.shape[:2]
|
184
|
+
|
185
|
+
if fine_tune_id is not None:
|
186
|
+
image_b64 = convert_to_b64(image)
|
187
|
+
landing_api = LandingPublicAPI()
|
188
|
+
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
189
|
+
if status is not JobStatus.SUCCEEDED:
|
190
|
+
raise FineTuneModelIsNotReady(
|
191
|
+
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
192
|
+
)
|
193
|
+
|
194
|
+
data_obj = Florence2FtRequest(
|
195
|
+
image=image_b64,
|
196
|
+
task=PromptTask.PHRASE_GROUNDING,
|
197
|
+
tool="florencev2_fine_tuning",
|
198
|
+
prompt=prompt,
|
199
|
+
fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
|
200
|
+
)
|
201
|
+
data = data_obj.model_dump(by_alias=True)
|
202
|
+
detections = send_inference_request(data, "tools", v2=False)
|
203
|
+
detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
|
204
|
+
bboxes_formatted = [
|
205
|
+
ODResponseData(
|
206
|
+
label=detections["labels"][i],
|
207
|
+
bbox=normalize_bbox(detections["bboxes"][i], image_size),
|
208
|
+
score=1.0,
|
209
|
+
)
|
210
|
+
for i in range(len(detections["bboxes"]))
|
211
|
+
]
|
212
|
+
return [bbox.model_dump() for bbox in bboxes_formatted]
|
213
|
+
|
180
214
|
buffer_bytes = numpy_to_bytes(image)
|
181
215
|
files = [("image", buffer_bytes)]
|
182
216
|
payload = {
|
@@ -206,10 +240,10 @@ def owl_v2_video(
|
|
206
240
|
box_threshold: float = 0.10,
|
207
241
|
) -> List[List[Dict[str, Any]]]:
|
208
242
|
"""'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
|
209
|
-
objects per frame given a text prompt
|
210
|
-
expression
|
211
|
-
|
212
|
-
detections for that frame.
|
243
|
+
objects indepdently per frame given a text prompt such as a category name or
|
244
|
+
referring expression but does not track objects across frames. The categories in
|
245
|
+
text prompt are separated by commas. It returns a list of lists where each inner
|
246
|
+
list contains the score, label, and bounding box of the detections for that frame.
|
213
247
|
|
214
248
|
Parameters:
|
215
249
|
prompt (str): The prompt to ground to the video.
|
@@ -335,7 +369,9 @@ def grounding_sam(
|
|
335
369
|
return return_data
|
336
370
|
|
337
371
|
|
338
|
-
def florence2_sam2_image(
|
372
|
+
def florence2_sam2_image(
|
373
|
+
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
|
374
|
+
) -> List[Dict[str, Any]]:
|
339
375
|
"""'florence2_sam2_image' is a tool that can segment multiple objects given a text
|
340
376
|
prompt such as category names or referring expressions. The categories in the text
|
341
377
|
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
@@ -344,6 +380,8 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
|
|
344
380
|
Parameters:
|
345
381
|
prompt (str): The prompt to ground to the image.
|
346
382
|
image (np.ndarray): The image to ground the prompt to.
|
383
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
384
|
+
fine-tuned model ID here to use it.
|
347
385
|
|
348
386
|
Returns:
|
349
387
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
@@ -369,18 +407,52 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
|
|
369
407
|
},
|
370
408
|
]
|
371
409
|
"""
|
372
|
-
|
410
|
+
if fine_tune_id is not None:
|
411
|
+
image_b64 = convert_to_b64(image)
|
412
|
+
landing_api = LandingPublicAPI()
|
413
|
+
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
414
|
+
if status is not JobStatus.SUCCEEDED:
|
415
|
+
raise FineTuneModelIsNotReady(
|
416
|
+
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
417
|
+
)
|
373
418
|
|
419
|
+
req_data_obj = Florence2FtRequest(
|
420
|
+
image=image_b64,
|
421
|
+
task=PromptTask.PHRASE_GROUNDING,
|
422
|
+
tool="florencev2_fine_tuning",
|
423
|
+
prompt=prompt,
|
424
|
+
fine_tuning=FineTuning(
|
425
|
+
job_id=UUID(fine_tune_id),
|
426
|
+
postprocessing="sam2",
|
427
|
+
),
|
428
|
+
)
|
429
|
+
req_data = req_data_obj.model_dump(by_alias=True)
|
430
|
+
detections_ft = send_inference_request(req_data, "tools", v2=False)
|
431
|
+
detections_ft = detections_ft["<CAPTION_TO_PHRASE_GROUNDING>"]
|
432
|
+
return_data = []
|
433
|
+
all_masks = np.array(detections_ft["masks"])
|
434
|
+
for i in range(len(detections_ft["bboxes"])):
|
435
|
+
return_data.append(
|
436
|
+
{
|
437
|
+
"score": 1.0,
|
438
|
+
"label": detections_ft["labels"][i],
|
439
|
+
"bbox": detections_ft["bboxes"][i],
|
440
|
+
"mask": all_masks[i, :, :].astype(np.uint8),
|
441
|
+
}
|
442
|
+
)
|
443
|
+
return return_data
|
444
|
+
|
445
|
+
buffer_bytes = numpy_to_bytes(image)
|
374
446
|
files = [("image", buffer_bytes)]
|
375
447
|
payload = {
|
376
448
|
"prompts": [s.strip() for s in prompt.split(",")],
|
377
449
|
"function_name": "florence2_sam2_image",
|
378
450
|
}
|
379
|
-
|
451
|
+
detections: Dict[str, Any] = send_inference_request(
|
380
452
|
payload, "florence2-sam2", files=files, v2=True
|
381
453
|
)
|
382
454
|
return_data = []
|
383
|
-
for _, data_i in
|
455
|
+
for _, data_i in detections["0"].items():
|
384
456
|
mask = rle_decode_array(data_i["mask"])
|
385
457
|
label = data_i["label"]
|
386
458
|
bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
|
@@ -389,17 +461,19 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
|
|
389
461
|
|
390
462
|
|
391
463
|
def florence2_sam2_video_tracking(
|
392
|
-
prompt: str, frames: List[np.ndarray]
|
464
|
+
prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = None
|
393
465
|
) -> List[List[Dict[str, Any]]]:
|
394
466
|
"""'florence2_sam2_video_tracking' is a tool that can segment and track multiple
|
395
467
|
entities in a video given a text prompt such as category names or referring
|
396
468
|
expressions. You can optionally separate the categories in the text with commas. It
|
397
|
-
|
398
|
-
|
469
|
+
can find new objects every 'chunk_length' frames and is useful for tracking and
|
470
|
+
counting without duplicating counts and always outputs scores of 1.0.
|
399
471
|
|
400
472
|
Parameters:
|
401
473
|
prompt (str): The prompt to ground to the video.
|
402
474
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
475
|
+
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
476
|
+
new objects.
|
403
477
|
|
404
478
|
Returns:
|
405
479
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
|
@@ -432,6 +506,8 @@ def florence2_sam2_video_tracking(
|
|
432
506
|
"prompts": [s.strip() for s in prompt.split(",")],
|
433
507
|
"function_name": "florence2_sam2_video_tracking",
|
434
508
|
}
|
509
|
+
if chunk_length is not None:
|
510
|
+
payload["chunk_length"] = chunk_length # type: ignore
|
435
511
|
data: Dict[str, Any] = send_inference_request(
|
436
512
|
payload, "florence2-sam2", files=files, v2=True
|
437
513
|
)
|
@@ -1119,13 +1195,13 @@ def florence2_phrase_grounding(
|
|
1119
1195
|
return_data = []
|
1120
1196
|
for i in range(len(detections["bboxes"])):
|
1121
1197
|
return_data.append(
|
1122
|
-
|
1123
|
-
"
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1198
|
+
ODResponseData(
|
1199
|
+
label=detections["labels"][i],
|
1200
|
+
bbox=normalize_bbox(detections["bboxes"][i], image_size),
|
1201
|
+
score=1.0,
|
1202
|
+
)
|
1127
1203
|
)
|
1128
|
-
return return_data
|
1204
|
+
return [bbox.model_dump() for bbox in return_data]
|
1129
1205
|
|
1130
1206
|
|
1131
1207
|
def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
@@ -1497,12 +1573,14 @@ def closest_box_distance(
|
|
1497
1573
|
# Utility and visualization functions
|
1498
1574
|
|
1499
1575
|
|
1500
|
-
def
|
1576
|
+
def extract_frames_and_timestamps(
|
1501
1577
|
video_uri: Union[str, Path], fps: float = 1
|
1502
|
-
) -> List[
|
1503
|
-
"""'
|
1504
|
-
|
1505
|
-
|
1578
|
+
) -> List[Dict[str, Union[np.ndarray, float]]]:
|
1579
|
+
"""'extract_frames_and_timestamps' extracts frames and timestamps from a video
|
1580
|
+
which can be a file path, url or youtube link, returns a list of dictionaries
|
1581
|
+
with keys "frame" and "timestamp" where "frame" is a numpy array and "timestamp" is
|
1582
|
+
the relative time in seconds where the frame was captured. The frame is a numpy
|
1583
|
+
array.
|
1506
1584
|
|
1507
1585
|
Parameters:
|
1508
1586
|
video_uri (Union[str, Path]): The path to the video file, url or youtube link
|
@@ -1510,15 +1588,23 @@ def extract_frames(
|
|
1510
1588
|
to 1.
|
1511
1589
|
|
1512
1590
|
Returns:
|
1513
|
-
List[
|
1514
|
-
as a numpy array and the timestamp in seconds.
|
1591
|
+
List[Dict[str, Union[np.ndarray, float]]]: A list of dictionaries containing the
|
1592
|
+
extracted frame as a numpy array and the timestamp in seconds.
|
1515
1593
|
|
1516
1594
|
Example
|
1517
1595
|
-------
|
1518
1596
|
>>> extract_frames("path/to/video.mp4")
|
1519
|
-
[
|
1597
|
+
[{"frame": np.ndarray, "timestamp": 0.0}, ...]
|
1520
1598
|
"""
|
1521
1599
|
|
1600
|
+
def reformat(
|
1601
|
+
frames_and_timestamps: List[Tuple[np.ndarray, float]]
|
1602
|
+
) -> List[Dict[str, Union[np.ndarray, float]]]:
|
1603
|
+
return [
|
1604
|
+
{"frame": frame, "timestamp": timestamp}
|
1605
|
+
for frame, timestamp in frames_and_timestamps
|
1606
|
+
]
|
1607
|
+
|
1522
1608
|
if str(video_uri).startswith(
|
1523
1609
|
(
|
1524
1610
|
"http://www.youtube.com/",
|
@@ -1540,16 +1626,16 @@ def extract_frames(
|
|
1540
1626
|
raise Exception("No suitable video stream found")
|
1541
1627
|
video_file_path = video.download(output_path=temp_dir)
|
1542
1628
|
|
1543
|
-
return extract_frames_from_video(video_file_path, fps)
|
1629
|
+
return reformat(extract_frames_from_video(video_file_path, fps))
|
1544
1630
|
elif str(video_uri).startswith(("http", "https")):
|
1545
1631
|
_, image_suffix = os.path.splitext(video_uri)
|
1546
1632
|
with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
|
1547
1633
|
# Download the video and save it to the temporary file
|
1548
1634
|
with urllib.request.urlopen(str(video_uri)) as response:
|
1549
1635
|
tmp_file.write(response.read())
|
1550
|
-
return extract_frames_from_video(tmp_file.name, fps)
|
1636
|
+
return reformat(extract_frames_from_video(tmp_file.name, fps))
|
1551
1637
|
|
1552
|
-
return extract_frames_from_video(str(video_uri), fps)
|
1638
|
+
return reformat(extract_frames_from_video(str(video_uri), fps))
|
1553
1639
|
|
1554
1640
|
|
1555
1641
|
def save_json(data: Any, file_path: str) -> None:
|
@@ -1953,7 +2039,6 @@ FUNCTION_TOOLS = [
|
|
1953
2039
|
vit_image_classification,
|
1954
2040
|
vit_nsfw_classification,
|
1955
2041
|
countgd_counting,
|
1956
|
-
florence2_image_caption,
|
1957
2042
|
florence2_ocr,
|
1958
2043
|
florence2_sam2_image,
|
1959
2044
|
florence2_sam2_video_tracking,
|
@@ -1968,7 +2053,7 @@ FUNCTION_TOOLS = [
|
|
1968
2053
|
]
|
1969
2054
|
|
1970
2055
|
UTIL_TOOLS = [
|
1971
|
-
|
2056
|
+
extract_frames_and_timestamps,
|
1972
2057
|
save_json,
|
1973
2058
|
load_image,
|
1974
2059
|
save_image,
|
@@ -28,6 +28,7 @@ class FineTuning(BaseModel):
|
|
28
28
|
model_config = ConfigDict(populate_by_name=True)
|
29
29
|
|
30
30
|
job_id: UUID = Field(alias="jobId")
|
31
|
+
postprocessing: Optional[str] = None
|
31
32
|
|
32
33
|
@field_serializer("job_id")
|
33
34
|
def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
|
@@ -5,7 +5,7 @@ import io
|
|
5
5
|
from importlib import resources
|
6
6
|
from io import BytesIO
|
7
7
|
from pathlib import Path
|
8
|
-
from typing import Dict, List, Tuple, Union
|
8
|
+
from typing import Dict, List, Optional, Tuple, Union
|
9
9
|
|
10
10
|
import numpy as np
|
11
11
|
from PIL import Image, ImageDraw, ImageFont
|
@@ -154,15 +154,20 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
|
|
154
154
|
)
|
155
155
|
|
156
156
|
|
157
|
-
def encode_image_bytes(image: bytes) -> str:
|
158
|
-
|
157
|
+
def encode_image_bytes(image: bytes, resize: Optional[int] = None) -> str:
|
158
|
+
if resize is not None:
|
159
|
+
image_pil = Image.open(io.BytesIO(image)).convert("RGB")
|
160
|
+
if image_pil.size[0] > resize or image_pil.size[1] > resize:
|
161
|
+
image_pil.thumbnail((resize, resize))
|
162
|
+
else:
|
163
|
+
image_pil = Image.open(io.BytesIO(image)).convert("RGB")
|
159
164
|
buffer = io.BytesIO()
|
160
|
-
|
165
|
+
image_pil.save(buffer, format="PNG")
|
161
166
|
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
162
167
|
return encoded_image
|
163
168
|
|
164
169
|
|
165
|
-
def encode_media(media: Union[str, Path]) -> str:
|
170
|
+
def encode_media(media: Union[str, Path], resize: Optional[int] = None) -> str:
|
166
171
|
if isinstance(media, str) and media.startswith(("http", "https")):
|
167
172
|
# for mp4 video url, we assume there is a same url but ends with png
|
168
173
|
# vision-agent-ui will upload this png when uploading the video
|
@@ -192,11 +197,17 @@ def encode_media(media: Union[str, Path]) -> str:
|
|
192
197
|
frames = extract_frames_from_video(str(media), fps=1)
|
193
198
|
image = frames[len(frames) // 2]
|
194
199
|
buffer = io.BytesIO()
|
195
|
-
|
200
|
+
if resize is not None:
|
201
|
+
image_pil = Image.fromarray(image[0]).convert("RGB")
|
202
|
+
if image_pil.size[0] > resize or image_pil.size[1] > resize:
|
203
|
+
image_pil.thumbnail((resize, resize))
|
204
|
+
else:
|
205
|
+
image_pil = Image.fromarray(image[0]).convert("RGB")
|
206
|
+
image_pil.save(buffer, format="PNG")
|
196
207
|
image_bytes = buffer.getvalue()
|
197
208
|
else:
|
198
209
|
image_bytes = open(media, "rb").read()
|
199
|
-
return encode_image_bytes(image_bytes)
|
210
|
+
return encode_image_bytes(image_bytes, resize=resize)
|
200
211
|
|
201
212
|
|
202
213
|
def denormalize_bbox(
|
vision_agent/utils/video.py
CHANGED
@@ -61,6 +61,7 @@ def video_writer(
|
|
61
61
|
stream.height = height - (height % 2)
|
62
62
|
stream.width = width - (width % 2)
|
63
63
|
stream.pix_fmt = "yuv420p"
|
64
|
+
stream.options = {"crf": "10"}
|
64
65
|
for frame in frames:
|
65
66
|
# Remove the alpha channel (convert RGBA to RGB)
|
66
67
|
frame_rgb = frame[:, :, :3]
|
@@ -77,7 +78,7 @@ def video_writer(
|
|
77
78
|
|
78
79
|
|
79
80
|
def frames_to_bytes(
|
80
|
-
frames: List[np.ndarray], fps: float =
|
81
|
+
frames: List[np.ndarray], fps: float = 1.0, file_ext: str = ".mp4"
|
81
82
|
) -> bytes:
|
82
83
|
r"""Convert a list of frames to a video file encoded into a byte string.
|
83
84
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.141
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -74,10 +74,11 @@ To get started, you can install the library using pip:
|
|
74
74
|
pip install vision-agent
|
75
75
|
```
|
76
76
|
|
77
|
-
Ensure you have an OpenAI API key and set
|
78
|
-
using Azure OpenAI please see the Azure setup section):
|
77
|
+
Ensure you have an Anthropic key and an OpenAI API key and set in your environment
|
78
|
+
variables (if you are using Azure OpenAI please see the Azure setup section):
|
79
79
|
|
80
80
|
```bash
|
81
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
81
82
|
export OPENAI_API_KEY="your-api-key"
|
82
83
|
```
|
83
84
|
|
@@ -112,6 +113,9 @@ You can find more details about the streamlit app [here](examples/chat/).
|
|
112
113
|
>>> resp = agent(resp)
|
113
114
|
```
|
114
115
|
|
116
|
+
`VisionAgent` currently utilizes Claude-3.5 as it's default LMM and uses OpenAI for
|
117
|
+
embeddings for tool searching.
|
118
|
+
|
115
119
|
### Vision Agent Coder
|
116
120
|
#### Basic Usage
|
117
121
|
You can interact with the agent as you would with any LLM or LMM model:
|
@@ -173,7 +177,8 @@ of the input is a list of dictionaries with the keys `role`, `content`, and `med
|
|
173
177
|
"code": "from vision_agent.tools import ..."
|
174
178
|
"test": "calculate_filled_percentage('jar.jpg')",
|
175
179
|
"test_result": "...",
|
176
|
-
"
|
180
|
+
"plans": {"plan1": {"thoughts": "..."}, ...},
|
181
|
+
"plan_thoughts": "...",
|
177
182
|
"working_memory": ...,
|
178
183
|
}
|
179
184
|
```
|
@@ -210,20 +215,25 @@ result = agent.chat_with_workflow(conv)
|
|
210
215
|
### Tools
|
211
216
|
There are a variety of tools for the model or the user to use. Some are executed locally
|
212
217
|
while others are hosted for you. You can easily access them yourself, for example if
|
213
|
-
you want to run `
|
218
|
+
you want to run `owl_v2_image` and visualize the output you can run:
|
214
219
|
|
215
220
|
```python
|
216
221
|
import vision_agent.tools as T
|
217
222
|
import matplotlib.pyplot as plt
|
218
223
|
|
219
224
|
image = T.load_image("dogs.jpg")
|
220
|
-
dets = T.
|
225
|
+
dets = T.owl_v2_image("dogs", image)
|
221
226
|
viz = T.overlay_bounding_boxes(image, dets)
|
222
227
|
plt.imshow(viz)
|
223
228
|
plt.show()
|
224
229
|
```
|
225
230
|
|
226
|
-
You can
|
231
|
+
You can find all available tools in `vision_agent/tools/tools.py`, however,
|
232
|
+
`VisionAgentCoder` only utilizes a subset of tools that have been tested and provide
|
233
|
+
the best performance. Those can be found in the same file under the `TOOLS` variable.
|
234
|
+
|
235
|
+
If you can't find the tool you are looking for you can also add custom tools to the
|
236
|
+
agent:
|
227
237
|
|
228
238
|
```python
|
229
239
|
import vision_agent as va
|
@@ -258,9 +268,48 @@ Can't find the tool you need and want add it to `VisionAgent`? Check out our
|
|
258
268
|
we add the source code for all the tools used in `VisionAgent`.
|
259
269
|
|
260
270
|
## Additional Backends
|
271
|
+
### Anthropic
|
272
|
+
`AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
|
273
|
+
Anthropic API key and set it in your environment variables:
|
274
|
+
|
275
|
+
```bash
|
276
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
277
|
+
```
|
278
|
+
|
279
|
+
Because Anthropic does not support embedding models, the default embedding model used
|
280
|
+
is the OpenAI model so you will also need to set your OpenAI API key:
|
281
|
+
|
282
|
+
```bash
|
283
|
+
export OPEN_AI_API_KEY="your-api-key"
|
284
|
+
```
|
285
|
+
|
286
|
+
Usage is the same as `VisionAgentCoder`:
|
287
|
+
|
288
|
+
```python
|
289
|
+
>>> import vision_agent as va
|
290
|
+
>>> agent = va.agent.AnthropicVisionAgentCoder()
|
291
|
+
>>> agent("Count the apples in the image", media="apples.jpg")
|
292
|
+
```
|
293
|
+
|
294
|
+
### OpenAI
|
295
|
+
`OpenAIVisionAgentCoder` uses OpenAI. To get started you just need to get an OpenAI API
|
296
|
+
key and set it in your environment variables:
|
297
|
+
|
298
|
+
```bash
|
299
|
+
export OPEN_AI_API_KEY="your-api-key"
|
300
|
+
```
|
301
|
+
|
302
|
+
Usage is the same as `VisionAgentCoder`:
|
303
|
+
|
304
|
+
```python
|
305
|
+
>>> import vision_agent as va
|
306
|
+
>>> agent = va.agent.OpenAIVisionAgentCoder()
|
307
|
+
>>> agent("Count the apples in the image", media="apples.jpg")
|
308
|
+
```
|
309
|
+
|
310
|
+
|
261
311
|
### Ollama
|
262
|
-
|
263
|
-
a few models:
|
312
|
+
`OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
|
264
313
|
|
265
314
|
```bash
|
266
315
|
ollama pull llama3.1
|
@@ -281,9 +330,8 @@ tools. You can use it just like you would use `VisionAgentCoder`:
|
|
281
330
|
> WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B.
|
282
331
|
|
283
332
|
### Azure OpenAI
|
284
|
-
|
285
|
-
|
286
|
-
`VisionAgentCoder`:
|
333
|
+
`AzureVisionAgentCoder` uses Azure OpenAI models. To get started follow the Azure Setup
|
334
|
+
section below. You can use it just like you would use `VisionAgentCoder`:
|
287
335
|
|
288
336
|
```python
|
289
337
|
>>> import vision_agent as va
|
@@ -0,0 +1,33 @@
|
|
1
|
+
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
2
|
+
vision_agent/agent/__init__.py,sha256=NF2LABqHixLvbsOIO-fe-VKZ7awvShLtcT0oQT4eWtI,235
|
3
|
+
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
|
+
vision_agent/agent/agent_utils.py,sha256=PEUHqvnHmFL4np_TeFmKMwr5s_dWfdfJz6TF_ogd1dU,2353
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=k1bUmvoz0KjVEu62PYA9djnq3pqzv2S1UsW6gLnTd7w,17023
|
6
|
+
vision_agent/agent/vision_agent_coder.py,sha256=4bbebV1sKE10vsxcZR-R8P54X2HjLeU9lDt7ylIZAT4,38429
|
7
|
+
vision_agent/agent/vision_agent_coder_prompts.py,sha256=YWK4C--YRS1Kuab11Gn-AXBzar1j_GNnTnxi_nnaPRY,14901
|
8
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=e_ASPeRFU1yZsQhCkK_bIBG-eyIWyWXmN64lFk-r7e0,10897
|
9
|
+
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
11
|
+
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
12
|
+
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
+
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
14
|
+
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
15
|
+
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
16
|
+
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
|
+
vision_agent/tools/__init__.py,sha256=zUv3aVPN1MXfyQiQi5To4rkQGtG7mxLQ1NjLI3pxM80,2412
|
18
|
+
vision_agent/tools/meta_tools.py,sha256=Df-e_Xak0M1EuRkFPtPJFsC0eRW19JgVqfBv84asJEc,23408
|
19
|
+
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
|
+
vision_agent/tools/tool_utils.py,sha256=5ukuDMxbEH4iKetYR9I7twzsA8ECyP4tVwYXQq54mxI,8020
|
21
|
+
vision_agent/tools/tools.py,sha256=dD_8AmAQb0oKVZHg2w2kSKlvWrG9yaKRbaHTz_kHgjA,73648
|
22
|
+
vision_agent/tools/tools_types.py,sha256=JUOZWGW2q-dlJ85CHr9gvo9KQk_rXyjJhi-iwPNn4eM,2397
|
23
|
+
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
24
|
+
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
25
|
+
vision_agent/utils/execute.py,sha256=Lb78YX34v2Ydr-Md25a_gylsdRVXBFbE-_dc_z6oHvg,27968
|
26
|
+
vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
|
27
|
+
vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
|
28
|
+
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
|
+
vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
|
30
|
+
vision_agent-0.2.141.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.141.dist-info/METADATA,sha256=qiqGj2EoTspKwlhXeU6bRsPBYXkyi-mNvgO0JD1NzMw,13758
|
32
|
+
vision_agent-0.2.141.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.141.dist-info/RECORD,,
|
@@ -1,33 +0,0 @@
|
|
1
|
-
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
2
|
-
vision_agent/agent/__init__.py,sha256=TddDT4e3JVc68Dt0zSk0B4OBORx_R2WhAGK71uqEe2w,204
|
3
|
-
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
|
-
vision_agent/agent/agent_utils.py,sha256=qOYQn-wJsa4j4YjFOBQ41xyklCg8Y94CIIGw9ZXmgIU,2053
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=Ed10_rWzHu-hejb5jF9lAF7xbmQ_qAGpCxDvByZw6M8,14100
|
6
|
-
vision_agent/agent/vision_agent_coder.py,sha256=OI95goKTqVaEEPYwkn6bVsHsHZeifoBC8rjG9nD0Znc,36909
|
7
|
-
vision_agent/agent/vision_agent_coder_prompts.py,sha256=a7P19QscKNiaweke0zHPCfi5GQImpG-ZGKv_kXz0seg,13452
|
8
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=-fXiIIb48duXVljWYcJ0Y4ZzfNnRFi3C5cKdF4SdDo8,10075
|
9
|
-
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
-
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
11
|
-
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
12
|
-
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
14
|
-
vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
|
15
|
-
vision_agent/lmm/lmm.py,sha256=soWmEjtleQUSH2G3tYZWxOmteIqkgMVcmuZfx4mxszU,16838
|
16
|
-
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
|
-
vision_agent/tools/__init__.py,sha256=nufZNzbcLTuXwxFmvZNj99qE8EO2qtEPT8wFsuI9vyE,2397
|
18
|
-
vision_agent/tools/meta_tools.py,sha256=orYbEPWOENXwmKSmbg52_2eMAoYT9ZbV5GjudUd-f0o,22563
|
19
|
-
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
|
-
vision_agent/tools/tool_utils.py,sha256=5ukuDMxbEH4iKetYR9I7twzsA8ECyP4tVwYXQq54mxI,8020
|
21
|
-
vision_agent/tools/tools.py,sha256=WKeB99ED0o_ISS_vZc-ch_1Dc8_Fl2fhnGlfVNwNouc,70024
|
22
|
-
vision_agent/tools/tools_types.py,sha256=rLpCUODPY0yI65SLOTJOxfHFfqWM3WjOq-AYX25Chjk,2356
|
23
|
-
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
24
|
-
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
25
|
-
vision_agent/utils/execute.py,sha256=Lb78YX34v2Ydr-Md25a_gylsdRVXBFbE-_dc_z6oHvg,27968
|
26
|
-
vision_agent/utils/image_utils.py,sha256=zTTOJFOieMzwIquTFnW7T6ssx9o6XfoZ0Unqyk7GJrg,10746
|
27
|
-
vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
|
28
|
-
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
|
-
vision_agent/utils/video.py,sha256=hOjfEOZNcddYdoa0CoviXA4Vo9kwURKuojIJgLLJdp0,4745
|
30
|
-
vision_agent-0.2.140.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
-
vision_agent-0.2.140.dist-info/METADATA,sha256=B33v0XI-5ZlEBBu-I8DT7JrbU04PophTYEmRQMVEkBQ,12291
|
32
|
-
vision_agent-0.2.140.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
-
vision_agent-0.2.140.dist-info/RECORD,,
|
File without changes
|
File without changes
|