vision-agent 0.2.140__py3-none-any.whl → 0.2.142__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/__init__.py +2 -1
- vision_agent/agent/agent_utils.py +8 -2
- vision_agent/agent/vision_agent.py +97 -17
- vision_agent/agent/vision_agent_coder.py +93 -66
- vision_agent/agent/vision_agent_coder_prompts.py +53 -19
- vision_agent/agent/vision_agent_prompts.py +31 -9
- vision_agent/lmm/__init__.py +1 -1
- vision_agent/lmm/lmm.py +6 -9
- vision_agent/tools/__init__.py +1 -1
- vision_agent/tools/meta_tools.py +65 -33
- vision_agent/tools/tools.py +115 -30
- vision_agent/tools/tools_types.py +1 -0
- vision_agent/utils/image_utils.py +18 -7
- vision_agent/utils/video.py +2 -1
- {vision_agent-0.2.140.dist-info → vision_agent-0.2.142.dist-info}/METADATA +60 -12
- vision_agent-0.2.142.dist-info/RECORD +33 -0
- vision_agent-0.2.140.dist-info/RECORD +0 -33
- {vision_agent-0.2.140.dist-info → vision_agent-0.2.142.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.140.dist-info → vision_agent-0.2.142.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -149,6 +149,7 @@ def owl_v2_image(
|
|
149
149
|
prompt: str,
|
150
150
|
image: np.ndarray,
|
151
151
|
box_threshold: float = 0.10,
|
152
|
+
fine_tune_id: Optional[str] = None,
|
152
153
|
) -> List[Dict[str, Any]]:
|
153
154
|
"""'owl_v2_image' is a tool that can detect and count multiple objects given a text
|
154
155
|
prompt such as category names or referring expressions on images. The categories in
|
@@ -160,6 +161,8 @@ def owl_v2_image(
|
|
160
161
|
image (np.ndarray): The image to ground the prompt to.
|
161
162
|
box_threshold (float, optional): The threshold for the box detection. Defaults
|
162
163
|
to 0.10.
|
164
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
165
|
+
fine-tuned model ID here to use it.
|
163
166
|
|
164
167
|
Returns:
|
165
168
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -176,7 +179,38 @@ def owl_v2_image(
|
|
176
179
|
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
177
180
|
]
|
178
181
|
"""
|
182
|
+
|
179
183
|
image_size = image.shape[:2]
|
184
|
+
|
185
|
+
if fine_tune_id is not None:
|
186
|
+
image_b64 = convert_to_b64(image)
|
187
|
+
landing_api = LandingPublicAPI()
|
188
|
+
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
189
|
+
if status is not JobStatus.SUCCEEDED:
|
190
|
+
raise FineTuneModelIsNotReady(
|
191
|
+
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
192
|
+
)
|
193
|
+
|
194
|
+
data_obj = Florence2FtRequest(
|
195
|
+
image=image_b64,
|
196
|
+
task=PromptTask.PHRASE_GROUNDING,
|
197
|
+
tool="florencev2_fine_tuning",
|
198
|
+
prompt=prompt,
|
199
|
+
fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
|
200
|
+
)
|
201
|
+
data = data_obj.model_dump(by_alias=True)
|
202
|
+
detections = send_inference_request(data, "tools", v2=False)
|
203
|
+
detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
|
204
|
+
bboxes_formatted = [
|
205
|
+
ODResponseData(
|
206
|
+
label=detections["labels"][i],
|
207
|
+
bbox=normalize_bbox(detections["bboxes"][i], image_size),
|
208
|
+
score=1.0,
|
209
|
+
)
|
210
|
+
for i in range(len(detections["bboxes"]))
|
211
|
+
]
|
212
|
+
return [bbox.model_dump() for bbox in bboxes_formatted]
|
213
|
+
|
180
214
|
buffer_bytes = numpy_to_bytes(image)
|
181
215
|
files = [("image", buffer_bytes)]
|
182
216
|
payload = {
|
@@ -206,10 +240,10 @@ def owl_v2_video(
|
|
206
240
|
box_threshold: float = 0.10,
|
207
241
|
) -> List[List[Dict[str, Any]]]:
|
208
242
|
"""'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
|
209
|
-
objects per frame given a text prompt
|
210
|
-
expression
|
211
|
-
|
212
|
-
detections for that frame.
|
243
|
+
objects indepdently per frame given a text prompt such as a category name or
|
244
|
+
referring expression but does not track objects across frames. The categories in
|
245
|
+
text prompt are separated by commas. It returns a list of lists where each inner
|
246
|
+
list contains the score, label, and bounding box of the detections for that frame.
|
213
247
|
|
214
248
|
Parameters:
|
215
249
|
prompt (str): The prompt to ground to the video.
|
@@ -335,7 +369,9 @@ def grounding_sam(
|
|
335
369
|
return return_data
|
336
370
|
|
337
371
|
|
338
|
-
def florence2_sam2_image(
|
372
|
+
def florence2_sam2_image(
|
373
|
+
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
|
374
|
+
) -> List[Dict[str, Any]]:
|
339
375
|
"""'florence2_sam2_image' is a tool that can segment multiple objects given a text
|
340
376
|
prompt such as category names or referring expressions. The categories in the text
|
341
377
|
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
@@ -344,6 +380,8 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
|
|
344
380
|
Parameters:
|
345
381
|
prompt (str): The prompt to ground to the image.
|
346
382
|
image (np.ndarray): The image to ground the prompt to.
|
383
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
384
|
+
fine-tuned model ID here to use it.
|
347
385
|
|
348
386
|
Returns:
|
349
387
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
@@ -369,18 +407,52 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
|
|
369
407
|
},
|
370
408
|
]
|
371
409
|
"""
|
372
|
-
|
410
|
+
if fine_tune_id is not None:
|
411
|
+
image_b64 = convert_to_b64(image)
|
412
|
+
landing_api = LandingPublicAPI()
|
413
|
+
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
414
|
+
if status is not JobStatus.SUCCEEDED:
|
415
|
+
raise FineTuneModelIsNotReady(
|
416
|
+
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
417
|
+
)
|
373
418
|
|
419
|
+
req_data_obj = Florence2FtRequest(
|
420
|
+
image=image_b64,
|
421
|
+
task=PromptTask.PHRASE_GROUNDING,
|
422
|
+
tool="florencev2_fine_tuning",
|
423
|
+
prompt=prompt,
|
424
|
+
fine_tuning=FineTuning(
|
425
|
+
job_id=UUID(fine_tune_id),
|
426
|
+
postprocessing="sam2",
|
427
|
+
),
|
428
|
+
)
|
429
|
+
req_data = req_data_obj.model_dump(by_alias=True)
|
430
|
+
detections_ft = send_inference_request(req_data, "tools", v2=False)
|
431
|
+
detections_ft = detections_ft["<CAPTION_TO_PHRASE_GROUNDING>"]
|
432
|
+
return_data = []
|
433
|
+
all_masks = np.array(detections_ft["masks"])
|
434
|
+
for i in range(len(detections_ft["bboxes"])):
|
435
|
+
return_data.append(
|
436
|
+
{
|
437
|
+
"score": 1.0,
|
438
|
+
"label": detections_ft["labels"][i],
|
439
|
+
"bbox": detections_ft["bboxes"][i],
|
440
|
+
"mask": all_masks[i, :, :].astype(np.uint8),
|
441
|
+
}
|
442
|
+
)
|
443
|
+
return return_data
|
444
|
+
|
445
|
+
buffer_bytes = numpy_to_bytes(image)
|
374
446
|
files = [("image", buffer_bytes)]
|
375
447
|
payload = {
|
376
448
|
"prompts": [s.strip() for s in prompt.split(",")],
|
377
449
|
"function_name": "florence2_sam2_image",
|
378
450
|
}
|
379
|
-
|
451
|
+
detections: Dict[str, Any] = send_inference_request(
|
380
452
|
payload, "florence2-sam2", files=files, v2=True
|
381
453
|
)
|
382
454
|
return_data = []
|
383
|
-
for _, data_i in
|
455
|
+
for _, data_i in detections["0"].items():
|
384
456
|
mask = rle_decode_array(data_i["mask"])
|
385
457
|
label = data_i["label"]
|
386
458
|
bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
|
@@ -389,17 +461,19 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
|
|
389
461
|
|
390
462
|
|
391
463
|
def florence2_sam2_video_tracking(
|
392
|
-
prompt: str, frames: List[np.ndarray]
|
464
|
+
prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = None
|
393
465
|
) -> List[List[Dict[str, Any]]]:
|
394
466
|
"""'florence2_sam2_video_tracking' is a tool that can segment and track multiple
|
395
467
|
entities in a video given a text prompt such as category names or referring
|
396
468
|
expressions. You can optionally separate the categories in the text with commas. It
|
397
|
-
|
398
|
-
|
469
|
+
can find new objects every 'chunk_length' frames and is useful for tracking and
|
470
|
+
counting without duplicating counts and always outputs scores of 1.0.
|
399
471
|
|
400
472
|
Parameters:
|
401
473
|
prompt (str): The prompt to ground to the video.
|
402
474
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
475
|
+
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
476
|
+
new objects.
|
403
477
|
|
404
478
|
Returns:
|
405
479
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
|
@@ -432,6 +506,8 @@ def florence2_sam2_video_tracking(
|
|
432
506
|
"prompts": [s.strip() for s in prompt.split(",")],
|
433
507
|
"function_name": "florence2_sam2_video_tracking",
|
434
508
|
}
|
509
|
+
if chunk_length is not None:
|
510
|
+
payload["chunk_length"] = chunk_length # type: ignore
|
435
511
|
data: Dict[str, Any] = send_inference_request(
|
436
512
|
payload, "florence2-sam2", files=files, v2=True
|
437
513
|
)
|
@@ -1119,13 +1195,13 @@ def florence2_phrase_grounding(
|
|
1119
1195
|
return_data = []
|
1120
1196
|
for i in range(len(detections["bboxes"])):
|
1121
1197
|
return_data.append(
|
1122
|
-
|
1123
|
-
"
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1198
|
+
ODResponseData(
|
1199
|
+
label=detections["labels"][i],
|
1200
|
+
bbox=normalize_bbox(detections["bboxes"][i], image_size),
|
1201
|
+
score=1.0,
|
1202
|
+
)
|
1127
1203
|
)
|
1128
|
-
return return_data
|
1204
|
+
return [bbox.model_dump() for bbox in return_data]
|
1129
1205
|
|
1130
1206
|
|
1131
1207
|
def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
@@ -1497,12 +1573,14 @@ def closest_box_distance(
|
|
1497
1573
|
# Utility and visualization functions
|
1498
1574
|
|
1499
1575
|
|
1500
|
-
def
|
1576
|
+
def extract_frames_and_timestamps(
|
1501
1577
|
video_uri: Union[str, Path], fps: float = 1
|
1502
|
-
) -> List[
|
1503
|
-
"""'
|
1504
|
-
|
1505
|
-
|
1578
|
+
) -> List[Dict[str, Union[np.ndarray, float]]]:
|
1579
|
+
"""'extract_frames_and_timestamps' extracts frames and timestamps from a video
|
1580
|
+
which can be a file path, url or youtube link, returns a list of dictionaries
|
1581
|
+
with keys "frame" and "timestamp" where "frame" is a numpy array and "timestamp" is
|
1582
|
+
the relative time in seconds where the frame was captured. The frame is a numpy
|
1583
|
+
array.
|
1506
1584
|
|
1507
1585
|
Parameters:
|
1508
1586
|
video_uri (Union[str, Path]): The path to the video file, url or youtube link
|
@@ -1510,15 +1588,23 @@ def extract_frames(
|
|
1510
1588
|
to 1.
|
1511
1589
|
|
1512
1590
|
Returns:
|
1513
|
-
List[
|
1514
|
-
as a numpy array and the timestamp in seconds.
|
1591
|
+
List[Dict[str, Union[np.ndarray, float]]]: A list of dictionaries containing the
|
1592
|
+
extracted frame as a numpy array and the timestamp in seconds.
|
1515
1593
|
|
1516
1594
|
Example
|
1517
1595
|
-------
|
1518
1596
|
>>> extract_frames("path/to/video.mp4")
|
1519
|
-
[
|
1597
|
+
[{"frame": np.ndarray, "timestamp": 0.0}, ...]
|
1520
1598
|
"""
|
1521
1599
|
|
1600
|
+
def reformat(
|
1601
|
+
frames_and_timestamps: List[Tuple[np.ndarray, float]]
|
1602
|
+
) -> List[Dict[str, Union[np.ndarray, float]]]:
|
1603
|
+
return [
|
1604
|
+
{"frame": frame, "timestamp": timestamp}
|
1605
|
+
for frame, timestamp in frames_and_timestamps
|
1606
|
+
]
|
1607
|
+
|
1522
1608
|
if str(video_uri).startswith(
|
1523
1609
|
(
|
1524
1610
|
"http://www.youtube.com/",
|
@@ -1540,16 +1626,16 @@ def extract_frames(
|
|
1540
1626
|
raise Exception("No suitable video stream found")
|
1541
1627
|
video_file_path = video.download(output_path=temp_dir)
|
1542
1628
|
|
1543
|
-
return extract_frames_from_video(video_file_path, fps)
|
1629
|
+
return reformat(extract_frames_from_video(video_file_path, fps))
|
1544
1630
|
elif str(video_uri).startswith(("http", "https")):
|
1545
1631
|
_, image_suffix = os.path.splitext(video_uri)
|
1546
1632
|
with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
|
1547
1633
|
# Download the video and save it to the temporary file
|
1548
1634
|
with urllib.request.urlopen(str(video_uri)) as response:
|
1549
1635
|
tmp_file.write(response.read())
|
1550
|
-
return extract_frames_from_video(tmp_file.name, fps)
|
1636
|
+
return reformat(extract_frames_from_video(tmp_file.name, fps))
|
1551
1637
|
|
1552
|
-
return extract_frames_from_video(str(video_uri), fps)
|
1638
|
+
return reformat(extract_frames_from_video(str(video_uri), fps))
|
1553
1639
|
|
1554
1640
|
|
1555
1641
|
def save_json(data: Any, file_path: str) -> None:
|
@@ -1953,7 +2039,6 @@ FUNCTION_TOOLS = [
|
|
1953
2039
|
vit_image_classification,
|
1954
2040
|
vit_nsfw_classification,
|
1955
2041
|
countgd_counting,
|
1956
|
-
florence2_image_caption,
|
1957
2042
|
florence2_ocr,
|
1958
2043
|
florence2_sam2_image,
|
1959
2044
|
florence2_sam2_video_tracking,
|
@@ -1968,7 +2053,7 @@ FUNCTION_TOOLS = [
|
|
1968
2053
|
]
|
1969
2054
|
|
1970
2055
|
UTIL_TOOLS = [
|
1971
|
-
|
2056
|
+
extract_frames_and_timestamps,
|
1972
2057
|
save_json,
|
1973
2058
|
load_image,
|
1974
2059
|
save_image,
|
@@ -28,6 +28,7 @@ class FineTuning(BaseModel):
|
|
28
28
|
model_config = ConfigDict(populate_by_name=True)
|
29
29
|
|
30
30
|
job_id: UUID = Field(alias="jobId")
|
31
|
+
postprocessing: Optional[str] = None
|
31
32
|
|
32
33
|
@field_serializer("job_id")
|
33
34
|
def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
|
@@ -5,7 +5,7 @@ import io
|
|
5
5
|
from importlib import resources
|
6
6
|
from io import BytesIO
|
7
7
|
from pathlib import Path
|
8
|
-
from typing import Dict, List, Tuple, Union
|
8
|
+
from typing import Dict, List, Optional, Tuple, Union
|
9
9
|
|
10
10
|
import numpy as np
|
11
11
|
from PIL import Image, ImageDraw, ImageFont
|
@@ -154,15 +154,20 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
|
|
154
154
|
)
|
155
155
|
|
156
156
|
|
157
|
-
def encode_image_bytes(image: bytes) -> str:
|
158
|
-
|
157
|
+
def encode_image_bytes(image: bytes, resize: Optional[int] = None) -> str:
|
158
|
+
if resize is not None:
|
159
|
+
image_pil = Image.open(io.BytesIO(image)).convert("RGB")
|
160
|
+
if image_pil.size[0] > resize or image_pil.size[1] > resize:
|
161
|
+
image_pil.thumbnail((resize, resize))
|
162
|
+
else:
|
163
|
+
image_pil = Image.open(io.BytesIO(image)).convert("RGB")
|
159
164
|
buffer = io.BytesIO()
|
160
|
-
|
165
|
+
image_pil.save(buffer, format="PNG")
|
161
166
|
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
162
167
|
return encoded_image
|
163
168
|
|
164
169
|
|
165
|
-
def encode_media(media: Union[str, Path]) -> str:
|
170
|
+
def encode_media(media: Union[str, Path], resize: Optional[int] = None) -> str:
|
166
171
|
if isinstance(media, str) and media.startswith(("http", "https")):
|
167
172
|
# for mp4 video url, we assume there is a same url but ends with png
|
168
173
|
# vision-agent-ui will upload this png when uploading the video
|
@@ -192,11 +197,17 @@ def encode_media(media: Union[str, Path]) -> str:
|
|
192
197
|
frames = extract_frames_from_video(str(media), fps=1)
|
193
198
|
image = frames[len(frames) // 2]
|
194
199
|
buffer = io.BytesIO()
|
195
|
-
|
200
|
+
if resize is not None:
|
201
|
+
image_pil = Image.fromarray(image[0]).convert("RGB")
|
202
|
+
if image_pil.size[0] > resize or image_pil.size[1] > resize:
|
203
|
+
image_pil.thumbnail((resize, resize))
|
204
|
+
else:
|
205
|
+
image_pil = Image.fromarray(image[0]).convert("RGB")
|
206
|
+
image_pil.save(buffer, format="PNG")
|
196
207
|
image_bytes = buffer.getvalue()
|
197
208
|
else:
|
198
209
|
image_bytes = open(media, "rb").read()
|
199
|
-
return encode_image_bytes(image_bytes)
|
210
|
+
return encode_image_bytes(image_bytes, resize=resize)
|
200
211
|
|
201
212
|
|
202
213
|
def denormalize_bbox(
|
vision_agent/utils/video.py
CHANGED
@@ -61,6 +61,7 @@ def video_writer(
|
|
61
61
|
stream.height = height - (height % 2)
|
62
62
|
stream.width = width - (width % 2)
|
63
63
|
stream.pix_fmt = "yuv420p"
|
64
|
+
stream.options = {"crf": "10"}
|
64
65
|
for frame in frames:
|
65
66
|
# Remove the alpha channel (convert RGBA to RGB)
|
66
67
|
frame_rgb = frame[:, :, :3]
|
@@ -77,7 +78,7 @@ def video_writer(
|
|
77
78
|
|
78
79
|
|
79
80
|
def frames_to_bytes(
|
80
|
-
frames: List[np.ndarray], fps: float =
|
81
|
+
frames: List[np.ndarray], fps: float = 1.0, file_ext: str = ".mp4"
|
81
82
|
) -> bytes:
|
82
83
|
r"""Convert a list of frames to a video file encoded into a byte string.
|
83
84
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.142
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -74,10 +74,11 @@ To get started, you can install the library using pip:
|
|
74
74
|
pip install vision-agent
|
75
75
|
```
|
76
76
|
|
77
|
-
Ensure you have an OpenAI API key and set
|
78
|
-
using Azure OpenAI please see the Azure setup section):
|
77
|
+
Ensure you have an Anthropic key and an OpenAI API key and set in your environment
|
78
|
+
variables (if you are using Azure OpenAI please see the Azure setup section):
|
79
79
|
|
80
80
|
```bash
|
81
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
81
82
|
export OPENAI_API_KEY="your-api-key"
|
82
83
|
```
|
83
84
|
|
@@ -112,6 +113,9 @@ You can find more details about the streamlit app [here](examples/chat/).
|
|
112
113
|
>>> resp = agent(resp)
|
113
114
|
```
|
114
115
|
|
116
|
+
`VisionAgent` currently utilizes Claude-3.5 as it's default LMM and uses OpenAI for
|
117
|
+
embeddings for tool searching.
|
118
|
+
|
115
119
|
### Vision Agent Coder
|
116
120
|
#### Basic Usage
|
117
121
|
You can interact with the agent as you would with any LLM or LMM model:
|
@@ -173,7 +177,8 @@ of the input is a list of dictionaries with the keys `role`, `content`, and `med
|
|
173
177
|
"code": "from vision_agent.tools import ..."
|
174
178
|
"test": "calculate_filled_percentage('jar.jpg')",
|
175
179
|
"test_result": "...",
|
176
|
-
"
|
180
|
+
"plans": {"plan1": {"thoughts": "..."}, ...},
|
181
|
+
"plan_thoughts": "...",
|
177
182
|
"working_memory": ...,
|
178
183
|
}
|
179
184
|
```
|
@@ -210,20 +215,25 @@ result = agent.chat_with_workflow(conv)
|
|
210
215
|
### Tools
|
211
216
|
There are a variety of tools for the model or the user to use. Some are executed locally
|
212
217
|
while others are hosted for you. You can easily access them yourself, for example if
|
213
|
-
you want to run `
|
218
|
+
you want to run `owl_v2_image` and visualize the output you can run:
|
214
219
|
|
215
220
|
```python
|
216
221
|
import vision_agent.tools as T
|
217
222
|
import matplotlib.pyplot as plt
|
218
223
|
|
219
224
|
image = T.load_image("dogs.jpg")
|
220
|
-
dets = T.
|
225
|
+
dets = T.owl_v2_image("dogs", image)
|
221
226
|
viz = T.overlay_bounding_boxes(image, dets)
|
222
227
|
plt.imshow(viz)
|
223
228
|
plt.show()
|
224
229
|
```
|
225
230
|
|
226
|
-
You can
|
231
|
+
You can find all available tools in `vision_agent/tools/tools.py`, however,
|
232
|
+
`VisionAgentCoder` only utilizes a subset of tools that have been tested and provide
|
233
|
+
the best performance. Those can be found in the same file under the `TOOLS` variable.
|
234
|
+
|
235
|
+
If you can't find the tool you are looking for you can also add custom tools to the
|
236
|
+
agent:
|
227
237
|
|
228
238
|
```python
|
229
239
|
import vision_agent as va
|
@@ -258,9 +268,48 @@ Can't find the tool you need and want add it to `VisionAgent`? Check out our
|
|
258
268
|
we add the source code for all the tools used in `VisionAgent`.
|
259
269
|
|
260
270
|
## Additional Backends
|
271
|
+
### Anthropic
|
272
|
+
`AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
|
273
|
+
Anthropic API key and set it in your environment variables:
|
274
|
+
|
275
|
+
```bash
|
276
|
+
export ANTHROPIC_API_KEY="your-api-key"
|
277
|
+
```
|
278
|
+
|
279
|
+
Because Anthropic does not support embedding models, the default embedding model used
|
280
|
+
is the OpenAI model so you will also need to set your OpenAI API key:
|
281
|
+
|
282
|
+
```bash
|
283
|
+
export OPEN_AI_API_KEY="your-api-key"
|
284
|
+
```
|
285
|
+
|
286
|
+
Usage is the same as `VisionAgentCoder`:
|
287
|
+
|
288
|
+
```python
|
289
|
+
>>> import vision_agent as va
|
290
|
+
>>> agent = va.agent.AnthropicVisionAgentCoder()
|
291
|
+
>>> agent("Count the apples in the image", media="apples.jpg")
|
292
|
+
```
|
293
|
+
|
294
|
+
### OpenAI
|
295
|
+
`OpenAIVisionAgentCoder` uses OpenAI. To get started you just need to get an OpenAI API
|
296
|
+
key and set it in your environment variables:
|
297
|
+
|
298
|
+
```bash
|
299
|
+
export OPEN_AI_API_KEY="your-api-key"
|
300
|
+
```
|
301
|
+
|
302
|
+
Usage is the same as `VisionAgentCoder`:
|
303
|
+
|
304
|
+
```python
|
305
|
+
>>> import vision_agent as va
|
306
|
+
>>> agent = va.agent.OpenAIVisionAgentCoder()
|
307
|
+
>>> agent("Count the apples in the image", media="apples.jpg")
|
308
|
+
```
|
309
|
+
|
310
|
+
|
261
311
|
### Ollama
|
262
|
-
|
263
|
-
a few models:
|
312
|
+
`OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
|
264
313
|
|
265
314
|
```bash
|
266
315
|
ollama pull llama3.1
|
@@ -281,9 +330,8 @@ tools. You can use it just like you would use `VisionAgentCoder`:
|
|
281
330
|
> WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B.
|
282
331
|
|
283
332
|
### Azure OpenAI
|
284
|
-
|
285
|
-
|
286
|
-
`VisionAgentCoder`:
|
333
|
+
`AzureVisionAgentCoder` uses Azure OpenAI models. To get started follow the Azure Setup
|
334
|
+
section below. You can use it just like you would use `VisionAgentCoder`:
|
287
335
|
|
288
336
|
```python
|
289
337
|
>>> import vision_agent as va
|
@@ -0,0 +1,33 @@
|
|
1
|
+
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
2
|
+
vision_agent/agent/__init__.py,sha256=NF2LABqHixLvbsOIO-fe-VKZ7awvShLtcT0oQT4eWtI,235
|
3
|
+
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
|
+
vision_agent/agent/agent_utils.py,sha256=PEUHqvnHmFL4np_TeFmKMwr5s_dWfdfJz6TF_ogd1dU,2353
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=k1bUmvoz0KjVEu62PYA9djnq3pqzv2S1UsW6gLnTd7w,17023
|
6
|
+
vision_agent/agent/vision_agent_coder.py,sha256=4bbebV1sKE10vsxcZR-R8P54X2HjLeU9lDt7ylIZAT4,38429
|
7
|
+
vision_agent/agent/vision_agent_coder_prompts.py,sha256=YWK4C--YRS1Kuab11Gn-AXBzar1j_GNnTnxi_nnaPRY,14901
|
8
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=e_ASPeRFU1yZsQhCkK_bIBG-eyIWyWXmN64lFk-r7e0,10897
|
9
|
+
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
11
|
+
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
12
|
+
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
+
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
14
|
+
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
15
|
+
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
16
|
+
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
|
+
vision_agent/tools/__init__.py,sha256=zUv3aVPN1MXfyQiQi5To4rkQGtG7mxLQ1NjLI3pxM80,2412
|
18
|
+
vision_agent/tools/meta_tools.py,sha256=XO5Ahe5ZauomynxgDcBuzmm0ocXwTnmZ0wjfgvOzDWc,23426
|
19
|
+
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
|
+
vision_agent/tools/tool_utils.py,sha256=5ukuDMxbEH4iKetYR9I7twzsA8ECyP4tVwYXQq54mxI,8020
|
21
|
+
vision_agent/tools/tools.py,sha256=dD_8AmAQb0oKVZHg2w2kSKlvWrG9yaKRbaHTz_kHgjA,73648
|
22
|
+
vision_agent/tools/tools_types.py,sha256=JUOZWGW2q-dlJ85CHr9gvo9KQk_rXyjJhi-iwPNn4eM,2397
|
23
|
+
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
24
|
+
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
25
|
+
vision_agent/utils/execute.py,sha256=Lb78YX34v2Ydr-Md25a_gylsdRVXBFbE-_dc_z6oHvg,27968
|
26
|
+
vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
|
27
|
+
vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
|
28
|
+
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
|
+
vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
|
30
|
+
vision_agent-0.2.142.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.142.dist-info/METADATA,sha256=yP7ShheLQ_a50CME1rbSUifRlc4ylqmM6PeIKflW9Ig,13758
|
32
|
+
vision_agent-0.2.142.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.142.dist-info/RECORD,,
|
@@ -1,33 +0,0 @@
|
|
1
|
-
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
2
|
-
vision_agent/agent/__init__.py,sha256=TddDT4e3JVc68Dt0zSk0B4OBORx_R2WhAGK71uqEe2w,204
|
3
|
-
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
|
-
vision_agent/agent/agent_utils.py,sha256=qOYQn-wJsa4j4YjFOBQ41xyklCg8Y94CIIGw9ZXmgIU,2053
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=Ed10_rWzHu-hejb5jF9lAF7xbmQ_qAGpCxDvByZw6M8,14100
|
6
|
-
vision_agent/agent/vision_agent_coder.py,sha256=OI95goKTqVaEEPYwkn6bVsHsHZeifoBC8rjG9nD0Znc,36909
|
7
|
-
vision_agent/agent/vision_agent_coder_prompts.py,sha256=a7P19QscKNiaweke0zHPCfi5GQImpG-ZGKv_kXz0seg,13452
|
8
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=-fXiIIb48duXVljWYcJ0Y4ZzfNnRFi3C5cKdF4SdDo8,10075
|
9
|
-
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
-
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
11
|
-
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
12
|
-
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
14
|
-
vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
|
15
|
-
vision_agent/lmm/lmm.py,sha256=soWmEjtleQUSH2G3tYZWxOmteIqkgMVcmuZfx4mxszU,16838
|
16
|
-
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
|
-
vision_agent/tools/__init__.py,sha256=nufZNzbcLTuXwxFmvZNj99qE8EO2qtEPT8wFsuI9vyE,2397
|
18
|
-
vision_agent/tools/meta_tools.py,sha256=orYbEPWOENXwmKSmbg52_2eMAoYT9ZbV5GjudUd-f0o,22563
|
19
|
-
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
|
-
vision_agent/tools/tool_utils.py,sha256=5ukuDMxbEH4iKetYR9I7twzsA8ECyP4tVwYXQq54mxI,8020
|
21
|
-
vision_agent/tools/tools.py,sha256=WKeB99ED0o_ISS_vZc-ch_1Dc8_Fl2fhnGlfVNwNouc,70024
|
22
|
-
vision_agent/tools/tools_types.py,sha256=rLpCUODPY0yI65SLOTJOxfHFfqWM3WjOq-AYX25Chjk,2356
|
23
|
-
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
24
|
-
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
25
|
-
vision_agent/utils/execute.py,sha256=Lb78YX34v2Ydr-Md25a_gylsdRVXBFbE-_dc_z6oHvg,27968
|
26
|
-
vision_agent/utils/image_utils.py,sha256=zTTOJFOieMzwIquTFnW7T6ssx9o6XfoZ0Unqyk7GJrg,10746
|
27
|
-
vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
|
28
|
-
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
|
-
vision_agent/utils/video.py,sha256=hOjfEOZNcddYdoa0CoviXA4Vo9kwURKuojIJgLLJdp0,4745
|
30
|
-
vision_agent-0.2.140.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
-
vision_agent-0.2.140.dist-info/METADATA,sha256=B33v0XI-5ZlEBBu-I8DT7JrbU04PophTYEmRQMVEkBQ,12291
|
32
|
-
vision_agent-0.2.140.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
-
vision_agent-0.2.140.dist-info/RECORD,,
|
File without changes
|
File without changes
|