vision-agent 0.2.117__tar.gz → 0.2.118__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.117 → vision_agent-0.2.118}/PKG-INFO +1 -1
- {vision_agent-0.2.117 → vision_agent-0.2.118}/pyproject.toml +1 -1
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/agent/vision_agent_coder.py +45 -22
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/lmm/lmm.py +11 -4
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/tools/__init__.py +1 -1
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/tools/tools.py +35 -13
- {vision_agent-0.2.117 → vision_agent-0.2.118}/LICENSE +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/README.md +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/agent/agent_utils.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/utils/video.py +0 -0
@@ -718,7 +718,12 @@ class VisionAgentCoder(Agent):
|
|
718
718
|
for chat_i in chat:
|
719
719
|
if "media" in chat_i:
|
720
720
|
for media in chat_i["media"]:
|
721
|
-
media =
|
721
|
+
media = (
|
722
|
+
media
|
723
|
+
if type(media) is str
|
724
|
+
and media.startswith(("http", "https"))
|
725
|
+
else code_interpreter.upload_file(media)
|
726
|
+
)
|
722
727
|
chat_i["content"] += f" Media name {media}" # type: ignore
|
723
728
|
media_list.append(media)
|
724
729
|
|
@@ -744,29 +749,14 @@ class VisionAgentCoder(Agent):
|
|
744
749
|
results = {"code": "", "test": "", "plan": []}
|
745
750
|
plan = []
|
746
751
|
success = False
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
"log_content": "Creating plans",
|
751
|
-
"status": "started",
|
752
|
-
}
|
753
|
-
)
|
754
|
-
plans = write_plans(
|
755
|
-
int_chat,
|
756
|
-
T.get_tool_descriptions_by_names(
|
757
|
-
customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
|
758
|
-
),
|
759
|
-
format_memory(working_memory),
|
760
|
-
self.planner,
|
752
|
+
|
753
|
+
plans = self._create_plans(
|
754
|
+
int_chat, customized_tool_names, working_memory, self.planner
|
761
755
|
)
|
762
756
|
|
763
|
-
if
|
764
|
-
|
765
|
-
|
766
|
-
p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
|
767
|
-
_LOGGER.info(
|
768
|
-
f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
769
|
-
)
|
757
|
+
if test_multi_plan:
|
758
|
+
self._log_plans(plans, self.verbosity)
|
759
|
+
|
770
760
|
tool_infos = retrieve_tools(
|
771
761
|
plans,
|
772
762
|
self.tool_recommender,
|
@@ -860,6 +850,39 @@ class VisionAgentCoder(Agent):
|
|
860
850
|
if self.report_progress_callback is not None:
|
861
851
|
self.report_progress_callback(data)
|
862
852
|
|
853
|
+
def _create_plans(
|
854
|
+
self,
|
855
|
+
int_chat: List[Message],
|
856
|
+
customized_tool_names: Optional[List[str]],
|
857
|
+
working_memory: List[Dict[str, str]],
|
858
|
+
planner: LMM,
|
859
|
+
) -> Dict[str, Any]:
|
860
|
+
self.log_progress(
|
861
|
+
{
|
862
|
+
"type": "log",
|
863
|
+
"log_content": "Creating plans",
|
864
|
+
"status": "started",
|
865
|
+
}
|
866
|
+
)
|
867
|
+
plans = write_plans(
|
868
|
+
int_chat,
|
869
|
+
T.get_tool_descriptions_by_names(
|
870
|
+
customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
|
871
|
+
),
|
872
|
+
format_memory(working_memory),
|
873
|
+
planner,
|
874
|
+
)
|
875
|
+
return plans
|
876
|
+
|
877
|
+
def _log_plans(self, plans: Dict[str, Any], verbosity: int) -> None:
|
878
|
+
if verbosity >= 1:
|
879
|
+
for p in plans:
|
880
|
+
# tabulate will fail if the keys are not the same for all elements
|
881
|
+
p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
|
882
|
+
_LOGGER.info(
|
883
|
+
f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
884
|
+
)
|
885
|
+
|
863
886
|
|
864
887
|
class OllamaVisionAgentCoder(VisionAgentCoder):
|
865
888
|
"""VisionAgentCoder that uses Ollama models for planning, coding, testing.
|
@@ -30,6 +30,12 @@ def encode_image_bytes(image: bytes) -> str:
|
|
30
30
|
|
31
31
|
|
32
32
|
def encode_media(media: Union[str, Path]) -> str:
|
33
|
+
if type(media) is str and media.startswith(("http", "https")):
|
34
|
+
# for mp4 video url, we assume there is a same url but ends with png
|
35
|
+
# vision-agent-ui will upload this png when uploading the video
|
36
|
+
if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
|
37
|
+
return media[:-4] + ".png"
|
38
|
+
return media
|
33
39
|
extension = "png"
|
34
40
|
extension = Path(media).suffix
|
35
41
|
if extension.lower() not in {
|
@@ -138,7 +144,11 @@ class OpenAILMM(LMM):
|
|
138
144
|
{
|
139
145
|
"type": "image_url",
|
140
146
|
"image_url": {
|
141
|
-
"url":
|
147
|
+
"url": (
|
148
|
+
encoded_media
|
149
|
+
if encoded_media.startswith(("http", "https"))
|
150
|
+
else f"data:image/png;base64,{encoded_media}"
|
151
|
+
),
|
142
152
|
"detail": "low",
|
143
153
|
},
|
144
154
|
},
|
@@ -390,7 +400,6 @@ class OllamaLMM(LMM):
|
|
390
400
|
tmp_kwargs = self.kwargs | kwargs
|
391
401
|
data.update(tmp_kwargs)
|
392
402
|
if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
|
393
|
-
|
394
403
|
json_data = json.dumps(data)
|
395
404
|
|
396
405
|
def f() -> Iterator[Optional[str]]:
|
@@ -424,7 +433,6 @@ class OllamaLMM(LMM):
|
|
424
433
|
media: Optional[List[Union[str, Path]]] = None,
|
425
434
|
**kwargs: Any,
|
426
435
|
) -> Union[str, Iterator[Optional[str]]]:
|
427
|
-
|
428
436
|
url = f"{self.url}/generate"
|
429
437
|
data: Dict[str, Any] = {
|
430
438
|
"model": self.model_name,
|
@@ -439,7 +447,6 @@ class OllamaLMM(LMM):
|
|
439
447
|
tmp_kwargs = self.kwargs | kwargs
|
440
448
|
data.update(tmp_kwargs)
|
441
449
|
if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
|
442
|
-
|
443
450
|
json_data = json.dumps(data)
|
444
451
|
|
445
452
|
def f() -> Iterator[Optional[str]]:
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import os
|
1
2
|
import io
|
2
3
|
import json
|
3
4
|
import logging
|
@@ -14,6 +15,7 @@ from moviepy.editor import ImageSequenceClip
|
|
14
15
|
from PIL import Image, ImageDraw, ImageFont
|
15
16
|
from pillow_heif import register_heif_opener # type: ignore
|
16
17
|
from pytube import YouTube # type: ignore
|
18
|
+
import urllib.request
|
17
19
|
|
18
20
|
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
19
21
|
from vision_agent.tools.tool_utils import (
|
@@ -760,10 +762,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
|
|
760
762
|
return answer[task] # type: ignore
|
761
763
|
|
762
764
|
|
763
|
-
def
|
764
|
-
"""'
|
765
|
-
objects given a text prompt
|
766
|
-
can optionally separate the
|
765
|
+
def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
|
766
|
+
"""'florence2_phrase_grounding' is a tool that can detect multiple
|
767
|
+
objects given a text prompt which can be object names or caption. You
|
768
|
+
can optionally separate the object names in the text with commas. It returns a list
|
767
769
|
of bounding boxes with normalized coordinates, label names and associated
|
768
770
|
probability scores of 1.0.
|
769
771
|
|
@@ -780,7 +782,7 @@ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str,
|
|
780
782
|
|
781
783
|
Example
|
782
784
|
-------
|
783
|
-
>>>
|
785
|
+
>>> florence2_phrase_grounding('person looking at a coyote', image)
|
784
786
|
[
|
785
787
|
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
786
788
|
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
@@ -792,7 +794,7 @@ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str,
|
|
792
794
|
"image": image_b64,
|
793
795
|
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
794
796
|
"prompt": prompt,
|
795
|
-
"function_name": "
|
797
|
+
"function_name": "florence2_phrase_grounding",
|
796
798
|
}
|
797
799
|
|
798
800
|
detections = send_inference_request(data, "florence2", v2=True)
|
@@ -1220,6 +1222,13 @@ def extract_frames(
|
|
1220
1222
|
video_file_path = video.download(output_path=temp_dir)
|
1221
1223
|
|
1222
1224
|
return extract_frames_from_video(video_file_path, fps)
|
1225
|
+
elif str(video_uri).startswith(("http", "https")):
|
1226
|
+
_, image_suffix = os.path.splitext(video_uri)
|
1227
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
|
1228
|
+
# Download the video and save it to the temporary file
|
1229
|
+
with urllib.request.urlopen(str(video_uri)) as response:
|
1230
|
+
tmp_file.write(response.read())
|
1231
|
+
return extract_frames_from_video(tmp_file.name, fps)
|
1223
1232
|
|
1224
1233
|
return extract_frames_from_video(str(video_uri), fps)
|
1225
1234
|
|
@@ -1250,10 +1259,10 @@ def save_json(data: Any, file_path: str) -> None:
|
|
1250
1259
|
|
1251
1260
|
|
1252
1261
|
def load_image(image_path: str) -> np.ndarray:
|
1253
|
-
"""'load_image' is a utility function that loads an image from the given file path string.
|
1262
|
+
"""'load_image' is a utility function that loads an image from the given file path string or an URL.
|
1254
1263
|
|
1255
1264
|
Parameters:
|
1256
|
-
image_path (str): The path to the image.
|
1265
|
+
image_path (str): The path or URL to the image.
|
1257
1266
|
|
1258
1267
|
Returns:
|
1259
1268
|
np.ndarray: The image as a NumPy array.
|
@@ -1265,6 +1274,13 @@ def load_image(image_path: str) -> np.ndarray:
|
|
1265
1274
|
# NOTE: sometimes the generated code pass in a NumPy array
|
1266
1275
|
if isinstance(image_path, np.ndarray):
|
1267
1276
|
return image_path
|
1277
|
+
if image_path.startswith(("http", "https")):
|
1278
|
+
_, image_suffix = os.path.splitext(image_path)
|
1279
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
|
1280
|
+
# Download the image and save it to the temporary file
|
1281
|
+
with urllib.request.urlopen(image_path) as response:
|
1282
|
+
tmp_file.write(response.read())
|
1283
|
+
image_path = tmp_file.name
|
1268
1284
|
image = Image.open(image_path).convert("RGB")
|
1269
1285
|
return np.array(image)
|
1270
1286
|
|
@@ -1418,6 +1434,7 @@ def overlay_segmentation_masks(
|
|
1418
1434
|
medias: Union[np.ndarray, List[np.ndarray]],
|
1419
1435
|
masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
|
1420
1436
|
draw_label: bool = True,
|
1437
|
+
secondary_label_key: str = "tracking_label",
|
1421
1438
|
) -> Union[np.ndarray, List[np.ndarray]]:
|
1422
1439
|
"""'overlay_segmentation_masks' is a utility function that displays segmentation
|
1423
1440
|
masks.
|
@@ -1426,7 +1443,10 @@ def overlay_segmentation_masks(
|
|
1426
1443
|
medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
|
1427
1444
|
the masks on.
|
1428
1445
|
masks (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
|
1429
|
-
dictionaries containing the masks.
|
1446
|
+
dictionaries containing the masks, labels and scores.
|
1447
|
+
draw_label (bool, optional): If True, the labels will be displayed on the image.
|
1448
|
+
secondary_label_key (str, optional): The key to use for the secondary
|
1449
|
+
tracking label which is needed in videos to display tracking information.
|
1430
1450
|
|
1431
1451
|
Returns:
|
1432
1452
|
np.ndarray: The image with the masks displayed.
|
@@ -1471,6 +1491,7 @@ def overlay_segmentation_masks(
|
|
1471
1491
|
for elt in masks_int[i]:
|
1472
1492
|
mask = elt["mask"]
|
1473
1493
|
label = elt["label"]
|
1494
|
+
tracking_lbl = elt.get(secondary_label_key, None)
|
1474
1495
|
np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
|
1475
1496
|
np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
|
1476
1497
|
mask_img = Image.fromarray(np_mask.astype(np.uint8))
|
@@ -1478,16 +1499,17 @@ def overlay_segmentation_masks(
|
|
1478
1499
|
|
1479
1500
|
if draw_label:
|
1480
1501
|
draw = ImageDraw.Draw(pil_image)
|
1481
|
-
|
1502
|
+
text = tracking_lbl if tracking_lbl else label
|
1503
|
+
text_box = draw.textbbox((0, 0), text=text, font=font)
|
1482
1504
|
x, y = _get_text_coords_from_mask(
|
1483
1505
|
mask,
|
1484
1506
|
v_gap=(text_box[3] - text_box[1]) + 10,
|
1485
1507
|
h_gap=(text_box[2] - text_box[0]) // 2,
|
1486
1508
|
)
|
1487
1509
|
if x != 0 and y != 0:
|
1488
|
-
text_box = draw.textbbox((x, y), text=
|
1510
|
+
text_box = draw.textbbox((x, y), text=text, font=font)
|
1489
1511
|
draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label])
|
1490
|
-
draw.text((x, y),
|
1512
|
+
draw.text((x, y), text, fill="black", font=font)
|
1491
1513
|
frame_out.append(np.array(pil_image))
|
1492
1514
|
return frame_out[0] if len(frame_out) == 1 else frame_out
|
1493
1515
|
|
@@ -1663,7 +1685,7 @@ FUNCTION_TOOLS = [
|
|
1663
1685
|
florence2_ocr,
|
1664
1686
|
florence2_sam2_image,
|
1665
1687
|
florence2_sam2_video,
|
1666
|
-
|
1688
|
+
florence2_phrase_grounding,
|
1667
1689
|
ixc25_image_vqa,
|
1668
1690
|
ixc25_video_vqa,
|
1669
1691
|
detr_segmentation,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.117 → vision_agent-0.2.118}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|