vision-agent 0.2.116__py3-none-any.whl → 0.2.118__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent_coder.py +45 -22
- vision_agent/lmm/lmm.py +11 -4
- vision_agent/tools/__init__.py +1 -1
- vision_agent/tools/tools.py +35 -13
- {vision_agent-0.2.116.dist-info → vision_agent-0.2.118.dist-info}/METADATA +2 -2
- {vision_agent-0.2.116.dist-info → vision_agent-0.2.118.dist-info}/RECORD +8 -8
- {vision_agent-0.2.116.dist-info → vision_agent-0.2.118.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.116.dist-info → vision_agent-0.2.118.dist-info}/WHEEL +0 -0
@@ -718,7 +718,12 @@ class VisionAgentCoder(Agent):
|
|
718
718
|
for chat_i in chat:
|
719
719
|
if "media" in chat_i:
|
720
720
|
for media in chat_i["media"]:
|
721
|
-
media =
|
721
|
+
media = (
|
722
|
+
media
|
723
|
+
if type(media) is str
|
724
|
+
and media.startswith(("http", "https"))
|
725
|
+
else code_interpreter.upload_file(media)
|
726
|
+
)
|
722
727
|
chat_i["content"] += f" Media name {media}" # type: ignore
|
723
728
|
media_list.append(media)
|
724
729
|
|
@@ -744,29 +749,14 @@ class VisionAgentCoder(Agent):
|
|
744
749
|
results = {"code": "", "test": "", "plan": []}
|
745
750
|
plan = []
|
746
751
|
success = False
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
"log_content": "Creating plans",
|
751
|
-
"status": "started",
|
752
|
-
}
|
753
|
-
)
|
754
|
-
plans = write_plans(
|
755
|
-
int_chat,
|
756
|
-
T.get_tool_descriptions_by_names(
|
757
|
-
customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
|
758
|
-
),
|
759
|
-
format_memory(working_memory),
|
760
|
-
self.planner,
|
752
|
+
|
753
|
+
plans = self._create_plans(
|
754
|
+
int_chat, customized_tool_names, working_memory, self.planner
|
761
755
|
)
|
762
756
|
|
763
|
-
if
|
764
|
-
|
765
|
-
|
766
|
-
p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
|
767
|
-
_LOGGER.info(
|
768
|
-
f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
769
|
-
)
|
757
|
+
if test_multi_plan:
|
758
|
+
self._log_plans(plans, self.verbosity)
|
759
|
+
|
770
760
|
tool_infos = retrieve_tools(
|
771
761
|
plans,
|
772
762
|
self.tool_recommender,
|
@@ -860,6 +850,39 @@ class VisionAgentCoder(Agent):
|
|
860
850
|
if self.report_progress_callback is not None:
|
861
851
|
self.report_progress_callback(data)
|
862
852
|
|
853
|
+
def _create_plans(
|
854
|
+
self,
|
855
|
+
int_chat: List[Message],
|
856
|
+
customized_tool_names: Optional[List[str]],
|
857
|
+
working_memory: List[Dict[str, str]],
|
858
|
+
planner: LMM,
|
859
|
+
) -> Dict[str, Any]:
|
860
|
+
self.log_progress(
|
861
|
+
{
|
862
|
+
"type": "log",
|
863
|
+
"log_content": "Creating plans",
|
864
|
+
"status": "started",
|
865
|
+
}
|
866
|
+
)
|
867
|
+
plans = write_plans(
|
868
|
+
int_chat,
|
869
|
+
T.get_tool_descriptions_by_names(
|
870
|
+
customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore
|
871
|
+
),
|
872
|
+
format_memory(working_memory),
|
873
|
+
planner,
|
874
|
+
)
|
875
|
+
return plans
|
876
|
+
|
877
|
+
def _log_plans(self, plans: Dict[str, Any], verbosity: int) -> None:
|
878
|
+
if verbosity >= 1:
|
879
|
+
for p in plans:
|
880
|
+
# tabulate will fail if the keys are not the same for all elements
|
881
|
+
p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
|
882
|
+
_LOGGER.info(
|
883
|
+
f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
884
|
+
)
|
885
|
+
|
863
886
|
|
864
887
|
class OllamaVisionAgentCoder(VisionAgentCoder):
|
865
888
|
"""VisionAgentCoder that uses Ollama models for planning, coding, testing.
|
vision_agent/lmm/lmm.py
CHANGED
@@ -30,6 +30,12 @@ def encode_image_bytes(image: bytes) -> str:
|
|
30
30
|
|
31
31
|
|
32
32
|
def encode_media(media: Union[str, Path]) -> str:
|
33
|
+
if type(media) is str and media.startswith(("http", "https")):
|
34
|
+
# for mp4 video url, we assume there is a same url but ends with png
|
35
|
+
# vision-agent-ui will upload this png when uploading the video
|
36
|
+
if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
|
37
|
+
return media[:-4] + ".png"
|
38
|
+
return media
|
33
39
|
extension = "png"
|
34
40
|
extension = Path(media).suffix
|
35
41
|
if extension.lower() not in {
|
@@ -138,7 +144,11 @@ class OpenAILMM(LMM):
|
|
138
144
|
{
|
139
145
|
"type": "image_url",
|
140
146
|
"image_url": {
|
141
|
-
"url":
|
147
|
+
"url": (
|
148
|
+
encoded_media
|
149
|
+
if encoded_media.startswith(("http", "https"))
|
150
|
+
else f"data:image/png;base64,{encoded_media}"
|
151
|
+
),
|
142
152
|
"detail": "low",
|
143
153
|
},
|
144
154
|
},
|
@@ -390,7 +400,6 @@ class OllamaLMM(LMM):
|
|
390
400
|
tmp_kwargs = self.kwargs | kwargs
|
391
401
|
data.update(tmp_kwargs)
|
392
402
|
if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
|
393
|
-
|
394
403
|
json_data = json.dumps(data)
|
395
404
|
|
396
405
|
def f() -> Iterator[Optional[str]]:
|
@@ -424,7 +433,6 @@ class OllamaLMM(LMM):
|
|
424
433
|
media: Optional[List[Union[str, Path]]] = None,
|
425
434
|
**kwargs: Any,
|
426
435
|
) -> Union[str, Iterator[Optional[str]]]:
|
427
|
-
|
428
436
|
url = f"{self.url}/generate"
|
429
437
|
data: Dict[str, Any] = {
|
430
438
|
"model": self.model_name,
|
@@ -439,7 +447,6 @@ class OllamaLMM(LMM):
|
|
439
447
|
tmp_kwargs = self.kwargs | kwargs
|
440
448
|
data.update(tmp_kwargs)
|
441
449
|
if "stream" in tmp_kwargs and tmp_kwargs["stream"]:
|
442
|
-
|
443
450
|
json_data = json.dumps(data)
|
444
451
|
|
445
452
|
def f() -> Iterator[Optional[str]]:
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import os
|
1
2
|
import io
|
2
3
|
import json
|
3
4
|
import logging
|
@@ -14,6 +15,7 @@ from moviepy.editor import ImageSequenceClip
|
|
14
15
|
from PIL import Image, ImageDraw, ImageFont
|
15
16
|
from pillow_heif import register_heif_opener # type: ignore
|
16
17
|
from pytube import YouTube # type: ignore
|
18
|
+
import urllib.request
|
17
19
|
|
18
20
|
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
19
21
|
from vision_agent.tools.tool_utils import (
|
@@ -760,10 +762,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
|
|
760
762
|
return answer[task] # type: ignore
|
761
763
|
|
762
764
|
|
763
|
-
def
|
764
|
-
"""'
|
765
|
-
objects given a text prompt
|
766
|
-
can optionally separate the
|
765
|
+
def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
|
766
|
+
"""'florence2_phrase_grounding' is a tool that can detect multiple
|
767
|
+
objects given a text prompt which can be object names or caption. You
|
768
|
+
can optionally separate the object names in the text with commas. It returns a list
|
767
769
|
of bounding boxes with normalized coordinates, label names and associated
|
768
770
|
probability scores of 1.0.
|
769
771
|
|
@@ -780,7 +782,7 @@ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str,
|
|
780
782
|
|
781
783
|
Example
|
782
784
|
-------
|
783
|
-
>>>
|
785
|
+
>>> florence2_phrase_grounding('person looking at a coyote', image)
|
784
786
|
[
|
785
787
|
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
786
788
|
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
@@ -792,7 +794,7 @@ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str,
|
|
792
794
|
"image": image_b64,
|
793
795
|
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
794
796
|
"prompt": prompt,
|
795
|
-
"function_name": "
|
797
|
+
"function_name": "florence2_phrase_grounding",
|
796
798
|
}
|
797
799
|
|
798
800
|
detections = send_inference_request(data, "florence2", v2=True)
|
@@ -1220,6 +1222,13 @@ def extract_frames(
|
|
1220
1222
|
video_file_path = video.download(output_path=temp_dir)
|
1221
1223
|
|
1222
1224
|
return extract_frames_from_video(video_file_path, fps)
|
1225
|
+
elif str(video_uri).startswith(("http", "https")):
|
1226
|
+
_, image_suffix = os.path.splitext(video_uri)
|
1227
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
|
1228
|
+
# Download the video and save it to the temporary file
|
1229
|
+
with urllib.request.urlopen(str(video_uri)) as response:
|
1230
|
+
tmp_file.write(response.read())
|
1231
|
+
return extract_frames_from_video(tmp_file.name, fps)
|
1223
1232
|
|
1224
1233
|
return extract_frames_from_video(str(video_uri), fps)
|
1225
1234
|
|
@@ -1250,10 +1259,10 @@ def save_json(data: Any, file_path: str) -> None:
|
|
1250
1259
|
|
1251
1260
|
|
1252
1261
|
def load_image(image_path: str) -> np.ndarray:
|
1253
|
-
"""'load_image' is a utility function that loads an image from the given file path string.
|
1262
|
+
"""'load_image' is a utility function that loads an image from the given file path string or an URL.
|
1254
1263
|
|
1255
1264
|
Parameters:
|
1256
|
-
image_path (str): The path to the image.
|
1265
|
+
image_path (str): The path or URL to the image.
|
1257
1266
|
|
1258
1267
|
Returns:
|
1259
1268
|
np.ndarray: The image as a NumPy array.
|
@@ -1265,6 +1274,13 @@ def load_image(image_path: str) -> np.ndarray:
|
|
1265
1274
|
# NOTE: sometimes the generated code pass in a NumPy array
|
1266
1275
|
if isinstance(image_path, np.ndarray):
|
1267
1276
|
return image_path
|
1277
|
+
if image_path.startswith(("http", "https")):
|
1278
|
+
_, image_suffix = os.path.splitext(image_path)
|
1279
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
|
1280
|
+
# Download the image and save it to the temporary file
|
1281
|
+
with urllib.request.urlopen(image_path) as response:
|
1282
|
+
tmp_file.write(response.read())
|
1283
|
+
image_path = tmp_file.name
|
1268
1284
|
image = Image.open(image_path).convert("RGB")
|
1269
1285
|
return np.array(image)
|
1270
1286
|
|
@@ -1418,6 +1434,7 @@ def overlay_segmentation_masks(
|
|
1418
1434
|
medias: Union[np.ndarray, List[np.ndarray]],
|
1419
1435
|
masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
|
1420
1436
|
draw_label: bool = True,
|
1437
|
+
secondary_label_key: str = "tracking_label",
|
1421
1438
|
) -> Union[np.ndarray, List[np.ndarray]]:
|
1422
1439
|
"""'overlay_segmentation_masks' is a utility function that displays segmentation
|
1423
1440
|
masks.
|
@@ -1426,7 +1443,10 @@ def overlay_segmentation_masks(
|
|
1426
1443
|
medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
|
1427
1444
|
the masks on.
|
1428
1445
|
masks (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
|
1429
|
-
dictionaries containing the masks.
|
1446
|
+
dictionaries containing the masks, labels and scores.
|
1447
|
+
draw_label (bool, optional): If True, the labels will be displayed on the image.
|
1448
|
+
secondary_label_key (str, optional): The key to use for the secondary
|
1449
|
+
tracking label which is needed in videos to display tracking information.
|
1430
1450
|
|
1431
1451
|
Returns:
|
1432
1452
|
np.ndarray: The image with the masks displayed.
|
@@ -1471,6 +1491,7 @@ def overlay_segmentation_masks(
|
|
1471
1491
|
for elt in masks_int[i]:
|
1472
1492
|
mask = elt["mask"]
|
1473
1493
|
label = elt["label"]
|
1494
|
+
tracking_lbl = elt.get(secondary_label_key, None)
|
1474
1495
|
np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
|
1475
1496
|
np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
|
1476
1497
|
mask_img = Image.fromarray(np_mask.astype(np.uint8))
|
@@ -1478,16 +1499,17 @@ def overlay_segmentation_masks(
|
|
1478
1499
|
|
1479
1500
|
if draw_label:
|
1480
1501
|
draw = ImageDraw.Draw(pil_image)
|
1481
|
-
|
1502
|
+
text = tracking_lbl if tracking_lbl else label
|
1503
|
+
text_box = draw.textbbox((0, 0), text=text, font=font)
|
1482
1504
|
x, y = _get_text_coords_from_mask(
|
1483
1505
|
mask,
|
1484
1506
|
v_gap=(text_box[3] - text_box[1]) + 10,
|
1485
1507
|
h_gap=(text_box[2] - text_box[0]) // 2,
|
1486
1508
|
)
|
1487
1509
|
if x != 0 and y != 0:
|
1488
|
-
text_box = draw.textbbox((x, y), text=
|
1510
|
+
text_box = draw.textbbox((x, y), text=text, font=font)
|
1489
1511
|
draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label])
|
1490
|
-
draw.text((x, y),
|
1512
|
+
draw.text((x, y), text, fill="black", font=font)
|
1491
1513
|
frame_out.append(np.array(pil_image))
|
1492
1514
|
return frame_out[0] if len(frame_out) == 1 else frame_out
|
1493
1515
|
|
@@ -1663,7 +1685,7 @@ FUNCTION_TOOLS = [
|
|
1663
1685
|
florence2_ocr,
|
1664
1686
|
florence2_sam2_image,
|
1665
1687
|
florence2_sam2_video,
|
1666
|
-
|
1688
|
+
florence2_phrase_grounding,
|
1667
1689
|
ixc25_image_vqa,
|
1668
1690
|
ixc25_video_vqa,
|
1669
1691
|
detr_segmentation,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.118
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -252,7 +252,7 @@ function. Make sure the documentation is in the same format above with descripti
|
|
252
252
|
`Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
|
253
253
|
[here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
|
254
254
|
|
255
|
-
## Additional
|
255
|
+
## Additional Backends
|
256
256
|
### Ollama
|
257
257
|
We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
|
258
258
|
a few models:
|
@@ -3,7 +3,7 @@ vision_agent/agent/__init__.py,sha256=FRwiux1FGvGccetyUCtY46KP01fQteqorm-JtFepov
|
|
3
3
|
vision_agent/agent/agent.py,sha256=Bt8yhjCFXuRdZaHxKEesG40V09nWRt45sZluri1R3AA,575
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=22LiPhkJlS5mVeo2dIi259pc2NgA7PGHRpcbnrtKo78,1930
|
5
5
|
vision_agent/agent/vision_agent.py,sha256=5rgO-pScVOS3t4sWnLBnGYYkGftGgF4U0FpZzFVrDAY,8447
|
6
|
-
vision_agent/agent/vision_agent_coder.py,sha256=
|
6
|
+
vision_agent/agent/vision_agent_coder.py,sha256=tE-15ttnDxUsEdB0XJP4AVNyOU89KS8ZvXZDPcNKA-8,34380
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=xIya1txRZM8qoQHAWTEkEFCL8L3iZD7QD09t3ZtdxSE,11305
|
8
8
|
vision_agent/agent/vision_agent_prompts.py,sha256=ydUU_Wvw-jqdL_vObSUr-VCQvjSwA5Fd74TbbhUzyxk,6112
|
9
9
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -12,13 +12,13 @@ vision_agent/clients/landing_public_api.py,sha256=6L15zh5lP5JHCpGnYpHMREgrrKiJin
|
|
12
12
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
14
14
|
vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
|
15
|
-
vision_agent/lmm/lmm.py,sha256=
|
15
|
+
vision_agent/lmm/lmm.py,sha256=xkAxunToISzo5rCcjekqQBvm5SRW-98htieLuztKNbk,20802
|
16
16
|
vision_agent/lmm/types.py,sha256=8TSRoTbXyCKVJiH-wHXI2OiGOMSkYv1vLGYeAXtNpOQ,153
|
17
|
-
vision_agent/tools/__init__.py,sha256=
|
17
|
+
vision_agent/tools/__init__.py,sha256=lUUc2HV13eSxg5KPZop1D-mB4ecmiQ5fYlBTQLNSbYg,2190
|
18
18
|
vision_agent/tools/meta_tools.py,sha256=q6h7hZarZrsWRloVE6PbTZwW8J2N1uUM9Ac-XxsT6hk,13365
|
19
19
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
20
|
vision_agent/tools/tool_utils.py,sha256=qMsb9d8QtpXGgF9rpPO2dA390BewKdYO68oWKDu-TGg,6504
|
21
|
-
vision_agent/tools/tools.py,sha256=
|
21
|
+
vision_agent/tools/tools.py,sha256=gAW6G9k1vzy8jwRACNnw2Vihsajm_oSlVJqd6E4JSRA,59957
|
22
22
|
vision_agent/tools/tools_types.py,sha256=z6_XtUhWgh201yM7Z0CYtiLBEGdHPc_QUydMDHZ84EA,2216
|
23
23
|
vision_agent/utils/__init__.py,sha256=pWk0ktvR4aUEhuEIzSLM9kSgW4WDVqptdvOTeGLkJ6M,230
|
24
24
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=c1LrmaHD331za8DbA1myJpgUmWoDzePaOK6-dsd
|
|
27
27
|
vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.118.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.118.dist-info/METADATA,sha256=4ilO7j9MOLCtaNekUUVlhMNdDKMk02ecx7ipnXT9RC8,11997
|
32
|
+
vision_agent-0.2.118.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.118.dist-info/RECORD,,
|
File without changes
|
File without changes
|