vision-agent 0.2.95__py3-none-any.whl → 0.2.97__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/tools.py +29 -14
- {vision_agent-0.2.95.dist-info → vision_agent-0.2.97.dist-info}/METADATA +2 -2
- {vision_agent-0.2.95.dist-info → vision_agent-0.2.97.dist-info}/RECORD +5 -5
- {vision_agent-0.2.95.dist-info → vision_agent-0.2.97.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.95.dist-info → vision_agent-0.2.97.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -9,6 +9,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
|
9
9
|
import cv2
|
10
10
|
import numpy as np
|
11
11
|
import requests
|
12
|
+
from moviepy.editor import ImageSequenceClip
|
12
13
|
from PIL import Image, ImageDraw, ImageFont
|
13
14
|
from pillow_heif import register_heif_opener # type: ignore
|
14
15
|
from pytube import YouTube # type: ignore
|
@@ -106,6 +107,7 @@ def grounding_dino(
|
|
106
107
|
"visual_grounding" if model_size == "large" else "visual_grounding_tiny"
|
107
108
|
),
|
108
109
|
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
110
|
+
"function_name": "grounding_dino",
|
109
111
|
}
|
110
112
|
data: Dict[str, Any] = send_inference_request(request_data, "tools")
|
111
113
|
return_data = []
|
@@ -161,6 +163,7 @@ def owl_v2(
|
|
161
163
|
"image": image_b64,
|
162
164
|
"tool": "open_vocab_detection",
|
163
165
|
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
166
|
+
"function_name": "owl_v2",
|
164
167
|
}
|
165
168
|
data: Dict[str, Any] = send_inference_request(request_data, "tools")
|
166
169
|
return_data = []
|
@@ -225,6 +228,7 @@ def grounding_sam(
|
|
225
228
|
"image": image_b64,
|
226
229
|
"tool": "visual_grounding_segment",
|
227
230
|
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
231
|
+
"function_name": "grounding_sam",
|
228
232
|
}
|
229
233
|
data: Dict[str, Any] = send_inference_request(request_data, "tools")
|
230
234
|
return_data = []
|
@@ -364,6 +368,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
|
364
368
|
data = {
|
365
369
|
"image": image_b64,
|
366
370
|
"tool": "zero_shot_counting",
|
371
|
+
"function_name": "loca_zero_shot_counting",
|
367
372
|
}
|
368
373
|
resp_data = send_inference_request(data, "tools")
|
369
374
|
resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
|
@@ -399,6 +404,7 @@ def loca_visual_prompt_counting(
|
|
399
404
|
"image": image_b64,
|
400
405
|
"prompt": bbox_str,
|
401
406
|
"tool": "few_shot_counting",
|
407
|
+
"function_name": "loca_visual_prompt_counting",
|
402
408
|
}
|
403
409
|
resp_data = send_inference_request(data, "tools")
|
404
410
|
resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
|
@@ -428,6 +434,7 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
|
428
434
|
"image": image_b64,
|
429
435
|
"prompt": prompt,
|
430
436
|
"tool": "image_question_answering_with_context",
|
437
|
+
"function_name": "florencev2_roberta_vqa",
|
431
438
|
}
|
432
439
|
|
433
440
|
answer = send_inference_request(data, "tools")
|
@@ -457,6 +464,7 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
|
457
464
|
"image": image_b64,
|
458
465
|
"prompt": prompt,
|
459
466
|
"tool": "image_question_answering",
|
467
|
+
"function_name": "git_vqa_v2",
|
460
468
|
}
|
461
469
|
|
462
470
|
answer = send_inference_request(data, "tools")
|
@@ -487,6 +495,7 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
|
|
487
495
|
"prompt": ",".join(classes),
|
488
496
|
"image": image_b64,
|
489
497
|
"tool": "closed_set_image_classification",
|
498
|
+
"function_name": "clip",
|
490
499
|
}
|
491
500
|
resp_data = send_inference_request(data, "tools")
|
492
501
|
resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
|
@@ -514,6 +523,7 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
|
|
514
523
|
data = {
|
515
524
|
"image": image_b64,
|
516
525
|
"tool": "image_classification",
|
526
|
+
"function_name": "vit_image_classification",
|
517
527
|
}
|
518
528
|
resp_data = send_inference_request(data, "tools")
|
519
529
|
resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
|
@@ -541,6 +551,7 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
|
|
541
551
|
data = {
|
542
552
|
"image": image_b64,
|
543
553
|
"tool": "nsfw_image_classification",
|
554
|
+
"function_name": "vit_nsfw_classification",
|
544
555
|
}
|
545
556
|
resp_data = send_inference_request(data, "tools")
|
546
557
|
resp_data["scores"] = round(resp_data["scores"], 4)
|
@@ -567,6 +578,7 @@ def blip_image_caption(image: np.ndarray) -> str:
|
|
567
578
|
data = {
|
568
579
|
"image": image_b64,
|
569
580
|
"tool": "image_captioning",
|
581
|
+
"function_name": "blip_image_caption",
|
570
582
|
}
|
571
583
|
|
572
584
|
answer = send_inference_request(data, "tools")
|
@@ -595,6 +607,7 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
|
|
595
607
|
"image": image_b64,
|
596
608
|
"tool": "florence2_image_captioning",
|
597
609
|
"detail_caption": detail_caption,
|
610
|
+
"function_name": "florencev2_image_caption",
|
598
611
|
}
|
599
612
|
|
600
613
|
answer = send_inference_request(data, "tools")
|
@@ -630,6 +643,7 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
630
643
|
data = {
|
631
644
|
"image": image_b64,
|
632
645
|
"tool": "object_detection",
|
646
|
+
"function_name": "florencev2_object_detection",
|
633
647
|
}
|
634
648
|
|
635
649
|
answer = send_inference_request(data, "tools")
|
@@ -686,6 +700,7 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
686
700
|
data = {
|
687
701
|
"image": image_b64,
|
688
702
|
"tool": "panoptic_segmentation",
|
703
|
+
"function_name": "detr_segmentation",
|
689
704
|
}
|
690
705
|
|
691
706
|
answer = send_inference_request(data, "tools")
|
@@ -728,6 +743,7 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
|
|
728
743
|
data = {
|
729
744
|
"image": image_b64,
|
730
745
|
"tool": "generate_depth",
|
746
|
+
"function_name": "depth_anything_v2",
|
731
747
|
}
|
732
748
|
|
733
749
|
answer = send_inference_request(data, "tools")
|
@@ -759,6 +775,7 @@ def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
|
|
759
775
|
data = {
|
760
776
|
"image": image_b64,
|
761
777
|
"tool": "generate_hed",
|
778
|
+
"function_name": "generate_soft_edge_image",
|
762
779
|
}
|
763
780
|
|
764
781
|
answer = send_inference_request(data, "tools")
|
@@ -791,6 +808,7 @@ def dpt_hybrid_midas(image: np.ndarray) -> np.ndarray:
|
|
791
808
|
data = {
|
792
809
|
"image": image_b64,
|
793
810
|
"tool": "generate_normal",
|
811
|
+
"function_name": "dpt_hybrid_midas",
|
794
812
|
}
|
795
813
|
|
796
814
|
answer = send_inference_request(data, "tools")
|
@@ -822,6 +840,7 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
|
|
822
840
|
data = {
|
823
841
|
"image": image_b64,
|
824
842
|
"tool": "generate_pose",
|
843
|
+
"function_name": "generate_pose_image",
|
825
844
|
}
|
826
845
|
|
827
846
|
answer = send_inference_request(data, "tools")
|
@@ -862,6 +881,7 @@ def template_match(
|
|
862
881
|
"image": image_b64,
|
863
882
|
"template": template_image_b64,
|
864
883
|
"tool": "template_match",
|
884
|
+
"function_name": "template_match",
|
865
885
|
}
|
866
886
|
|
867
887
|
answer = send_inference_request(data, "tools")
|
@@ -1044,20 +1064,15 @@ def save_video(
|
|
1044
1064
|
_LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
|
1045
1065
|
fps = 4
|
1046
1066
|
|
1047
|
-
|
1048
|
-
output_video_path
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1055
|
-
|
1056
|
-
video.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
|
1057
|
-
video.release()
|
1058
|
-
|
1059
|
-
_save_video_to_result(output_video_path)
|
1060
|
-
return output_video_path
|
1067
|
+
with ImageSequenceClip(frames, fps=fps) as video:
|
1068
|
+
if output_video_path:
|
1069
|
+
f = open(output_video_path, "wb")
|
1070
|
+
else:
|
1071
|
+
f = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) # type: ignore
|
1072
|
+
video.write_videofile(f.name, codec="libx264")
|
1073
|
+
f.close()
|
1074
|
+
_save_video_to_result(f.name)
|
1075
|
+
return f.name
|
1061
1076
|
|
1062
1077
|
|
1063
1078
|
def _save_video_to_result(video_uri: str) -> None:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.97
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -11,7 +11,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.11
|
12
12
|
Requires-Dist: anthropic (>=0.31.0,<0.32.0)
|
13
13
|
Requires-Dist: e2b (>=0.17.1,<0.18.0)
|
14
|
-
Requires-Dist: e2b-code-interpreter (==0.0.
|
14
|
+
Requires-Dist: e2b-code-interpreter (==0.0.11a27)
|
15
15
|
Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
|
16
16
|
Requires-Dist: langsmith (>=0.1.58,<0.2.0)
|
17
17
|
Requires-Dist: moviepy (>=1.0.0,<2.0.0)
|
@@ -15,7 +15,7 @@ vision_agent/tools/__init__.py,sha256=UNiaJAOt1C709gaJ-a9h9BzKnY5JmoEUpgKftsOnyP
|
|
15
15
|
vision_agent/tools/meta_tools.py,sha256=rmxgVzj-vJKeewHbue3qHru4sYsFLxlSZV-YH-eyH5w,13366
|
16
16
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
17
17
|
vision_agent/tools/tool_utils.py,sha256=XoB-iae8hHrBQgJd3fV6-UjZAkClysobUaOM17IcHuE,4597
|
18
|
-
vision_agent/tools/tools.py,sha256=
|
18
|
+
vision_agent/tools/tools.py,sha256=fHD4qhn7cGG1O77J_BHfaRfW6LMQuj1OIu9xqYu6AG8,43220
|
19
19
|
vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
|
20
20
|
vision_agent/utils/exceptions.py,sha256=isVH-SVL4vHj3q5kK4z7cy5_aOapAqHXWkpibfSNbUs,1659
|
21
21
|
vision_agent/utils/execute.py,sha256=s43aUtuq7ZNjil2mxrddiz8EvvqlJwttkYlIiZouXqM,25125
|
@@ -23,7 +23,7 @@ vision_agent/utils/image_utils.py,sha256=y69wtNla0xHZ1h1x0-vv7nOyKUq69jtjSJBiDCn
|
|
23
23
|
vision_agent/utils/sim.py,sha256=7JvtWGN0Ik5ife3qQYWs7Fm3T8AnAXGFd5HnvDC15mQ,4433
|
24
24
|
vision_agent/utils/type_defs.py,sha256=oVFJcicB-s_09lqvn61u0A5ncZsTqZArZledXWbrrg0,1384
|
25
25
|
vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
|
26
|
-
vision_agent-0.2.
|
27
|
-
vision_agent-0.2.
|
28
|
-
vision_agent-0.2.
|
29
|
-
vision_agent-0.2.
|
26
|
+
vision_agent-0.2.97.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
27
|
+
vision_agent-0.2.97.dist-info/METADATA,sha256=00md0PT29fBJuyXl2LeWcrC3l5T6FXn85YE6Kmat60Q,10728
|
28
|
+
vision_agent-0.2.97.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
29
|
+
vision_agent-0.2.97.dist-info/RECORD,,
|
File without changes
|
File without changes
|