vision-agent 0.2.95__py3-none-any.whl → 0.2.97__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union, cast
9
9
  import cv2
10
10
  import numpy as np
11
11
  import requests
12
+ from moviepy.editor import ImageSequenceClip
12
13
  from PIL import Image, ImageDraw, ImageFont
13
14
  from pillow_heif import register_heif_opener # type: ignore
14
15
  from pytube import YouTube # type: ignore
@@ -106,6 +107,7 @@ def grounding_dino(
106
107
  "visual_grounding" if model_size == "large" else "visual_grounding_tiny"
107
108
  ),
108
109
  "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
110
+ "function_name": "grounding_dino",
109
111
  }
110
112
  data: Dict[str, Any] = send_inference_request(request_data, "tools")
111
113
  return_data = []
@@ -161,6 +163,7 @@ def owl_v2(
161
163
  "image": image_b64,
162
164
  "tool": "open_vocab_detection",
163
165
  "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
166
+ "function_name": "owl_v2",
164
167
  }
165
168
  data: Dict[str, Any] = send_inference_request(request_data, "tools")
166
169
  return_data = []
@@ -225,6 +228,7 @@ def grounding_sam(
225
228
  "image": image_b64,
226
229
  "tool": "visual_grounding_segment",
227
230
  "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
231
+ "function_name": "grounding_sam",
228
232
  }
229
233
  data: Dict[str, Any] = send_inference_request(request_data, "tools")
230
234
  return_data = []
@@ -364,6 +368,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
364
368
  data = {
365
369
  "image": image_b64,
366
370
  "tool": "zero_shot_counting",
371
+ "function_name": "loca_zero_shot_counting",
367
372
  }
368
373
  resp_data = send_inference_request(data, "tools")
369
374
  resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
@@ -399,6 +404,7 @@ def loca_visual_prompt_counting(
399
404
  "image": image_b64,
400
405
  "prompt": bbox_str,
401
406
  "tool": "few_shot_counting",
407
+ "function_name": "loca_visual_prompt_counting",
402
408
  }
403
409
  resp_data = send_inference_request(data, "tools")
404
410
  resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
@@ -428,6 +434,7 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
428
434
  "image": image_b64,
429
435
  "prompt": prompt,
430
436
  "tool": "image_question_answering_with_context",
437
+ "function_name": "florencev2_roberta_vqa",
431
438
  }
432
439
 
433
440
  answer = send_inference_request(data, "tools")
@@ -457,6 +464,7 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
457
464
  "image": image_b64,
458
465
  "prompt": prompt,
459
466
  "tool": "image_question_answering",
467
+ "function_name": "git_vqa_v2",
460
468
  }
461
469
 
462
470
  answer = send_inference_request(data, "tools")
@@ -487,6 +495,7 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
487
495
  "prompt": ",".join(classes),
488
496
  "image": image_b64,
489
497
  "tool": "closed_set_image_classification",
498
+ "function_name": "clip",
490
499
  }
491
500
  resp_data = send_inference_request(data, "tools")
492
501
  resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
@@ -514,6 +523,7 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
514
523
  data = {
515
524
  "image": image_b64,
516
525
  "tool": "image_classification",
526
+ "function_name": "vit_image_classification",
517
527
  }
518
528
  resp_data = send_inference_request(data, "tools")
519
529
  resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
@@ -541,6 +551,7 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
541
551
  data = {
542
552
  "image": image_b64,
543
553
  "tool": "nsfw_image_classification",
554
+ "function_name": "vit_nsfw_classification",
544
555
  }
545
556
  resp_data = send_inference_request(data, "tools")
546
557
  resp_data["scores"] = round(resp_data["scores"], 4)
@@ -567,6 +578,7 @@ def blip_image_caption(image: np.ndarray) -> str:
567
578
  data = {
568
579
  "image": image_b64,
569
580
  "tool": "image_captioning",
581
+ "function_name": "blip_image_caption",
570
582
  }
571
583
 
572
584
  answer = send_inference_request(data, "tools")
@@ -595,6 +607,7 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
595
607
  "image": image_b64,
596
608
  "tool": "florence2_image_captioning",
597
609
  "detail_caption": detail_caption,
610
+ "function_name": "florencev2_image_caption",
598
611
  }
599
612
 
600
613
  answer = send_inference_request(data, "tools")
@@ -630,6 +643,7 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
630
643
  data = {
631
644
  "image": image_b64,
632
645
  "tool": "object_detection",
646
+ "function_name": "florencev2_object_detection",
633
647
  }
634
648
 
635
649
  answer = send_inference_request(data, "tools")
@@ -686,6 +700,7 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
686
700
  data = {
687
701
  "image": image_b64,
688
702
  "tool": "panoptic_segmentation",
703
+ "function_name": "detr_segmentation",
689
704
  }
690
705
 
691
706
  answer = send_inference_request(data, "tools")
@@ -728,6 +743,7 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
728
743
  data = {
729
744
  "image": image_b64,
730
745
  "tool": "generate_depth",
746
+ "function_name": "depth_anything_v2",
731
747
  }
732
748
 
733
749
  answer = send_inference_request(data, "tools")
@@ -759,6 +775,7 @@ def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
759
775
  data = {
760
776
  "image": image_b64,
761
777
  "tool": "generate_hed",
778
+ "function_name": "generate_soft_edge_image",
762
779
  }
763
780
 
764
781
  answer = send_inference_request(data, "tools")
@@ -791,6 +808,7 @@ def dpt_hybrid_midas(image: np.ndarray) -> np.ndarray:
791
808
  data = {
792
809
  "image": image_b64,
793
810
  "tool": "generate_normal",
811
+ "function_name": "dpt_hybrid_midas",
794
812
  }
795
813
 
796
814
  answer = send_inference_request(data, "tools")
@@ -822,6 +840,7 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
822
840
  data = {
823
841
  "image": image_b64,
824
842
  "tool": "generate_pose",
843
+ "function_name": "generate_pose_image",
825
844
  }
826
845
 
827
846
  answer = send_inference_request(data, "tools")
@@ -862,6 +881,7 @@ def template_match(
862
881
  "image": image_b64,
863
882
  "template": template_image_b64,
864
883
  "tool": "template_match",
884
+ "function_name": "template_match",
865
885
  }
866
886
 
867
887
  answer = send_inference_request(data, "tools")
@@ -1044,20 +1064,15 @@ def save_video(
1044
1064
  _LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
1045
1065
  fps = 4
1046
1066
 
1047
- if not output_video_path:
1048
- output_video_path = tempfile.NamedTemporaryFile(
1049
- suffix=".mp4", delete=False
1050
- ).name
1051
-
1052
- height, width, layers = frames[0].shape if frames else (0, 0, 0)
1053
- fourcc = cv2.VideoWriter_fourcc(*"mp4v") # type: ignore
1054
- video = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
1055
- for frame in frames:
1056
- video.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
1057
- video.release()
1058
-
1059
- _save_video_to_result(output_video_path)
1060
- return output_video_path
1067
+ with ImageSequenceClip(frames, fps=fps) as video:
1068
+ if output_video_path:
1069
+ f = open(output_video_path, "wb")
1070
+ else:
1071
+ f = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) # type: ignore
1072
+ video.write_videofile(f.name, codec="libx264")
1073
+ f.close()
1074
+ _save_video_to_result(f.name)
1075
+ return f.name
1061
1076
 
1062
1077
 
1063
1078
  def _save_video_to_result(video_uri: str) -> None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.95
3
+ Version: 0.2.97
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -11,7 +11,7 @@ Classifier: Programming Language :: Python :: 3.10
11
11
  Classifier: Programming Language :: Python :: 3.11
12
12
  Requires-Dist: anthropic (>=0.31.0,<0.32.0)
13
13
  Requires-Dist: e2b (>=0.17.1,<0.18.0)
14
- Requires-Dist: e2b-code-interpreter (==0.0.11a17)
14
+ Requires-Dist: e2b-code-interpreter (==0.0.11a27)
15
15
  Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
16
16
  Requires-Dist: langsmith (>=0.1.58,<0.2.0)
17
17
  Requires-Dist: moviepy (>=1.0.0,<2.0.0)
@@ -15,7 +15,7 @@ vision_agent/tools/__init__.py,sha256=UNiaJAOt1C709gaJ-a9h9BzKnY5JmoEUpgKftsOnyP
15
15
  vision_agent/tools/meta_tools.py,sha256=rmxgVzj-vJKeewHbue3qHru4sYsFLxlSZV-YH-eyH5w,13366
16
16
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
17
17
  vision_agent/tools/tool_utils.py,sha256=XoB-iae8hHrBQgJd3fV6-UjZAkClysobUaOM17IcHuE,4597
18
- vision_agent/tools/tools.py,sha256=CWQY1sD-xtWchPrg_AJNAGH-k7UxrKIkiog8r0sx1Do,42446
18
+ vision_agent/tools/tools.py,sha256=fHD4qhn7cGG1O77J_BHfaRfW6LMQuj1OIu9xqYu6AG8,43220
19
19
  vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
20
20
  vision_agent/utils/exceptions.py,sha256=isVH-SVL4vHj3q5kK4z7cy5_aOapAqHXWkpibfSNbUs,1659
21
21
  vision_agent/utils/execute.py,sha256=s43aUtuq7ZNjil2mxrddiz8EvvqlJwttkYlIiZouXqM,25125
@@ -23,7 +23,7 @@ vision_agent/utils/image_utils.py,sha256=y69wtNla0xHZ1h1x0-vv7nOyKUq69jtjSJBiDCn
23
23
  vision_agent/utils/sim.py,sha256=7JvtWGN0Ik5ife3qQYWs7Fm3T8AnAXGFd5HnvDC15mQ,4433
24
24
  vision_agent/utils/type_defs.py,sha256=oVFJcicB-s_09lqvn61u0A5ncZsTqZArZledXWbrrg0,1384
25
25
  vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
26
- vision_agent-0.2.95.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
27
- vision_agent-0.2.95.dist-info/METADATA,sha256=-OCOFe_UAKyI5sjDr6nYklJq5jwKZbLjwFkFMO-wrV8,10728
28
- vision_agent-0.2.95.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
29
- vision_agent-0.2.95.dist-info/RECORD,,
26
+ vision_agent-0.2.97.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
27
+ vision_agent-0.2.97.dist-info/METADATA,sha256=00md0PT29fBJuyXl2LeWcrC3l5T6FXn85YE6Kmat60Q,10728
28
+ vision_agent-0.2.97.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
29
+ vision_agent-0.2.97.dist-info/RECORD,,