vision-agent 0.2.95__tar.gz → 0.2.97__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (29) hide show
  1. {vision_agent-0.2.95 → vision_agent-0.2.97}/PKG-INFO +2 -2
  2. {vision_agent-0.2.95 → vision_agent-0.2.97}/pyproject.toml +2 -2
  3. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/tools/tools.py +29 -14
  4. {vision_agent-0.2.95 → vision_agent-0.2.97}/LICENSE +0 -0
  5. {vision_agent-0.2.95 → vision_agent-0.2.97}/README.md +0 -0
  6. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/__init__.py +0 -0
  7. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/agent/__init__.py +0 -0
  8. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/agent/agent.py +0 -0
  9. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/agent/agent_utils.py +0 -0
  10. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/agent/vision_agent.py +0 -0
  11. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/agent/vision_agent_coder.py +0 -0
  12. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  13. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/agent/vision_agent_prompts.py +0 -0
  14. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/fonts/__init__.py +0 -0
  15. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  16. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/lmm/__init__.py +0 -0
  17. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/lmm/lmm.py +0 -0
  18. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/lmm/types.py +0 -0
  19. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/tools/__init__.py +0 -0
  20. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/tools/meta_tools.py +0 -0
  21. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/tools/prompts.py +0 -0
  22. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/tools/tool_utils.py +0 -0
  23. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/utils/__init__.py +0 -0
  24. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/utils/exceptions.py +0 -0
  25. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/utils/execute.py +0 -0
  26. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/utils/image_utils.py +0 -0
  27. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/utils/sim.py +0 -0
  28. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/utils/type_defs.py +0 -0
  29. {vision_agent-0.2.95 → vision_agent-0.2.97}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.95
3
+ Version: 0.2.97
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -11,7 +11,7 @@ Classifier: Programming Language :: Python :: 3.10
11
11
  Classifier: Programming Language :: Python :: 3.11
12
12
  Requires-Dist: anthropic (>=0.31.0,<0.32.0)
13
13
  Requires-Dist: e2b (>=0.17.1,<0.18.0)
14
- Requires-Dist: e2b-code-interpreter (==0.0.11a17)
14
+ Requires-Dist: e2b-code-interpreter (==0.0.11a27)
15
15
  Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
16
16
  Requires-Dist: langsmith (>=0.1.58,<0.2.0)
17
17
  Requires-Dist: moviepy (>=1.0.0,<2.0.0)
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.95"
7
+ version = "0.2.97"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -35,7 +35,7 @@ rich = "^13.7.1"
35
35
  langsmith = "^0.1.58"
36
36
  ipykernel = "^6.29.4"
37
37
  e2b = "^0.17.1"
38
- e2b-code-interpreter = "0.0.11a17"
38
+ e2b-code-interpreter = "0.0.11a27"
39
39
  tenacity = "^8.3.0"
40
40
  pillow-heif = "^0.16.0"
41
41
  pytube = "15.0.0"
@@ -9,6 +9,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union, cast
9
9
  import cv2
10
10
  import numpy as np
11
11
  import requests
12
+ from moviepy.editor import ImageSequenceClip
12
13
  from PIL import Image, ImageDraw, ImageFont
13
14
  from pillow_heif import register_heif_opener # type: ignore
14
15
  from pytube import YouTube # type: ignore
@@ -106,6 +107,7 @@ def grounding_dino(
106
107
  "visual_grounding" if model_size == "large" else "visual_grounding_tiny"
107
108
  ),
108
109
  "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
110
+ "function_name": "grounding_dino",
109
111
  }
110
112
  data: Dict[str, Any] = send_inference_request(request_data, "tools")
111
113
  return_data = []
@@ -161,6 +163,7 @@ def owl_v2(
161
163
  "image": image_b64,
162
164
  "tool": "open_vocab_detection",
163
165
  "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
166
+ "function_name": "owl_v2",
164
167
  }
165
168
  data: Dict[str, Any] = send_inference_request(request_data, "tools")
166
169
  return_data = []
@@ -225,6 +228,7 @@ def grounding_sam(
225
228
  "image": image_b64,
226
229
  "tool": "visual_grounding_segment",
227
230
  "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
231
+ "function_name": "grounding_sam",
228
232
  }
229
233
  data: Dict[str, Any] = send_inference_request(request_data, "tools")
230
234
  return_data = []
@@ -364,6 +368,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
364
368
  data = {
365
369
  "image": image_b64,
366
370
  "tool": "zero_shot_counting",
371
+ "function_name": "loca_zero_shot_counting",
367
372
  }
368
373
  resp_data = send_inference_request(data, "tools")
369
374
  resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
@@ -399,6 +404,7 @@ def loca_visual_prompt_counting(
399
404
  "image": image_b64,
400
405
  "prompt": bbox_str,
401
406
  "tool": "few_shot_counting",
407
+ "function_name": "loca_visual_prompt_counting",
402
408
  }
403
409
  resp_data = send_inference_request(data, "tools")
404
410
  resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
@@ -428,6 +434,7 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
428
434
  "image": image_b64,
429
435
  "prompt": prompt,
430
436
  "tool": "image_question_answering_with_context",
437
+ "function_name": "florencev2_roberta_vqa",
431
438
  }
432
439
 
433
440
  answer = send_inference_request(data, "tools")
@@ -457,6 +464,7 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
457
464
  "image": image_b64,
458
465
  "prompt": prompt,
459
466
  "tool": "image_question_answering",
467
+ "function_name": "git_vqa_v2",
460
468
  }
461
469
 
462
470
  answer = send_inference_request(data, "tools")
@@ -487,6 +495,7 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
487
495
  "prompt": ",".join(classes),
488
496
  "image": image_b64,
489
497
  "tool": "closed_set_image_classification",
498
+ "function_name": "clip",
490
499
  }
491
500
  resp_data = send_inference_request(data, "tools")
492
501
  resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
@@ -514,6 +523,7 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
514
523
  data = {
515
524
  "image": image_b64,
516
525
  "tool": "image_classification",
526
+ "function_name": "vit_image_classification",
517
527
  }
518
528
  resp_data = send_inference_request(data, "tools")
519
529
  resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
@@ -541,6 +551,7 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
541
551
  data = {
542
552
  "image": image_b64,
543
553
  "tool": "nsfw_image_classification",
554
+ "function_name": "vit_nsfw_classification",
544
555
  }
545
556
  resp_data = send_inference_request(data, "tools")
546
557
  resp_data["scores"] = round(resp_data["scores"], 4)
@@ -567,6 +578,7 @@ def blip_image_caption(image: np.ndarray) -> str:
567
578
  data = {
568
579
  "image": image_b64,
569
580
  "tool": "image_captioning",
581
+ "function_name": "blip_image_caption",
570
582
  }
571
583
 
572
584
  answer = send_inference_request(data, "tools")
@@ -595,6 +607,7 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
595
607
  "image": image_b64,
596
608
  "tool": "florence2_image_captioning",
597
609
  "detail_caption": detail_caption,
610
+ "function_name": "florencev2_image_caption",
598
611
  }
599
612
 
600
613
  answer = send_inference_request(data, "tools")
@@ -630,6 +643,7 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
630
643
  data = {
631
644
  "image": image_b64,
632
645
  "tool": "object_detection",
646
+ "function_name": "florencev2_object_detection",
633
647
  }
634
648
 
635
649
  answer = send_inference_request(data, "tools")
@@ -686,6 +700,7 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
686
700
  data = {
687
701
  "image": image_b64,
688
702
  "tool": "panoptic_segmentation",
703
+ "function_name": "detr_segmentation",
689
704
  }
690
705
 
691
706
  answer = send_inference_request(data, "tools")
@@ -728,6 +743,7 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
728
743
  data = {
729
744
  "image": image_b64,
730
745
  "tool": "generate_depth",
746
+ "function_name": "depth_anything_v2",
731
747
  }
732
748
 
733
749
  answer = send_inference_request(data, "tools")
@@ -759,6 +775,7 @@ def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
759
775
  data = {
760
776
  "image": image_b64,
761
777
  "tool": "generate_hed",
778
+ "function_name": "generate_soft_edge_image",
762
779
  }
763
780
 
764
781
  answer = send_inference_request(data, "tools")
@@ -791,6 +808,7 @@ def dpt_hybrid_midas(image: np.ndarray) -> np.ndarray:
791
808
  data = {
792
809
  "image": image_b64,
793
810
  "tool": "generate_normal",
811
+ "function_name": "dpt_hybrid_midas",
794
812
  }
795
813
 
796
814
  answer = send_inference_request(data, "tools")
@@ -822,6 +840,7 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
822
840
  data = {
823
841
  "image": image_b64,
824
842
  "tool": "generate_pose",
843
+ "function_name": "generate_pose_image",
825
844
  }
826
845
 
827
846
  answer = send_inference_request(data, "tools")
@@ -862,6 +881,7 @@ def template_match(
862
881
  "image": image_b64,
863
882
  "template": template_image_b64,
864
883
  "tool": "template_match",
884
+ "function_name": "template_match",
865
885
  }
866
886
 
867
887
  answer = send_inference_request(data, "tools")
@@ -1044,20 +1064,15 @@ def save_video(
1044
1064
  _LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
1045
1065
  fps = 4
1046
1066
 
1047
- if not output_video_path:
1048
- output_video_path = tempfile.NamedTemporaryFile(
1049
- suffix=".mp4", delete=False
1050
- ).name
1051
-
1052
- height, width, layers = frames[0].shape if frames else (0, 0, 0)
1053
- fourcc = cv2.VideoWriter_fourcc(*"mp4v") # type: ignore
1054
- video = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
1055
- for frame in frames:
1056
- video.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
1057
- video.release()
1058
-
1059
- _save_video_to_result(output_video_path)
1060
- return output_video_path
1067
+ with ImageSequenceClip(frames, fps=fps) as video:
1068
+ if output_video_path:
1069
+ f = open(output_video_path, "wb")
1070
+ else:
1071
+ f = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) # type: ignore
1072
+ video.write_videofile(f.name, codec="libx264")
1073
+ f.close()
1074
+ _save_video_to_result(f.name)
1075
+ return f.name
1061
1076
 
1062
1077
 
1063
1078
  def _save_video_to_result(video_uri: str) -> None:
File without changes
File without changes