vision-agent 0.2.177__py3-none-any.whl → 0.2.179__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -65,6 +65,8 @@ from .tools import (
65
65
  template_match,
66
66
  vit_image_classification,
67
67
  vit_nsfw_classification,
68
+ qwen2_vl_images_vqa,
69
+ video_temporal_localization,
68
70
  )
69
71
 
70
72
  __new_tools__ = [
@@ -852,6 +852,39 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
852
852
  return cast(str, data["answer"])
853
853
 
854
854
 
855
+ def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
856
+ """'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary images
857
+ including regular images or images of documents or presentations. It returns text
858
+ as an answer to the question.
859
+
860
+ Parameters:
861
+ prompt (str): The question about the document image
862
+ images (List[np.ndarray]): The reference images used for the question
863
+
864
+ Returns:
865
+ str: A string which is the answer to the given prompt.
866
+
867
+ Example
868
+ -------
869
+ >>> qwen2_vl_images_vqa('Give a summary of the document', images)
870
+ 'The document talks about the history of the United States of America and its...'
871
+ """
872
+ for image in images:
873
+ if image.shape[0] < 1 or image.shape[1] < 1:
874
+ raise ValueError(f"Image is empty, image shape: {image.shape}")
875
+
876
+ files = [("images", numpy_to_bytes(image)) for image in images]
877
+ payload = {
878
+ "prompt": prompt,
879
+ "model": "qwen2vl",
880
+ "function_name": "qwen2_vl_images_vqa",
881
+ }
882
+ data: Dict[str, Any] = send_inference_request(
883
+ payload, "image-to-text", files=files, v2=True
884
+ )
885
+ return cast(str, data)
886
+
887
+
855
888
  def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
856
889
  """'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
857
890
  including regular videos or videos of documents or presentations. It returns text
@@ -975,6 +1008,54 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
975
1008
  return answer["text"][0] # type: ignore
976
1009
 
977
1010
 
1011
+ def video_temporal_localization(
1012
+ prompt: str,
1013
+ frames: List[np.ndarray],
1014
+ model: str = "qwen2vl",
1015
+ chunk_length: Optional[float] = None,
1016
+ chunk_length_seconds: Optional[float] = None,
1017
+ chunk_length_frames: Optional[int] = 2,
1018
+ ) -> List[float]:
1019
+ """'video_temporal_localization' is a tool that can find objects in a video given a question about it.
1020
+ It returns a list of floats with a value of 1.0 if the object to be found is present in the chunk of video being analyzed.
1021
+
1022
+ Parameters:
1023
+ prompt (str): The question about the video
1024
+ frames (List[np.ndarray]): The reference frames used for the question
1025
+ model (str): The model to use for the inference. Valid values are 'qwen2vl', 'gpt4o', 'internlm-xcomposer'
1026
+ chunk_length (Optional[float]): length of each chunk in seconds
1027
+ chunk_length_seconds (Optional[float]): alternative length for chunk in seconds
1028
+ chunk_length_frames (Optional[int]): length of each chunk in frames
1029
+
1030
+ Returns:
1031
+ List[float]: A list of floats with a value of 1.0 if the object to be found is present in the chunk of video
1032
+
1033
+ Example
1034
+ -------
1035
+ >>> video_temporal_localization('Did a goal happened?', frames)
1036
+ [0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
1037
+ """
1038
+
1039
+ buffer_bytes = frames_to_bytes(frames)
1040
+ files = [("video", buffer_bytes)]
1041
+ payload: Dict[str, Any] = {
1042
+ "prompt": prompt,
1043
+ "model": model,
1044
+ "function_name": "video_temporal_localization",
1045
+ }
1046
+ if chunk_length is not None:
1047
+ payload["chunk_length"] = chunk_length
1048
+ if chunk_length_seconds is not None:
1049
+ payload["chunk_length_seconds"] = chunk_length_seconds
1050
+ if chunk_length_frames is not None:
1051
+ payload["chunk_length_frames"] = chunk_length_frames
1052
+
1053
+ data = send_inference_request(
1054
+ payload, "video-temporal-localization", files=files, v2=True
1055
+ )
1056
+ return [cast(float, value) for value in data]
1057
+
1058
+
978
1059
  def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
979
1060
  """'clip' is a tool that can classify an image or a cropped detection given a list
980
1061
  of input classes or tags. It returns the same list of the input classes along with
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.177
3
+ Version: 0.2.179
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
16
16
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
17
17
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
18
18
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
19
- vision_agent/tools/__init__.py,sha256=q9lKBq1reyQzwsx2b5tS_zGDvIHdxz1Q6VEtguZagfo,2689
19
+ vision_agent/tools/__init__.py,sha256=OEBJGOXNpCG1Ye-N39ahjWR4lL0RPVkcX60s25LpdVA,2747
20
20
  vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
23
- vision_agent/tools/tools.py,sha256=iKsBZxJ5--xWK-mqgZ1jbX_bfGS5HmAp-VRZ69m9yPg,77921
23
+ vision_agent/tools/tools.py,sha256=KwqEHlmTUkR2YglQMYPQQstPkDopfvyLoI0-RivzH3c,81063
24
24
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
25
25
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
26
26
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
29
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
30
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
31
31
  vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
32
- vision_agent-0.2.177.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- vision_agent-0.2.177.dist-info/METADATA,sha256=Xz1KUOjaiAEFCT0SVKuWERZZMpsB_F14GLMdL9NDkB0,18330
34
- vision_agent-0.2.177.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
- vision_agent-0.2.177.dist-info/RECORD,,
32
+ vision_agent-0.2.179.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.179.dist-info/METADATA,sha256=Gy3GmMvxmQnNWG1LMvAlSHOENQRzxgYu6S8yH0feCUk,18330
34
+ vision_agent-0.2.179.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.179.dist-info/RECORD,,