vision-agent 0.2.177__py3-none-any.whl → 0.2.179__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -65,6 +65,8 @@ from .tools import (
65
65
  template_match,
66
66
  vit_image_classification,
67
67
  vit_nsfw_classification,
68
+ qwen2_vl_images_vqa,
69
+ video_temporal_localization,
68
70
  )
69
71
 
70
72
  __new_tools__ = [
@@ -852,6 +852,39 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
852
852
  return cast(str, data["answer"])
853
853
 
854
854
 
855
+ def qwen2_vl_images_vqa(prompt: str, images: List[np.ndarray]) -> str:
856
+ """'qwen2_vl_images_vqa' is a tool that can answer any questions about arbitrary images
857
+ including regular images or images of documents or presentations. It returns text
858
+ as an answer to the question.
859
+
860
+ Parameters:
861
+ prompt (str): The question about the document image
862
+ images (List[np.ndarray]): The reference images used for the question
863
+
864
+ Returns:
865
+ str: A string which is the answer to the given prompt.
866
+
867
+ Example
868
+ -------
869
+ >>> qwen2_vl_images_vqa('Give a summary of the document', images)
870
+ 'The document talks about the history of the United States of America and its...'
871
+ """
872
+ for image in images:
873
+ if image.shape[0] < 1 or image.shape[1] < 1:
874
+ raise ValueError(f"Image is empty, image shape: {image.shape}")
875
+
876
+ files = [("images", numpy_to_bytes(image)) for image in images]
877
+ payload = {
878
+ "prompt": prompt,
879
+ "model": "qwen2vl",
880
+ "function_name": "qwen2_vl_images_vqa",
881
+ }
882
+ data: Dict[str, Any] = send_inference_request(
883
+ payload, "image-to-text", files=files, v2=True
884
+ )
885
+ return cast(str, data)
886
+
887
+
855
888
  def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
856
889
  """'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
857
890
  including regular videos or videos of documents or presentations. It returns text
@@ -975,6 +1008,54 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
975
1008
  return answer["text"][0] # type: ignore
976
1009
 
977
1010
 
1011
+ def video_temporal_localization(
1012
+ prompt: str,
1013
+ frames: List[np.ndarray],
1014
+ model: str = "qwen2vl",
1015
+ chunk_length: Optional[float] = None,
1016
+ chunk_length_seconds: Optional[float] = None,
1017
+ chunk_length_frames: Optional[int] = 2,
1018
+ ) -> List[float]:
1019
+ """'video_temporal_localization' is a tool that can find objects in a video given a question about it.
1020
+ It returns a list of floats with a value of 1.0 if the object to be found is present in the chunk of video being analyzed.
1021
+
1022
+ Parameters:
1023
+ prompt (str): The question about the video
1024
+ frames (List[np.ndarray]): The reference frames used for the question
1025
+ model (str): The model to use for the inference. Valid values are 'qwen2vl', 'gpt4o', 'internlm-xcomposer'
1026
+ chunk_length (Optional[float]): length of each chunk in seconds
1027
+ chunk_length_seconds (Optional[float]): alternative length for chunk in seconds
1028
+ chunk_length_frames (Optional[int]): length of each chunk in frames
1029
+
1030
+ Returns:
1031
+ List[float]: A list of floats with a value of 1.0 if the object to be found is present in the chunk of video
1032
+
1033
+ Example
1034
+ -------
1035
+ >>> video_temporal_localization('Did a goal happened?', frames)
1036
+ [0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
1037
+ """
1038
+
1039
+ buffer_bytes = frames_to_bytes(frames)
1040
+ files = [("video", buffer_bytes)]
1041
+ payload: Dict[str, Any] = {
1042
+ "prompt": prompt,
1043
+ "model": model,
1044
+ "function_name": "video_temporal_localization",
1045
+ }
1046
+ if chunk_length is not None:
1047
+ payload["chunk_length"] = chunk_length
1048
+ if chunk_length_seconds is not None:
1049
+ payload["chunk_length_seconds"] = chunk_length_seconds
1050
+ if chunk_length_frames is not None:
1051
+ payload["chunk_length_frames"] = chunk_length_frames
1052
+
1053
+ data = send_inference_request(
1054
+ payload, "video-temporal-localization", files=files, v2=True
1055
+ )
1056
+ return [cast(float, value) for value in data]
1057
+
1058
+
978
1059
  def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
979
1060
  """'clip' is a tool that can classify an image or a cropped detection given a list
980
1061
  of input classes or tags. It returns the same list of the input classes along with
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.177
3
+ Version: 0.2.179
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
16
16
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
17
17
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
18
18
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
19
- vision_agent/tools/__init__.py,sha256=q9lKBq1reyQzwsx2b5tS_zGDvIHdxz1Q6VEtguZagfo,2689
19
+ vision_agent/tools/__init__.py,sha256=OEBJGOXNpCG1Ye-N39ahjWR4lL0RPVkcX60s25LpdVA,2747
20
20
  vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
23
- vision_agent/tools/tools.py,sha256=iKsBZxJ5--xWK-mqgZ1jbX_bfGS5HmAp-VRZ69m9yPg,77921
23
+ vision_agent/tools/tools.py,sha256=KwqEHlmTUkR2YglQMYPQQstPkDopfvyLoI0-RivzH3c,81063
24
24
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
25
25
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
26
26
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
29
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
30
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
31
31
  vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
32
- vision_agent-0.2.177.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- vision_agent-0.2.177.dist-info/METADATA,sha256=Xz1KUOjaiAEFCT0SVKuWERZZMpsB_F14GLMdL9NDkB0,18330
34
- vision_agent-0.2.177.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
- vision_agent-0.2.177.dist-info/RECORD,,
32
+ vision_agent-0.2.179.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.179.dist-info/METADATA,sha256=Gy3GmMvxmQnNWG1LMvAlSHOENQRzxgYu6S8yH0feCUk,18330
34
+ vision_agent-0.2.179.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.179.dist-info/RECORD,,