vision-agent 0.2.182__py3-none-any.whl → 0.2.184__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -66,7 +66,9 @@ from .tools import (
66
66
  vit_image_classification,
67
67
  vit_nsfw_classification,
68
68
  qwen2_vl_images_vqa,
69
+ qwen2_vl_video_vqa,
69
70
  video_temporal_localization,
71
+ flux_image_inpainting,
70
72
  )
71
73
 
72
74
  __new_tools__ = [
@@ -930,6 +930,37 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
930
930
  return cast(str, data["answer"])
931
931
 
932
932
 
933
+ def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
934
+ """'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
935
+ including regular videos or videos of documents or presentations. It returns text
936
+ as an answer to the question.
937
+
938
+ Parameters:
939
+ prompt (str): The question about the video
940
+ frames (List[np.ndarray]): The reference frames used for the question
941
+
942
+ Returns:
943
+ str: A string which is the answer to the given prompt.
944
+
945
+ Example
946
+ -------
947
+ >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
948
+ 'Lionel Messi'
949
+ """
950
+
951
+ buffer_bytes = frames_to_bytes(frames)
952
+ files = [("video", buffer_bytes)]
953
+ payload = {
954
+ "prompt": prompt,
955
+ "model": "qwen2vl",
956
+ "function_name": "qwen2_vl_video_vqa",
957
+ }
958
+ data: Dict[str, Any] = send_inference_request(
959
+ payload, "image-to-text", files=files, v2=True
960
+ )
961
+ return cast(str, data)
962
+
963
+
933
964
  def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
934
965
  """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
935
966
  including regular images or images of documents or presentations. It returns text
@@ -1742,6 +1773,82 @@ def closest_box_distance(
1742
1773
  return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1743
1774
 
1744
1775
 
1776
+ def flux_image_inpainting(
1777
+ prompt: str,
1778
+ image: np.ndarray,
1779
+ mask: np.ndarray,
1780
+ ) -> np.ndarray:
1781
+ """'flux_image_inpainting' performs image inpainting to fill the masked regions,
1782
+ given by mask, in the image, given image based on the text prompt and surrounding image context.
1783
+ It can be used to edit regions of an image according to the prompt given.
1784
+
1785
+ Parameters:
1786
+ prompt (str): A detailed text description guiding what should be generated
1787
+ in the masked area. More detailed and specific prompts typically yield better results.
1788
+ image (np.ndarray): The source image to be inpainted.
1789
+ The image will serve as the base context for the inpainting process.
1790
+ mask (np.ndarray): A binary mask image with 0's and 1's,
1791
+ where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
1792
+
1793
+ Returns:
1794
+ np.ndarray:
1795
+ The generated image(s) as a numpy array in RGB format
1796
+ with values ranging from 0 to 255.
1797
+
1798
+ -------
1799
+ Example:
1800
+ >>> # Generate inpainting
1801
+ >>> result = flux_image_inpainting(
1802
+ ... prompt="a modern black leather sofa with white pillows",
1803
+ ... image=image,
1804
+ ... mask=mask,
1805
+ ... )
1806
+ >>> save_image(result, "inpainted_room.png")
1807
+ """
1808
+ if (
1809
+ image.shape[0] < 8
1810
+ or image.shape[1] < 8
1811
+ or mask.shape[0] < 8
1812
+ or mask.shape[1] < 8
1813
+ ):
1814
+ raise ValueError("The image or mask does not have enough size for inpainting")
1815
+
1816
+ if np.array_equal(mask, mask.astype(bool).astype(int)):
1817
+ mask = np.where(mask > 0, 255, 0).astype(np.uint8)
1818
+ else:
1819
+ raise ValueError("The mask should be a binary mask with 0's and 1's")
1820
+
1821
+ image_file = numpy_to_bytes(image)
1822
+ mask_file = numpy_to_bytes(mask)
1823
+
1824
+ files = [
1825
+ ("image", image_file),
1826
+ ("mask_image", mask_file),
1827
+ ]
1828
+
1829
+ payload = {
1830
+ "prompt": prompt,
1831
+ "task": "inpainting",
1832
+ "height": image.shape[0],
1833
+ "width": image.shape[1],
1834
+ "strength": 0.99,
1835
+ "guidance_scale": 18,
1836
+ "num_inference_steps": 20,
1837
+ "seed": None,
1838
+ }
1839
+
1840
+ response = send_inference_request(
1841
+ payload=payload,
1842
+ endpoint_name="flux1",
1843
+ files=files,
1844
+ v2=True,
1845
+ metadata_payload={"function_name": "flux_image_inpainting"},
1846
+ )
1847
+
1848
+ output_image = np.array(b64_to_pil(response[0]).convert("RGB"))
1849
+ return output_image
1850
+
1851
+
1745
1852
  # Utility and visualization functions
1746
1853
 
1747
1854
 
@@ -2238,13 +2345,13 @@ FUNCTION_TOOLS = [
2238
2345
  florence2_sam2_image,
2239
2346
  florence2_sam2_video_tracking,
2240
2347
  florence2_phrase_grounding,
2241
- ixc25_image_vqa,
2242
- ixc25_video_vqa,
2243
2348
  detr_segmentation,
2244
2349
  depth_anything_v2,
2245
2350
  generate_pose_image,
2246
2351
  closest_mask_distance,
2247
2352
  closest_box_distance,
2353
+ qwen2_vl_images_vqa,
2354
+ qwen2_vl_video_vqa,
2248
2355
  ]
2249
2356
 
2250
2357
  UTIL_TOOLS = [
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.182
3
+ Version: 0.2.184
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
16
16
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
17
17
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
18
18
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
19
- vision_agent/tools/__init__.py,sha256=OEBJGOXNpCG1Ye-N39ahjWR4lL0RPVkcX60s25LpdVA,2747
19
+ vision_agent/tools/__init__.py,sha256=KVP4_6qxOb2lpFdQgQtyDfdkMLL1O6wVZNK19MXp-xo,2798
20
20
  vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
23
- vision_agent/tools/tools.py,sha256=p0MBQnwA10NF48ZhTIRWzHaarkezjvDazk7VuvjH1-k,80142
23
+ vision_agent/tools/tools.py,sha256=kHeBjiVvncQJeL_Gni84bgHOCgxko4XO7otpt8IyWU4,83610
24
24
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
25
25
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
26
26
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
29
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
30
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
31
31
  vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
32
- vision_agent-0.2.182.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- vision_agent-0.2.182.dist-info/METADATA,sha256=eLwHRDYfkonJsLN0ug1Sc2bqZv7SAHiDzVeYeTGCmj8,18330
34
- vision_agent-0.2.182.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
- vision_agent-0.2.182.dist-info/RECORD,,
32
+ vision_agent-0.2.184.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.184.dist-info/METADATA,sha256=n8BeCLsPCBXDsr0FCmRBtScseMyJ8TuR68MWlqeO9Is,18330
34
+ vision_agent-0.2.184.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.184.dist-info/RECORD,,