vision-agent 0.2.182__py3-none-any.whl → 0.2.184__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -66,7 +66,9 @@ from .tools import (
66
66
  vit_image_classification,
67
67
  vit_nsfw_classification,
68
68
  qwen2_vl_images_vqa,
69
+ qwen2_vl_video_vqa,
69
70
  video_temporal_localization,
71
+ flux_image_inpainting,
70
72
  )
71
73
 
72
74
  __new_tools__ = [
@@ -930,6 +930,37 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
930
930
  return cast(str, data["answer"])
931
931
 
932
932
 
933
+ def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
934
+ """'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
935
+ including regular videos or videos of documents or presentations. It returns text
936
+ as an answer to the question.
937
+
938
+ Parameters:
939
+ prompt (str): The question about the video
940
+ frames (List[np.ndarray]): The reference frames used for the question
941
+
942
+ Returns:
943
+ str: A string which is the answer to the given prompt.
944
+
945
+ Example
946
+ -------
947
+ >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
948
+ 'Lionel Messi'
949
+ """
950
+
951
+ buffer_bytes = frames_to_bytes(frames)
952
+ files = [("video", buffer_bytes)]
953
+ payload = {
954
+ "prompt": prompt,
955
+ "model": "qwen2vl",
956
+ "function_name": "qwen2_vl_video_vqa",
957
+ }
958
+ data: Dict[str, Any] = send_inference_request(
959
+ payload, "image-to-text", files=files, v2=True
960
+ )
961
+ return cast(str, data)
962
+
963
+
933
964
  def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
934
965
  """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
935
966
  including regular images or images of documents or presentations. It returns text
@@ -1742,6 +1773,82 @@ def closest_box_distance(
1742
1773
  return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1743
1774
 
1744
1775
 
1776
+ def flux_image_inpainting(
1777
+ prompt: str,
1778
+ image: np.ndarray,
1779
+ mask: np.ndarray,
1780
+ ) -> np.ndarray:
1781
+ """'flux_image_inpainting' performs image inpainting to fill the masked regions,
1782
+ given by mask, in the image, given image based on the text prompt and surrounding image context.
1783
+ It can be used to edit regions of an image according to the prompt given.
1784
+
1785
+ Parameters:
1786
+ prompt (str): A detailed text description guiding what should be generated
1787
+ in the masked area. More detailed and specific prompts typically yield better results.
1788
+ image (np.ndarray): The source image to be inpainted.
1789
+ The image will serve as the base context for the inpainting process.
1790
+ mask (np.ndarray): A binary mask image with 0's and 1's,
1791
+ where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
1792
+
1793
+ Returns:
1794
+ np.ndarray:
1795
+ The generated image(s) as a numpy array in RGB format
1796
+ with values ranging from 0 to 255.
1797
+
1798
+ -------
1799
+ Example:
1800
+ >>> # Generate inpainting
1801
+ >>> result = flux_image_inpainting(
1802
+ ... prompt="a modern black leather sofa with white pillows",
1803
+ ... image=image,
1804
+ ... mask=mask,
1805
+ ... )
1806
+ >>> save_image(result, "inpainted_room.png")
1807
+ """
1808
+ if (
1809
+ image.shape[0] < 8
1810
+ or image.shape[1] < 8
1811
+ or mask.shape[0] < 8
1812
+ or mask.shape[1] < 8
1813
+ ):
1814
+ raise ValueError("The image or mask does not have enough size for inpainting")
1815
+
1816
+ if np.array_equal(mask, mask.astype(bool).astype(int)):
1817
+ mask = np.where(mask > 0, 255, 0).astype(np.uint8)
1818
+ else:
1819
+ raise ValueError("The mask should be a binary mask with 0's and 1's")
1820
+
1821
+ image_file = numpy_to_bytes(image)
1822
+ mask_file = numpy_to_bytes(mask)
1823
+
1824
+ files = [
1825
+ ("image", image_file),
1826
+ ("mask_image", mask_file),
1827
+ ]
1828
+
1829
+ payload = {
1830
+ "prompt": prompt,
1831
+ "task": "inpainting",
1832
+ "height": image.shape[0],
1833
+ "width": image.shape[1],
1834
+ "strength": 0.99,
1835
+ "guidance_scale": 18,
1836
+ "num_inference_steps": 20,
1837
+ "seed": None,
1838
+ }
1839
+
1840
+ response = send_inference_request(
1841
+ payload=payload,
1842
+ endpoint_name="flux1",
1843
+ files=files,
1844
+ v2=True,
1845
+ metadata_payload={"function_name": "flux_image_inpainting"},
1846
+ )
1847
+
1848
+ output_image = np.array(b64_to_pil(response[0]).convert("RGB"))
1849
+ return output_image
1850
+
1851
+
1745
1852
  # Utility and visualization functions
1746
1853
 
1747
1854
 
@@ -2238,13 +2345,13 @@ FUNCTION_TOOLS = [
2238
2345
  florence2_sam2_image,
2239
2346
  florence2_sam2_video_tracking,
2240
2347
  florence2_phrase_grounding,
2241
- ixc25_image_vqa,
2242
- ixc25_video_vqa,
2243
2348
  detr_segmentation,
2244
2349
  depth_anything_v2,
2245
2350
  generate_pose_image,
2246
2351
  closest_mask_distance,
2247
2352
  closest_box_distance,
2353
+ qwen2_vl_images_vqa,
2354
+ qwen2_vl_video_vqa,
2248
2355
  ]
2249
2356
 
2250
2357
  UTIL_TOOLS = [
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.182
3
+ Version: 0.2.184
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
16
16
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
17
17
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
18
18
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
19
- vision_agent/tools/__init__.py,sha256=OEBJGOXNpCG1Ye-N39ahjWR4lL0RPVkcX60s25LpdVA,2747
19
+ vision_agent/tools/__init__.py,sha256=KVP4_6qxOb2lpFdQgQtyDfdkMLL1O6wVZNK19MXp-xo,2798
20
20
  vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
23
- vision_agent/tools/tools.py,sha256=p0MBQnwA10NF48ZhTIRWzHaarkezjvDazk7VuvjH1-k,80142
23
+ vision_agent/tools/tools.py,sha256=kHeBjiVvncQJeL_Gni84bgHOCgxko4XO7otpt8IyWU4,83610
24
24
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
25
25
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
26
26
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
29
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
30
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
31
31
  vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
32
- vision_agent-0.2.182.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- vision_agent-0.2.182.dist-info/METADATA,sha256=eLwHRDYfkonJsLN0ug1Sc2bqZv7SAHiDzVeYeTGCmj8,18330
34
- vision_agent-0.2.182.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
- vision_agent-0.2.182.dist-info/RECORD,,
32
+ vision_agent-0.2.184.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.184.dist-info/METADATA,sha256=n8BeCLsPCBXDsr0FCmRBtScseMyJ8TuR68MWlqeO9Is,18330
34
+ vision_agent-0.2.184.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.184.dist-info/RECORD,,