vision-agent 0.2.183__py3-none-any.whl → 0.2.185__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -68,6 +68,7 @@ from .tools import (
68
68
  qwen2_vl_images_vqa,
69
69
  qwen2_vl_video_vqa,
70
70
  video_temporal_localization,
71
+ flux_image_inpainting,
71
72
  )
72
73
 
73
74
  __new_tools__ = [
@@ -28,10 +28,8 @@ from vision_agent.tools.tool_utils import (
28
28
  send_task_inference_request,
29
29
  )
30
30
  from vision_agent.tools.tools_types import (
31
- Florence2FtRequest,
32
31
  JobStatus,
33
32
  ODResponseData,
34
- PromptTask,
35
33
  )
36
34
  from vision_agent.utils.exceptions import FineTuneModelIsNotReady
37
35
  from vision_agent.utils.execute import FileSerializer, MimeType
@@ -421,8 +419,15 @@ def florence2_sam2_image(
421
419
  if image.shape[0] < 1 or image.shape[1] < 1:
422
420
  return []
423
421
 
422
+ buffer_bytes = numpy_to_bytes(image)
423
+ files = [("image", buffer_bytes)]
424
+ payload = {
425
+ "prompt": prompt,
426
+ "model": "florence2sam2",
427
+ }
428
+ metadata = {"function_name": "florence2_sam2_image"}
429
+
424
430
  if fine_tune_id is not None:
425
- image_b64 = convert_to_b64(image)
426
431
  landing_api = LandingPublicAPI()
427
432
  status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
428
433
  if status is not JobStatus.SUCCEEDED:
@@ -430,58 +435,31 @@ def florence2_sam2_image(
430
435
  f"Fine-tuned model {fine_tune_id} is not ready yet"
431
436
  )
432
437
 
433
- req_data_obj = Florence2FtRequest(
434
- image=image_b64,
435
- task=PromptTask.PHRASE_GROUNDING,
436
- prompt=prompt,
437
- postprocessing="sam2",
438
- job_id=UUID(fine_tune_id),
439
- )
440
- req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
441
- detections_ft = send_inference_request(
442
- req_data,
443
- "florence2-ft",
444
- v2=True,
445
- is_form=True,
446
- metadata_payload={"function_name": "florence2_sam2_image"},
447
- )
448
- # get the first frame
449
- detection = detections_ft[0]
450
- return_data = []
451
- for i in range(len(detection["bboxes"])):
452
- return_data.append(
453
- {
454
- "score": 1.0,
455
- "label": detection["labels"][i],
456
- "bbox": normalize_bbox(
457
- detection["bboxes"][i], detection["masks"][i]["size"]
458
- ),
459
- "mask": rle_decode_array(detection["masks"][i]),
460
- }
461
- )
462
- return return_data
438
+ payload["jobId"] = fine_tune_id
463
439
 
464
- buffer_bytes = numpy_to_bytes(image)
465
- files = [("image", buffer_bytes)]
466
- payload = {
467
- "prompts": [s.strip() for s in prompt.split(",")],
468
- "function_name": "florence2_sam2_image",
469
- }
470
- detections: Dict[str, Any] = send_inference_request(
471
- payload, "florence2-sam2", files=files, v2=True
440
+ detections = send_task_inference_request(
441
+ payload,
442
+ "text-to-instance-segmentation",
443
+ files=files,
444
+ metadata=metadata,
472
445
  )
473
446
 
447
+ # get the first frame
448
+ frame = detections[0]
474
449
  return_data = []
475
- for _, data_i in detections["0"].items():
476
- mask = rle_decode_array(data_i["mask"])
477
- label = data_i["label"]
478
- bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
450
+ for detection in frame:
451
+ mask = rle_decode_array(detection["mask"])
452
+ label = detection["label"]
453
+ bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
479
454
  return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
480
455
  return return_data
481
456
 
482
457
 
483
458
  def florence2_sam2_video_tracking(
484
- prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 3
459
+ prompt: str,
460
+ frames: List[np.ndarray],
461
+ chunk_length: Optional[int] = 3,
462
+ fine_tune_id: Optional[str] = None,
485
463
  ) -> List[List[Dict[str, Any]]]:
486
464
  """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
487
465
  entities in a video given a text prompt such as category names or referring
@@ -494,6 +472,8 @@ def florence2_sam2_video_tracking(
494
472
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
495
473
  chunk_length (Optional[int]): The number of frames to re-run florence2 to find
496
474
  new objects.
475
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
476
+ fine-tuned model ID here to use it.
497
477
 
498
478
  Returns:
499
479
  List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
@@ -519,24 +499,43 @@ def florence2_sam2_video_tracking(
519
499
  ...
520
500
  ]
521
501
  """
502
+ if len(frames) == 0:
503
+ raise ValueError("No frames provided")
522
504
 
523
505
  buffer_bytes = frames_to_bytes(frames)
524
506
  files = [("video", buffer_bytes)]
525
507
  payload = {
526
- "prompts": [s.strip() for s in prompt.split(",")],
527
- "function_name": "florence2_sam2_video_tracking",
508
+ "prompt": prompt,
509
+ "model": "florence2sam2",
528
510
  }
511
+ metadata = {"function_name": "florence2_sam2_video_tracking"}
512
+
529
513
  if chunk_length is not None:
530
- payload["chunk_length"] = chunk_length # type: ignore
531
- data: Dict[str, Any] = send_inference_request(
532
- payload, "florence2-sam2", files=files, v2=True
514
+ payload["chunk_length_frames"] = chunk_length # type: ignore
515
+
516
+ if fine_tune_id is not None:
517
+ landing_api = LandingPublicAPI()
518
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
519
+ if status is not JobStatus.SUCCEEDED:
520
+ raise FineTuneModelIsNotReady(
521
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
522
+ )
523
+
524
+ payload["jobId"] = fine_tune_id
525
+
526
+ detections = send_task_inference_request(
527
+ payload,
528
+ "text-to-instance-segmentation",
529
+ files=files,
530
+ metadata=metadata,
533
531
  )
532
+
534
533
  return_data = []
535
- for frame_i in data.keys():
534
+ for frame in detections:
536
535
  return_frame_data = []
537
- for obj_id, data_j in data[frame_i].items():
538
- mask = rle_decode_array(data_j["mask"])
539
- label = obj_id + ": " + data_j["label"]
536
+ for detection in frame:
537
+ mask = rle_decode_array(detection["mask"])
538
+ label = str(detection["id"]) + ": " + detection["label"]
540
539
  return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
541
540
  return_data.append(return_frame_data)
542
541
  return return_data
@@ -552,7 +551,7 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
552
551
 
553
552
  Returns:
554
553
  List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
555
- with nornmalized coordinates, and confidence score.
554
+ with normalized coordinates, and confidence score.
556
555
 
557
556
  Example
558
557
  -------
@@ -608,7 +607,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
608
607
 
609
608
  Returns:
610
609
  Dict[str, Any]: A dictionary containing the key 'count' and the count as a
611
- value, e.g. {count: 12} and a heat map for visaulization purposes.
610
+ value, e.g. {count: 12} and a heat map for visualization purposes.
612
611
 
613
612
  Example
614
613
  -------
@@ -647,7 +646,7 @@ def loca_visual_prompt_counting(
647
646
 
648
647
  Returns:
649
648
  Dict[str, Any]: A dictionary containing the key 'count' and the count as a
650
- value, e.g. {count: 12} and a heat map for visaulization purposes.
649
+ value, e.g. {count: 12} and a heat map for visualization purposes.
651
650
 
652
651
  Example
653
652
  -------
@@ -1773,6 +1772,82 @@ def closest_box_distance(
1773
1772
  return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1774
1773
 
1775
1774
 
1775
+ def flux_image_inpainting(
1776
+ prompt: str,
1777
+ image: np.ndarray,
1778
+ mask: np.ndarray,
1779
+ ) -> np.ndarray:
1780
+ """'flux_image_inpainting' performs image inpainting to fill the masked regions,
1781
+ given by mask, in the image, given image based on the text prompt and surrounding image context.
1782
+ It can be used to edit regions of an image according to the prompt given.
1783
+
1784
+ Parameters:
1785
+ prompt (str): A detailed text description guiding what should be generated
1786
+ in the masked area. More detailed and specific prompts typically yield better results.
1787
+ image (np.ndarray): The source image to be inpainted.
1788
+ The image will serve as the base context for the inpainting process.
1789
+ mask (np.ndarray): A binary mask image with 0's and 1's,
1790
+ where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
1791
+
1792
+ Returns:
1793
+ np.ndarray:
1794
+ The generated image(s) as a numpy array in RGB format
1795
+ with values ranging from 0 to 255.
1796
+
1797
+ -------
1798
+ Example:
1799
+ >>> # Generate inpainting
1800
+ >>> result = flux_image_inpainting(
1801
+ ... prompt="a modern black leather sofa with white pillows",
1802
+ ... image=image,
1803
+ ... mask=mask,
1804
+ ... )
1805
+ >>> save_image(result, "inpainted_room.png")
1806
+ """
1807
+ if (
1808
+ image.shape[0] < 8
1809
+ or image.shape[1] < 8
1810
+ or mask.shape[0] < 8
1811
+ or mask.shape[1] < 8
1812
+ ):
1813
+ raise ValueError("The image or mask does not have enough size for inpainting")
1814
+
1815
+ if np.array_equal(mask, mask.astype(bool).astype(int)):
1816
+ mask = np.where(mask > 0, 255, 0).astype(np.uint8)
1817
+ else:
1818
+ raise ValueError("The mask should be a binary mask with 0's and 1's")
1819
+
1820
+ image_file = numpy_to_bytes(image)
1821
+ mask_file = numpy_to_bytes(mask)
1822
+
1823
+ files = [
1824
+ ("image", image_file),
1825
+ ("mask_image", mask_file),
1826
+ ]
1827
+
1828
+ payload = {
1829
+ "prompt": prompt,
1830
+ "task": "inpainting",
1831
+ "height": image.shape[0],
1832
+ "width": image.shape[1],
1833
+ "strength": 0.99,
1834
+ "guidance_scale": 18,
1835
+ "num_inference_steps": 20,
1836
+ "seed": None,
1837
+ }
1838
+
1839
+ response = send_inference_request(
1840
+ payload=payload,
1841
+ endpoint_name="flux1",
1842
+ files=files,
1843
+ v2=True,
1844
+ metadata_payload={"function_name": "flux_image_inpainting"},
1845
+ )
1846
+
1847
+ output_image = np.array(b64_to_pil(response[0]).convert("RGB"))
1848
+ return output_image
1849
+
1850
+
1776
1851
  # Utility and visualization functions
1777
1852
 
1778
1853
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.183
3
+ Version: 0.2.185
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
16
16
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
17
17
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
18
18
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
19
- vision_agent/tools/__init__.py,sha256=17wZ4ZsoSTZZaiqBTi6pqAKUr-qf58_T_zH2GXOi1KU,2771
19
+ vision_agent/tools/__init__.py,sha256=KVP4_6qxOb2lpFdQgQtyDfdkMLL1O6wVZNK19MXp-xo,2798
20
20
  vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
23
- vision_agent/tools/tools.py,sha256=vc0T940b-rRiGAOJttn7BsuCpVh9rJaivOmorpE41AA,81134
23
+ vision_agent/tools/tools.py,sha256=us3fOV3JIqFB9WidEX6NT65HwJbIxhh59RRvUcMIshI,83251
24
24
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
25
25
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
26
26
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
29
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
30
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
31
31
  vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
32
- vision_agent-0.2.183.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- vision_agent-0.2.183.dist-info/METADATA,sha256=9V38VymRic0fe2uqCIjl3nhuVJYx49ZQox69izWD8k8,18330
34
- vision_agent-0.2.183.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
- vision_agent-0.2.183.dist-info/RECORD,,
32
+ vision_agent-0.2.185.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.185.dist-info/METADATA,sha256=Wgo1bRpQ3MgqxIDpBiN0Tj0YAUBwRtYCQ7DmhJwgKpY,18330
34
+ vision_agent-0.2.185.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.185.dist-info/RECORD,,