vision-agent 0.2.183__py3-none-any.whl → 0.2.185__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -68,6 +68,7 @@ from .tools import (
68
68
  qwen2_vl_images_vqa,
69
69
  qwen2_vl_video_vqa,
70
70
  video_temporal_localization,
71
+ flux_image_inpainting,
71
72
  )
72
73
 
73
74
  __new_tools__ = [
@@ -28,10 +28,8 @@ from vision_agent.tools.tool_utils import (
28
28
  send_task_inference_request,
29
29
  )
30
30
  from vision_agent.tools.tools_types import (
31
- Florence2FtRequest,
32
31
  JobStatus,
33
32
  ODResponseData,
34
- PromptTask,
35
33
  )
36
34
  from vision_agent.utils.exceptions import FineTuneModelIsNotReady
37
35
  from vision_agent.utils.execute import FileSerializer, MimeType
@@ -421,8 +419,15 @@ def florence2_sam2_image(
421
419
  if image.shape[0] < 1 or image.shape[1] < 1:
422
420
  return []
423
421
 
422
+ buffer_bytes = numpy_to_bytes(image)
423
+ files = [("image", buffer_bytes)]
424
+ payload = {
425
+ "prompt": prompt,
426
+ "model": "florence2sam2",
427
+ }
428
+ metadata = {"function_name": "florence2_sam2_image"}
429
+
424
430
  if fine_tune_id is not None:
425
- image_b64 = convert_to_b64(image)
426
431
  landing_api = LandingPublicAPI()
427
432
  status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
428
433
  if status is not JobStatus.SUCCEEDED:
@@ -430,58 +435,31 @@ def florence2_sam2_image(
430
435
  f"Fine-tuned model {fine_tune_id} is not ready yet"
431
436
  )
432
437
 
433
- req_data_obj = Florence2FtRequest(
434
- image=image_b64,
435
- task=PromptTask.PHRASE_GROUNDING,
436
- prompt=prompt,
437
- postprocessing="sam2",
438
- job_id=UUID(fine_tune_id),
439
- )
440
- req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
441
- detections_ft = send_inference_request(
442
- req_data,
443
- "florence2-ft",
444
- v2=True,
445
- is_form=True,
446
- metadata_payload={"function_name": "florence2_sam2_image"},
447
- )
448
- # get the first frame
449
- detection = detections_ft[0]
450
- return_data = []
451
- for i in range(len(detection["bboxes"])):
452
- return_data.append(
453
- {
454
- "score": 1.0,
455
- "label": detection["labels"][i],
456
- "bbox": normalize_bbox(
457
- detection["bboxes"][i], detection["masks"][i]["size"]
458
- ),
459
- "mask": rle_decode_array(detection["masks"][i]),
460
- }
461
- )
462
- return return_data
438
+ payload["jobId"] = fine_tune_id
463
439
 
464
- buffer_bytes = numpy_to_bytes(image)
465
- files = [("image", buffer_bytes)]
466
- payload = {
467
- "prompts": [s.strip() for s in prompt.split(",")],
468
- "function_name": "florence2_sam2_image",
469
- }
470
- detections: Dict[str, Any] = send_inference_request(
471
- payload, "florence2-sam2", files=files, v2=True
440
+ detections = send_task_inference_request(
441
+ payload,
442
+ "text-to-instance-segmentation",
443
+ files=files,
444
+ metadata=metadata,
472
445
  )
473
446
 
447
+ # get the first frame
448
+ frame = detections[0]
474
449
  return_data = []
475
- for _, data_i in detections["0"].items():
476
- mask = rle_decode_array(data_i["mask"])
477
- label = data_i["label"]
478
- bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
450
+ for detection in frame:
451
+ mask = rle_decode_array(detection["mask"])
452
+ label = detection["label"]
453
+ bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
479
454
  return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
480
455
  return return_data
481
456
 
482
457
 
483
458
  def florence2_sam2_video_tracking(
484
- prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 3
459
+ prompt: str,
460
+ frames: List[np.ndarray],
461
+ chunk_length: Optional[int] = 3,
462
+ fine_tune_id: Optional[str] = None,
485
463
  ) -> List[List[Dict[str, Any]]]:
486
464
  """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
487
465
  entities in a video given a text prompt such as category names or referring
@@ -494,6 +472,8 @@ def florence2_sam2_video_tracking(
494
472
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
495
473
  chunk_length (Optional[int]): The number of frames to re-run florence2 to find
496
474
  new objects.
475
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
476
+ fine-tuned model ID here to use it.
497
477
 
498
478
  Returns:
499
479
  List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
@@ -519,24 +499,43 @@ def florence2_sam2_video_tracking(
519
499
  ...
520
500
  ]
521
501
  """
502
+ if len(frames) == 0:
503
+ raise ValueError("No frames provided")
522
504
 
523
505
  buffer_bytes = frames_to_bytes(frames)
524
506
  files = [("video", buffer_bytes)]
525
507
  payload = {
526
- "prompts": [s.strip() for s in prompt.split(",")],
527
- "function_name": "florence2_sam2_video_tracking",
508
+ "prompt": prompt,
509
+ "model": "florence2sam2",
528
510
  }
511
+ metadata = {"function_name": "florence2_sam2_video_tracking"}
512
+
529
513
  if chunk_length is not None:
530
- payload["chunk_length"] = chunk_length # type: ignore
531
- data: Dict[str, Any] = send_inference_request(
532
- payload, "florence2-sam2", files=files, v2=True
514
+ payload["chunk_length_frames"] = chunk_length # type: ignore
515
+
516
+ if fine_tune_id is not None:
517
+ landing_api = LandingPublicAPI()
518
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
519
+ if status is not JobStatus.SUCCEEDED:
520
+ raise FineTuneModelIsNotReady(
521
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
522
+ )
523
+
524
+ payload["jobId"] = fine_tune_id
525
+
526
+ detections = send_task_inference_request(
527
+ payload,
528
+ "text-to-instance-segmentation",
529
+ files=files,
530
+ metadata=metadata,
533
531
  )
532
+
534
533
  return_data = []
535
- for frame_i in data.keys():
534
+ for frame in detections:
536
535
  return_frame_data = []
537
- for obj_id, data_j in data[frame_i].items():
538
- mask = rle_decode_array(data_j["mask"])
539
- label = obj_id + ": " + data_j["label"]
536
+ for detection in frame:
537
+ mask = rle_decode_array(detection["mask"])
538
+ label = str(detection["id"]) + ": " + detection["label"]
540
539
  return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
541
540
  return_data.append(return_frame_data)
542
541
  return return_data
@@ -552,7 +551,7 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
552
551
 
553
552
  Returns:
554
553
  List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
555
- with nornmalized coordinates, and confidence score.
554
+ with normalized coordinates, and confidence score.
556
555
 
557
556
  Example
558
557
  -------
@@ -608,7 +607,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
608
607
 
609
608
  Returns:
610
609
  Dict[str, Any]: A dictionary containing the key 'count' and the count as a
611
- value, e.g. {count: 12} and a heat map for visaulization purposes.
610
+ value, e.g. {count: 12} and a heat map for visualization purposes.
612
611
 
613
612
  Example
614
613
  -------
@@ -647,7 +646,7 @@ def loca_visual_prompt_counting(
647
646
 
648
647
  Returns:
649
648
  Dict[str, Any]: A dictionary containing the key 'count' and the count as a
650
- value, e.g. {count: 12} and a heat map for visaulization purposes.
649
+ value, e.g. {count: 12} and a heat map for visualization purposes.
651
650
 
652
651
  Example
653
652
  -------
@@ -1773,6 +1772,82 @@ def closest_box_distance(
1773
1772
  return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
1774
1773
 
1775
1774
 
1775
+ def flux_image_inpainting(
1776
+ prompt: str,
1777
+ image: np.ndarray,
1778
+ mask: np.ndarray,
1779
+ ) -> np.ndarray:
1780
+ """'flux_image_inpainting' performs image inpainting to fill the masked regions,
1781
+ given by mask, in the image, given image based on the text prompt and surrounding image context.
1782
+ It can be used to edit regions of an image according to the prompt given.
1783
+
1784
+ Parameters:
1785
+ prompt (str): A detailed text description guiding what should be generated
1786
+ in the masked area. More detailed and specific prompts typically yield better results.
1787
+ image (np.ndarray): The source image to be inpainted.
1788
+ The image will serve as the base context for the inpainting process.
1789
+ mask (np.ndarray): A binary mask image with 0's and 1's,
1790
+ where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
1791
+
1792
+ Returns:
1793
+ np.ndarray:
1794
+ The generated image(s) as a numpy array in RGB format
1795
+ with values ranging from 0 to 255.
1796
+
1797
+ -------
1798
+ Example:
1799
+ >>> # Generate inpainting
1800
+ >>> result = flux_image_inpainting(
1801
+ ... prompt="a modern black leather sofa with white pillows",
1802
+ ... image=image,
1803
+ ... mask=mask,
1804
+ ... )
1805
+ >>> save_image(result, "inpainted_room.png")
1806
+ """
1807
+ if (
1808
+ image.shape[0] < 8
1809
+ or image.shape[1] < 8
1810
+ or mask.shape[0] < 8
1811
+ or mask.shape[1] < 8
1812
+ ):
1813
+ raise ValueError("The image or mask does not have enough size for inpainting")
1814
+
1815
+ if np.array_equal(mask, mask.astype(bool).astype(int)):
1816
+ mask = np.where(mask > 0, 255, 0).astype(np.uint8)
1817
+ else:
1818
+ raise ValueError("The mask should be a binary mask with 0's and 1's")
1819
+
1820
+ image_file = numpy_to_bytes(image)
1821
+ mask_file = numpy_to_bytes(mask)
1822
+
1823
+ files = [
1824
+ ("image", image_file),
1825
+ ("mask_image", mask_file),
1826
+ ]
1827
+
1828
+ payload = {
1829
+ "prompt": prompt,
1830
+ "task": "inpainting",
1831
+ "height": image.shape[0],
1832
+ "width": image.shape[1],
1833
+ "strength": 0.99,
1834
+ "guidance_scale": 18,
1835
+ "num_inference_steps": 20,
1836
+ "seed": None,
1837
+ }
1838
+
1839
+ response = send_inference_request(
1840
+ payload=payload,
1841
+ endpoint_name="flux1",
1842
+ files=files,
1843
+ v2=True,
1844
+ metadata_payload={"function_name": "flux_image_inpainting"},
1845
+ )
1846
+
1847
+ output_image = np.array(b64_to_pil(response[0]).convert("RGB"))
1848
+ return output_image
1849
+
1850
+
1776
1851
  # Utility and visualization functions
1777
1852
 
1778
1853
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.183
3
+ Version: 0.2.185
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
16
16
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
17
17
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
18
18
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
19
- vision_agent/tools/__init__.py,sha256=17wZ4ZsoSTZZaiqBTi6pqAKUr-qf58_T_zH2GXOi1KU,2771
19
+ vision_agent/tools/__init__.py,sha256=KVP4_6qxOb2lpFdQgQtyDfdkMLL1O6wVZNK19MXp-xo,2798
20
20
  vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
23
- vision_agent/tools/tools.py,sha256=vc0T940b-rRiGAOJttn7BsuCpVh9rJaivOmorpE41AA,81134
23
+ vision_agent/tools/tools.py,sha256=us3fOV3JIqFB9WidEX6NT65HwJbIxhh59RRvUcMIshI,83251
24
24
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
25
25
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
26
26
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
29
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
30
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
31
31
  vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
32
- vision_agent-0.2.183.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- vision_agent-0.2.183.dist-info/METADATA,sha256=9V38VymRic0fe2uqCIjl3nhuVJYx49ZQox69izWD8k8,18330
34
- vision_agent-0.2.183.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
- vision_agent-0.2.183.dist-info/RECORD,,
32
+ vision_agent-0.2.185.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.185.dist-info/METADATA,sha256=Wgo1bRpQ3MgqxIDpBiN0Tj0YAUBwRtYCQ7DmhJwgKpY,18330
34
+ vision_agent-0.2.185.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.185.dist-info/RECORD,,