vision-agent 0.2.183__py3-none-any.whl → 0.2.185__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +133 -58
- {vision_agent-0.2.183.dist-info → vision_agent-0.2.185.dist-info}/METADATA +1 -1
- {vision_agent-0.2.183.dist-info → vision_agent-0.2.185.dist-info}/RECORD +6 -6
- {vision_agent-0.2.183.dist-info → vision_agent-0.2.185.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.183.dist-info → vision_agent-0.2.185.dist-info}/WHEEL +0 -0
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -28,10 +28,8 @@ from vision_agent.tools.tool_utils import (
|
|
28
28
|
send_task_inference_request,
|
29
29
|
)
|
30
30
|
from vision_agent.tools.tools_types import (
|
31
|
-
Florence2FtRequest,
|
32
31
|
JobStatus,
|
33
32
|
ODResponseData,
|
34
|
-
PromptTask,
|
35
33
|
)
|
36
34
|
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
37
35
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
@@ -421,8 +419,15 @@ def florence2_sam2_image(
|
|
421
419
|
if image.shape[0] < 1 or image.shape[1] < 1:
|
422
420
|
return []
|
423
421
|
|
422
|
+
buffer_bytes = numpy_to_bytes(image)
|
423
|
+
files = [("image", buffer_bytes)]
|
424
|
+
payload = {
|
425
|
+
"prompt": prompt,
|
426
|
+
"model": "florence2sam2",
|
427
|
+
}
|
428
|
+
metadata = {"function_name": "florence2_sam2_image"}
|
429
|
+
|
424
430
|
if fine_tune_id is not None:
|
425
|
-
image_b64 = convert_to_b64(image)
|
426
431
|
landing_api = LandingPublicAPI()
|
427
432
|
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
428
433
|
if status is not JobStatus.SUCCEEDED:
|
@@ -430,58 +435,31 @@ def florence2_sam2_image(
|
|
430
435
|
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
431
436
|
)
|
432
437
|
|
433
|
-
|
434
|
-
image=image_b64,
|
435
|
-
task=PromptTask.PHRASE_GROUNDING,
|
436
|
-
prompt=prompt,
|
437
|
-
postprocessing="sam2",
|
438
|
-
job_id=UUID(fine_tune_id),
|
439
|
-
)
|
440
|
-
req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
|
441
|
-
detections_ft = send_inference_request(
|
442
|
-
req_data,
|
443
|
-
"florence2-ft",
|
444
|
-
v2=True,
|
445
|
-
is_form=True,
|
446
|
-
metadata_payload={"function_name": "florence2_sam2_image"},
|
447
|
-
)
|
448
|
-
# get the first frame
|
449
|
-
detection = detections_ft[0]
|
450
|
-
return_data = []
|
451
|
-
for i in range(len(detection["bboxes"])):
|
452
|
-
return_data.append(
|
453
|
-
{
|
454
|
-
"score": 1.0,
|
455
|
-
"label": detection["labels"][i],
|
456
|
-
"bbox": normalize_bbox(
|
457
|
-
detection["bboxes"][i], detection["masks"][i]["size"]
|
458
|
-
),
|
459
|
-
"mask": rle_decode_array(detection["masks"][i]),
|
460
|
-
}
|
461
|
-
)
|
462
|
-
return return_data
|
438
|
+
payload["jobId"] = fine_tune_id
|
463
439
|
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
}
|
470
|
-
detections: Dict[str, Any] = send_inference_request(
|
471
|
-
payload, "florence2-sam2", files=files, v2=True
|
440
|
+
detections = send_task_inference_request(
|
441
|
+
payload,
|
442
|
+
"text-to-instance-segmentation",
|
443
|
+
files=files,
|
444
|
+
metadata=metadata,
|
472
445
|
)
|
473
446
|
|
447
|
+
# get the first frame
|
448
|
+
frame = detections[0]
|
474
449
|
return_data = []
|
475
|
-
for
|
476
|
-
mask = rle_decode_array(
|
477
|
-
label =
|
478
|
-
bbox = normalize_bbox(
|
450
|
+
for detection in frame:
|
451
|
+
mask = rle_decode_array(detection["mask"])
|
452
|
+
label = detection["label"]
|
453
|
+
bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
|
479
454
|
return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
|
480
455
|
return return_data
|
481
456
|
|
482
457
|
|
483
458
|
def florence2_sam2_video_tracking(
|
484
|
-
prompt: str,
|
459
|
+
prompt: str,
|
460
|
+
frames: List[np.ndarray],
|
461
|
+
chunk_length: Optional[int] = 3,
|
462
|
+
fine_tune_id: Optional[str] = None,
|
485
463
|
) -> List[List[Dict[str, Any]]]:
|
486
464
|
"""'florence2_sam2_video_tracking' is a tool that can segment and track multiple
|
487
465
|
entities in a video given a text prompt such as category names or referring
|
@@ -494,6 +472,8 @@ def florence2_sam2_video_tracking(
|
|
494
472
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
495
473
|
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
496
474
|
new objects.
|
475
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
476
|
+
fine-tuned model ID here to use it.
|
497
477
|
|
498
478
|
Returns:
|
499
479
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
|
@@ -519,24 +499,43 @@ def florence2_sam2_video_tracking(
|
|
519
499
|
...
|
520
500
|
]
|
521
501
|
"""
|
502
|
+
if len(frames) == 0:
|
503
|
+
raise ValueError("No frames provided")
|
522
504
|
|
523
505
|
buffer_bytes = frames_to_bytes(frames)
|
524
506
|
files = [("video", buffer_bytes)]
|
525
507
|
payload = {
|
526
|
-
"
|
527
|
-
"
|
508
|
+
"prompt": prompt,
|
509
|
+
"model": "florence2sam2",
|
528
510
|
}
|
511
|
+
metadata = {"function_name": "florence2_sam2_video_tracking"}
|
512
|
+
|
529
513
|
if chunk_length is not None:
|
530
|
-
payload["
|
531
|
-
|
532
|
-
|
514
|
+
payload["chunk_length_frames"] = chunk_length # type: ignore
|
515
|
+
|
516
|
+
if fine_tune_id is not None:
|
517
|
+
landing_api = LandingPublicAPI()
|
518
|
+
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
519
|
+
if status is not JobStatus.SUCCEEDED:
|
520
|
+
raise FineTuneModelIsNotReady(
|
521
|
+
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
522
|
+
)
|
523
|
+
|
524
|
+
payload["jobId"] = fine_tune_id
|
525
|
+
|
526
|
+
detections = send_task_inference_request(
|
527
|
+
payload,
|
528
|
+
"text-to-instance-segmentation",
|
529
|
+
files=files,
|
530
|
+
metadata=metadata,
|
533
531
|
)
|
532
|
+
|
534
533
|
return_data = []
|
535
|
-
for
|
534
|
+
for frame in detections:
|
536
535
|
return_frame_data = []
|
537
|
-
for
|
538
|
-
mask = rle_decode_array(
|
539
|
-
label =
|
536
|
+
for detection in frame:
|
537
|
+
mask = rle_decode_array(detection["mask"])
|
538
|
+
label = str(detection["id"]) + ": " + detection["label"]
|
540
539
|
return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
|
541
540
|
return_data.append(return_frame_data)
|
542
541
|
return return_data
|
@@ -552,7 +551,7 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
552
551
|
|
553
552
|
Returns:
|
554
553
|
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
555
|
-
with
|
554
|
+
with normalized coordinates, and confidence score.
|
556
555
|
|
557
556
|
Example
|
558
557
|
-------
|
@@ -608,7 +607,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
|
608
607
|
|
609
608
|
Returns:
|
610
609
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
611
|
-
value, e.g. {count: 12} and a heat map for
|
610
|
+
value, e.g. {count: 12} and a heat map for visualization purposes.
|
612
611
|
|
613
612
|
Example
|
614
613
|
-------
|
@@ -647,7 +646,7 @@ def loca_visual_prompt_counting(
|
|
647
646
|
|
648
647
|
Returns:
|
649
648
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
650
|
-
value, e.g. {count: 12} and a heat map for
|
649
|
+
value, e.g. {count: 12} and a heat map for visualization purposes.
|
651
650
|
|
652
651
|
Example
|
653
652
|
-------
|
@@ -1773,6 +1772,82 @@ def closest_box_distance(
|
|
1773
1772
|
return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
|
1774
1773
|
|
1775
1774
|
|
1775
|
+
def flux_image_inpainting(
|
1776
|
+
prompt: str,
|
1777
|
+
image: np.ndarray,
|
1778
|
+
mask: np.ndarray,
|
1779
|
+
) -> np.ndarray:
|
1780
|
+
"""'flux_image_inpainting' performs image inpainting to fill the masked regions,
|
1781
|
+
given by mask, in the image, given image based on the text prompt and surrounding image context.
|
1782
|
+
It can be used to edit regions of an image according to the prompt given.
|
1783
|
+
|
1784
|
+
Parameters:
|
1785
|
+
prompt (str): A detailed text description guiding what should be generated
|
1786
|
+
in the masked area. More detailed and specific prompts typically yield better results.
|
1787
|
+
image (np.ndarray): The source image to be inpainted.
|
1788
|
+
The image will serve as the base context for the inpainting process.
|
1789
|
+
mask (np.ndarray): A binary mask image with 0's and 1's,
|
1790
|
+
where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
|
1791
|
+
|
1792
|
+
Returns:
|
1793
|
+
np.ndarray:
|
1794
|
+
The generated image(s) as a numpy array in RGB format
|
1795
|
+
with values ranging from 0 to 255.
|
1796
|
+
|
1797
|
+
-------
|
1798
|
+
Example:
|
1799
|
+
>>> # Generate inpainting
|
1800
|
+
>>> result = flux_image_inpainting(
|
1801
|
+
... prompt="a modern black leather sofa with white pillows",
|
1802
|
+
... image=image,
|
1803
|
+
... mask=mask,
|
1804
|
+
... )
|
1805
|
+
>>> save_image(result, "inpainted_room.png")
|
1806
|
+
"""
|
1807
|
+
if (
|
1808
|
+
image.shape[0] < 8
|
1809
|
+
or image.shape[1] < 8
|
1810
|
+
or mask.shape[0] < 8
|
1811
|
+
or mask.shape[1] < 8
|
1812
|
+
):
|
1813
|
+
raise ValueError("The image or mask does not have enough size for inpainting")
|
1814
|
+
|
1815
|
+
if np.array_equal(mask, mask.astype(bool).astype(int)):
|
1816
|
+
mask = np.where(mask > 0, 255, 0).astype(np.uint8)
|
1817
|
+
else:
|
1818
|
+
raise ValueError("The mask should be a binary mask with 0's and 1's")
|
1819
|
+
|
1820
|
+
image_file = numpy_to_bytes(image)
|
1821
|
+
mask_file = numpy_to_bytes(mask)
|
1822
|
+
|
1823
|
+
files = [
|
1824
|
+
("image", image_file),
|
1825
|
+
("mask_image", mask_file),
|
1826
|
+
]
|
1827
|
+
|
1828
|
+
payload = {
|
1829
|
+
"prompt": prompt,
|
1830
|
+
"task": "inpainting",
|
1831
|
+
"height": image.shape[0],
|
1832
|
+
"width": image.shape[1],
|
1833
|
+
"strength": 0.99,
|
1834
|
+
"guidance_scale": 18,
|
1835
|
+
"num_inference_steps": 20,
|
1836
|
+
"seed": None,
|
1837
|
+
}
|
1838
|
+
|
1839
|
+
response = send_inference_request(
|
1840
|
+
payload=payload,
|
1841
|
+
endpoint_name="flux1",
|
1842
|
+
files=files,
|
1843
|
+
v2=True,
|
1844
|
+
metadata_payload={"function_name": "flux_image_inpainting"},
|
1845
|
+
)
|
1846
|
+
|
1847
|
+
output_image = np.array(b64_to_pil(response[0]).convert("RGB"))
|
1848
|
+
return output_image
|
1849
|
+
|
1850
|
+
|
1776
1851
|
# Utility and visualization functions
|
1777
1852
|
|
1778
1853
|
|
@@ -16,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
16
16
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
17
17
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
18
18
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
19
|
-
vision_agent/tools/__init__.py,sha256=
|
19
|
+
vision_agent/tools/__init__.py,sha256=KVP4_6qxOb2lpFdQgQtyDfdkMLL1O6wVZNK19MXp-xo,2798
|
20
20
|
vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
23
|
-
vision_agent/tools/tools.py,sha256=
|
23
|
+
vision_agent/tools/tools.py,sha256=us3fOV3JIqFB9WidEX6NT65HwJbIxhh59RRvUcMIshI,83251
|
24
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
25
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
26
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
29
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
30
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
31
31
|
vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
32
|
+
vision_agent-0.2.185.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.185.dist-info/METADATA,sha256=Wgo1bRpQ3MgqxIDpBiN0Tj0YAUBwRtYCQ7DmhJwgKpY,18330
|
34
|
+
vision_agent-0.2.185.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.185.dist-info/RECORD,,
|
File without changes
|
File without changes
|