vision-agent 0.2.184__py3-none-any.whl → 0.2.185__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/tools.py +57 -58
- {vision_agent-0.2.184.dist-info → vision_agent-0.2.185.dist-info}/METADATA +1 -1
- {vision_agent-0.2.184.dist-info → vision_agent-0.2.185.dist-info}/RECORD +5 -5
- {vision_agent-0.2.184.dist-info → vision_agent-0.2.185.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.184.dist-info → vision_agent-0.2.185.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -28,10 +28,8 @@ from vision_agent.tools.tool_utils import (
|
|
28
28
|
send_task_inference_request,
|
29
29
|
)
|
30
30
|
from vision_agent.tools.tools_types import (
|
31
|
-
Florence2FtRequest,
|
32
31
|
JobStatus,
|
33
32
|
ODResponseData,
|
34
|
-
PromptTask,
|
35
33
|
)
|
36
34
|
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
37
35
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
@@ -421,8 +419,15 @@ def florence2_sam2_image(
|
|
421
419
|
if image.shape[0] < 1 or image.shape[1] < 1:
|
422
420
|
return []
|
423
421
|
|
422
|
+
buffer_bytes = numpy_to_bytes(image)
|
423
|
+
files = [("image", buffer_bytes)]
|
424
|
+
payload = {
|
425
|
+
"prompt": prompt,
|
426
|
+
"model": "florence2sam2",
|
427
|
+
}
|
428
|
+
metadata = {"function_name": "florence2_sam2_image"}
|
429
|
+
|
424
430
|
if fine_tune_id is not None:
|
425
|
-
image_b64 = convert_to_b64(image)
|
426
431
|
landing_api = LandingPublicAPI()
|
427
432
|
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
428
433
|
if status is not JobStatus.SUCCEEDED:
|
@@ -430,58 +435,31 @@ def florence2_sam2_image(
|
|
430
435
|
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
431
436
|
)
|
432
437
|
|
433
|
-
|
434
|
-
image=image_b64,
|
435
|
-
task=PromptTask.PHRASE_GROUNDING,
|
436
|
-
prompt=prompt,
|
437
|
-
postprocessing="sam2",
|
438
|
-
job_id=UUID(fine_tune_id),
|
439
|
-
)
|
440
|
-
req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
|
441
|
-
detections_ft = send_inference_request(
|
442
|
-
req_data,
|
443
|
-
"florence2-ft",
|
444
|
-
v2=True,
|
445
|
-
is_form=True,
|
446
|
-
metadata_payload={"function_name": "florence2_sam2_image"},
|
447
|
-
)
|
448
|
-
# get the first frame
|
449
|
-
detection = detections_ft[0]
|
450
|
-
return_data = []
|
451
|
-
for i in range(len(detection["bboxes"])):
|
452
|
-
return_data.append(
|
453
|
-
{
|
454
|
-
"score": 1.0,
|
455
|
-
"label": detection["labels"][i],
|
456
|
-
"bbox": normalize_bbox(
|
457
|
-
detection["bboxes"][i], detection["masks"][i]["size"]
|
458
|
-
),
|
459
|
-
"mask": rle_decode_array(detection["masks"][i]),
|
460
|
-
}
|
461
|
-
)
|
462
|
-
return return_data
|
438
|
+
payload["jobId"] = fine_tune_id
|
463
439
|
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
}
|
470
|
-
detections: Dict[str, Any] = send_inference_request(
|
471
|
-
payload, "florence2-sam2", files=files, v2=True
|
440
|
+
detections = send_task_inference_request(
|
441
|
+
payload,
|
442
|
+
"text-to-instance-segmentation",
|
443
|
+
files=files,
|
444
|
+
metadata=metadata,
|
472
445
|
)
|
473
446
|
|
447
|
+
# get the first frame
|
448
|
+
frame = detections[0]
|
474
449
|
return_data = []
|
475
|
-
for
|
476
|
-
mask = rle_decode_array(
|
477
|
-
label =
|
478
|
-
bbox = normalize_bbox(
|
450
|
+
for detection in frame:
|
451
|
+
mask = rle_decode_array(detection["mask"])
|
452
|
+
label = detection["label"]
|
453
|
+
bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
|
479
454
|
return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
|
480
455
|
return return_data
|
481
456
|
|
482
457
|
|
483
458
|
def florence2_sam2_video_tracking(
|
484
|
-
prompt: str,
|
459
|
+
prompt: str,
|
460
|
+
frames: List[np.ndarray],
|
461
|
+
chunk_length: Optional[int] = 3,
|
462
|
+
fine_tune_id: Optional[str] = None,
|
485
463
|
) -> List[List[Dict[str, Any]]]:
|
486
464
|
"""'florence2_sam2_video_tracking' is a tool that can segment and track multiple
|
487
465
|
entities in a video given a text prompt such as category names or referring
|
@@ -494,6 +472,8 @@ def florence2_sam2_video_tracking(
|
|
494
472
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
495
473
|
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
496
474
|
new objects.
|
475
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
476
|
+
fine-tuned model ID here to use it.
|
497
477
|
|
498
478
|
Returns:
|
499
479
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
|
@@ -519,24 +499,43 @@ def florence2_sam2_video_tracking(
|
|
519
499
|
...
|
520
500
|
]
|
521
501
|
"""
|
502
|
+
if len(frames) == 0:
|
503
|
+
raise ValueError("No frames provided")
|
522
504
|
|
523
505
|
buffer_bytes = frames_to_bytes(frames)
|
524
506
|
files = [("video", buffer_bytes)]
|
525
507
|
payload = {
|
526
|
-
"
|
527
|
-
"
|
508
|
+
"prompt": prompt,
|
509
|
+
"model": "florence2sam2",
|
528
510
|
}
|
511
|
+
metadata = {"function_name": "florence2_sam2_video_tracking"}
|
512
|
+
|
529
513
|
if chunk_length is not None:
|
530
|
-
payload["
|
531
|
-
|
532
|
-
|
514
|
+
payload["chunk_length_frames"] = chunk_length # type: ignore
|
515
|
+
|
516
|
+
if fine_tune_id is not None:
|
517
|
+
landing_api = LandingPublicAPI()
|
518
|
+
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
519
|
+
if status is not JobStatus.SUCCEEDED:
|
520
|
+
raise FineTuneModelIsNotReady(
|
521
|
+
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
522
|
+
)
|
523
|
+
|
524
|
+
payload["jobId"] = fine_tune_id
|
525
|
+
|
526
|
+
detections = send_task_inference_request(
|
527
|
+
payload,
|
528
|
+
"text-to-instance-segmentation",
|
529
|
+
files=files,
|
530
|
+
metadata=metadata,
|
533
531
|
)
|
532
|
+
|
534
533
|
return_data = []
|
535
|
-
for
|
534
|
+
for frame in detections:
|
536
535
|
return_frame_data = []
|
537
|
-
for
|
538
|
-
mask = rle_decode_array(
|
539
|
-
label =
|
536
|
+
for detection in frame:
|
537
|
+
mask = rle_decode_array(detection["mask"])
|
538
|
+
label = str(detection["id"]) + ": " + detection["label"]
|
540
539
|
return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
|
541
540
|
return_data.append(return_frame_data)
|
542
541
|
return return_data
|
@@ -552,7 +551,7 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
552
551
|
|
553
552
|
Returns:
|
554
553
|
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
555
|
-
with
|
554
|
+
with normalized coordinates, and confidence score.
|
556
555
|
|
557
556
|
Example
|
558
557
|
-------
|
@@ -608,7 +607,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
|
608
607
|
|
609
608
|
Returns:
|
610
609
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
611
|
-
value, e.g. {count: 12} and a heat map for
|
610
|
+
value, e.g. {count: 12} and a heat map for visualization purposes.
|
612
611
|
|
613
612
|
Example
|
614
613
|
-------
|
@@ -647,7 +646,7 @@ def loca_visual_prompt_counting(
|
|
647
646
|
|
648
647
|
Returns:
|
649
648
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
650
|
-
value, e.g. {count: 12} and a heat map for
|
649
|
+
value, e.g. {count: 12} and a heat map for visualization purposes.
|
651
650
|
|
652
651
|
Example
|
653
652
|
-------
|
@@ -20,7 +20,7 @@ vision_agent/tools/__init__.py,sha256=KVP4_6qxOb2lpFdQgQtyDfdkMLL1O6wVZNK19MXp-x
|
|
20
20
|
vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
23
|
-
vision_agent/tools/tools.py,sha256=
|
23
|
+
vision_agent/tools/tools.py,sha256=us3fOV3JIqFB9WidEX6NT65HwJbIxhh59RRvUcMIshI,83251
|
24
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
25
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
26
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
29
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
30
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
31
31
|
vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
32
|
+
vision_agent-0.2.185.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.185.dist-info/METADATA,sha256=Wgo1bRpQ3MgqxIDpBiN0Tj0YAUBwRtYCQ7DmhJwgKpY,18330
|
34
|
+
vision_agent-0.2.185.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.185.dist-info/RECORD,,
|
File without changes
|
File without changes
|