vision-agent 0.2.184__py3-none-any.whl → 0.2.186__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/tools.py +70 -73
- {vision_agent-0.2.184.dist-info → vision_agent-0.2.186.dist-info}/METADATA +3 -3
- {vision_agent-0.2.184.dist-info → vision_agent-0.2.186.dist-info}/RECORD +5 -5
- {vision_agent-0.2.184.dist-info → vision_agent-0.2.186.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.184.dist-info → vision_agent-0.2.186.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -28,10 +28,8 @@ from vision_agent.tools.tool_utils import (
|
|
28
28
|
send_task_inference_request,
|
29
29
|
)
|
30
30
|
from vision_agent.tools.tools_types import (
|
31
|
-
Florence2FtRequest,
|
32
31
|
JobStatus,
|
33
32
|
ODResponseData,
|
34
|
-
PromptTask,
|
35
33
|
)
|
36
34
|
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
37
35
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
@@ -421,8 +419,15 @@ def florence2_sam2_image(
|
|
421
419
|
if image.shape[0] < 1 or image.shape[1] < 1:
|
422
420
|
return []
|
423
421
|
|
422
|
+
buffer_bytes = numpy_to_bytes(image)
|
423
|
+
files = [("image", buffer_bytes)]
|
424
|
+
payload = {
|
425
|
+
"prompt": prompt,
|
426
|
+
"model": "florence2sam2",
|
427
|
+
}
|
428
|
+
metadata = {"function_name": "florence2_sam2_image"}
|
429
|
+
|
424
430
|
if fine_tune_id is not None:
|
425
|
-
image_b64 = convert_to_b64(image)
|
426
431
|
landing_api = LandingPublicAPI()
|
427
432
|
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
428
433
|
if status is not JobStatus.SUCCEEDED:
|
@@ -430,58 +435,31 @@ def florence2_sam2_image(
|
|
430
435
|
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
431
436
|
)
|
432
437
|
|
433
|
-
|
434
|
-
image=image_b64,
|
435
|
-
task=PromptTask.PHRASE_GROUNDING,
|
436
|
-
prompt=prompt,
|
437
|
-
postprocessing="sam2",
|
438
|
-
job_id=UUID(fine_tune_id),
|
439
|
-
)
|
440
|
-
req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
|
441
|
-
detections_ft = send_inference_request(
|
442
|
-
req_data,
|
443
|
-
"florence2-ft",
|
444
|
-
v2=True,
|
445
|
-
is_form=True,
|
446
|
-
metadata_payload={"function_name": "florence2_sam2_image"},
|
447
|
-
)
|
448
|
-
# get the first frame
|
449
|
-
detection = detections_ft[0]
|
450
|
-
return_data = []
|
451
|
-
for i in range(len(detection["bboxes"])):
|
452
|
-
return_data.append(
|
453
|
-
{
|
454
|
-
"score": 1.0,
|
455
|
-
"label": detection["labels"][i],
|
456
|
-
"bbox": normalize_bbox(
|
457
|
-
detection["bboxes"][i], detection["masks"][i]["size"]
|
458
|
-
),
|
459
|
-
"mask": rle_decode_array(detection["masks"][i]),
|
460
|
-
}
|
461
|
-
)
|
462
|
-
return return_data
|
438
|
+
payload["jobId"] = fine_tune_id
|
463
439
|
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
}
|
470
|
-
detections: Dict[str, Any] = send_inference_request(
|
471
|
-
payload, "florence2-sam2", files=files, v2=True
|
440
|
+
detections = send_task_inference_request(
|
441
|
+
payload,
|
442
|
+
"text-to-instance-segmentation",
|
443
|
+
files=files,
|
444
|
+
metadata=metadata,
|
472
445
|
)
|
473
446
|
|
447
|
+
# get the first frame
|
448
|
+
frame = detections[0]
|
474
449
|
return_data = []
|
475
|
-
for
|
476
|
-
mask = rle_decode_array(
|
477
|
-
label =
|
478
|
-
bbox = normalize_bbox(
|
450
|
+
for detection in frame:
|
451
|
+
mask = rle_decode_array(detection["mask"])
|
452
|
+
label = detection["label"]
|
453
|
+
bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
|
479
454
|
return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
|
480
455
|
return return_data
|
481
456
|
|
482
457
|
|
483
458
|
def florence2_sam2_video_tracking(
|
484
|
-
prompt: str,
|
459
|
+
prompt: str,
|
460
|
+
frames: List[np.ndarray],
|
461
|
+
chunk_length: Optional[int] = 3,
|
462
|
+
fine_tune_id: Optional[str] = None,
|
485
463
|
) -> List[List[Dict[str, Any]]]:
|
486
464
|
"""'florence2_sam2_video_tracking' is a tool that can segment and track multiple
|
487
465
|
entities in a video given a text prompt such as category names or referring
|
@@ -494,6 +472,8 @@ def florence2_sam2_video_tracking(
|
|
494
472
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
495
473
|
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
496
474
|
new objects.
|
475
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
476
|
+
fine-tuned model ID here to use it.
|
497
477
|
|
498
478
|
Returns:
|
499
479
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
|
@@ -519,24 +499,43 @@ def florence2_sam2_video_tracking(
|
|
519
499
|
...
|
520
500
|
]
|
521
501
|
"""
|
502
|
+
if len(frames) == 0:
|
503
|
+
raise ValueError("No frames provided")
|
522
504
|
|
523
505
|
buffer_bytes = frames_to_bytes(frames)
|
524
506
|
files = [("video", buffer_bytes)]
|
525
507
|
payload = {
|
526
|
-
"
|
527
|
-
"
|
508
|
+
"prompt": prompt,
|
509
|
+
"model": "florence2sam2",
|
528
510
|
}
|
511
|
+
metadata = {"function_name": "florence2_sam2_video_tracking"}
|
512
|
+
|
529
513
|
if chunk_length is not None:
|
530
|
-
payload["
|
531
|
-
|
532
|
-
|
514
|
+
payload["chunk_length_frames"] = chunk_length # type: ignore
|
515
|
+
|
516
|
+
if fine_tune_id is not None:
|
517
|
+
landing_api = LandingPublicAPI()
|
518
|
+
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
519
|
+
if status is not JobStatus.SUCCEEDED:
|
520
|
+
raise FineTuneModelIsNotReady(
|
521
|
+
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
522
|
+
)
|
523
|
+
|
524
|
+
payload["jobId"] = fine_tune_id
|
525
|
+
|
526
|
+
detections = send_task_inference_request(
|
527
|
+
payload,
|
528
|
+
"text-to-instance-segmentation",
|
529
|
+
files=files,
|
530
|
+
metadata=metadata,
|
533
531
|
)
|
532
|
+
|
534
533
|
return_data = []
|
535
|
-
for
|
534
|
+
for frame in detections:
|
536
535
|
return_frame_data = []
|
537
|
-
for
|
538
|
-
mask = rle_decode_array(
|
539
|
-
label =
|
536
|
+
for detection in frame:
|
537
|
+
mask = rle_decode_array(detection["mask"])
|
538
|
+
label = str(detection["id"]) + ": " + detection["label"]
|
540
539
|
return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
|
541
540
|
return_data.append(return_frame_data)
|
542
541
|
return return_data
|
@@ -552,7 +551,7 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
552
551
|
|
553
552
|
Returns:
|
554
553
|
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
555
|
-
with
|
554
|
+
with normalized coordinates, and confidence score.
|
556
555
|
|
557
556
|
Example
|
558
557
|
-------
|
@@ -608,7 +607,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
|
608
607
|
|
609
608
|
Returns:
|
610
609
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
611
|
-
value, e.g. {count: 12} and a heat map for
|
610
|
+
value, e.g. {count: 12} and a heat map for visualization purposes.
|
612
611
|
|
613
612
|
Example
|
614
613
|
-------
|
@@ -647,7 +646,7 @@ def loca_visual_prompt_counting(
|
|
647
646
|
|
648
647
|
Returns:
|
649
648
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
650
|
-
value, e.g. {count: 12} and a heat map for
|
649
|
+
value, e.g. {count: 12} and a heat map for visualization purposes.
|
651
650
|
|
652
651
|
Example
|
653
652
|
-------
|
@@ -1058,23 +1057,25 @@ def video_temporal_localization(
|
|
1058
1057
|
prompt: str,
|
1059
1058
|
frames: List[np.ndarray],
|
1060
1059
|
model: str = "qwen2vl",
|
1061
|
-
chunk_length: Optional[float] = None,
|
1062
|
-
chunk_length_seconds: Optional[float] = None,
|
1063
1060
|
chunk_length_frames: Optional[int] = 2,
|
1064
1061
|
) -> List[float]:
|
1065
|
-
"""'video_temporal_localization'
|
1066
|
-
|
1062
|
+
"""'video_temporal_localization' will run qwen2vl on each chunk_length_frames
|
1063
|
+
value selected for the video. It can detect multiple objects independently per
|
1064
|
+
chunk_length_frames given a text prompt such as a referring expression
|
1065
|
+
but does not track objects across frames.
|
1066
|
+
It returns a list of floats with a value of 1.0 if the objects are found in a given
|
1067
|
+
chunk_length_frames of the video.
|
1067
1068
|
|
1068
1069
|
Parameters:
|
1069
1070
|
prompt (str): The question about the video
|
1070
1071
|
frames (List[np.ndarray]): The reference frames used for the question
|
1071
|
-
model (str): The model to use for the inference. Valid values are
|
1072
|
-
|
1073
|
-
chunk_length_seconds (Optional[float]): alternative length for chunk in seconds
|
1072
|
+
model (str): The model to use for the inference. Valid values are
|
1073
|
+
'qwen2vl', 'gpt4o', 'internlm-xcomposer'
|
1074
1074
|
chunk_length_frames (Optional[int]): length of each chunk in frames
|
1075
1075
|
|
1076
1076
|
Returns:
|
1077
|
-
List[float]: A list of floats with a value of 1.0 if the
|
1077
|
+
List[float]: A list of floats with a value of 1.0 if the objects to be found
|
1078
|
+
are present in the chunk_length_frames of the video.
|
1078
1079
|
|
1079
1080
|
Example
|
1080
1081
|
-------
|
@@ -1089,10 +1090,6 @@ def video_temporal_localization(
|
|
1089
1090
|
"model": model,
|
1090
1091
|
"function_name": "video_temporal_localization",
|
1091
1092
|
}
|
1092
|
-
if chunk_length is not None:
|
1093
|
-
payload["chunk_length"] = chunk_length
|
1094
|
-
if chunk_length_seconds is not None:
|
1095
|
-
payload["chunk_length_seconds"] = chunk_length_seconds
|
1096
1093
|
if chunk_length_frames is not None:
|
1097
1094
|
payload["chunk_length_frames"] = chunk_length_frames
|
1098
1095
|
|
@@ -1791,9 +1788,8 @@ def flux_image_inpainting(
|
|
1791
1788
|
where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
|
1792
1789
|
|
1793
1790
|
Returns:
|
1794
|
-
np.ndarray:
|
1795
|
-
|
1796
|
-
with values ranging from 0 to 255.
|
1791
|
+
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
1792
|
+
ranging from 0 to 255.
|
1797
1793
|
|
1798
1794
|
-------
|
1799
1795
|
Example:
|
@@ -2352,6 +2348,7 @@ FUNCTION_TOOLS = [
|
|
2352
2348
|
closest_box_distance,
|
2353
2349
|
qwen2_vl_images_vqa,
|
2354
2350
|
qwen2_vl_video_vqa,
|
2351
|
+
video_temporal_localization,
|
2355
2352
|
]
|
2356
2353
|
|
2357
2354
|
UTIL_TOOLS = [
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.186
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -63,10 +63,10 @@ code to solve the task for them. Check out our discord for updates and roadmaps!
|
|
63
63
|
## Table of Contents
|
64
64
|
- [🚀Quick Start](#quick-start)
|
65
65
|
- [📚Documentation](#documentation)
|
66
|
-
- [🔍🤖VisionAgent](#
|
66
|
+
- [🔍🤖VisionAgent](#visionagent-basic-usage)
|
67
67
|
- [🛠️Tools](#tools)
|
68
68
|
- [🤖LMMs](#lmms)
|
69
|
-
- [💻🤖VisionAgent Coder](#
|
69
|
+
- [💻🤖VisionAgent Coder](#visionagent-coder)
|
70
70
|
- [🏗️Additional Backends](#additional-backends)
|
71
71
|
|
72
72
|
## Quick Start
|
@@ -20,7 +20,7 @@ vision_agent/tools/__init__.py,sha256=KVP4_6qxOb2lpFdQgQtyDfdkMLL1O6wVZNK19MXp-x
|
|
20
20
|
vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
23
|
-
vision_agent/tools/tools.py,sha256
|
23
|
+
vision_agent/tools/tools.py,sha256=-oq8jzITi-yVYJ3ut5MuGJ65jd3ESRtHfw4SCAruMps,83059
|
24
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
25
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
26
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
29
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
30
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
31
31
|
vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
32
|
+
vision_agent-0.2.186.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.186.dist-info/METADATA,sha256=NQfESIRsq9-QWyPzNkyv6dSuRS6TGe5D2tZH4iJpeBU,18328
|
34
|
+
vision_agent-0.2.186.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.186.dist-info/RECORD,,
|
File without changes
|
File without changes
|