vision-agent 0.2.184__py3-none-any.whl → 0.2.186__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/tools/tools.py +70 -73
- {vision_agent-0.2.184.dist-info → vision_agent-0.2.186.dist-info}/METADATA +3 -3
- {vision_agent-0.2.184.dist-info → vision_agent-0.2.186.dist-info}/RECORD +5 -5
- {vision_agent-0.2.184.dist-info → vision_agent-0.2.186.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.184.dist-info → vision_agent-0.2.186.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -28,10 +28,8 @@ from vision_agent.tools.tool_utils import (
|
|
28
28
|
send_task_inference_request,
|
29
29
|
)
|
30
30
|
from vision_agent.tools.tools_types import (
|
31
|
-
Florence2FtRequest,
|
32
31
|
JobStatus,
|
33
32
|
ODResponseData,
|
34
|
-
PromptTask,
|
35
33
|
)
|
36
34
|
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
37
35
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
@@ -421,8 +419,15 @@ def florence2_sam2_image(
|
|
421
419
|
if image.shape[0] < 1 or image.shape[1] < 1:
|
422
420
|
return []
|
423
421
|
|
422
|
+
buffer_bytes = numpy_to_bytes(image)
|
423
|
+
files = [("image", buffer_bytes)]
|
424
|
+
payload = {
|
425
|
+
"prompt": prompt,
|
426
|
+
"model": "florence2sam2",
|
427
|
+
}
|
428
|
+
metadata = {"function_name": "florence2_sam2_image"}
|
429
|
+
|
424
430
|
if fine_tune_id is not None:
|
425
|
-
image_b64 = convert_to_b64(image)
|
426
431
|
landing_api = LandingPublicAPI()
|
427
432
|
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
428
433
|
if status is not JobStatus.SUCCEEDED:
|
@@ -430,58 +435,31 @@ def florence2_sam2_image(
|
|
430
435
|
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
431
436
|
)
|
432
437
|
|
433
|
-
|
434
|
-
image=image_b64,
|
435
|
-
task=PromptTask.PHRASE_GROUNDING,
|
436
|
-
prompt=prompt,
|
437
|
-
postprocessing="sam2",
|
438
|
-
job_id=UUID(fine_tune_id),
|
439
|
-
)
|
440
|
-
req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
|
441
|
-
detections_ft = send_inference_request(
|
442
|
-
req_data,
|
443
|
-
"florence2-ft",
|
444
|
-
v2=True,
|
445
|
-
is_form=True,
|
446
|
-
metadata_payload={"function_name": "florence2_sam2_image"},
|
447
|
-
)
|
448
|
-
# get the first frame
|
449
|
-
detection = detections_ft[0]
|
450
|
-
return_data = []
|
451
|
-
for i in range(len(detection["bboxes"])):
|
452
|
-
return_data.append(
|
453
|
-
{
|
454
|
-
"score": 1.0,
|
455
|
-
"label": detection["labels"][i],
|
456
|
-
"bbox": normalize_bbox(
|
457
|
-
detection["bboxes"][i], detection["masks"][i]["size"]
|
458
|
-
),
|
459
|
-
"mask": rle_decode_array(detection["masks"][i]),
|
460
|
-
}
|
461
|
-
)
|
462
|
-
return return_data
|
438
|
+
payload["jobId"] = fine_tune_id
|
463
439
|
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
}
|
470
|
-
detections: Dict[str, Any] = send_inference_request(
|
471
|
-
payload, "florence2-sam2", files=files, v2=True
|
440
|
+
detections = send_task_inference_request(
|
441
|
+
payload,
|
442
|
+
"text-to-instance-segmentation",
|
443
|
+
files=files,
|
444
|
+
metadata=metadata,
|
472
445
|
)
|
473
446
|
|
447
|
+
# get the first frame
|
448
|
+
frame = detections[0]
|
474
449
|
return_data = []
|
475
|
-
for
|
476
|
-
mask = rle_decode_array(
|
477
|
-
label =
|
478
|
-
bbox = normalize_bbox(
|
450
|
+
for detection in frame:
|
451
|
+
mask = rle_decode_array(detection["mask"])
|
452
|
+
label = detection["label"]
|
453
|
+
bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
|
479
454
|
return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
|
480
455
|
return return_data
|
481
456
|
|
482
457
|
|
483
458
|
def florence2_sam2_video_tracking(
|
484
|
-
prompt: str,
|
459
|
+
prompt: str,
|
460
|
+
frames: List[np.ndarray],
|
461
|
+
chunk_length: Optional[int] = 3,
|
462
|
+
fine_tune_id: Optional[str] = None,
|
485
463
|
) -> List[List[Dict[str, Any]]]:
|
486
464
|
"""'florence2_sam2_video_tracking' is a tool that can segment and track multiple
|
487
465
|
entities in a video given a text prompt such as category names or referring
|
@@ -494,6 +472,8 @@ def florence2_sam2_video_tracking(
|
|
494
472
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
495
473
|
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
496
474
|
new objects.
|
475
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
476
|
+
fine-tuned model ID here to use it.
|
497
477
|
|
498
478
|
Returns:
|
499
479
|
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
|
@@ -519,24 +499,43 @@ def florence2_sam2_video_tracking(
|
|
519
499
|
...
|
520
500
|
]
|
521
501
|
"""
|
502
|
+
if len(frames) == 0:
|
503
|
+
raise ValueError("No frames provided")
|
522
504
|
|
523
505
|
buffer_bytes = frames_to_bytes(frames)
|
524
506
|
files = [("video", buffer_bytes)]
|
525
507
|
payload = {
|
526
|
-
"
|
527
|
-
"
|
508
|
+
"prompt": prompt,
|
509
|
+
"model": "florence2sam2",
|
528
510
|
}
|
511
|
+
metadata = {"function_name": "florence2_sam2_video_tracking"}
|
512
|
+
|
529
513
|
if chunk_length is not None:
|
530
|
-
payload["
|
531
|
-
|
532
|
-
|
514
|
+
payload["chunk_length_frames"] = chunk_length # type: ignore
|
515
|
+
|
516
|
+
if fine_tune_id is not None:
|
517
|
+
landing_api = LandingPublicAPI()
|
518
|
+
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
519
|
+
if status is not JobStatus.SUCCEEDED:
|
520
|
+
raise FineTuneModelIsNotReady(
|
521
|
+
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
522
|
+
)
|
523
|
+
|
524
|
+
payload["jobId"] = fine_tune_id
|
525
|
+
|
526
|
+
detections = send_task_inference_request(
|
527
|
+
payload,
|
528
|
+
"text-to-instance-segmentation",
|
529
|
+
files=files,
|
530
|
+
metadata=metadata,
|
533
531
|
)
|
532
|
+
|
534
533
|
return_data = []
|
535
|
-
for
|
534
|
+
for frame in detections:
|
536
535
|
return_frame_data = []
|
537
|
-
for
|
538
|
-
mask = rle_decode_array(
|
539
|
-
label =
|
536
|
+
for detection in frame:
|
537
|
+
mask = rle_decode_array(detection["mask"])
|
538
|
+
label = str(detection["id"]) + ": " + detection["label"]
|
540
539
|
return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
|
541
540
|
return_data.append(return_frame_data)
|
542
541
|
return return_data
|
@@ -552,7 +551,7 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
552
551
|
|
553
552
|
Returns:
|
554
553
|
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
555
|
-
with
|
554
|
+
with normalized coordinates, and confidence score.
|
556
555
|
|
557
556
|
Example
|
558
557
|
-------
|
@@ -608,7 +607,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
|
608
607
|
|
609
608
|
Returns:
|
610
609
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
611
|
-
value, e.g. {count: 12} and a heat map for
|
610
|
+
value, e.g. {count: 12} and a heat map for visualization purposes.
|
612
611
|
|
613
612
|
Example
|
614
613
|
-------
|
@@ -647,7 +646,7 @@ def loca_visual_prompt_counting(
|
|
647
646
|
|
648
647
|
Returns:
|
649
648
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
650
|
-
value, e.g. {count: 12} and a heat map for
|
649
|
+
value, e.g. {count: 12} and a heat map for visualization purposes.
|
651
650
|
|
652
651
|
Example
|
653
652
|
-------
|
@@ -1058,23 +1057,25 @@ def video_temporal_localization(
|
|
1058
1057
|
prompt: str,
|
1059
1058
|
frames: List[np.ndarray],
|
1060
1059
|
model: str = "qwen2vl",
|
1061
|
-
chunk_length: Optional[float] = None,
|
1062
|
-
chunk_length_seconds: Optional[float] = None,
|
1063
1060
|
chunk_length_frames: Optional[int] = 2,
|
1064
1061
|
) -> List[float]:
|
1065
|
-
"""'video_temporal_localization'
|
1066
|
-
|
1062
|
+
"""'video_temporal_localization' will run qwen2vl on each chunk_length_frames
|
1063
|
+
value selected for the video. It can detect multiple objects independently per
|
1064
|
+
chunk_length_frames given a text prompt such as a referring expression
|
1065
|
+
but does not track objects across frames.
|
1066
|
+
It returns a list of floats with a value of 1.0 if the objects are found in a given
|
1067
|
+
chunk_length_frames of the video.
|
1067
1068
|
|
1068
1069
|
Parameters:
|
1069
1070
|
prompt (str): The question about the video
|
1070
1071
|
frames (List[np.ndarray]): The reference frames used for the question
|
1071
|
-
model (str): The model to use for the inference. Valid values are
|
1072
|
-
|
1073
|
-
chunk_length_seconds (Optional[float]): alternative length for chunk in seconds
|
1072
|
+
model (str): The model to use for the inference. Valid values are
|
1073
|
+
'qwen2vl', 'gpt4o', 'internlm-xcomposer'
|
1074
1074
|
chunk_length_frames (Optional[int]): length of each chunk in frames
|
1075
1075
|
|
1076
1076
|
Returns:
|
1077
|
-
List[float]: A list of floats with a value of 1.0 if the
|
1077
|
+
List[float]: A list of floats with a value of 1.0 if the objects to be found
|
1078
|
+
are present in the chunk_length_frames of the video.
|
1078
1079
|
|
1079
1080
|
Example
|
1080
1081
|
-------
|
@@ -1089,10 +1090,6 @@ def video_temporal_localization(
|
|
1089
1090
|
"model": model,
|
1090
1091
|
"function_name": "video_temporal_localization",
|
1091
1092
|
}
|
1092
|
-
if chunk_length is not None:
|
1093
|
-
payload["chunk_length"] = chunk_length
|
1094
|
-
if chunk_length_seconds is not None:
|
1095
|
-
payload["chunk_length_seconds"] = chunk_length_seconds
|
1096
1093
|
if chunk_length_frames is not None:
|
1097
1094
|
payload["chunk_length_frames"] = chunk_length_frames
|
1098
1095
|
|
@@ -1791,9 +1788,8 @@ def flux_image_inpainting(
|
|
1791
1788
|
where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
|
1792
1789
|
|
1793
1790
|
Returns:
|
1794
|
-
np.ndarray:
|
1795
|
-
|
1796
|
-
with values ranging from 0 to 255.
|
1791
|
+
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
1792
|
+
ranging from 0 to 255.
|
1797
1793
|
|
1798
1794
|
-------
|
1799
1795
|
Example:
|
@@ -2352,6 +2348,7 @@ FUNCTION_TOOLS = [
|
|
2352
2348
|
closest_box_distance,
|
2353
2349
|
qwen2_vl_images_vqa,
|
2354
2350
|
qwen2_vl_video_vqa,
|
2351
|
+
video_temporal_localization,
|
2355
2352
|
]
|
2356
2353
|
|
2357
2354
|
UTIL_TOOLS = [
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.186
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -63,10 +63,10 @@ code to solve the task for them. Check out our discord for updates and roadmaps!
|
|
63
63
|
## Table of Contents
|
64
64
|
- [🚀Quick Start](#quick-start)
|
65
65
|
- [📚Documentation](#documentation)
|
66
|
-
- [🔍🤖VisionAgent](#
|
66
|
+
- [🔍🤖VisionAgent](#visionagent-basic-usage)
|
67
67
|
- [🛠️Tools](#tools)
|
68
68
|
- [🤖LMMs](#lmms)
|
69
|
-
- [💻🤖VisionAgent Coder](#
|
69
|
+
- [💻🤖VisionAgent Coder](#visionagent-coder)
|
70
70
|
- [🏗️Additional Backends](#additional-backends)
|
71
71
|
|
72
72
|
## Quick Start
|
@@ -20,7 +20,7 @@ vision_agent/tools/__init__.py,sha256=KVP4_6qxOb2lpFdQgQtyDfdkMLL1O6wVZNK19MXp-x
|
|
20
20
|
vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
23
|
-
vision_agent/tools/tools.py,sha256
|
23
|
+
vision_agent/tools/tools.py,sha256=-oq8jzITi-yVYJ3ut5MuGJ65jd3ESRtHfw4SCAruMps,83059
|
24
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
25
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
26
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
29
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
30
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
31
31
|
vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
32
|
+
vision_agent-0.2.186.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.186.dist-info/METADATA,sha256=NQfESIRsq9-QWyPzNkyv6dSuRS6TGe5D2tZH4iJpeBU,18328
|
34
|
+
vision_agent-0.2.186.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.186.dist-info/RECORD,,
|
File without changes
|
File without changes
|