vision-agent 0.2.152__tar.gz → 0.2.153__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.152 → vision_agent-0.2.153}/PKG-INFO +1 -1
- {vision_agent-0.2.152 → vision_agent-0.2.153}/pyproject.toml +1 -1
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/agent/vision_agent_coder_prompts.py +5 -5
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/agent/vision_agent_prompts.py +7 -7
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/tools/__init__.py +1 -1
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/tools/meta_tools.py +6 -2
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/tools/tool_utils.py +9 -4
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/tools/tools.py +152 -51
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/tools/tools_types.py +8 -13
- {vision_agent-0.2.152 → vision_agent-0.2.153}/LICENSE +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/README.md +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/agent/agent_utils.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/utils/video.py +0 -0
{vision_agent-0.2.152 → vision_agent-0.2.153}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
@@ -101,7 +101,7 @@ plan1:
|
|
101
101
|
- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
|
102
102
|
plan2:
|
103
103
|
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
104
|
-
- Use the '
|
104
|
+
- Use the 'florence2_phrase_grounding_image' tool with the prompt 'person' to detect where the people are in the video.
|
105
105
|
plan3:
|
106
106
|
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
107
107
|
- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
|
@@ -109,7 +109,7 @@ plan3:
|
|
109
109
|
|
110
110
|
```python
|
111
111
|
import numpy as np
|
112
|
-
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video,
|
112
|
+
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding_image, florence2_sam2_video_tracking
|
113
113
|
|
114
114
|
# sample at 1 FPS and use the first 10 frames to reduce processing time
|
115
115
|
frames = extract_frames_and_timestamps("video.mp4", 1)
|
@@ -143,7 +143,7 @@ owl_v2_out = owl_v2_video("person", frames)
|
|
143
143
|
owl_v2_counts = get_counts(owl_v2_out)
|
144
144
|
|
145
145
|
# plan2
|
146
|
-
florence2_out = [
|
146
|
+
florence2_out = [florence2_phrase_grounding_image("person", f) for f in frames]
|
147
147
|
florence2_counts = get_counts(florence2_out)
|
148
148
|
|
149
149
|
# plan3
|
@@ -153,13 +153,13 @@ f2s2_counts = get_counts(f2s2_tracking_out)
|
|
153
153
|
|
154
154
|
final_out = {{
|
155
155
|
"owl_v2_video": owl_v2_out,
|
156
|
-
"
|
156
|
+
"florence2_phrase_grounding_image": florence2_out,
|
157
157
|
"florence2_sam2_video_tracking": f2s2_out,
|
158
158
|
}}
|
159
159
|
|
160
160
|
counts = {{
|
161
161
|
"owl_v2_video": owl_v2_counts,
|
162
|
-
"
|
162
|
+
"florence2_phrase_grounding_image": florence2_counts,
|
163
163
|
"florence2_sam2_video_tracking": f2s2_counts,
|
164
164
|
}}
|
165
165
|
|
@@ -131,10 +131,10 @@ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers wi
|
|
131
131
|
|
132
132
|
OBSERVATION:
|
133
133
|
[Artifact code.py]
|
134
|
-
0|from vision_agent.tools import load_image,
|
134
|
+
0|from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
|
135
135
|
1|def count_workers_with_helmets(image_path: str, output_path: str):
|
136
136
|
2| image = load_image(image_path)
|
137
|
-
3| detections =
|
137
|
+
3| detections = florence2_phrase_grounding_image("worker, helmet", image)
|
138
138
|
4| workers = [d for d in detections if d['label'] == 'worker']
|
139
139
|
5| helmets = [d for d in detections if d['label'] == 'helmet']
|
140
140
|
6| count = 0
|
@@ -166,18 +166,18 @@ AGENT: {"thoughts": "Because the user has supplied me with labels I can call obj
|
|
166
166
|
OBSERVATION:
|
167
167
|
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
|
168
168
|
|
169
|
-
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original
|
169
|
+
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding_image call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
|
170
170
|
|
171
171
|
OBSERVATION:
|
172
172
|
[Artifact code.py edits]
|
173
173
|
---
|
174
174
|
+++
|
175
175
|
@@ -1,7 +1,7 @@
|
176
|
-
from vision_agent.tools import load_image,
|
176
|
+
from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
|
177
177
|
def count_workers_with_helmets(image_path: str, output_path: str):
|
178
178
|
image = load_image(image_path)
|
179
|
-
- detections =
|
180
|
-
+ detections =
|
179
|
+
- detections = florence2_phrase_grounding_image("worker, helmet", image)
|
180
|
+
+ detections = florence2_phrase_grounding_image("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
|
181
181
|
workers = [d for d in detections if d['label'] == 'worker']
|
182
182
|
helmets = [d for d in detections if d['label'] == 'helmet']
|
183
183
|
count = 0
|
@@ -189,5 +189,5 @@ OBSERVATION:
|
|
189
189
|
----- stdout -----
|
190
190
|
3
|
191
191
|
|
192
|
-
AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned
|
192
|
+
AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding_image model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
|
193
193
|
"""
|
@@ -668,8 +668,12 @@ def use_object_detection_fine_tuning(
|
|
668
668
|
|
669
669
|
patterns_with_fine_tune_id = [
|
670
670
|
(
|
671
|
-
r'
|
672
|
-
lambda match: f'
|
671
|
+
r'florence2_phrase_grounding_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
672
|
+
lambda match: f'florence2_phrase_grounding_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
|
673
|
+
),
|
674
|
+
(
|
675
|
+
r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
676
|
+
lambda match: f'florence2_phrase_grounding_video("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
|
673
677
|
),
|
674
678
|
(
|
675
679
|
r'owl_v2_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
@@ -1,6 +1,6 @@
|
|
1
|
+
import os
|
1
2
|
import inspect
|
2
3
|
import logging
|
3
|
-
import os
|
4
4
|
from base64 import b64encode
|
5
5
|
from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
|
6
6
|
|
@@ -37,8 +37,9 @@ def send_inference_request(
|
|
37
37
|
files: Optional[List[Tuple[Any, ...]]] = None,
|
38
38
|
v2: bool = False,
|
39
39
|
metadata_payload: Optional[Dict[str, Any]] = None,
|
40
|
+
is_form: bool = False,
|
40
41
|
) -> Any:
|
41
|
-
# TODO: runtime_tag and function_name should be metadata_payload and
|
42
|
+
# TODO: runtime_tag and function_name should be metadata_payload and not included
|
42
43
|
# in the service payload
|
43
44
|
if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
|
44
45
|
payload["runtime_tag"] = runtime_tag
|
@@ -64,7 +65,7 @@ def send_inference_request(
|
|
64
65
|
elif metadata_payload is not None and "function_name" in metadata_payload:
|
65
66
|
function_name = metadata_payload["function_name"]
|
66
67
|
|
67
|
-
response = _call_post(url, payload, session, files, function_name)
|
68
|
+
response = _call_post(url, payload, session, files, function_name, is_form)
|
68
69
|
|
69
70
|
# TODO: consider making the response schema the same between below two sources
|
70
71
|
return response if "TOOL_ENDPOINT_AUTH" in os.environ else response["data"]
|
@@ -75,6 +76,7 @@ def send_task_inference_request(
|
|
75
76
|
task_name: str,
|
76
77
|
files: Optional[List[Tuple[Any, ...]]] = None,
|
77
78
|
metadata: Optional[Dict[str, Any]] = None,
|
79
|
+
is_form: bool = False,
|
78
80
|
) -> Any:
|
79
81
|
url = f"{_LND_API_URL_v2}/{task_name}"
|
80
82
|
headers = {"apikey": _LND_API_KEY}
|
@@ -87,7 +89,7 @@ def send_task_inference_request(
|
|
87
89
|
function_name = "unknown"
|
88
90
|
if metadata is not None and "function_name" in metadata:
|
89
91
|
function_name = metadata["function_name"]
|
90
|
-
response = _call_post(url, payload, session, files, function_name)
|
92
|
+
response = _call_post(url, payload, session, files, function_name, is_form)
|
91
93
|
return response["data"]
|
92
94
|
|
93
95
|
|
@@ -203,6 +205,7 @@ def _call_post(
|
|
203
205
|
session: Session,
|
204
206
|
files: Optional[List[Tuple[Any, ...]]] = None,
|
205
207
|
function_name: str = "unknown",
|
208
|
+
is_form: bool = False,
|
206
209
|
) -> Any:
|
207
210
|
files_in_b64 = None
|
208
211
|
if files:
|
@@ -210,6 +213,8 @@ def _call_post(
|
|
210
213
|
try:
|
211
214
|
if files is not None:
|
212
215
|
response = session.post(url, data=payload, files=files)
|
216
|
+
elif is_form:
|
217
|
+
response = session.post(url, data=payload)
|
213
218
|
else:
|
214
219
|
response = session.post(url, json=payload)
|
215
220
|
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import base64
|
1
2
|
import io
|
2
3
|
import json
|
3
4
|
import logging
|
@@ -28,7 +29,6 @@ from vision_agent.tools.tool_utils import (
|
|
28
29
|
send_task_inference_request,
|
29
30
|
)
|
30
31
|
from vision_agent.tools.tools_types import (
|
31
|
-
FineTuning,
|
32
32
|
Florence2FtRequest,
|
33
33
|
JobStatus,
|
34
34
|
ODResponseData,
|
@@ -194,20 +194,26 @@ def owl_v2_image(
|
|
194
194
|
data_obj = Florence2FtRequest(
|
195
195
|
image=image_b64,
|
196
196
|
task=PromptTask.PHRASE_GROUNDING,
|
197
|
-
tool="florencev2_fine_tuning",
|
198
197
|
prompt=prompt,
|
199
|
-
|
198
|
+
job_id=UUID(fine_tune_id),
|
200
199
|
)
|
201
|
-
data = data_obj.model_dump(by_alias=True)
|
202
|
-
detections = send_inference_request(
|
203
|
-
|
200
|
+
data = data_obj.model_dump(by_alias=True, exclude_none=True)
|
201
|
+
detections = send_inference_request(
|
202
|
+
data,
|
203
|
+
"florence2-ft",
|
204
|
+
v2=True,
|
205
|
+
is_form=True,
|
206
|
+
metadata_payload={"function_name": "owl_v2_image"},
|
207
|
+
)
|
208
|
+
# get the first frame
|
209
|
+
detection = detections[0]
|
204
210
|
bboxes_formatted = [
|
205
211
|
ODResponseData(
|
206
|
-
label=
|
207
|
-
bbox=normalize_bbox(
|
212
|
+
label=detection["labels"][i],
|
213
|
+
bbox=normalize_bbox(detection["bboxes"][i], image_size),
|
208
214
|
score=1.0,
|
209
215
|
)
|
210
|
-
for i in range(len(
|
216
|
+
for i in range(len(detection["bboxes"]))
|
211
217
|
]
|
212
218
|
return [bbox.model_dump() for bbox in bboxes_formatted]
|
213
219
|
|
@@ -419,25 +425,30 @@ def florence2_sam2_image(
|
|
419
425
|
req_data_obj = Florence2FtRequest(
|
420
426
|
image=image_b64,
|
421
427
|
task=PromptTask.PHRASE_GROUNDING,
|
422
|
-
tool="florencev2_fine_tuning",
|
423
428
|
prompt=prompt,
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
429
|
+
postprocessing="sam2",
|
430
|
+
job_id=UUID(fine_tune_id),
|
431
|
+
)
|
432
|
+
req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
|
433
|
+
detections_ft = send_inference_request(
|
434
|
+
req_data,
|
435
|
+
"florence2-ft",
|
436
|
+
v2=True,
|
437
|
+
is_form=True,
|
438
|
+
metadata_payload={"function_name": "florence2_sam2_image"},
|
428
439
|
)
|
429
|
-
|
430
|
-
|
431
|
-
detections_ft = detections_ft["<CAPTION_TO_PHRASE_GROUNDING>"]
|
440
|
+
# get the first frame
|
441
|
+
detection = detections_ft[0]
|
432
442
|
return_data = []
|
433
|
-
|
434
|
-
for i in range(len(detections_ft["bboxes"])):
|
443
|
+
for i in range(len(detection["bboxes"])):
|
435
444
|
return_data.append(
|
436
445
|
{
|
437
446
|
"score": 1.0,
|
438
|
-
"label":
|
439
|
-
"bbox":
|
440
|
-
|
447
|
+
"label": detection["labels"][i],
|
448
|
+
"bbox": normalize_bbox(
|
449
|
+
detection["bboxes"][i], detection["masks"][i]["size"]
|
450
|
+
),
|
451
|
+
"mask": rle_decode_array(detection["masks"][i]),
|
441
452
|
}
|
442
453
|
)
|
443
454
|
return return_data
|
@@ -451,6 +462,7 @@ def florence2_sam2_image(
|
|
451
462
|
detections: Dict[str, Any] = send_inference_request(
|
452
463
|
payload, "florence2-sam2", files=files, v2=True
|
453
464
|
)
|
465
|
+
|
454
466
|
return_data = []
|
455
467
|
for _, data_i in detections["0"].items():
|
456
468
|
mask = rle_decode_array(data_i["mask"])
|
@@ -688,22 +700,18 @@ def countgd_counting(
|
|
688
700
|
{'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
|
689
701
|
]
|
690
702
|
"""
|
691
|
-
|
692
|
-
files = [("image", buffer_bytes)]
|
703
|
+
image_b64 = convert_to_b64(image)
|
693
704
|
prompt = prompt.replace(", ", " .")
|
694
|
-
payload = {"
|
705
|
+
payload = {"prompt": prompt, "image": image_b64}
|
695
706
|
metadata = {"function_name": "countgd_counting"}
|
696
|
-
resp_data = send_task_inference_request(
|
697
|
-
payload, "text-to-object-detection", files=files, metadata=metadata
|
698
|
-
)
|
699
|
-
bboxes_per_frame = resp_data[0]
|
707
|
+
resp_data = send_task_inference_request(payload, "countgd", metadata=metadata)
|
700
708
|
bboxes_formatted = [
|
701
709
|
ODResponseData(
|
702
710
|
label=bbox["label"],
|
703
|
-
bbox=list(map(lambda x: round(x, 2), bbox["
|
711
|
+
bbox=list(map(lambda x: round(x, 2), bbox["bbox"])),
|
704
712
|
score=round(bbox["score"], 2),
|
705
713
|
)
|
706
|
-
for bbox in
|
714
|
+
for bbox in resp_data
|
707
715
|
]
|
708
716
|
filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
|
709
717
|
return [bbox.model_dump() for bbox in filtered_bboxes]
|
@@ -887,7 +895,10 @@ def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[b
|
|
887
895
|
"function_name": "ixc25_temporal_localization",
|
888
896
|
}
|
889
897
|
data: List[int] = send_inference_request(
|
890
|
-
payload,
|
898
|
+
payload,
|
899
|
+
"video-temporal-localization?model=internlm-xcomposer",
|
900
|
+
files=files,
|
901
|
+
v2=True,
|
891
902
|
)
|
892
903
|
chunk_size = round(len(frames) / len(data))
|
893
904
|
data_explode = [[elt] * chunk_size for elt in data]
|
@@ -1132,13 +1143,13 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
|
|
1132
1143
|
return answer[task] # type: ignore
|
1133
1144
|
|
1134
1145
|
|
1135
|
-
def
|
1146
|
+
def florence2_phrase_grounding_image(
|
1136
1147
|
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
|
1137
1148
|
) -> List[Dict[str, Any]]:
|
1138
|
-
"""'
|
1139
|
-
objects given a text prompt which can be object names or caption.
|
1140
|
-
can optionally separate the object names in the text with commas. It returns
|
1141
|
-
of bounding boxes with normalized coordinates, label names and associated
|
1149
|
+
"""'florence2_phrase_grounding_image' will run florence2 on a image. It can
|
1150
|
+
detect multiple objects given a text prompt which can be object names or caption.
|
1151
|
+
You can optionally separate the object names in the text with commas. It returns
|
1152
|
+
a list of bounding boxes with normalized coordinates, label names and associated
|
1142
1153
|
probability scores of 1.0.
|
1143
1154
|
|
1144
1155
|
Parameters:
|
@@ -1156,7 +1167,7 @@ def florence2_phrase_grounding(
|
|
1156
1167
|
|
1157
1168
|
Example
|
1158
1169
|
-------
|
1159
|
-
>>>
|
1170
|
+
>>> florence2_phrase_grounding_image('person looking at a coyote', image)
|
1160
1171
|
[
|
1161
1172
|
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1162
1173
|
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
@@ -1176,39 +1187,128 @@ def florence2_phrase_grounding(
|
|
1176
1187
|
data_obj = Florence2FtRequest(
|
1177
1188
|
image=image_b64,
|
1178
1189
|
task=PromptTask.PHRASE_GROUNDING,
|
1179
|
-
tool="florencev2_fine_tuning",
|
1180
1190
|
prompt=prompt,
|
1181
|
-
|
1191
|
+
job_id=UUID(fine_tune_id),
|
1182
1192
|
)
|
1183
|
-
data = data_obj.model_dump(by_alias=True)
|
1193
|
+
data = data_obj.model_dump(by_alias=True, exclude_none=True)
|
1184
1194
|
detections = send_inference_request(
|
1185
1195
|
data,
|
1186
|
-
"
|
1187
|
-
v2=
|
1188
|
-
|
1196
|
+
"florence2-ft",
|
1197
|
+
v2=True,
|
1198
|
+
is_form=True,
|
1199
|
+
metadata_payload={"function_name": "florence2_phrase_grounding_image"},
|
1189
1200
|
)
|
1201
|
+
# get the first frame
|
1202
|
+
detection = detections[0]
|
1190
1203
|
else:
|
1191
1204
|
data = {
|
1192
1205
|
"image": image_b64,
|
1193
1206
|
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
1194
1207
|
"prompt": prompt,
|
1195
|
-
"function_name": "
|
1208
|
+
"function_name": "florence2_phrase_grounding_image",
|
1196
1209
|
}
|
1197
1210
|
detections = send_inference_request(data, "florence2", v2=True)
|
1211
|
+
detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
|
1198
1212
|
|
1199
|
-
detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
|
1200
1213
|
return_data = []
|
1201
|
-
for i in range(len(
|
1214
|
+
for i in range(len(detection["bboxes"])):
|
1202
1215
|
return_data.append(
|
1203
1216
|
ODResponseData(
|
1204
|
-
label=
|
1205
|
-
bbox=normalize_bbox(
|
1217
|
+
label=detection["labels"][i],
|
1218
|
+
bbox=normalize_bbox(detection["bboxes"][i], image_size),
|
1206
1219
|
score=1.0,
|
1207
1220
|
)
|
1208
1221
|
)
|
1209
1222
|
return [bbox.model_dump() for bbox in return_data]
|
1210
1223
|
|
1211
1224
|
|
1225
|
+
def florence2_phrase_grounding_video(
|
1226
|
+
prompt: str, frames: List[np.ndarray], fine_tune_id: Optional[str] = None
|
1227
|
+
) -> List[List[Dict[str, Any]]]:
|
1228
|
+
"""'florence2_phrase_grounding_video' will run florence2 on each frame of a video.
|
1229
|
+
It can detect multiple objects given a text prompt which can be object names or
|
1230
|
+
caption. You can optionally separate the object names in the text with commas.
|
1231
|
+
It returns a list of lists where each inner list contains bounding boxes with
|
1232
|
+
normalized coordinates, label names and associated probability scores of 1.0.
|
1233
|
+
|
1234
|
+
Parameters:
|
1235
|
+
prompt (str): The prompt to ground to the video.
|
1236
|
+
frames (List[np.ndarray]): The list of frames to detect objects.
|
1237
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
1238
|
+
fine-tuned model ID here to use it.
|
1239
|
+
|
1240
|
+
Returns:
|
1241
|
+
List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the score,
|
1242
|
+
label, and bounding box of the detected objects with normalized coordinates
|
1243
|
+
between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
|
1244
|
+
of the top-left and xmax and ymax are the coordinates of the bottom-right of
|
1245
|
+
the bounding box. The scores are always 1.0 and cannot be thresholded.
|
1246
|
+
|
1247
|
+
Example
|
1248
|
+
-------
|
1249
|
+
>>> florence2_phrase_grounding_video('person looking at a coyote', frames)
|
1250
|
+
[
|
1251
|
+
[
|
1252
|
+
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1253
|
+
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
1254
|
+
],
|
1255
|
+
...
|
1256
|
+
]
|
1257
|
+
"""
|
1258
|
+
if len(frames) == 0:
|
1259
|
+
raise ValueError("No frames provided")
|
1260
|
+
|
1261
|
+
image_size = frames[0].shape[:2]
|
1262
|
+
buffer_bytes = frames_to_bytes(frames)
|
1263
|
+
files = [("video", buffer_bytes)]
|
1264
|
+
|
1265
|
+
if fine_tune_id is not None:
|
1266
|
+
landing_api = LandingPublicAPI()
|
1267
|
+
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
1268
|
+
if status is not JobStatus.SUCCEEDED:
|
1269
|
+
raise FineTuneModelIsNotReady(
|
1270
|
+
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
1271
|
+
)
|
1272
|
+
|
1273
|
+
data_obj = Florence2FtRequest(
|
1274
|
+
task=PromptTask.PHRASE_GROUNDING,
|
1275
|
+
prompt=prompt,
|
1276
|
+
job_id=UUID(fine_tune_id),
|
1277
|
+
)
|
1278
|
+
|
1279
|
+
data = data_obj.model_dump(by_alias=True, exclude_none=True, mode="json")
|
1280
|
+
detections = send_inference_request(
|
1281
|
+
data,
|
1282
|
+
"florence2-ft",
|
1283
|
+
v2=True,
|
1284
|
+
files=files,
|
1285
|
+
metadata_payload={"function_name": "florence2_phrase_grounding_video"},
|
1286
|
+
)
|
1287
|
+
else:
|
1288
|
+
data = {
|
1289
|
+
"prompt": prompt,
|
1290
|
+
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
1291
|
+
"function_name": "florence2_phrase_grounding_video",
|
1292
|
+
"video": base64.b64encode(buffer_bytes).decode("utf-8"),
|
1293
|
+
}
|
1294
|
+
detections = send_inference_request(data, "florence2", v2=True)
|
1295
|
+
detections = [d["<CAPTION_TO_PHRASE_GROUNDING>"] for d in detections]
|
1296
|
+
|
1297
|
+
bboxes_formatted = []
|
1298
|
+
for frame_data in detections:
|
1299
|
+
bboxes_formatted_per_frame = []
|
1300
|
+
for idx in range(len(frame_data["bboxes"])):
|
1301
|
+
bboxes_formatted_per_frame.append(
|
1302
|
+
ODResponseData(
|
1303
|
+
label=frame_data["labels"][idx],
|
1304
|
+
bbox=normalize_bbox(frame_data["bboxes"][idx], image_size),
|
1305
|
+
score=1.0,
|
1306
|
+
)
|
1307
|
+
)
|
1308
|
+
bboxes_formatted.append(bboxes_formatted_per_frame)
|
1309
|
+
return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
|
1310
|
+
|
1311
|
+
|
1212
1312
|
def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
1213
1313
|
"""'florence2_ocr' is a tool that can detect text and text regions in an image.
|
1214
1314
|
Each text region contains one line of text. It returns a list of detected text,
|
@@ -1220,7 +1320,7 @@ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
1220
1320
|
|
1221
1321
|
Returns:
|
1222
1322
|
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
1223
|
-
with
|
1323
|
+
with normalized coordinates, and confidence score.
|
1224
1324
|
|
1225
1325
|
Example
|
1226
1326
|
-------
|
@@ -2064,7 +2164,8 @@ FUNCTION_TOOLS = [
|
|
2064
2164
|
florence2_ocr,
|
2065
2165
|
florence2_sam2_image,
|
2066
2166
|
florence2_sam2_video_tracking,
|
2067
|
-
|
2167
|
+
florence2_phrase_grounding_image,
|
2168
|
+
florence2_phrase_grounding_video,
|
2068
2169
|
ixc25_image_vqa,
|
2069
2170
|
ixc25_video_vqa,
|
2070
2171
|
detr_segmentation,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
from typing import List, Optional, Tuple, Union
|
3
2
|
from uuid import UUID
|
3
|
+
from typing import List, Optional, Tuple, Union
|
4
4
|
|
5
5
|
from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
|
6
6
|
|
@@ -24,27 +24,22 @@ class PromptTask(str, Enum):
|
|
24
24
|
PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
|
25
25
|
|
26
26
|
|
27
|
-
class
|
27
|
+
class Florence2FtRequest(BaseModel):
|
28
28
|
model_config = ConfigDict(populate_by_name=True)
|
29
29
|
|
30
|
-
|
30
|
+
image: Optional[str] = None
|
31
|
+
video: Optional[bytes] = None
|
32
|
+
task: PromptTask
|
33
|
+
prompt: Optional[str] = ""
|
34
|
+
chunk_length_frames: Optional[int] = None
|
31
35
|
postprocessing: Optional[str] = None
|
36
|
+
job_id: Optional[UUID] = Field(None, alias="jobId")
|
32
37
|
|
33
38
|
@field_serializer("job_id")
|
34
39
|
def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
|
35
40
|
return str(job_id)
|
36
41
|
|
37
42
|
|
38
|
-
class Florence2FtRequest(BaseModel):
|
39
|
-
model_config = ConfigDict(populate_by_name=True)
|
40
|
-
|
41
|
-
image: str
|
42
|
-
task: PromptTask
|
43
|
-
tool: str
|
44
|
-
prompt: Optional[str] = ""
|
45
|
-
fine_tuning: Optional[FineTuning] = Field(None, alias="fineTuning")
|
46
|
-
|
47
|
-
|
48
43
|
class JobStatus(str, Enum):
|
49
44
|
"""The status of a fine-tuning job.
|
50
45
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|