vision-agent 0.2.151__py3-none-any.whl → 0.2.153__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent_coder_prompts.py +5 -5
- vision_agent/agent/vision_agent_prompts.py +7 -7
- vision_agent/tools/__init__.py +1 -1
- vision_agent/tools/meta_tools.py +6 -2
- vision_agent/tools/tool_utils.py +9 -4
- vision_agent/tools/tools.py +154 -53
- vision_agent/tools/tools_types.py +8 -13
- {vision_agent-0.2.151.dist-info → vision_agent-0.2.153.dist-info}/METADATA +1 -1
- {vision_agent-0.2.151.dist-info → vision_agent-0.2.153.dist-info}/RECORD +11 -11
- {vision_agent-0.2.151.dist-info → vision_agent-0.2.153.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.151.dist-info → vision_agent-0.2.153.dist-info}/WHEEL +0 -0
@@ -101,7 +101,7 @@ plan1:
|
|
101
101
|
- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
|
102
102
|
plan2:
|
103
103
|
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
104
|
-
- Use the '
|
104
|
+
- Use the 'florence2_phrase_grounding_image' tool with the prompt 'person' to detect where the people are in the video.
|
105
105
|
plan3:
|
106
106
|
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
107
107
|
- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
|
@@ -109,7 +109,7 @@ plan3:
|
|
109
109
|
|
110
110
|
```python
|
111
111
|
import numpy as np
|
112
|
-
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video,
|
112
|
+
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding_image, florence2_sam2_video_tracking
|
113
113
|
|
114
114
|
# sample at 1 FPS and use the first 10 frames to reduce processing time
|
115
115
|
frames = extract_frames_and_timestamps("video.mp4", 1)
|
@@ -143,7 +143,7 @@ owl_v2_out = owl_v2_video("person", frames)
|
|
143
143
|
owl_v2_counts = get_counts(owl_v2_out)
|
144
144
|
|
145
145
|
# plan2
|
146
|
-
florence2_out = [
|
146
|
+
florence2_out = [florence2_phrase_grounding_image("person", f) for f in frames]
|
147
147
|
florence2_counts = get_counts(florence2_out)
|
148
148
|
|
149
149
|
# plan3
|
@@ -153,13 +153,13 @@ f2s2_counts = get_counts(f2s2_tracking_out)
|
|
153
153
|
|
154
154
|
final_out = {{
|
155
155
|
"owl_v2_video": owl_v2_out,
|
156
|
-
"
|
156
|
+
"florence2_phrase_grounding_image": florence2_out,
|
157
157
|
"florence2_sam2_video_tracking": f2s2_out,
|
158
158
|
}}
|
159
159
|
|
160
160
|
counts = {{
|
161
161
|
"owl_v2_video": owl_v2_counts,
|
162
|
-
"
|
162
|
+
"florence2_phrase_grounding_image": florence2_counts,
|
163
163
|
"florence2_sam2_video_tracking": f2s2_counts,
|
164
164
|
}}
|
165
165
|
|
@@ -131,10 +131,10 @@ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers wi
|
|
131
131
|
|
132
132
|
OBSERVATION:
|
133
133
|
[Artifact code.py]
|
134
|
-
0|from vision_agent.tools import load_image,
|
134
|
+
0|from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
|
135
135
|
1|def count_workers_with_helmets(image_path: str, output_path: str):
|
136
136
|
2| image = load_image(image_path)
|
137
|
-
3| detections =
|
137
|
+
3| detections = florence2_phrase_grounding_image("worker, helmet", image)
|
138
138
|
4| workers = [d for d in detections if d['label'] == 'worker']
|
139
139
|
5| helmets = [d for d in detections if d['label'] == 'helmet']
|
140
140
|
6| count = 0
|
@@ -166,18 +166,18 @@ AGENT: {"thoughts": "Because the user has supplied me with labels I can call obj
|
|
166
166
|
OBSERVATION:
|
167
167
|
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
|
168
168
|
|
169
|
-
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original
|
169
|
+
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding_image call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
|
170
170
|
|
171
171
|
OBSERVATION:
|
172
172
|
[Artifact code.py edits]
|
173
173
|
---
|
174
174
|
+++
|
175
175
|
@@ -1,7 +1,7 @@
|
176
|
-
from vision_agent.tools import load_image,
|
176
|
+
from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
|
177
177
|
def count_workers_with_helmets(image_path: str, output_path: str):
|
178
178
|
image = load_image(image_path)
|
179
|
-
- detections =
|
180
|
-
+ detections =
|
179
|
+
- detections = florence2_phrase_grounding_image("worker, helmet", image)
|
180
|
+
+ detections = florence2_phrase_grounding_image("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
|
181
181
|
workers = [d for d in detections if d['label'] == 'worker']
|
182
182
|
helmets = [d for d in detections if d['label'] == 'helmet']
|
183
183
|
count = 0
|
@@ -189,5 +189,5 @@ OBSERVATION:
|
|
189
189
|
----- stdout -----
|
190
190
|
3
|
191
191
|
|
192
|
-
AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned
|
192
|
+
AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding_image model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
|
193
193
|
"""
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/meta_tools.py
CHANGED
@@ -668,8 +668,12 @@ def use_object_detection_fine_tuning(
|
|
668
668
|
|
669
669
|
patterns_with_fine_tune_id = [
|
670
670
|
(
|
671
|
-
r'
|
672
|
-
lambda match: f'
|
671
|
+
r'florence2_phrase_grounding_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
672
|
+
lambda match: f'florence2_phrase_grounding_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
|
673
|
+
),
|
674
|
+
(
|
675
|
+
r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
676
|
+
lambda match: f'florence2_phrase_grounding_video("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
|
673
677
|
),
|
674
678
|
(
|
675
679
|
r'owl_v2_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
|
vision_agent/tools/tool_utils.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
+
import os
|
1
2
|
import inspect
|
2
3
|
import logging
|
3
|
-
import os
|
4
4
|
from base64 import b64encode
|
5
5
|
from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
|
6
6
|
|
@@ -37,8 +37,9 @@ def send_inference_request(
|
|
37
37
|
files: Optional[List[Tuple[Any, ...]]] = None,
|
38
38
|
v2: bool = False,
|
39
39
|
metadata_payload: Optional[Dict[str, Any]] = None,
|
40
|
+
is_form: bool = False,
|
40
41
|
) -> Any:
|
41
|
-
# TODO: runtime_tag and function_name should be metadata_payload and
|
42
|
+
# TODO: runtime_tag and function_name should be metadata_payload and not included
|
42
43
|
# in the service payload
|
43
44
|
if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
|
44
45
|
payload["runtime_tag"] = runtime_tag
|
@@ -64,7 +65,7 @@ def send_inference_request(
|
|
64
65
|
elif metadata_payload is not None and "function_name" in metadata_payload:
|
65
66
|
function_name = metadata_payload["function_name"]
|
66
67
|
|
67
|
-
response = _call_post(url, payload, session, files, function_name)
|
68
|
+
response = _call_post(url, payload, session, files, function_name, is_form)
|
68
69
|
|
69
70
|
# TODO: consider making the response schema the same between below two sources
|
70
71
|
return response if "TOOL_ENDPOINT_AUTH" in os.environ else response["data"]
|
@@ -75,6 +76,7 @@ def send_task_inference_request(
|
|
75
76
|
task_name: str,
|
76
77
|
files: Optional[List[Tuple[Any, ...]]] = None,
|
77
78
|
metadata: Optional[Dict[str, Any]] = None,
|
79
|
+
is_form: bool = False,
|
78
80
|
) -> Any:
|
79
81
|
url = f"{_LND_API_URL_v2}/{task_name}"
|
80
82
|
headers = {"apikey": _LND_API_KEY}
|
@@ -87,7 +89,7 @@ def send_task_inference_request(
|
|
87
89
|
function_name = "unknown"
|
88
90
|
if metadata is not None and "function_name" in metadata:
|
89
91
|
function_name = metadata["function_name"]
|
90
|
-
response = _call_post(url, payload, session, files, function_name)
|
92
|
+
response = _call_post(url, payload, session, files, function_name, is_form)
|
91
93
|
return response["data"]
|
92
94
|
|
93
95
|
|
@@ -203,6 +205,7 @@ def _call_post(
|
|
203
205
|
session: Session,
|
204
206
|
files: Optional[List[Tuple[Any, ...]]] = None,
|
205
207
|
function_name: str = "unknown",
|
208
|
+
is_form: bool = False,
|
206
209
|
) -> Any:
|
207
210
|
files_in_b64 = None
|
208
211
|
if files:
|
@@ -210,6 +213,8 @@ def _call_post(
|
|
210
213
|
try:
|
211
214
|
if files is not None:
|
212
215
|
response = session.post(url, data=payload, files=files)
|
216
|
+
elif is_form:
|
217
|
+
response = session.post(url, data=payload)
|
213
218
|
else:
|
214
219
|
response = session.post(url, json=payload)
|
215
220
|
|
vision_agent/tools/tools.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import base64
|
1
2
|
import io
|
2
3
|
import json
|
3
4
|
import logging
|
@@ -28,7 +29,6 @@ from vision_agent.tools.tool_utils import (
|
|
28
29
|
send_task_inference_request,
|
29
30
|
)
|
30
31
|
from vision_agent.tools.tools_types import (
|
31
|
-
FineTuning,
|
32
32
|
Florence2FtRequest,
|
33
33
|
JobStatus,
|
34
34
|
ODResponseData,
|
@@ -194,20 +194,26 @@ def owl_v2_image(
|
|
194
194
|
data_obj = Florence2FtRequest(
|
195
195
|
image=image_b64,
|
196
196
|
task=PromptTask.PHRASE_GROUNDING,
|
197
|
-
tool="florencev2_fine_tuning",
|
198
197
|
prompt=prompt,
|
199
|
-
|
198
|
+
job_id=UUID(fine_tune_id),
|
200
199
|
)
|
201
|
-
data = data_obj.model_dump(by_alias=True)
|
202
|
-
detections = send_inference_request(
|
203
|
-
|
200
|
+
data = data_obj.model_dump(by_alias=True, exclude_none=True)
|
201
|
+
detections = send_inference_request(
|
202
|
+
data,
|
203
|
+
"florence2-ft",
|
204
|
+
v2=True,
|
205
|
+
is_form=True,
|
206
|
+
metadata_payload={"function_name": "owl_v2_image"},
|
207
|
+
)
|
208
|
+
# get the first frame
|
209
|
+
detection = detections[0]
|
204
210
|
bboxes_formatted = [
|
205
211
|
ODResponseData(
|
206
|
-
label=
|
207
|
-
bbox=normalize_bbox(
|
212
|
+
label=detection["labels"][i],
|
213
|
+
bbox=normalize_bbox(detection["bboxes"][i], image_size),
|
208
214
|
score=1.0,
|
209
215
|
)
|
210
|
-
for i in range(len(
|
216
|
+
for i in range(len(detection["bboxes"]))
|
211
217
|
]
|
212
218
|
return [bbox.model_dump() for bbox in bboxes_formatted]
|
213
219
|
|
@@ -419,25 +425,30 @@ def florence2_sam2_image(
|
|
419
425
|
req_data_obj = Florence2FtRequest(
|
420
426
|
image=image_b64,
|
421
427
|
task=PromptTask.PHRASE_GROUNDING,
|
422
|
-
tool="florencev2_fine_tuning",
|
423
428
|
prompt=prompt,
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
429
|
+
postprocessing="sam2",
|
430
|
+
job_id=UUID(fine_tune_id),
|
431
|
+
)
|
432
|
+
req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
|
433
|
+
detections_ft = send_inference_request(
|
434
|
+
req_data,
|
435
|
+
"florence2-ft",
|
436
|
+
v2=True,
|
437
|
+
is_form=True,
|
438
|
+
metadata_payload={"function_name": "florence2_sam2_image"},
|
428
439
|
)
|
429
|
-
|
430
|
-
|
431
|
-
detections_ft = detections_ft["<CAPTION_TO_PHRASE_GROUNDING>"]
|
440
|
+
# get the first frame
|
441
|
+
detection = detections_ft[0]
|
432
442
|
return_data = []
|
433
|
-
|
434
|
-
for i in range(len(detections_ft["bboxes"])):
|
443
|
+
for i in range(len(detection["bboxes"])):
|
435
444
|
return_data.append(
|
436
445
|
{
|
437
446
|
"score": 1.0,
|
438
|
-
"label":
|
439
|
-
"bbox":
|
440
|
-
|
447
|
+
"label": detection["labels"][i],
|
448
|
+
"bbox": normalize_bbox(
|
449
|
+
detection["bboxes"][i], detection["masks"][i]["size"]
|
450
|
+
),
|
451
|
+
"mask": rle_decode_array(detection["masks"][i]),
|
441
452
|
}
|
442
453
|
)
|
443
454
|
return return_data
|
@@ -451,6 +462,7 @@ def florence2_sam2_image(
|
|
451
462
|
detections: Dict[str, Any] = send_inference_request(
|
452
463
|
payload, "florence2-sam2", files=files, v2=True
|
453
464
|
)
|
465
|
+
|
454
466
|
return_data = []
|
455
467
|
for _, data_i in detections["0"].items():
|
456
468
|
mask = rle_decode_array(data_i["mask"])
|
@@ -688,22 +700,18 @@ def countgd_counting(
|
|
688
700
|
{'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
|
689
701
|
]
|
690
702
|
"""
|
691
|
-
|
692
|
-
files = [("image", buffer_bytes)]
|
703
|
+
image_b64 = convert_to_b64(image)
|
693
704
|
prompt = prompt.replace(", ", " .")
|
694
|
-
payload = {"
|
705
|
+
payload = {"prompt": prompt, "image": image_b64}
|
695
706
|
metadata = {"function_name": "countgd_counting"}
|
696
|
-
resp_data = send_task_inference_request(
|
697
|
-
payload, "text-to-object-detection", files=files, metadata=metadata
|
698
|
-
)
|
699
|
-
bboxes_per_frame = resp_data[0]
|
707
|
+
resp_data = send_task_inference_request(payload, "countgd", metadata=metadata)
|
700
708
|
bboxes_formatted = [
|
701
709
|
ODResponseData(
|
702
710
|
label=bbox["label"],
|
703
|
-
bbox=list(map(lambda x: round(x, 2), bbox["
|
711
|
+
bbox=list(map(lambda x: round(x, 2), bbox["bbox"])),
|
704
712
|
score=round(bbox["score"], 2),
|
705
713
|
)
|
706
|
-
for bbox in
|
714
|
+
for bbox in resp_data
|
707
715
|
]
|
708
716
|
filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
|
709
717
|
return [bbox.model_dump() for bbox in filtered_bboxes]
|
@@ -887,7 +895,10 @@ def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[b
|
|
887
895
|
"function_name": "ixc25_temporal_localization",
|
888
896
|
}
|
889
897
|
data: List[int] = send_inference_request(
|
890
|
-
payload,
|
898
|
+
payload,
|
899
|
+
"video-temporal-localization?model=internlm-xcomposer",
|
900
|
+
files=files,
|
901
|
+
v2=True,
|
891
902
|
)
|
892
903
|
chunk_size = round(len(frames) / len(data))
|
893
904
|
data_explode = [[elt] * chunk_size for elt in data]
|
@@ -1132,13 +1143,13 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
|
|
1132
1143
|
return answer[task] # type: ignore
|
1133
1144
|
|
1134
1145
|
|
1135
|
-
def
|
1146
|
+
def florence2_phrase_grounding_image(
|
1136
1147
|
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
|
1137
1148
|
) -> List[Dict[str, Any]]:
|
1138
|
-
"""'
|
1139
|
-
objects given a text prompt which can be object names or caption.
|
1140
|
-
can optionally separate the object names in the text with commas. It returns
|
1141
|
-
of bounding boxes with normalized coordinates, label names and associated
|
1149
|
+
"""'florence2_phrase_grounding_image' will run florence2 on a image. It can
|
1150
|
+
detect multiple objects given a text prompt which can be object names or caption.
|
1151
|
+
You can optionally separate the object names in the text with commas. It returns
|
1152
|
+
a list of bounding boxes with normalized coordinates, label names and associated
|
1142
1153
|
probability scores of 1.0.
|
1143
1154
|
|
1144
1155
|
Parameters:
|
@@ -1156,7 +1167,7 @@ def florence2_phrase_grounding(
|
|
1156
1167
|
|
1157
1168
|
Example
|
1158
1169
|
-------
|
1159
|
-
>>>
|
1170
|
+
>>> florence2_phrase_grounding_image('person looking at a coyote', image)
|
1160
1171
|
[
|
1161
1172
|
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1162
1173
|
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
@@ -1176,39 +1187,128 @@ def florence2_phrase_grounding(
|
|
1176
1187
|
data_obj = Florence2FtRequest(
|
1177
1188
|
image=image_b64,
|
1178
1189
|
task=PromptTask.PHRASE_GROUNDING,
|
1179
|
-
tool="florencev2_fine_tuning",
|
1180
1190
|
prompt=prompt,
|
1181
|
-
|
1191
|
+
job_id=UUID(fine_tune_id),
|
1182
1192
|
)
|
1183
|
-
data = data_obj.model_dump(by_alias=True)
|
1193
|
+
data = data_obj.model_dump(by_alias=True, exclude_none=True)
|
1184
1194
|
detections = send_inference_request(
|
1185
1195
|
data,
|
1186
|
-
"
|
1187
|
-
v2=
|
1188
|
-
|
1196
|
+
"florence2-ft",
|
1197
|
+
v2=True,
|
1198
|
+
is_form=True,
|
1199
|
+
metadata_payload={"function_name": "florence2_phrase_grounding_image"},
|
1189
1200
|
)
|
1201
|
+
# get the first frame
|
1202
|
+
detection = detections[0]
|
1190
1203
|
else:
|
1191
1204
|
data = {
|
1192
1205
|
"image": image_b64,
|
1193
1206
|
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
1194
1207
|
"prompt": prompt,
|
1195
|
-
"function_name": "
|
1208
|
+
"function_name": "florence2_phrase_grounding_image",
|
1196
1209
|
}
|
1197
1210
|
detections = send_inference_request(data, "florence2", v2=True)
|
1211
|
+
detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
|
1198
1212
|
|
1199
|
-
detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
|
1200
1213
|
return_data = []
|
1201
|
-
for i in range(len(
|
1214
|
+
for i in range(len(detection["bboxes"])):
|
1202
1215
|
return_data.append(
|
1203
1216
|
ODResponseData(
|
1204
|
-
label=
|
1205
|
-
bbox=normalize_bbox(
|
1217
|
+
label=detection["labels"][i],
|
1218
|
+
bbox=normalize_bbox(detection["bboxes"][i], image_size),
|
1206
1219
|
score=1.0,
|
1207
1220
|
)
|
1208
1221
|
)
|
1209
1222
|
return [bbox.model_dump() for bbox in return_data]
|
1210
1223
|
|
1211
1224
|
|
1225
|
+
def florence2_phrase_grounding_video(
|
1226
|
+
prompt: str, frames: List[np.ndarray], fine_tune_id: Optional[str] = None
|
1227
|
+
) -> List[List[Dict[str, Any]]]:
|
1228
|
+
"""'florence2_phrase_grounding_video' will run florence2 on each frame of a video.
|
1229
|
+
It can detect multiple objects given a text prompt which can be object names or
|
1230
|
+
caption. You can optionally separate the object names in the text with commas.
|
1231
|
+
It returns a list of lists where each inner list contains bounding boxes with
|
1232
|
+
normalized coordinates, label names and associated probability scores of 1.0.
|
1233
|
+
|
1234
|
+
Parameters:
|
1235
|
+
prompt (str): The prompt to ground to the video.
|
1236
|
+
frames (List[np.ndarray]): The list of frames to detect objects.
|
1237
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
1238
|
+
fine-tuned model ID here to use it.
|
1239
|
+
|
1240
|
+
Returns:
|
1241
|
+
List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the score,
|
1242
|
+
label, and bounding box of the detected objects with normalized coordinates
|
1243
|
+
between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
|
1244
|
+
of the top-left and xmax and ymax are the coordinates of the bottom-right of
|
1245
|
+
the bounding box. The scores are always 1.0 and cannot be thresholded.
|
1246
|
+
|
1247
|
+
Example
|
1248
|
+
-------
|
1249
|
+
>>> florence2_phrase_grounding_video('person looking at a coyote', frames)
|
1250
|
+
[
|
1251
|
+
[
|
1252
|
+
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1253
|
+
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
1254
|
+
],
|
1255
|
+
...
|
1256
|
+
]
|
1257
|
+
"""
|
1258
|
+
if len(frames) == 0:
|
1259
|
+
raise ValueError("No frames provided")
|
1260
|
+
|
1261
|
+
image_size = frames[0].shape[:2]
|
1262
|
+
buffer_bytes = frames_to_bytes(frames)
|
1263
|
+
files = [("video", buffer_bytes)]
|
1264
|
+
|
1265
|
+
if fine_tune_id is not None:
|
1266
|
+
landing_api = LandingPublicAPI()
|
1267
|
+
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
1268
|
+
if status is not JobStatus.SUCCEEDED:
|
1269
|
+
raise FineTuneModelIsNotReady(
|
1270
|
+
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
1271
|
+
)
|
1272
|
+
|
1273
|
+
data_obj = Florence2FtRequest(
|
1274
|
+
task=PromptTask.PHRASE_GROUNDING,
|
1275
|
+
prompt=prompt,
|
1276
|
+
job_id=UUID(fine_tune_id),
|
1277
|
+
)
|
1278
|
+
|
1279
|
+
data = data_obj.model_dump(by_alias=True, exclude_none=True, mode="json")
|
1280
|
+
detections = send_inference_request(
|
1281
|
+
data,
|
1282
|
+
"florence2-ft",
|
1283
|
+
v2=True,
|
1284
|
+
files=files,
|
1285
|
+
metadata_payload={"function_name": "florence2_phrase_grounding_video"},
|
1286
|
+
)
|
1287
|
+
else:
|
1288
|
+
data = {
|
1289
|
+
"prompt": prompt,
|
1290
|
+
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
1291
|
+
"function_name": "florence2_phrase_grounding_video",
|
1292
|
+
"video": base64.b64encode(buffer_bytes).decode("utf-8"),
|
1293
|
+
}
|
1294
|
+
detections = send_inference_request(data, "florence2", v2=True)
|
1295
|
+
detections = [d["<CAPTION_TO_PHRASE_GROUNDING>"] for d in detections]
|
1296
|
+
|
1297
|
+
bboxes_formatted = []
|
1298
|
+
for frame_data in detections:
|
1299
|
+
bboxes_formatted_per_frame = []
|
1300
|
+
for idx in range(len(frame_data["bboxes"])):
|
1301
|
+
bboxes_formatted_per_frame.append(
|
1302
|
+
ODResponseData(
|
1303
|
+
label=frame_data["labels"][idx],
|
1304
|
+
bbox=normalize_bbox(frame_data["bboxes"][idx], image_size),
|
1305
|
+
score=1.0,
|
1306
|
+
)
|
1307
|
+
)
|
1308
|
+
bboxes_formatted.append(bboxes_formatted_per_frame)
|
1309
|
+
return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
|
1310
|
+
|
1311
|
+
|
1212
1312
|
def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
1213
1313
|
"""'florence2_ocr' is a tool that can detect text and text regions in an image.
|
1214
1314
|
Each text region contains one line of text. It returns a list of detected text,
|
@@ -1220,7 +1320,7 @@ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
1220
1320
|
|
1221
1321
|
Returns:
|
1222
1322
|
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
1223
|
-
with
|
1323
|
+
with normalized coordinates, and confidence score.
|
1224
1324
|
|
1225
1325
|
Example
|
1226
1326
|
-------
|
@@ -1603,7 +1703,7 @@ def extract_frames_and_timestamps(
|
|
1603
1703
|
"""
|
1604
1704
|
|
1605
1705
|
def reformat(
|
1606
|
-
frames_and_timestamps: List[Tuple[np.ndarray, float]]
|
1706
|
+
frames_and_timestamps: List[Tuple[np.ndarray, float]],
|
1607
1707
|
) -> List[Dict[str, Union[np.ndarray, float]]]:
|
1608
1708
|
return [
|
1609
1709
|
{"frame": frame, "timestamp": timestamp}
|
@@ -2017,7 +2117,7 @@ def overlay_counting_results(
|
|
2017
2117
|
fontsize,
|
2018
2118
|
)
|
2019
2119
|
|
2020
|
-
for i, elt in enumerate(instances):
|
2120
|
+
for i, elt in enumerate(instances, 1):
|
2021
2121
|
label = f"{i}"
|
2022
2122
|
box = elt["bbox"]
|
2023
2123
|
|
@@ -2064,7 +2164,8 @@ FUNCTION_TOOLS = [
|
|
2064
2164
|
florence2_ocr,
|
2065
2165
|
florence2_sam2_image,
|
2066
2166
|
florence2_sam2_video_tracking,
|
2067
|
-
|
2167
|
+
florence2_phrase_grounding_image,
|
2168
|
+
florence2_phrase_grounding_video,
|
2068
2169
|
ixc25_image_vqa,
|
2069
2170
|
ixc25_video_vqa,
|
2070
2171
|
detr_segmentation,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
from typing import List, Optional, Tuple, Union
|
3
2
|
from uuid import UUID
|
3
|
+
from typing import List, Optional, Tuple, Union
|
4
4
|
|
5
5
|
from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
|
6
6
|
|
@@ -24,27 +24,22 @@ class PromptTask(str, Enum):
|
|
24
24
|
PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
|
25
25
|
|
26
26
|
|
27
|
-
class
|
27
|
+
class Florence2FtRequest(BaseModel):
|
28
28
|
model_config = ConfigDict(populate_by_name=True)
|
29
29
|
|
30
|
-
|
30
|
+
image: Optional[str] = None
|
31
|
+
video: Optional[bytes] = None
|
32
|
+
task: PromptTask
|
33
|
+
prompt: Optional[str] = ""
|
34
|
+
chunk_length_frames: Optional[int] = None
|
31
35
|
postprocessing: Optional[str] = None
|
36
|
+
job_id: Optional[UUID] = Field(None, alias="jobId")
|
32
37
|
|
33
38
|
@field_serializer("job_id")
|
34
39
|
def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
|
35
40
|
return str(job_id)
|
36
41
|
|
37
42
|
|
38
|
-
class Florence2FtRequest(BaseModel):
|
39
|
-
model_config = ConfigDict(populate_by_name=True)
|
40
|
-
|
41
|
-
image: str
|
42
|
-
task: PromptTask
|
43
|
-
tool: str
|
44
|
-
prompt: Optional[str] = ""
|
45
|
-
fine_tuning: Optional[FineTuning] = Field(None, alias="fineTuning")
|
46
|
-
|
47
|
-
|
48
43
|
class JobStatus(str, Enum):
|
49
44
|
"""The status of a fine-tuning job.
|
50
45
|
|
@@ -4,8 +4,8 @@ vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,5
|
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=PEUHqvnHmFL4np_TeFmKMwr5s_dWfdfJz6TF_ogd1dU,2353
|
5
5
|
vision_agent/agent/vision_agent.py,sha256=m7apb1smJbRyj0VAellrN_mDrSPAee4DVm6FWRa-e78,18459
|
6
6
|
vision_agent/agent/vision_agent_coder.py,sha256=9BT4gaXsqH5pvxo8WGwJN9MTvP1V3TgoJHBpjtlKP9I,38417
|
7
|
-
vision_agent/agent/vision_agent_coder_prompts.py,sha256=
|
8
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
7
|
+
vision_agent/agent/vision_agent_coder_prompts.py,sha256=Ea_v_qLBJMVwQVLLIdNq15MgV2-6qqhcThHAHFwzv-o,18940
|
8
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=eOqluRb1R_SJFsdWXd9HJuiJnJccEnDDUkfPXlHOjyw,11293
|
9
9
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
11
11
|
vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
|
@@ -14,12 +14,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
14
14
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
15
15
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
16
16
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
|
-
vision_agent/tools/__init__.py,sha256=
|
18
|
-
vision_agent/tools/meta_tools.py,sha256=
|
17
|
+
vision_agent/tools/__init__.py,sha256=cg4Axb9L3Z7WkdyEv5IyqDsmZKIrxmS4CmV3DEXURnU,2418
|
18
|
+
vision_agent/tools/meta_tools.py,sha256=yrplxiDu-L9_Dw_L2ESehJabckAq59Q-xfMpIbYB0Ak,25179
|
19
19
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
|
-
vision_agent/tools/tool_utils.py,sha256=
|
21
|
-
vision_agent/tools/tools.py,sha256=
|
22
|
-
vision_agent/tools/tools_types.py,sha256=
|
20
|
+
vision_agent/tools/tool_utils.py,sha256=FTSboDmYPQLmIhsc9FeydcrdRZU6huBZKnyBmm0VsHE,8196
|
21
|
+
vision_agent/tools/tools.py,sha256=Of7NTZTc1bim_fdAoDxx47WzttGI8VlMKKcId0sMwfk,78406
|
22
|
+
vision_agent/tools/tools_types.py,sha256=Qijj5NmY6_Aq1fYwuQYf3J1TAQYTz_1mWkX3Dq4d4e0,2339
|
23
23
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
24
24
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
25
25
|
vision_agent/utils/execute.py,sha256=FqSOr5gtBeKB1g2hbV6-bhox6qItDQNn2o9efq1w6f4,28017
|
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
27
27
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.153.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.153.dist-info/METADATA,sha256=zehWh4l1EfZeTKxSEgKXtQMb0EE5pvWP1UG0d2lyS44,13758
|
32
|
+
vision_agent-0.2.153.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.153.dist-info/RECORD,,
|
File without changes
|
File without changes
|