vision-agent 0.2.181__tar.gz → 0.2.182__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.181 → vision_agent-0.2.182}/PKG-INFO +1 -1
- {vision_agent-0.2.181 → vision_agent-0.2.182}/pyproject.toml +1 -1
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/tools/tools.py +128 -132
- {vision_agent-0.2.181 → vision_agent-0.2.182}/LICENSE +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/README.md +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/agent/agent_utils.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/agent/vision_agent_planner.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/tools/__init__.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/utils/video.py +0 -0
@@ -1,4 +1,3 @@
|
|
1
|
-
import base64
|
2
1
|
import io
|
3
2
|
import json
|
4
3
|
import logging
|
@@ -184,8 +183,16 @@ def owl_v2_image(
|
|
184
183
|
if image_size[0] < 1 or image_size[1] < 1:
|
185
184
|
return []
|
186
185
|
|
186
|
+
buffer_bytes = numpy_to_bytes(image)
|
187
|
+
files = [("image", buffer_bytes)]
|
188
|
+
payload = {
|
189
|
+
"prompts": [s.strip() for s in prompt.split(",")],
|
190
|
+
"confidence": box_threshold,
|
191
|
+
"model": "owlv2",
|
192
|
+
}
|
193
|
+
metadata = {"function_name": "owl_v2_image"}
|
194
|
+
|
187
195
|
if fine_tune_id is not None:
|
188
|
-
image_b64 = convert_to_b64(image)
|
189
196
|
landing_api = LandingPublicAPI()
|
190
197
|
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
191
198
|
if status is not JobStatus.SUCCEEDED:
|
@@ -193,43 +200,22 @@ def owl_v2_image(
|
|
193
200
|
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
194
201
|
)
|
195
202
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
data = data_obj.model_dump(by_alias=True, exclude_none=True)
|
203
|
-
detections = send_inference_request(
|
204
|
-
data,
|
205
|
-
"florence2-ft",
|
206
|
-
v2=True,
|
207
|
-
is_form=True,
|
208
|
-
metadata_payload={"function_name": "owl_v2_image"},
|
209
|
-
)
|
210
|
-
# get the first frame
|
211
|
-
detection = detections[0]
|
212
|
-
bboxes_formatted = [
|
213
|
-
ODResponseData(
|
214
|
-
label=detection["labels"][i],
|
215
|
-
bbox=normalize_bbox(detection["bboxes"][i], image_size),
|
216
|
-
score=1.0,
|
217
|
-
)
|
218
|
-
for i in range(len(detection["bboxes"]))
|
219
|
-
]
|
220
|
-
return [bbox.model_dump() for bbox in bboxes_formatted]
|
203
|
+
# we can only execute fine-tuned models with florence2
|
204
|
+
payload = {
|
205
|
+
"prompts": payload["prompts"],
|
206
|
+
"jobId": fine_tune_id,
|
207
|
+
"model": "florence2",
|
208
|
+
}
|
221
209
|
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
"function_name": "owl_v2_image",
|
228
|
-
}
|
229
|
-
resp_data = send_inference_request(
|
230
|
-
payload, "text-to-object-detection", files=files, v2=True
|
210
|
+
detections = send_task_inference_request(
|
211
|
+
payload,
|
212
|
+
"text-to-object-detection",
|
213
|
+
files=files,
|
214
|
+
metadata=metadata,
|
231
215
|
)
|
232
|
-
|
216
|
+
|
217
|
+
# get the first frame
|
218
|
+
bboxes = detections[0]
|
233
219
|
bboxes_formatted = [
|
234
220
|
ODResponseData(
|
235
221
|
label=bbox["label"],
|
@@ -238,17 +224,17 @@ def owl_v2_image(
|
|
238
224
|
)
|
239
225
|
for bbox in bboxes
|
240
226
|
]
|
241
|
-
|
242
|
-
return [bbox.model_dump() for bbox in filtered_bboxes]
|
227
|
+
return [bbox.model_dump() for bbox in bboxes_formatted]
|
243
228
|
|
244
229
|
|
245
230
|
def owl_v2_video(
|
246
231
|
prompt: str,
|
247
232
|
frames: List[np.ndarray],
|
248
233
|
box_threshold: float = 0.10,
|
234
|
+
fine_tune_id: Optional[str] = None,
|
249
235
|
) -> List[List[Dict[str, Any]]]:
|
250
236
|
"""'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
|
251
|
-
objects
|
237
|
+
objects independently per frame given a text prompt such as a category name or
|
252
238
|
referring expression but does not track objects across frames. The categories in
|
253
239
|
text prompt are separated by commas. It returns a list of lists where each inner
|
254
240
|
list contains the score, label, and bounding box of the detections for that frame.
|
@@ -258,6 +244,8 @@ def owl_v2_video(
|
|
258
244
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
259
245
|
box_threshold (float, optional): The threshold for the box detection. Defaults
|
260
246
|
to 0.30.
|
247
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
248
|
+
fine-tuned model ID here to use it.
|
261
249
|
|
262
250
|
Returns:
|
263
251
|
List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the
|
@@ -285,30 +273,45 @@ def owl_v2_video(
|
|
285
273
|
files = [("video", buffer_bytes)]
|
286
274
|
payload = {
|
287
275
|
"prompts": [s.strip() for s in prompt.split(",")],
|
276
|
+
"confidence": box_threshold,
|
288
277
|
"model": "owlv2",
|
289
|
-
"function_name": "owl_v2_video",
|
290
278
|
}
|
291
|
-
|
292
|
-
|
279
|
+
metadata = {"function_name": "owl_v2_video"}
|
280
|
+
|
281
|
+
if fine_tune_id is not None:
|
282
|
+
landing_api = LandingPublicAPI()
|
283
|
+
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
284
|
+
if status is not JobStatus.SUCCEEDED:
|
285
|
+
raise FineTuneModelIsNotReady(
|
286
|
+
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
287
|
+
)
|
288
|
+
|
289
|
+
# we can only execute fine-tuned models with florence2
|
290
|
+
payload = {
|
291
|
+
"prompts": payload["prompts"],
|
292
|
+
"jobId": fine_tune_id,
|
293
|
+
"model": "florence2",
|
294
|
+
}
|
295
|
+
|
296
|
+
detections = send_task_inference_request(
|
297
|
+
payload,
|
298
|
+
"text-to-object-detection",
|
299
|
+
files=files,
|
300
|
+
metadata=metadata,
|
293
301
|
)
|
294
|
-
bboxes_formatted = []
|
295
|
-
if data is not None:
|
296
|
-
for frame_data in data:
|
297
|
-
bboxes_formated_frame = []
|
298
|
-
for elt in frame_data:
|
299
|
-
bboxes_formated_frame.append(
|
300
|
-
ODResponseData(
|
301
|
-
label=elt["label"], # type: ignore
|
302
|
-
bbox=normalize_bbox(elt["bounding_box"], image_size), # type: ignore
|
303
|
-
score=round(elt["score"], 2), # type: ignore
|
304
|
-
)
|
305
|
-
)
|
306
|
-
bboxes_formatted.append(bboxes_formated_frame)
|
307
302
|
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
303
|
+
bboxes_formatted = []
|
304
|
+
for frame_data in detections:
|
305
|
+
bboxes_formatted_per_frame = [
|
306
|
+
ODResponseData(
|
307
|
+
label=bbox["label"],
|
308
|
+
bbox=normalize_bbox(bbox["bounding_box"], image_size),
|
309
|
+
score=round(bbox["score"], 2),
|
310
|
+
)
|
311
|
+
for bbox in frame_data
|
312
|
+
]
|
313
|
+
bboxes_formatted.append(bboxes_formatted_per_frame)
|
314
|
+
return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
|
312
315
|
|
313
316
|
|
314
317
|
def grounding_sam(
|
@@ -708,23 +711,31 @@ def countgd_counting(
|
|
708
711
|
image_size = image.shape[:2]
|
709
712
|
if image_size[0] < 1 or image_size[1] < 1:
|
710
713
|
return []
|
714
|
+
|
711
715
|
buffer_bytes = numpy_to_bytes(image)
|
712
716
|
files = [("image", buffer_bytes)]
|
713
|
-
|
714
|
-
|
717
|
+
payload = {
|
718
|
+
"prompts": [prompt.replace(", ", " .")],
|
719
|
+
"confidence": box_threshold, # still not being used in the API
|
720
|
+
"model": "countgd",
|
721
|
+
}
|
715
722
|
metadata = {"function_name": "countgd_counting"}
|
716
|
-
|
723
|
+
|
724
|
+
detections = send_task_inference_request(
|
717
725
|
payload, "text-to-object-detection", files=files, metadata=metadata
|
718
726
|
)
|
719
|
-
|
727
|
+
|
728
|
+
# get the first frame
|
729
|
+
bboxes = detections[0]
|
720
730
|
bboxes_formatted = [
|
721
731
|
ODResponseData(
|
722
732
|
label=bbox["label"],
|
723
733
|
bbox=normalize_bbox(bbox["bounding_box"], image_size),
|
724
734
|
score=round(bbox["score"], 2),
|
725
735
|
)
|
726
|
-
for bbox in
|
736
|
+
for bbox in bboxes
|
727
737
|
]
|
738
|
+
# TODO: remove this once we start to use the confidence on countgd
|
728
739
|
filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
|
729
740
|
return [bbox.model_dump() for bbox in filtered_bboxes]
|
730
741
|
|
@@ -768,6 +779,7 @@ def countgd_example_based_counting(
|
|
768
779
|
image_size = image.shape[:2]
|
769
780
|
if image_size[0] < 1 or image_size[1] < 1:
|
770
781
|
return []
|
782
|
+
|
771
783
|
buffer_bytes = numpy_to_bytes(image)
|
772
784
|
files = [("image", buffer_bytes)]
|
773
785
|
visual_prompts = [
|
@@ -775,10 +787,13 @@ def countgd_example_based_counting(
|
|
775
787
|
]
|
776
788
|
payload = {"visual_prompts": json.dumps(visual_prompts), "model": "countgd"}
|
777
789
|
metadata = {"function_name": "countgd_example_based_counting"}
|
778
|
-
|
790
|
+
|
791
|
+
detections = send_task_inference_request(
|
779
792
|
payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
|
780
793
|
)
|
781
|
-
|
794
|
+
|
795
|
+
# get the first frame
|
796
|
+
bboxes_per_frame = detections[0]
|
782
797
|
bboxes_formatted = [
|
783
798
|
ODResponseData(
|
784
799
|
label=bbox["label"],
|
@@ -1240,7 +1255,14 @@ def florence2_phrase_grounding(
|
|
1240
1255
|
image_size = image.shape[:2]
|
1241
1256
|
if image_size[0] < 1 or image_size[1] < 1:
|
1242
1257
|
return []
|
1243
|
-
|
1258
|
+
|
1259
|
+
buffer_bytes = numpy_to_bytes(image)
|
1260
|
+
files = [("image", buffer_bytes)]
|
1261
|
+
payload = {
|
1262
|
+
"prompts": [s.strip() for s in prompt.split(",")],
|
1263
|
+
"model": "florence2",
|
1264
|
+
}
|
1265
|
+
metadata = {"function_name": "florence2_phrase_grounding"}
|
1244
1266
|
|
1245
1267
|
if fine_tune_id is not None:
|
1246
1268
|
landing_api = LandingPublicAPI()
|
@@ -1250,42 +1272,27 @@ def florence2_phrase_grounding(
|
|
1250
1272
|
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
1251
1273
|
)
|
1252
1274
|
|
1253
|
-
|
1254
|
-
image=image_b64,
|
1255
|
-
task=PromptTask.PHRASE_GROUNDING,
|
1256
|
-
prompt=prompt,
|
1257
|
-
job_id=UUID(fine_tune_id),
|
1258
|
-
)
|
1259
|
-
data = data_obj.model_dump(by_alias=True, exclude_none=True)
|
1260
|
-
detections = send_inference_request(
|
1261
|
-
data,
|
1262
|
-
"florence2-ft",
|
1263
|
-
v2=True,
|
1264
|
-
is_form=True,
|
1265
|
-
metadata_payload={"function_name": "florence2_phrase_grounding"},
|
1266
|
-
)
|
1267
|
-
# get the first frame
|
1268
|
-
detection = detections[0]
|
1269
|
-
else:
|
1270
|
-
data = {
|
1271
|
-
"image": image_b64,
|
1272
|
-
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
1273
|
-
"prompt": prompt,
|
1274
|
-
"function_name": "florence2_phrase_grounding",
|
1275
|
-
}
|
1276
|
-
detections = send_inference_request(data, "florence2", v2=True)
|
1277
|
-
detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
|
1275
|
+
payload["jobId"] = fine_tune_id
|
1278
1276
|
|
1279
|
-
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1283
|
-
|
1284
|
-
|
1285
|
-
|
1286
|
-
|
1277
|
+
detections = send_task_inference_request(
|
1278
|
+
payload,
|
1279
|
+
"text-to-object-detection",
|
1280
|
+
files=files,
|
1281
|
+
metadata=metadata,
|
1282
|
+
)
|
1283
|
+
|
1284
|
+
# get the first frame
|
1285
|
+
bboxes = detections[0]
|
1286
|
+
bboxes_formatted = [
|
1287
|
+
ODResponseData(
|
1288
|
+
label=bbox["label"],
|
1289
|
+
bbox=normalize_bbox(bbox["bounding_box"], image_size),
|
1290
|
+
score=round(bbox["score"], 2),
|
1287
1291
|
)
|
1288
|
-
|
1292
|
+
for bbox in bboxes
|
1293
|
+
]
|
1294
|
+
|
1295
|
+
return [bbox.model_dump() for bbox in bboxes_formatted]
|
1289
1296
|
|
1290
1297
|
|
1291
1298
|
def florence2_phrase_grounding_video(
|
@@ -1327,6 +1334,11 @@ def florence2_phrase_grounding_video(
|
|
1327
1334
|
image_size = frames[0].shape[:2]
|
1328
1335
|
buffer_bytes = frames_to_bytes(frames)
|
1329
1336
|
files = [("video", buffer_bytes)]
|
1337
|
+
payload = {
|
1338
|
+
"prompts": [s.strip() for s in prompt.split(",")],
|
1339
|
+
"model": "florence2",
|
1340
|
+
}
|
1341
|
+
metadata = {"function_name": "florence2_phrase_grounding_video"}
|
1330
1342
|
|
1331
1343
|
if fine_tune_id is not None:
|
1332
1344
|
landing_api = LandingPublicAPI()
|
@@ -1336,41 +1348,25 @@ def florence2_phrase_grounding_video(
|
|
1336
1348
|
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
1337
1349
|
)
|
1338
1350
|
|
1339
|
-
|
1340
|
-
task=PromptTask.PHRASE_GROUNDING,
|
1341
|
-
prompt=prompt,
|
1342
|
-
job_id=UUID(fine_tune_id),
|
1343
|
-
)
|
1351
|
+
payload["jobId"] = fine_tune_id
|
1344
1352
|
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1348
|
-
|
1349
|
-
|
1350
|
-
|
1351
|
-
metadata_payload={"function_name": "florence2_phrase_grounding_video"},
|
1352
|
-
)
|
1353
|
-
else:
|
1354
|
-
data = {
|
1355
|
-
"prompt": prompt,
|
1356
|
-
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
1357
|
-
"function_name": "florence2_phrase_grounding_video",
|
1358
|
-
"video": base64.b64encode(buffer_bytes).decode("utf-8"),
|
1359
|
-
}
|
1360
|
-
detections = send_inference_request(data, "florence2", v2=True)
|
1361
|
-
detections = [d["<CAPTION_TO_PHRASE_GROUNDING>"] for d in detections]
|
1353
|
+
detections = send_task_inference_request(
|
1354
|
+
payload,
|
1355
|
+
"text-to-object-detection",
|
1356
|
+
files=files,
|
1357
|
+
metadata=metadata,
|
1358
|
+
)
|
1362
1359
|
|
1363
1360
|
bboxes_formatted = []
|
1364
1361
|
for frame_data in detections:
|
1365
|
-
bboxes_formatted_per_frame = [
|
1366
|
-
|
1367
|
-
|
1368
|
-
|
1369
|
-
|
1370
|
-
bbox=normalize_bbox(frame_data["bboxes"][idx], image_size),
|
1371
|
-
score=1.0,
|
1372
|
-
)
|
1362
|
+
bboxes_formatted_per_frame = [
|
1363
|
+
ODResponseData(
|
1364
|
+
label=bbox["label"],
|
1365
|
+
bbox=normalize_bbox(bbox["bounding_box"], image_size),
|
1366
|
+
score=round(bbox["score"], 2),
|
1373
1367
|
)
|
1368
|
+
for bbox in frame_data
|
1369
|
+
]
|
1374
1370
|
bboxes_formatted.append(bboxes_formatted_per_frame)
|
1375
1371
|
return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
|
1376
1372
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
File without changes
|
{vision_agent-0.2.181 → vision_agent-0.2.182}/vision_agent/agent/vision_agent_planner_prompts.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|