vision-agent 0.2.181__tar.gz → 0.2.183__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. {vision_agent-0.2.181 → vision_agent-0.2.183}/PKG-INFO +1 -1
  2. {vision_agent-0.2.181 → vision_agent-0.2.183}/pyproject.toml +1 -1
  3. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/tools/__init__.py +1 -0
  4. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/tools/tools.py +161 -134
  5. {vision_agent-0.2.181 → vision_agent-0.2.183}/LICENSE +0 -0
  6. {vision_agent-0.2.181 → vision_agent-0.2.183}/README.md +0 -0
  7. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/__init__.py +0 -0
  8. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/agent/__init__.py +0 -0
  9. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/agent/agent.py +0 -0
  10. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/agent/agent_utils.py +0 -0
  11. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/agent/vision_agent.py +0 -0
  12. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/agent/vision_agent_coder.py +0 -0
  13. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  14. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/agent/vision_agent_planner.py +0 -0
  15. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
  16. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/agent/vision_agent_prompts.py +0 -0
  17. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/clients/__init__.py +0 -0
  18. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/clients/http.py +0 -0
  19. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/clients/landing_public_api.py +0 -0
  20. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/fonts/__init__.py +0 -0
  21. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  22. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/lmm/__init__.py +0 -0
  23. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/lmm/lmm.py +0 -0
  24. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/lmm/types.py +0 -0
  25. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/tools/meta_tools.py +0 -0
  26. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/tools/prompts.py +0 -0
  27. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/tools/tool_utils.py +0 -0
  28. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/tools/tools_types.py +0 -0
  29. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/utils/__init__.py +0 -0
  30. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/utils/exceptions.py +0 -0
  31. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/utils/execute.py +0 -0
  32. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/utils/image_utils.py +0 -0
  33. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/utils/sim.py +0 -0
  34. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/utils/type_defs.py +0 -0
  35. {vision_agent-0.2.181 → vision_agent-0.2.183}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.181
3
+ Version: 0.2.183
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.181"
7
+ version = "0.2.183"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -66,6 +66,7 @@ from .tools import (
66
66
  vit_image_classification,
67
67
  vit_nsfw_classification,
68
68
  qwen2_vl_images_vqa,
69
+ qwen2_vl_video_vqa,
69
70
  video_temporal_localization,
70
71
  )
71
72
 
@@ -1,4 +1,3 @@
1
- import base64
2
1
  import io
3
2
  import json
4
3
  import logging
@@ -184,8 +183,16 @@ def owl_v2_image(
184
183
  if image_size[0] < 1 or image_size[1] < 1:
185
184
  return []
186
185
 
186
+ buffer_bytes = numpy_to_bytes(image)
187
+ files = [("image", buffer_bytes)]
188
+ payload = {
189
+ "prompts": [s.strip() for s in prompt.split(",")],
190
+ "confidence": box_threshold,
191
+ "model": "owlv2",
192
+ }
193
+ metadata = {"function_name": "owl_v2_image"}
194
+
187
195
  if fine_tune_id is not None:
188
- image_b64 = convert_to_b64(image)
189
196
  landing_api = LandingPublicAPI()
190
197
  status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
191
198
  if status is not JobStatus.SUCCEEDED:
@@ -193,43 +200,22 @@ def owl_v2_image(
193
200
  f"Fine-tuned model {fine_tune_id} is not ready yet"
194
201
  )
195
202
 
196
- data_obj = Florence2FtRequest(
197
- image=image_b64,
198
- task=PromptTask.PHRASE_GROUNDING,
199
- prompt=prompt,
200
- job_id=UUID(fine_tune_id),
201
- )
202
- data = data_obj.model_dump(by_alias=True, exclude_none=True)
203
- detections = send_inference_request(
204
- data,
205
- "florence2-ft",
206
- v2=True,
207
- is_form=True,
208
- metadata_payload={"function_name": "owl_v2_image"},
209
- )
210
- # get the first frame
211
- detection = detections[0]
212
- bboxes_formatted = [
213
- ODResponseData(
214
- label=detection["labels"][i],
215
- bbox=normalize_bbox(detection["bboxes"][i], image_size),
216
- score=1.0,
217
- )
218
- for i in range(len(detection["bboxes"]))
219
- ]
220
- return [bbox.model_dump() for bbox in bboxes_formatted]
203
+ # we can only execute fine-tuned models with florence2
204
+ payload = {
205
+ "prompts": payload["prompts"],
206
+ "jobId": fine_tune_id,
207
+ "model": "florence2",
208
+ }
221
209
 
222
- buffer_bytes = numpy_to_bytes(image)
223
- files = [("image", buffer_bytes)]
224
- payload = {
225
- "prompts": [s.strip() for s in prompt.split(",")],
226
- "model": "owlv2",
227
- "function_name": "owl_v2_image",
228
- }
229
- resp_data = send_inference_request(
230
- payload, "text-to-object-detection", files=files, v2=True
210
+ detections = send_task_inference_request(
211
+ payload,
212
+ "text-to-object-detection",
213
+ files=files,
214
+ metadata=metadata,
231
215
  )
232
- bboxes = resp_data[0]
216
+
217
+ # get the first frame
218
+ bboxes = detections[0]
233
219
  bboxes_formatted = [
234
220
  ODResponseData(
235
221
  label=bbox["label"],
@@ -238,17 +224,17 @@ def owl_v2_image(
238
224
  )
239
225
  for bbox in bboxes
240
226
  ]
241
- filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
242
- return [bbox.model_dump() for bbox in filtered_bboxes]
227
+ return [bbox.model_dump() for bbox in bboxes_formatted]
243
228
 
244
229
 
245
230
  def owl_v2_video(
246
231
  prompt: str,
247
232
  frames: List[np.ndarray],
248
233
  box_threshold: float = 0.10,
234
+ fine_tune_id: Optional[str] = None,
249
235
  ) -> List[List[Dict[str, Any]]]:
250
236
  """'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
251
- objects indepdently per frame given a text prompt such as a category name or
237
+ objects independently per frame given a text prompt such as a category name or
252
238
  referring expression but does not track objects across frames. The categories in
253
239
  text prompt are separated by commas. It returns a list of lists where each inner
254
240
  list contains the score, label, and bounding box of the detections for that frame.
@@ -258,6 +244,8 @@ def owl_v2_video(
258
244
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
259
245
  box_threshold (float, optional): The threshold for the box detection. Defaults
260
246
  to 0.30.
247
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
248
+ fine-tuned model ID here to use it.
261
249
 
262
250
  Returns:
263
251
  List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the
@@ -285,30 +273,45 @@ def owl_v2_video(
285
273
  files = [("video", buffer_bytes)]
286
274
  payload = {
287
275
  "prompts": [s.strip() for s in prompt.split(",")],
276
+ "confidence": box_threshold,
288
277
  "model": "owlv2",
289
- "function_name": "owl_v2_video",
290
278
  }
291
- data: Dict[str, Any] = send_inference_request(
292
- payload, "text-to-object-detection", files=files, v2=True
279
+ metadata = {"function_name": "owl_v2_video"}
280
+
281
+ if fine_tune_id is not None:
282
+ landing_api = LandingPublicAPI()
283
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
284
+ if status is not JobStatus.SUCCEEDED:
285
+ raise FineTuneModelIsNotReady(
286
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
287
+ )
288
+
289
+ # we can only execute fine-tuned models with florence2
290
+ payload = {
291
+ "prompts": payload["prompts"],
292
+ "jobId": fine_tune_id,
293
+ "model": "florence2",
294
+ }
295
+
296
+ detections = send_task_inference_request(
297
+ payload,
298
+ "text-to-object-detection",
299
+ files=files,
300
+ metadata=metadata,
293
301
  )
294
- bboxes_formatted = []
295
- if data is not None:
296
- for frame_data in data:
297
- bboxes_formated_frame = []
298
- for elt in frame_data:
299
- bboxes_formated_frame.append(
300
- ODResponseData(
301
- label=elt["label"], # type: ignore
302
- bbox=normalize_bbox(elt["bounding_box"], image_size), # type: ignore
303
- score=round(elt["score"], 2), # type: ignore
304
- )
305
- )
306
- bboxes_formatted.append(bboxes_formated_frame)
307
302
 
308
- filtered_bboxes = [
309
- filter_bboxes_by_threshold(elt, box_threshold) for elt in bboxes_formatted
310
- ]
311
- return [[bbox.model_dump() for bbox in frame] for frame in filtered_bboxes]
303
+ bboxes_formatted = []
304
+ for frame_data in detections:
305
+ bboxes_formatted_per_frame = [
306
+ ODResponseData(
307
+ label=bbox["label"],
308
+ bbox=normalize_bbox(bbox["bounding_box"], image_size),
309
+ score=round(bbox["score"], 2),
310
+ )
311
+ for bbox in frame_data
312
+ ]
313
+ bboxes_formatted.append(bboxes_formatted_per_frame)
314
+ return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
312
315
 
313
316
 
314
317
  def grounding_sam(
@@ -708,23 +711,31 @@ def countgd_counting(
708
711
  image_size = image.shape[:2]
709
712
  if image_size[0] < 1 or image_size[1] < 1:
710
713
  return []
714
+
711
715
  buffer_bytes = numpy_to_bytes(image)
712
716
  files = [("image", buffer_bytes)]
713
- prompt = prompt.replace(", ", " .")
714
- payload = {"prompts": [prompt], "model": "countgd"}
717
+ payload = {
718
+ "prompts": [prompt.replace(", ", " .")],
719
+ "confidence": box_threshold, # still not being used in the API
720
+ "model": "countgd",
721
+ }
715
722
  metadata = {"function_name": "countgd_counting"}
716
- resp_data = send_task_inference_request(
723
+
724
+ detections = send_task_inference_request(
717
725
  payload, "text-to-object-detection", files=files, metadata=metadata
718
726
  )
719
- bboxes_per_frame = resp_data[0]
727
+
728
+ # get the first frame
729
+ bboxes = detections[0]
720
730
  bboxes_formatted = [
721
731
  ODResponseData(
722
732
  label=bbox["label"],
723
733
  bbox=normalize_bbox(bbox["bounding_box"], image_size),
724
734
  score=round(bbox["score"], 2),
725
735
  )
726
- for bbox in bboxes_per_frame
736
+ for bbox in bboxes
727
737
  ]
738
+ # TODO: remove this once we start to use the confidence on countgd
728
739
  filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
729
740
  return [bbox.model_dump() for bbox in filtered_bboxes]
730
741
 
@@ -768,6 +779,7 @@ def countgd_example_based_counting(
768
779
  image_size = image.shape[:2]
769
780
  if image_size[0] < 1 or image_size[1] < 1:
770
781
  return []
782
+
771
783
  buffer_bytes = numpy_to_bytes(image)
772
784
  files = [("image", buffer_bytes)]
773
785
  visual_prompts = [
@@ -775,10 +787,13 @@ def countgd_example_based_counting(
775
787
  ]
776
788
  payload = {"visual_prompts": json.dumps(visual_prompts), "model": "countgd"}
777
789
  metadata = {"function_name": "countgd_example_based_counting"}
778
- resp_data = send_task_inference_request(
790
+
791
+ detections = send_task_inference_request(
779
792
  payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
780
793
  )
781
- bboxes_per_frame = resp_data[0]
794
+
795
+ # get the first frame
796
+ bboxes_per_frame = detections[0]
782
797
  bboxes_formatted = [
783
798
  ODResponseData(
784
799
  label=bbox["label"],
@@ -915,6 +930,37 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
915
930
  return cast(str, data["answer"])
916
931
 
917
932
 
933
+ def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
934
+ """'qwen2_vl_video_vqa' is a tool that can answer any questions about arbitrary videos
935
+ including regular videos or videos of documents or presentations. It returns text
936
+ as an answer to the question.
937
+
938
+ Parameters:
939
+ prompt (str): The question about the video
940
+ frames (List[np.ndarray]): The reference frames used for the question
941
+
942
+ Returns:
943
+ str: A string which is the answer to the given prompt.
944
+
945
+ Example
946
+ -------
947
+ >>> qwen2_vl_video_vqa('Which football player made the goal?', frames)
948
+ 'Lionel Messi'
949
+ """
950
+
951
+ buffer_bytes = frames_to_bytes(frames)
952
+ files = [("video", buffer_bytes)]
953
+ payload = {
954
+ "prompt": prompt,
955
+ "model": "qwen2vl",
956
+ "function_name": "qwen2_vl_video_vqa",
957
+ }
958
+ data: Dict[str, Any] = send_inference_request(
959
+ payload, "image-to-text", files=files, v2=True
960
+ )
961
+ return cast(str, data)
962
+
963
+
918
964
  def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
919
965
  """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
920
966
  including regular images or images of documents or presentations. It returns text
@@ -1240,7 +1286,14 @@ def florence2_phrase_grounding(
1240
1286
  image_size = image.shape[:2]
1241
1287
  if image_size[0] < 1 or image_size[1] < 1:
1242
1288
  return []
1243
- image_b64 = convert_to_b64(image)
1289
+
1290
+ buffer_bytes = numpy_to_bytes(image)
1291
+ files = [("image", buffer_bytes)]
1292
+ payload = {
1293
+ "prompts": [s.strip() for s in prompt.split(",")],
1294
+ "model": "florence2",
1295
+ }
1296
+ metadata = {"function_name": "florence2_phrase_grounding"}
1244
1297
 
1245
1298
  if fine_tune_id is not None:
1246
1299
  landing_api = LandingPublicAPI()
@@ -1250,42 +1303,27 @@ def florence2_phrase_grounding(
1250
1303
  f"Fine-tuned model {fine_tune_id} is not ready yet"
1251
1304
  )
1252
1305
 
1253
- data_obj = Florence2FtRequest(
1254
- image=image_b64,
1255
- task=PromptTask.PHRASE_GROUNDING,
1256
- prompt=prompt,
1257
- job_id=UUID(fine_tune_id),
1258
- )
1259
- data = data_obj.model_dump(by_alias=True, exclude_none=True)
1260
- detections = send_inference_request(
1261
- data,
1262
- "florence2-ft",
1263
- v2=True,
1264
- is_form=True,
1265
- metadata_payload={"function_name": "florence2_phrase_grounding"},
1266
- )
1267
- # get the first frame
1268
- detection = detections[0]
1269
- else:
1270
- data = {
1271
- "image": image_b64,
1272
- "task": "<CAPTION_TO_PHRASE_GROUNDING>",
1273
- "prompt": prompt,
1274
- "function_name": "florence2_phrase_grounding",
1275
- }
1276
- detections = send_inference_request(data, "florence2", v2=True)
1277
- detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
1306
+ payload["jobId"] = fine_tune_id
1278
1307
 
1279
- return_data = []
1280
- for i in range(len(detection["bboxes"])):
1281
- return_data.append(
1282
- ODResponseData(
1283
- label=detection["labels"][i],
1284
- bbox=normalize_bbox(detection["bboxes"][i], image_size),
1285
- score=1.0,
1286
- )
1308
+ detections = send_task_inference_request(
1309
+ payload,
1310
+ "text-to-object-detection",
1311
+ files=files,
1312
+ metadata=metadata,
1313
+ )
1314
+
1315
+ # get the first frame
1316
+ bboxes = detections[0]
1317
+ bboxes_formatted = [
1318
+ ODResponseData(
1319
+ label=bbox["label"],
1320
+ bbox=normalize_bbox(bbox["bounding_box"], image_size),
1321
+ score=round(bbox["score"], 2),
1287
1322
  )
1288
- return [bbox.model_dump() for bbox in return_data]
1323
+ for bbox in bboxes
1324
+ ]
1325
+
1326
+ return [bbox.model_dump() for bbox in bboxes_formatted]
1289
1327
 
1290
1328
 
1291
1329
  def florence2_phrase_grounding_video(
@@ -1327,6 +1365,11 @@ def florence2_phrase_grounding_video(
1327
1365
  image_size = frames[0].shape[:2]
1328
1366
  buffer_bytes = frames_to_bytes(frames)
1329
1367
  files = [("video", buffer_bytes)]
1368
+ payload = {
1369
+ "prompts": [s.strip() for s in prompt.split(",")],
1370
+ "model": "florence2",
1371
+ }
1372
+ metadata = {"function_name": "florence2_phrase_grounding_video"}
1330
1373
 
1331
1374
  if fine_tune_id is not None:
1332
1375
  landing_api = LandingPublicAPI()
@@ -1336,41 +1379,25 @@ def florence2_phrase_grounding_video(
1336
1379
  f"Fine-tuned model {fine_tune_id} is not ready yet"
1337
1380
  )
1338
1381
 
1339
- data_obj = Florence2FtRequest(
1340
- task=PromptTask.PHRASE_GROUNDING,
1341
- prompt=prompt,
1342
- job_id=UUID(fine_tune_id),
1343
- )
1382
+ payload["jobId"] = fine_tune_id
1344
1383
 
1345
- data = data_obj.model_dump(by_alias=True, exclude_none=True, mode="json")
1346
- detections = send_inference_request(
1347
- data,
1348
- "florence2-ft",
1349
- v2=True,
1350
- files=files,
1351
- metadata_payload={"function_name": "florence2_phrase_grounding_video"},
1352
- )
1353
- else:
1354
- data = {
1355
- "prompt": prompt,
1356
- "task": "<CAPTION_TO_PHRASE_GROUNDING>",
1357
- "function_name": "florence2_phrase_grounding_video",
1358
- "video": base64.b64encode(buffer_bytes).decode("utf-8"),
1359
- }
1360
- detections = send_inference_request(data, "florence2", v2=True)
1361
- detections = [d["<CAPTION_TO_PHRASE_GROUNDING>"] for d in detections]
1384
+ detections = send_task_inference_request(
1385
+ payload,
1386
+ "text-to-object-detection",
1387
+ files=files,
1388
+ metadata=metadata,
1389
+ )
1362
1390
 
1363
1391
  bboxes_formatted = []
1364
1392
  for frame_data in detections:
1365
- bboxes_formatted_per_frame = []
1366
- for idx in range(len(frame_data["bboxes"])):
1367
- bboxes_formatted_per_frame.append(
1368
- ODResponseData(
1369
- label=frame_data["labels"][idx],
1370
- bbox=normalize_bbox(frame_data["bboxes"][idx], image_size),
1371
- score=1.0,
1372
- )
1393
+ bboxes_formatted_per_frame = [
1394
+ ODResponseData(
1395
+ label=bbox["label"],
1396
+ bbox=normalize_bbox(bbox["bounding_box"], image_size),
1397
+ score=round(bbox["score"], 2),
1373
1398
  )
1399
+ for bbox in frame_data
1400
+ ]
1374
1401
  bboxes_formatted.append(bboxes_formatted_per_frame)
1375
1402
  return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
1376
1403
 
@@ -2242,13 +2269,13 @@ FUNCTION_TOOLS = [
2242
2269
  florence2_sam2_image,
2243
2270
  florence2_sam2_video_tracking,
2244
2271
  florence2_phrase_grounding,
2245
- ixc25_image_vqa,
2246
- ixc25_video_vqa,
2247
2272
  detr_segmentation,
2248
2273
  depth_anything_v2,
2249
2274
  generate_pose_image,
2250
2275
  closest_mask_distance,
2251
2276
  closest_box_distance,
2277
+ qwen2_vl_images_vqa,
2278
+ qwen2_vl_video_vqa,
2252
2279
  ]
2253
2280
 
2254
2281
  UTIL_TOOLS = [
File without changes
File without changes