vision-agent 0.2.184__py3-none-any.whl → 0.2.186__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,10 +28,8 @@ from vision_agent.tools.tool_utils import (
28
28
  send_task_inference_request,
29
29
  )
30
30
  from vision_agent.tools.tools_types import (
31
- Florence2FtRequest,
32
31
  JobStatus,
33
32
  ODResponseData,
34
- PromptTask,
35
33
  )
36
34
  from vision_agent.utils.exceptions import FineTuneModelIsNotReady
37
35
  from vision_agent.utils.execute import FileSerializer, MimeType
@@ -421,8 +419,15 @@ def florence2_sam2_image(
421
419
  if image.shape[0] < 1 or image.shape[1] < 1:
422
420
  return []
423
421
 
422
+ buffer_bytes = numpy_to_bytes(image)
423
+ files = [("image", buffer_bytes)]
424
+ payload = {
425
+ "prompt": prompt,
426
+ "model": "florence2sam2",
427
+ }
428
+ metadata = {"function_name": "florence2_sam2_image"}
429
+
424
430
  if fine_tune_id is not None:
425
- image_b64 = convert_to_b64(image)
426
431
  landing_api = LandingPublicAPI()
427
432
  status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
428
433
  if status is not JobStatus.SUCCEEDED:
@@ -430,58 +435,31 @@ def florence2_sam2_image(
430
435
  f"Fine-tuned model {fine_tune_id} is not ready yet"
431
436
  )
432
437
 
433
- req_data_obj = Florence2FtRequest(
434
- image=image_b64,
435
- task=PromptTask.PHRASE_GROUNDING,
436
- prompt=prompt,
437
- postprocessing="sam2",
438
- job_id=UUID(fine_tune_id),
439
- )
440
- req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
441
- detections_ft = send_inference_request(
442
- req_data,
443
- "florence2-ft",
444
- v2=True,
445
- is_form=True,
446
- metadata_payload={"function_name": "florence2_sam2_image"},
447
- )
448
- # get the first frame
449
- detection = detections_ft[0]
450
- return_data = []
451
- for i in range(len(detection["bboxes"])):
452
- return_data.append(
453
- {
454
- "score": 1.0,
455
- "label": detection["labels"][i],
456
- "bbox": normalize_bbox(
457
- detection["bboxes"][i], detection["masks"][i]["size"]
458
- ),
459
- "mask": rle_decode_array(detection["masks"][i]),
460
- }
461
- )
462
- return return_data
438
+ payload["jobId"] = fine_tune_id
463
439
 
464
- buffer_bytes = numpy_to_bytes(image)
465
- files = [("image", buffer_bytes)]
466
- payload = {
467
- "prompts": [s.strip() for s in prompt.split(",")],
468
- "function_name": "florence2_sam2_image",
469
- }
470
- detections: Dict[str, Any] = send_inference_request(
471
- payload, "florence2-sam2", files=files, v2=True
440
+ detections = send_task_inference_request(
441
+ payload,
442
+ "text-to-instance-segmentation",
443
+ files=files,
444
+ metadata=metadata,
472
445
  )
473
446
 
447
+ # get the first frame
448
+ frame = detections[0]
474
449
  return_data = []
475
- for _, data_i in detections["0"].items():
476
- mask = rle_decode_array(data_i["mask"])
477
- label = data_i["label"]
478
- bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
450
+ for detection in frame:
451
+ mask = rle_decode_array(detection["mask"])
452
+ label = detection["label"]
453
+ bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
479
454
  return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
480
455
  return return_data
481
456
 
482
457
 
483
458
  def florence2_sam2_video_tracking(
484
- prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 3
459
+ prompt: str,
460
+ frames: List[np.ndarray],
461
+ chunk_length: Optional[int] = 3,
462
+ fine_tune_id: Optional[str] = None,
485
463
  ) -> List[List[Dict[str, Any]]]:
486
464
  """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
487
465
  entities in a video given a text prompt such as category names or referring
@@ -494,6 +472,8 @@ def florence2_sam2_video_tracking(
494
472
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
495
473
  chunk_length (Optional[int]): The number of frames to re-run florence2 to find
496
474
  new objects.
475
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
476
+ fine-tuned model ID here to use it.
497
477
 
498
478
  Returns:
499
479
  List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
@@ -519,24 +499,43 @@ def florence2_sam2_video_tracking(
519
499
  ...
520
500
  ]
521
501
  """
502
+ if len(frames) == 0:
503
+ raise ValueError("No frames provided")
522
504
 
523
505
  buffer_bytes = frames_to_bytes(frames)
524
506
  files = [("video", buffer_bytes)]
525
507
  payload = {
526
- "prompts": [s.strip() for s in prompt.split(",")],
527
- "function_name": "florence2_sam2_video_tracking",
508
+ "prompt": prompt,
509
+ "model": "florence2sam2",
528
510
  }
511
+ metadata = {"function_name": "florence2_sam2_video_tracking"}
512
+
529
513
  if chunk_length is not None:
530
- payload["chunk_length"] = chunk_length # type: ignore
531
- data: Dict[str, Any] = send_inference_request(
532
- payload, "florence2-sam2", files=files, v2=True
514
+ payload["chunk_length_frames"] = chunk_length # type: ignore
515
+
516
+ if fine_tune_id is not None:
517
+ landing_api = LandingPublicAPI()
518
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
519
+ if status is not JobStatus.SUCCEEDED:
520
+ raise FineTuneModelIsNotReady(
521
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
522
+ )
523
+
524
+ payload["jobId"] = fine_tune_id
525
+
526
+ detections = send_task_inference_request(
527
+ payload,
528
+ "text-to-instance-segmentation",
529
+ files=files,
530
+ metadata=metadata,
533
531
  )
532
+
534
533
  return_data = []
535
- for frame_i in data.keys():
534
+ for frame in detections:
536
535
  return_frame_data = []
537
- for obj_id, data_j in data[frame_i].items():
538
- mask = rle_decode_array(data_j["mask"])
539
- label = obj_id + ": " + data_j["label"]
536
+ for detection in frame:
537
+ mask = rle_decode_array(detection["mask"])
538
+ label = str(detection["id"]) + ": " + detection["label"]
540
539
  return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
541
540
  return_data.append(return_frame_data)
542
541
  return return_data
@@ -552,7 +551,7 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
552
551
 
553
552
  Returns:
554
553
  List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
555
- with nornmalized coordinates, and confidence score.
554
+ with normalized coordinates, and confidence score.
556
555
 
557
556
  Example
558
557
  -------
@@ -608,7 +607,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
608
607
 
609
608
  Returns:
610
609
  Dict[str, Any]: A dictionary containing the key 'count' and the count as a
611
- value, e.g. {count: 12} and a heat map for visaulization purposes.
610
+ value, e.g. {count: 12} and a heat map for visualization purposes.
612
611
 
613
612
  Example
614
613
  -------
@@ -647,7 +646,7 @@ def loca_visual_prompt_counting(
647
646
 
648
647
  Returns:
649
648
  Dict[str, Any]: A dictionary containing the key 'count' and the count as a
650
- value, e.g. {count: 12} and a heat map for visaulization purposes.
649
+ value, e.g. {count: 12} and a heat map for visualization purposes.
651
650
 
652
651
  Example
653
652
  -------
@@ -1058,23 +1057,25 @@ def video_temporal_localization(
1058
1057
  prompt: str,
1059
1058
  frames: List[np.ndarray],
1060
1059
  model: str = "qwen2vl",
1061
- chunk_length: Optional[float] = None,
1062
- chunk_length_seconds: Optional[float] = None,
1063
1060
  chunk_length_frames: Optional[int] = 2,
1064
1061
  ) -> List[float]:
1065
- """'video_temporal_localization' is a tool that can find objects in a video given a question about it.
1066
- It returns a list of floats with a value of 1.0 if the object to be found is present in the chunk of video being analyzed.
1062
+ """'video_temporal_localization' will run qwen2vl on each chunk_length_frames
1063
+ value selected for the video. It can detect multiple objects independently per
1064
+ chunk_length_frames given a text prompt such as a referring expression
1065
+ but does not track objects across frames.
1066
+ It returns a list of floats with a value of 1.0 if the objects are found in a given
1067
+ chunk_length_frames of the video.
1067
1068
 
1068
1069
  Parameters:
1069
1070
  prompt (str): The question about the video
1070
1071
  frames (List[np.ndarray]): The reference frames used for the question
1071
- model (str): The model to use for the inference. Valid values are 'qwen2vl', 'gpt4o', 'internlm-xcomposer'
1072
- chunk_length (Optional[float]): length of each chunk in seconds
1073
- chunk_length_seconds (Optional[float]): alternative length for chunk in seconds
1072
+ model (str): The model to use for the inference. Valid values are
1073
+ 'qwen2vl', 'gpt4o', 'internlm-xcomposer'
1074
1074
  chunk_length_frames (Optional[int]): length of each chunk in frames
1075
1075
 
1076
1076
  Returns:
1077
- List[float]: A list of floats with a value of 1.0 if the object to be found is present in the chunk of video
1077
+ List[float]: A list of floats with a value of 1.0 if the objects to be found
1078
+ are present in the chunk_length_frames of the video.
1078
1079
 
1079
1080
  Example
1080
1081
  -------
@@ -1089,10 +1090,6 @@ def video_temporal_localization(
1089
1090
  "model": model,
1090
1091
  "function_name": "video_temporal_localization",
1091
1092
  }
1092
- if chunk_length is not None:
1093
- payload["chunk_length"] = chunk_length
1094
- if chunk_length_seconds is not None:
1095
- payload["chunk_length_seconds"] = chunk_length_seconds
1096
1093
  if chunk_length_frames is not None:
1097
1094
  payload["chunk_length_frames"] = chunk_length_frames
1098
1095
 
@@ -1791,9 +1788,8 @@ def flux_image_inpainting(
1791
1788
  where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
1792
1789
 
1793
1790
  Returns:
1794
- np.ndarray:
1795
- The generated image(s) as a numpy array in RGB format
1796
- with values ranging from 0 to 255.
1791
+ np.ndarray: The generated image(s) as a numpy array in RGB format with values
1792
+ ranging from 0 to 255.
1797
1793
 
1798
1794
  -------
1799
1795
  Example:
@@ -2352,6 +2348,7 @@ FUNCTION_TOOLS = [
2352
2348
  closest_box_distance,
2353
2349
  qwen2_vl_images_vqa,
2354
2350
  qwen2_vl_video_vqa,
2351
+ video_temporal_localization,
2355
2352
  ]
2356
2353
 
2357
2354
  UTIL_TOOLS = [
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.184
3
+ Version: 0.2.186
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -63,10 +63,10 @@ code to solve the task for them. Check out our discord for updates and roadmaps!
63
63
  ## Table of Contents
64
64
  - [🚀Quick Start](#quick-start)
65
65
  - [📚Documentation](#documentation)
66
- - [🔍🤖VisionAgent](#vision-agent-basic-usage)
66
+ - [🔍🤖VisionAgent](#visionagent-basic-usage)
67
67
  - [🛠️Tools](#tools)
68
68
  - [🤖LMMs](#lmms)
69
- - [💻🤖VisionAgent Coder](#vision-agent-coder)
69
+ - [💻🤖VisionAgent Coder](#visionagent-coder)
70
70
  - [🏗️Additional Backends](#additional-backends)
71
71
 
72
72
  ## Quick Start
@@ -20,7 +20,7 @@ vision_agent/tools/__init__.py,sha256=KVP4_6qxOb2lpFdQgQtyDfdkMLL1O6wVZNK19MXp-x
20
20
  vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
23
- vision_agent/tools/tools.py,sha256=kHeBjiVvncQJeL_Gni84bgHOCgxko4XO7otpt8IyWU4,83610
23
+ vision_agent/tools/tools.py,sha256=-oq8jzITi-yVYJ3ut5MuGJ65jd3ESRtHfw4SCAruMps,83059
24
24
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
25
25
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
26
26
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
29
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
30
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
31
31
  vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
32
- vision_agent-0.2.184.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- vision_agent-0.2.184.dist-info/METADATA,sha256=n8BeCLsPCBXDsr0FCmRBtScseMyJ8TuR68MWlqeO9Is,18330
34
- vision_agent-0.2.184.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
- vision_agent-0.2.184.dist-info/RECORD,,
32
+ vision_agent-0.2.186.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.186.dist-info/METADATA,sha256=NQfESIRsq9-QWyPzNkyv6dSuRS6TGe5D2tZH4iJpeBU,18328
34
+ vision_agent-0.2.186.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.186.dist-info/RECORD,,