vision-agent 0.2.184__tar.gz → 0.2.186__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. {vision_agent-0.2.184 → vision_agent-0.2.186}/PKG-INFO +3 -3
  2. {vision_agent-0.2.184 → vision_agent-0.2.186}/README.md +2 -2
  3. {vision_agent-0.2.184 → vision_agent-0.2.186}/pyproject.toml +1 -1
  4. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/tools/tools.py +70 -73
  5. {vision_agent-0.2.184 → vision_agent-0.2.186}/LICENSE +0 -0
  6. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/__init__.py +0 -0
  7. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/agent/__init__.py +0 -0
  8. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/agent/agent.py +0 -0
  9. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/agent/agent_utils.py +0 -0
  10. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/agent/vision_agent.py +0 -0
  11. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/agent/vision_agent_coder.py +0 -0
  12. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  13. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/agent/vision_agent_planner.py +0 -0
  14. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
  15. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/agent/vision_agent_prompts.py +0 -0
  16. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/clients/__init__.py +0 -0
  17. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/clients/http.py +0 -0
  18. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/clients/landing_public_api.py +0 -0
  19. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/fonts/__init__.py +0 -0
  20. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  21. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/lmm/__init__.py +0 -0
  22. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/lmm/lmm.py +0 -0
  23. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/lmm/types.py +0 -0
  24. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/tools/__init__.py +0 -0
  25. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/tools/meta_tools.py +0 -0
  26. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/tools/prompts.py +0 -0
  27. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/tools/tool_utils.py +0 -0
  28. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/tools/tools_types.py +0 -0
  29. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/utils/__init__.py +0 -0
  30. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/utils/exceptions.py +0 -0
  31. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/utils/execute.py +0 -0
  32. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/utils/image_utils.py +0 -0
  33. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/utils/sim.py +0 -0
  34. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/utils/type_defs.py +0 -0
  35. {vision_agent-0.2.184 → vision_agent-0.2.186}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.184
3
+ Version: 0.2.186
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -63,10 +63,10 @@ code to solve the task for them. Check out our discord for updates and roadmaps!
63
63
  ## Table of Contents
64
64
  - [🚀Quick Start](#quick-start)
65
65
  - [📚Documentation](#documentation)
66
- - [🔍🤖VisionAgent](#vision-agent-basic-usage)
66
+ - [🔍🤖VisionAgent](#visionagent-basic-usage)
67
67
  - [🛠️Tools](#tools)
68
68
  - [🤖LMMs](#lmms)
69
- - [💻🤖VisionAgent Coder](#vision-agent-coder)
69
+ - [💻🤖VisionAgent Coder](#visionagent-coder)
70
70
  - [🏗️Additional Backends](#additional-backends)
71
71
 
72
72
  ## Quick Start
@@ -21,10 +21,10 @@ code to solve the task for them. Check out our discord for updates and roadmaps!
21
21
  ## Table of Contents
22
22
  - [🚀Quick Start](#quick-start)
23
23
  - [📚Documentation](#documentation)
24
- - [🔍🤖VisionAgent](#vision-agent-basic-usage)
24
+ - [🔍🤖VisionAgent](#visionagent-basic-usage)
25
25
  - [🛠️Tools](#tools)
26
26
  - [🤖LMMs](#lmms)
27
- - [💻🤖VisionAgent Coder](#vision-agent-coder)
27
+ - [💻🤖VisionAgent Coder](#visionagent-coder)
28
28
  - [🏗️Additional Backends](#additional-backends)
29
29
 
30
30
  ## Quick Start
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.184"
7
+ version = "0.2.186"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -28,10 +28,8 @@ from vision_agent.tools.tool_utils import (
28
28
  send_task_inference_request,
29
29
  )
30
30
  from vision_agent.tools.tools_types import (
31
- Florence2FtRequest,
32
31
  JobStatus,
33
32
  ODResponseData,
34
- PromptTask,
35
33
  )
36
34
  from vision_agent.utils.exceptions import FineTuneModelIsNotReady
37
35
  from vision_agent.utils.execute import FileSerializer, MimeType
@@ -421,8 +419,15 @@ def florence2_sam2_image(
421
419
  if image.shape[0] < 1 or image.shape[1] < 1:
422
420
  return []
423
421
 
422
+ buffer_bytes = numpy_to_bytes(image)
423
+ files = [("image", buffer_bytes)]
424
+ payload = {
425
+ "prompt": prompt,
426
+ "model": "florence2sam2",
427
+ }
428
+ metadata = {"function_name": "florence2_sam2_image"}
429
+
424
430
  if fine_tune_id is not None:
425
- image_b64 = convert_to_b64(image)
426
431
  landing_api = LandingPublicAPI()
427
432
  status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
428
433
  if status is not JobStatus.SUCCEEDED:
@@ -430,58 +435,31 @@ def florence2_sam2_image(
430
435
  f"Fine-tuned model {fine_tune_id} is not ready yet"
431
436
  )
432
437
 
433
- req_data_obj = Florence2FtRequest(
434
- image=image_b64,
435
- task=PromptTask.PHRASE_GROUNDING,
436
- prompt=prompt,
437
- postprocessing="sam2",
438
- job_id=UUID(fine_tune_id),
439
- )
440
- req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
441
- detections_ft = send_inference_request(
442
- req_data,
443
- "florence2-ft",
444
- v2=True,
445
- is_form=True,
446
- metadata_payload={"function_name": "florence2_sam2_image"},
447
- )
448
- # get the first frame
449
- detection = detections_ft[0]
450
- return_data = []
451
- for i in range(len(detection["bboxes"])):
452
- return_data.append(
453
- {
454
- "score": 1.0,
455
- "label": detection["labels"][i],
456
- "bbox": normalize_bbox(
457
- detection["bboxes"][i], detection["masks"][i]["size"]
458
- ),
459
- "mask": rle_decode_array(detection["masks"][i]),
460
- }
461
- )
462
- return return_data
438
+ payload["jobId"] = fine_tune_id
463
439
 
464
- buffer_bytes = numpy_to_bytes(image)
465
- files = [("image", buffer_bytes)]
466
- payload = {
467
- "prompts": [s.strip() for s in prompt.split(",")],
468
- "function_name": "florence2_sam2_image",
469
- }
470
- detections: Dict[str, Any] = send_inference_request(
471
- payload, "florence2-sam2", files=files, v2=True
440
+ detections = send_task_inference_request(
441
+ payload,
442
+ "text-to-instance-segmentation",
443
+ files=files,
444
+ metadata=metadata,
472
445
  )
473
446
 
447
+ # get the first frame
448
+ frame = detections[0]
474
449
  return_data = []
475
- for _, data_i in detections["0"].items():
476
- mask = rle_decode_array(data_i["mask"])
477
- label = data_i["label"]
478
- bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
450
+ for detection in frame:
451
+ mask = rle_decode_array(detection["mask"])
452
+ label = detection["label"]
453
+ bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
479
454
  return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
480
455
  return return_data
481
456
 
482
457
 
483
458
  def florence2_sam2_video_tracking(
484
- prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 3
459
+ prompt: str,
460
+ frames: List[np.ndarray],
461
+ chunk_length: Optional[int] = 3,
462
+ fine_tune_id: Optional[str] = None,
485
463
  ) -> List[List[Dict[str, Any]]]:
486
464
  """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
487
465
  entities in a video given a text prompt such as category names or referring
@@ -494,6 +472,8 @@ def florence2_sam2_video_tracking(
494
472
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
495
473
  chunk_length (Optional[int]): The number of frames to re-run florence2 to find
496
474
  new objects.
475
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
476
+ fine-tuned model ID here to use it.
497
477
 
498
478
  Returns:
499
479
  List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
@@ -519,24 +499,43 @@ def florence2_sam2_video_tracking(
519
499
  ...
520
500
  ]
521
501
  """
502
+ if len(frames) == 0:
503
+ raise ValueError("No frames provided")
522
504
 
523
505
  buffer_bytes = frames_to_bytes(frames)
524
506
  files = [("video", buffer_bytes)]
525
507
  payload = {
526
- "prompts": [s.strip() for s in prompt.split(",")],
527
- "function_name": "florence2_sam2_video_tracking",
508
+ "prompt": prompt,
509
+ "model": "florence2sam2",
528
510
  }
511
+ metadata = {"function_name": "florence2_sam2_video_tracking"}
512
+
529
513
  if chunk_length is not None:
530
- payload["chunk_length"] = chunk_length # type: ignore
531
- data: Dict[str, Any] = send_inference_request(
532
- payload, "florence2-sam2", files=files, v2=True
514
+ payload["chunk_length_frames"] = chunk_length # type: ignore
515
+
516
+ if fine_tune_id is not None:
517
+ landing_api = LandingPublicAPI()
518
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
519
+ if status is not JobStatus.SUCCEEDED:
520
+ raise FineTuneModelIsNotReady(
521
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
522
+ )
523
+
524
+ payload["jobId"] = fine_tune_id
525
+
526
+ detections = send_task_inference_request(
527
+ payload,
528
+ "text-to-instance-segmentation",
529
+ files=files,
530
+ metadata=metadata,
533
531
  )
532
+
534
533
  return_data = []
535
- for frame_i in data.keys():
534
+ for frame in detections:
536
535
  return_frame_data = []
537
- for obj_id, data_j in data[frame_i].items():
538
- mask = rle_decode_array(data_j["mask"])
539
- label = obj_id + ": " + data_j["label"]
536
+ for detection in frame:
537
+ mask = rle_decode_array(detection["mask"])
538
+ label = str(detection["id"]) + ": " + detection["label"]
540
539
  return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
541
540
  return_data.append(return_frame_data)
542
541
  return return_data
@@ -552,7 +551,7 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
552
551
 
553
552
  Returns:
554
553
  List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
555
- with nornmalized coordinates, and confidence score.
554
+ with normalized coordinates, and confidence score.
556
555
 
557
556
  Example
558
557
  -------
@@ -608,7 +607,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
608
607
 
609
608
  Returns:
610
609
  Dict[str, Any]: A dictionary containing the key 'count' and the count as a
611
- value, e.g. {count: 12} and a heat map for visaulization purposes.
610
+ value, e.g. {count: 12} and a heat map for visualization purposes.
612
611
 
613
612
  Example
614
613
  -------
@@ -647,7 +646,7 @@ def loca_visual_prompt_counting(
647
646
 
648
647
  Returns:
649
648
  Dict[str, Any]: A dictionary containing the key 'count' and the count as a
650
- value, e.g. {count: 12} and a heat map for visaulization purposes.
649
+ value, e.g. {count: 12} and a heat map for visualization purposes.
651
650
 
652
651
  Example
653
652
  -------
@@ -1058,23 +1057,25 @@ def video_temporal_localization(
1058
1057
  prompt: str,
1059
1058
  frames: List[np.ndarray],
1060
1059
  model: str = "qwen2vl",
1061
- chunk_length: Optional[float] = None,
1062
- chunk_length_seconds: Optional[float] = None,
1063
1060
  chunk_length_frames: Optional[int] = 2,
1064
1061
  ) -> List[float]:
1065
- """'video_temporal_localization' is a tool that can find objects in a video given a question about it.
1066
- It returns a list of floats with a value of 1.0 if the object to be found is present in the chunk of video being analyzed.
1062
+ """'video_temporal_localization' will run qwen2vl on each chunk_length_frames
1063
+ value selected for the video. It can detect multiple objects independently per
1064
+ chunk_length_frames given a text prompt such as a referring expression
1065
+ but does not track objects across frames.
1066
+ It returns a list of floats with a value of 1.0 if the objects are found in a given
1067
+ chunk_length_frames of the video.
1067
1068
 
1068
1069
  Parameters:
1069
1070
  prompt (str): The question about the video
1070
1071
  frames (List[np.ndarray]): The reference frames used for the question
1071
- model (str): The model to use for the inference. Valid values are 'qwen2vl', 'gpt4o', 'internlm-xcomposer'
1072
- chunk_length (Optional[float]): length of each chunk in seconds
1073
- chunk_length_seconds (Optional[float]): alternative length for chunk in seconds
1072
+ model (str): The model to use for the inference. Valid values are
1073
+ 'qwen2vl', 'gpt4o', 'internlm-xcomposer'
1074
1074
  chunk_length_frames (Optional[int]): length of each chunk in frames
1075
1075
 
1076
1076
  Returns:
1077
- List[float]: A list of floats with a value of 1.0 if the object to be found is present in the chunk of video
1077
+ List[float]: A list of floats with a value of 1.0 if the objects to be found
1078
+ are present in the chunk_length_frames of the video.
1078
1079
 
1079
1080
  Example
1080
1081
  -------
@@ -1089,10 +1090,6 @@ def video_temporal_localization(
1089
1090
  "model": model,
1090
1091
  "function_name": "video_temporal_localization",
1091
1092
  }
1092
- if chunk_length is not None:
1093
- payload["chunk_length"] = chunk_length
1094
- if chunk_length_seconds is not None:
1095
- payload["chunk_length_seconds"] = chunk_length_seconds
1096
1093
  if chunk_length_frames is not None:
1097
1094
  payload["chunk_length_frames"] = chunk_length_frames
1098
1095
 
@@ -1791,9 +1788,8 @@ def flux_image_inpainting(
1791
1788
  where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
1792
1789
 
1793
1790
  Returns:
1794
- np.ndarray:
1795
- The generated image(s) as a numpy array in RGB format
1796
- with values ranging from 0 to 255.
1791
+ np.ndarray: The generated image(s) as a numpy array in RGB format with values
1792
+ ranging from 0 to 255.
1797
1793
 
1798
1794
  -------
1799
1795
  Example:
@@ -2352,6 +2348,7 @@ FUNCTION_TOOLS = [
2352
2348
  closest_box_distance,
2353
2349
  qwen2_vl_images_vqa,
2354
2350
  qwen2_vl_video_vqa,
2351
+ video_temporal_localization,
2355
2352
  ]
2356
2353
 
2357
2354
  UTIL_TOOLS = [
File without changes