vision-agent 0.2.184__py3-none-any.whl → 0.2.185__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,10 +28,8 @@ from vision_agent.tools.tool_utils import (
28
28
  send_task_inference_request,
29
29
  )
30
30
  from vision_agent.tools.tools_types import (
31
- Florence2FtRequest,
32
31
  JobStatus,
33
32
  ODResponseData,
34
- PromptTask,
35
33
  )
36
34
  from vision_agent.utils.exceptions import FineTuneModelIsNotReady
37
35
  from vision_agent.utils.execute import FileSerializer, MimeType
@@ -421,8 +419,15 @@ def florence2_sam2_image(
421
419
  if image.shape[0] < 1 or image.shape[1] < 1:
422
420
  return []
423
421
 
422
+ buffer_bytes = numpy_to_bytes(image)
423
+ files = [("image", buffer_bytes)]
424
+ payload = {
425
+ "prompt": prompt,
426
+ "model": "florence2sam2",
427
+ }
428
+ metadata = {"function_name": "florence2_sam2_image"}
429
+
424
430
  if fine_tune_id is not None:
425
- image_b64 = convert_to_b64(image)
426
431
  landing_api = LandingPublicAPI()
427
432
  status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
428
433
  if status is not JobStatus.SUCCEEDED:
@@ -430,58 +435,31 @@ def florence2_sam2_image(
430
435
  f"Fine-tuned model {fine_tune_id} is not ready yet"
431
436
  )
432
437
 
433
- req_data_obj = Florence2FtRequest(
434
- image=image_b64,
435
- task=PromptTask.PHRASE_GROUNDING,
436
- prompt=prompt,
437
- postprocessing="sam2",
438
- job_id=UUID(fine_tune_id),
439
- )
440
- req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
441
- detections_ft = send_inference_request(
442
- req_data,
443
- "florence2-ft",
444
- v2=True,
445
- is_form=True,
446
- metadata_payload={"function_name": "florence2_sam2_image"},
447
- )
448
- # get the first frame
449
- detection = detections_ft[0]
450
- return_data = []
451
- for i in range(len(detection["bboxes"])):
452
- return_data.append(
453
- {
454
- "score": 1.0,
455
- "label": detection["labels"][i],
456
- "bbox": normalize_bbox(
457
- detection["bboxes"][i], detection["masks"][i]["size"]
458
- ),
459
- "mask": rle_decode_array(detection["masks"][i]),
460
- }
461
- )
462
- return return_data
438
+ payload["jobId"] = fine_tune_id
463
439
 
464
- buffer_bytes = numpy_to_bytes(image)
465
- files = [("image", buffer_bytes)]
466
- payload = {
467
- "prompts": [s.strip() for s in prompt.split(",")],
468
- "function_name": "florence2_sam2_image",
469
- }
470
- detections: Dict[str, Any] = send_inference_request(
471
- payload, "florence2-sam2", files=files, v2=True
440
+ detections = send_task_inference_request(
441
+ payload,
442
+ "text-to-instance-segmentation",
443
+ files=files,
444
+ metadata=metadata,
472
445
  )
473
446
 
447
+ # get the first frame
448
+ frame = detections[0]
474
449
  return_data = []
475
- for _, data_i in detections["0"].items():
476
- mask = rle_decode_array(data_i["mask"])
477
- label = data_i["label"]
478
- bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
450
+ for detection in frame:
451
+ mask = rle_decode_array(detection["mask"])
452
+ label = detection["label"]
453
+ bbox = normalize_bbox(detection["bounding_box"], detection["mask"]["size"])
479
454
  return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
480
455
  return return_data
481
456
 
482
457
 
483
458
  def florence2_sam2_video_tracking(
484
- prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 3
459
+ prompt: str,
460
+ frames: List[np.ndarray],
461
+ chunk_length: Optional[int] = 3,
462
+ fine_tune_id: Optional[str] = None,
485
463
  ) -> List[List[Dict[str, Any]]]:
486
464
  """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
487
465
  entities in a video given a text prompt such as category names or referring
@@ -494,6 +472,8 @@ def florence2_sam2_video_tracking(
494
472
  frames (List[np.ndarray]): The list of frames to ground the prompt to.
495
473
  chunk_length (Optional[int]): The number of frames to re-run florence2 to find
496
474
  new objects.
475
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
476
+ fine-tuned model ID here to use it.
497
477
 
498
478
  Returns:
499
479
  List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
@@ -519,24 +499,43 @@ def florence2_sam2_video_tracking(
519
499
  ...
520
500
  ]
521
501
  """
502
+ if len(frames) == 0:
503
+ raise ValueError("No frames provided")
522
504
 
523
505
  buffer_bytes = frames_to_bytes(frames)
524
506
  files = [("video", buffer_bytes)]
525
507
  payload = {
526
- "prompts": [s.strip() for s in prompt.split(",")],
527
- "function_name": "florence2_sam2_video_tracking",
508
+ "prompt": prompt,
509
+ "model": "florence2sam2",
528
510
  }
511
+ metadata = {"function_name": "florence2_sam2_video_tracking"}
512
+
529
513
  if chunk_length is not None:
530
- payload["chunk_length"] = chunk_length # type: ignore
531
- data: Dict[str, Any] = send_inference_request(
532
- payload, "florence2-sam2", files=files, v2=True
514
+ payload["chunk_length_frames"] = chunk_length # type: ignore
515
+
516
+ if fine_tune_id is not None:
517
+ landing_api = LandingPublicAPI()
518
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
519
+ if status is not JobStatus.SUCCEEDED:
520
+ raise FineTuneModelIsNotReady(
521
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
522
+ )
523
+
524
+ payload["jobId"] = fine_tune_id
525
+
526
+ detections = send_task_inference_request(
527
+ payload,
528
+ "text-to-instance-segmentation",
529
+ files=files,
530
+ metadata=metadata,
533
531
  )
532
+
534
533
  return_data = []
535
- for frame_i in data.keys():
534
+ for frame in detections:
536
535
  return_frame_data = []
537
- for obj_id, data_j in data[frame_i].items():
538
- mask = rle_decode_array(data_j["mask"])
539
- label = obj_id + ": " + data_j["label"]
536
+ for detection in frame:
537
+ mask = rle_decode_array(detection["mask"])
538
+ label = str(detection["id"]) + ": " + detection["label"]
540
539
  return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
541
540
  return_data.append(return_frame_data)
542
541
  return return_data
@@ -552,7 +551,7 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
552
551
 
553
552
  Returns:
554
553
  List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
555
- with nornmalized coordinates, and confidence score.
554
+ with normalized coordinates, and confidence score.
556
555
 
557
556
  Example
558
557
  -------
@@ -608,7 +607,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
608
607
 
609
608
  Returns:
610
609
  Dict[str, Any]: A dictionary containing the key 'count' and the count as a
611
- value, e.g. {count: 12} and a heat map for visaulization purposes.
610
+ value, e.g. {count: 12} and a heat map for visualization purposes.
612
611
 
613
612
  Example
614
613
  -------
@@ -647,7 +646,7 @@ def loca_visual_prompt_counting(
647
646
 
648
647
  Returns:
649
648
  Dict[str, Any]: A dictionary containing the key 'count' and the count as a
650
- value, e.g. {count: 12} and a heat map for visaulization purposes.
649
+ value, e.g. {count: 12} and a heat map for visualization purposes.
651
650
 
652
651
  Example
653
652
  -------
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.184
3
+ Version: 0.2.185
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -20,7 +20,7 @@ vision_agent/tools/__init__.py,sha256=KVP4_6qxOb2lpFdQgQtyDfdkMLL1O6wVZNK19MXp-x
20
20
  vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
23
- vision_agent/tools/tools.py,sha256=kHeBjiVvncQJeL_Gni84bgHOCgxko4XO7otpt8IyWU4,83610
23
+ vision_agent/tools/tools.py,sha256=us3fOV3JIqFB9WidEX6NT65HwJbIxhh59RRvUcMIshI,83251
24
24
  vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
25
25
  vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
26
26
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
29
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
30
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
31
31
  vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
32
- vision_agent-0.2.184.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
- vision_agent-0.2.184.dist-info/METADATA,sha256=n8BeCLsPCBXDsr0FCmRBtScseMyJ8TuR68MWlqeO9Is,18330
34
- vision_agent-0.2.184.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
- vision_agent-0.2.184.dist-info/RECORD,,
32
+ vision_agent-0.2.185.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.185.dist-info/METADATA,sha256=Wgo1bRpQ3MgqxIDpBiN0Tj0YAUBwRtYCQ7DmhJwgKpY,18330
34
+ vision_agent-0.2.185.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.185.dist-info/RECORD,,