vision-agent 0.2.101__tar.gz → 0.2.103__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. {vision_agent-0.2.101 → vision_agent-0.2.103}/PKG-INFO +1 -1
  2. {vision_agent-0.2.101 → vision_agent-0.2.103}/pyproject.toml +1 -1
  3. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/agent/vision_agent_coder.py +10 -1
  4. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/tools/__init__.py +1 -0
  5. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/tools/tools.py +47 -0
  6. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/utils/image_utils.py +17 -0
  7. {vision_agent-0.2.101 → vision_agent-0.2.103}/LICENSE +0 -0
  8. {vision_agent-0.2.101 → vision_agent-0.2.103}/README.md +0 -0
  9. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/__init__.py +0 -0
  10. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/agent/__init__.py +0 -0
  11. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/agent/agent.py +0 -0
  12. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/agent/agent_utils.py +0 -0
  13. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/agent/vision_agent.py +0 -0
  14. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  15. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/agent/vision_agent_prompts.py +0 -0
  16. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/clients/__init__.py +0 -0
  17. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/clients/http.py +0 -0
  18. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/clients/landing_public_api.py +0 -0
  19. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/fonts/__init__.py +0 -0
  20. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  21. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/lmm/__init__.py +0 -0
  22. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/lmm/lmm.py +0 -0
  23. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/lmm/types.py +0 -0
  24. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/tools/meta_tools.py +0 -0
  25. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/tools/meta_tools_types.py +0 -0
  26. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/tools/prompts.py +0 -0
  27. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/tools/tool_utils.py +0 -0
  28. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/utils/__init__.py +0 -0
  29. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/utils/exceptions.py +0 -0
  30. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/utils/execute.py +0 -0
  31. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/utils/sim.py +0 -0
  32. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/utils/type_defs.py +0 -0
  33. {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.101
3
+ Version: 0.2.103
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.101"
7
+ version = "0.2.103"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -735,8 +735,17 @@ class VisionAgentCoder(Agent):
735
735
 
736
736
  if self.verbosity >= 1:
737
737
  for p in plans:
738
+ # tabulate will fail if the keys are not the same for all elements
739
+ p_fixed = [
740
+ {
741
+ "instructions": (
742
+ e["instructions"] if "instructions" in e else ""
743
+ )
744
+ }
745
+ for e in plans[p]
746
+ ]
738
747
  _LOGGER.info(
739
- f"\n{tabulate(tabular_data=plans[p], headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
748
+ f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
740
749
  )
741
750
 
742
751
  tool_infos = retrieve_tools(
@@ -19,6 +19,7 @@ from .tools import (
19
19
  florencev2_image_caption,
20
20
  florencev2_object_detection,
21
21
  florencev2_roberta_vqa,
22
+ florencev2_ocr,
22
23
  generate_pose_image,
23
24
  generate_soft_edge_image,
24
25
  get_tool_documentation,
@@ -28,6 +28,7 @@ from vision_agent.utils.image_utils import (
28
28
  denormalize_bbox,
29
29
  get_image_size,
30
30
  normalize_bbox,
31
+ convert_quad_box_to_bbox,
31
32
  rle_decode,
32
33
  )
33
34
 
@@ -652,6 +653,51 @@ def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str
652
653
  return return_data
653
654
 
654
655
 
656
+ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
657
+ """'florencev2_ocr' is a tool that can detect text and text regions in an image.
658
+ Each text region contains one line of text. It returns a list of detected text,
659
+ the text region as a bounding box with normalized coordinates, and confidence
660
+ scores. The results are sorted from top-left to bottom right.
661
+
662
+ Parameters:
663
+ image (np.ndarray): The image to extract text from.
664
+
665
+ Returns:
666
+ List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
667
+ with nornmalized coordinates, and confidence score.
668
+
669
+ Example
670
+ -------
671
+ >>> florencev2_ocr(image)
672
+ [
673
+ {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
674
+ ]
675
+ """
676
+
677
+ image_size = image.shape[:2]
678
+ image_b64 = convert_to_b64(image)
679
+ data = {
680
+ "image": image_b64,
681
+ "task": "<OCR_WITH_REGION>",
682
+ "function_name": "florencev2_ocr",
683
+ }
684
+
685
+ detections = send_inference_request(data, "florence2", v2=True)
686
+ detections = detections["<OCR_WITH_REGION>"]
687
+ return_data = []
688
+ for i in range(len(detections["quad_boxes"])):
689
+ return_data.append(
690
+ {
691
+ "label": detections["labels"][i],
692
+ "bbox": normalize_bbox(
693
+ convert_quad_box_to_bbox(detections["quad_boxes"][i]), image_size
694
+ ),
695
+ "score": 1.0,
696
+ }
697
+ )
698
+ return return_data
699
+
700
+
655
701
  def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
656
702
  """'detr_segmentation' is a tool that can segment common objects in an
657
703
  image without any text prompt. It returns a list of detected objects
@@ -1248,6 +1294,7 @@ TOOLS = [
1248
1294
  loca_visual_prompt_counting,
1249
1295
  florencev2_roberta_vqa,
1250
1296
  florencev2_image_caption,
1297
+ florencev2_ocr,
1251
1298
  detr_segmentation,
1252
1299
  depth_anything_v2,
1253
1300
  generate_soft_edge_image,
@@ -140,6 +140,23 @@ def denormalize_bbox(
140
140
  return bbox
141
141
 
142
142
 
143
+ def convert_quad_box_to_bbox(quad_box: List[Union[int, float]]) -> List[float]:
144
+ r"""Convert a quadrilateral bounding box to a rectangular bounding box.
145
+
146
+ Parameters:
147
+ quad_box: the quadrilateral bounding box
148
+
149
+ Returns:
150
+ The rectangular bounding box
151
+ """
152
+ x1, y1, x2, y2, x3, y3, x4, y4 = quad_box
153
+ x_min = min(x1, x2, x3, x4)
154
+ x_max = max(x1, x2, x3, x4)
155
+ y_min = min(y1, y2, y3, y4)
156
+ y_max = max(y1, y2, y3, y4)
157
+ return [x_min, y_min, x_max, y_max]
158
+
159
+
143
160
  def overlay_bboxes(
144
161
  image: Union[str, Path, np.ndarray, ImageType], bboxes: Dict
145
162
  ) -> ImageType:
File without changes
File without changes