vision-agent 0.2.101__py3-none-any.whl → 0.2.103__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent_coder.py +10 -1
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +47 -0
- vision_agent/utils/image_utils.py +17 -0
- {vision_agent-0.2.101.dist-info → vision_agent-0.2.103.dist-info}/METADATA +1 -1
- {vision_agent-0.2.101.dist-info → vision_agent-0.2.103.dist-info}/RECORD +8 -8
- {vision_agent-0.2.101.dist-info → vision_agent-0.2.103.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.101.dist-info → vision_agent-0.2.103.dist-info}/WHEEL +0 -0
@@ -735,8 +735,17 @@ class VisionAgentCoder(Agent):
|
|
735
735
|
|
736
736
|
if self.verbosity >= 1:
|
737
737
|
for p in plans:
|
738
|
+
# tabulate will fail if the keys are not the same for all elements
|
739
|
+
p_fixed = [
|
740
|
+
{
|
741
|
+
"instructions": (
|
742
|
+
e["instructions"] if "instructions" in e else ""
|
743
|
+
)
|
744
|
+
}
|
745
|
+
for e in plans[p]
|
746
|
+
]
|
738
747
|
_LOGGER.info(
|
739
|
-
f"\n{tabulate(tabular_data=
|
748
|
+
f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
740
749
|
)
|
741
750
|
|
742
751
|
tool_infos = retrieve_tools(
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -28,6 +28,7 @@ from vision_agent.utils.image_utils import (
|
|
28
28
|
denormalize_bbox,
|
29
29
|
get_image_size,
|
30
30
|
normalize_bbox,
|
31
|
+
convert_quad_box_to_bbox,
|
31
32
|
rle_decode,
|
32
33
|
)
|
33
34
|
|
@@ -652,6 +653,51 @@ def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str
|
|
652
653
|
return return_data
|
653
654
|
|
654
655
|
|
656
|
+
def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
657
|
+
"""'florencev2_ocr' is a tool that can detect text and text regions in an image.
|
658
|
+
Each text region contains one line of text. It returns a list of detected text,
|
659
|
+
the text region as a bounding box with normalized coordinates, and confidence
|
660
|
+
scores. The results are sorted from top-left to bottom right.
|
661
|
+
|
662
|
+
Parameters:
|
663
|
+
image (np.ndarray): The image to extract text from.
|
664
|
+
|
665
|
+
Returns:
|
666
|
+
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
667
|
+
with nornmalized coordinates, and confidence score.
|
668
|
+
|
669
|
+
Example
|
670
|
+
-------
|
671
|
+
>>> florencev2_ocr(image)
|
672
|
+
[
|
673
|
+
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
674
|
+
]
|
675
|
+
"""
|
676
|
+
|
677
|
+
image_size = image.shape[:2]
|
678
|
+
image_b64 = convert_to_b64(image)
|
679
|
+
data = {
|
680
|
+
"image": image_b64,
|
681
|
+
"task": "<OCR_WITH_REGION>",
|
682
|
+
"function_name": "florencev2_ocr",
|
683
|
+
}
|
684
|
+
|
685
|
+
detections = send_inference_request(data, "florence2", v2=True)
|
686
|
+
detections = detections["<OCR_WITH_REGION>"]
|
687
|
+
return_data = []
|
688
|
+
for i in range(len(detections["quad_boxes"])):
|
689
|
+
return_data.append(
|
690
|
+
{
|
691
|
+
"label": detections["labels"][i],
|
692
|
+
"bbox": normalize_bbox(
|
693
|
+
convert_quad_box_to_bbox(detections["quad_boxes"][i]), image_size
|
694
|
+
),
|
695
|
+
"score": 1.0,
|
696
|
+
}
|
697
|
+
)
|
698
|
+
return return_data
|
699
|
+
|
700
|
+
|
655
701
|
def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
|
656
702
|
"""'detr_segmentation' is a tool that can segment common objects in an
|
657
703
|
image without any text prompt. It returns a list of detected objects
|
@@ -1248,6 +1294,7 @@ TOOLS = [
|
|
1248
1294
|
loca_visual_prompt_counting,
|
1249
1295
|
florencev2_roberta_vqa,
|
1250
1296
|
florencev2_image_caption,
|
1297
|
+
florencev2_ocr,
|
1251
1298
|
detr_segmentation,
|
1252
1299
|
depth_anything_v2,
|
1253
1300
|
generate_soft_edge_image,
|
@@ -140,6 +140,23 @@ def denormalize_bbox(
|
|
140
140
|
return bbox
|
141
141
|
|
142
142
|
|
143
|
+
def convert_quad_box_to_bbox(quad_box: List[Union[int, float]]) -> List[float]:
|
144
|
+
r"""Convert a quadrilateral bounding box to a rectangular bounding box.
|
145
|
+
|
146
|
+
Parameters:
|
147
|
+
quad_box: the quadrilateral bounding box
|
148
|
+
|
149
|
+
Returns:
|
150
|
+
The rectangular bounding box
|
151
|
+
"""
|
152
|
+
x1, y1, x2, y2, x3, y3, x4, y4 = quad_box
|
153
|
+
x_min = min(x1, x2, x3, x4)
|
154
|
+
x_max = max(x1, x2, x3, x4)
|
155
|
+
y_min = min(y1, y2, y3, y4)
|
156
|
+
y_max = max(y1, y2, y3, y4)
|
157
|
+
return [x_min, y_min, x_max, y_max]
|
158
|
+
|
159
|
+
|
143
160
|
def overlay_bboxes(
|
144
161
|
image: Union[str, Path, np.ndarray, ImageType], bboxes: Dict
|
145
162
|
) -> ImageType:
|
@@ -3,7 +3,7 @@ vision_agent/agent/__init__.py,sha256=qpduQ9YufJQfMmG6jwKC2xmlbtR2qK8_1eQC1sGA9K
|
|
3
3
|
vision_agent/agent/agent.py,sha256=Bt8yhjCFXuRdZaHxKEesG40V09nWRt45sZluri1R3AA,575
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=JXdl2xz14LKQAmScY-MIW23AD2WBFCsnI0JS6dAyj3Q,1412
|
5
5
|
vision_agent/agent/vision_agent.py,sha256=U7VqUR-Io0xkGHpcF03Kq87Y0YQIdZQGqxuXdwjQzgk,8441
|
6
|
-
vision_agent/agent/vision_agent_coder.py,sha256=
|
6
|
+
vision_agent/agent/vision_agent_coder.py,sha256=A3x1vb2iYq1Bi0AfUodFh2b0w9G0XfN0Kq0gjY8f5f0,30700
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=a3R_vHlT2FW3-DSn4OWgzF9zEAx-uKM4ZaTi9Kn-K54,11116
|
8
8
|
vision_agent/agent/vision_agent_prompts.py,sha256=hjs-m4ZHR7HE1HtOeX_1rOvTQA2FMEAqEkaBbGPBYDo,6072
|
9
9
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -14,20 +14,20 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
14
14
|
vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
|
15
15
|
vision_agent/lmm/lmm.py,sha256=KcS6h-8whGFmwt7t4LNlj0hZ4U-rBojYBLKLmrMsF48,15075
|
16
16
|
vision_agent/lmm/types.py,sha256=8TSRoTbXyCKVJiH-wHXI2OiGOMSkYv1vLGYeAXtNpOQ,153
|
17
|
-
vision_agent/tools/__init__.py,sha256=
|
17
|
+
vision_agent/tools/__init__.py,sha256=MK0D8NtIChwGHwqsTz3LeV5BGuQecNVrNzUsyaEwuGA,1926
|
18
18
|
vision_agent/tools/meta_tools.py,sha256=v2FrLl0YwM7JwsVRfgfnryd9qorbPRiObestexbnNBs,15170
|
19
19
|
vision_agent/tools/meta_tools_types.py,sha256=aU4knXEhm0AnDYW958T6Q6qPwN4yq8pQzQOxqFaOjzg,596
|
20
20
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
21
21
|
vision_agent/tools/tool_utils.py,sha256=ZhZ9oEcOvRSuWPy-gV0rx3pvaaXzBW-ZC3YQanXrq1g,4733
|
22
|
-
vision_agent/tools/tools.py,sha256=
|
22
|
+
vision_agent/tools/tools.py,sha256=fgPE0VHfBiQPJKkslBm_hugTOyRT-Hnw7eztvC-l4_o,44661
|
23
23
|
vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
|
24
24
|
vision_agent/utils/exceptions.py,sha256=isVH-SVL4vHj3q5kK4z7cy5_aOapAqHXWkpibfSNbUs,1659
|
25
25
|
vision_agent/utils/execute.py,sha256=ZRxztUfZwvMvPnFbKx5W_LZzTuKl8Zf5dP3Y8P2-3nk,25093
|
26
|
-
vision_agent/utils/image_utils.py,sha256=
|
26
|
+
vision_agent/utils/image_utils.py,sha256=c_g5i_cFC0C-Yw9gU_NaVgQdmBlyumw3bLIDtCU42xo,8200
|
27
27
|
vision_agent/utils/sim.py,sha256=7JvtWGN0Ik5ife3qQYWs7Fm3T8AnAXGFd5HnvDC15mQ,4433
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.103.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.103.dist-info/METADATA,sha256=DfZa2bcKHvQxsgAJRBdIEpPGdBjt18TuOwMzXOUIV_w,10729
|
32
|
+
vision_agent-0.2.103.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.103.dist-info/RECORD,,
|
File without changes
|
File without changes
|