vision-agent 0.2.101__py3-none-any.whl → 0.2.103__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/vision_agent_coder.py +10 -1
- vision_agent/tools/__init__.py +1 -0
- vision_agent/tools/tools.py +47 -0
- vision_agent/utils/image_utils.py +17 -0
- {vision_agent-0.2.101.dist-info → vision_agent-0.2.103.dist-info}/METADATA +1 -1
- {vision_agent-0.2.101.dist-info → vision_agent-0.2.103.dist-info}/RECORD +8 -8
- {vision_agent-0.2.101.dist-info → vision_agent-0.2.103.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.101.dist-info → vision_agent-0.2.103.dist-info}/WHEEL +0 -0
@@ -735,8 +735,17 @@ class VisionAgentCoder(Agent):
|
|
735
735
|
|
736
736
|
if self.verbosity >= 1:
|
737
737
|
for p in plans:
|
738
|
+
# tabulate will fail if the keys are not the same for all elements
|
739
|
+
p_fixed = [
|
740
|
+
{
|
741
|
+
"instructions": (
|
742
|
+
e["instructions"] if "instructions" in e else ""
|
743
|
+
)
|
744
|
+
}
|
745
|
+
for e in plans[p]
|
746
|
+
]
|
738
747
|
_LOGGER.info(
|
739
|
-
f"\n{tabulate(tabular_data=
|
748
|
+
f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
740
749
|
)
|
741
750
|
|
742
751
|
tool_infos = retrieve_tools(
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -28,6 +28,7 @@ from vision_agent.utils.image_utils import (
|
|
28
28
|
denormalize_bbox,
|
29
29
|
get_image_size,
|
30
30
|
normalize_bbox,
|
31
|
+
convert_quad_box_to_bbox,
|
31
32
|
rle_decode,
|
32
33
|
)
|
33
34
|
|
@@ -652,6 +653,51 @@ def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str
|
|
652
653
|
return return_data
|
653
654
|
|
654
655
|
|
656
|
+
def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
657
|
+
"""'florencev2_ocr' is a tool that can detect text and text regions in an image.
|
658
|
+
Each text region contains one line of text. It returns a list of detected text,
|
659
|
+
the text region as a bounding box with normalized coordinates, and confidence
|
660
|
+
scores. The results are sorted from top-left to bottom right.
|
661
|
+
|
662
|
+
Parameters:
|
663
|
+
image (np.ndarray): The image to extract text from.
|
664
|
+
|
665
|
+
Returns:
|
666
|
+
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
667
|
+
with nornmalized coordinates, and confidence score.
|
668
|
+
|
669
|
+
Example
|
670
|
+
-------
|
671
|
+
>>> florencev2_ocr(image)
|
672
|
+
[
|
673
|
+
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
674
|
+
]
|
675
|
+
"""
|
676
|
+
|
677
|
+
image_size = image.shape[:2]
|
678
|
+
image_b64 = convert_to_b64(image)
|
679
|
+
data = {
|
680
|
+
"image": image_b64,
|
681
|
+
"task": "<OCR_WITH_REGION>",
|
682
|
+
"function_name": "florencev2_ocr",
|
683
|
+
}
|
684
|
+
|
685
|
+
detections = send_inference_request(data, "florence2", v2=True)
|
686
|
+
detections = detections["<OCR_WITH_REGION>"]
|
687
|
+
return_data = []
|
688
|
+
for i in range(len(detections["quad_boxes"])):
|
689
|
+
return_data.append(
|
690
|
+
{
|
691
|
+
"label": detections["labels"][i],
|
692
|
+
"bbox": normalize_bbox(
|
693
|
+
convert_quad_box_to_bbox(detections["quad_boxes"][i]), image_size
|
694
|
+
),
|
695
|
+
"score": 1.0,
|
696
|
+
}
|
697
|
+
)
|
698
|
+
return return_data
|
699
|
+
|
700
|
+
|
655
701
|
def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
|
656
702
|
"""'detr_segmentation' is a tool that can segment common objects in an
|
657
703
|
image without any text prompt. It returns a list of detected objects
|
@@ -1248,6 +1294,7 @@ TOOLS = [
|
|
1248
1294
|
loca_visual_prompt_counting,
|
1249
1295
|
florencev2_roberta_vqa,
|
1250
1296
|
florencev2_image_caption,
|
1297
|
+
florencev2_ocr,
|
1251
1298
|
detr_segmentation,
|
1252
1299
|
depth_anything_v2,
|
1253
1300
|
generate_soft_edge_image,
|
@@ -140,6 +140,23 @@ def denormalize_bbox(
|
|
140
140
|
return bbox
|
141
141
|
|
142
142
|
|
143
|
+
def convert_quad_box_to_bbox(quad_box: List[Union[int, float]]) -> List[float]:
|
144
|
+
r"""Convert a quadrilateral bounding box to a rectangular bounding box.
|
145
|
+
|
146
|
+
Parameters:
|
147
|
+
quad_box: the quadrilateral bounding box
|
148
|
+
|
149
|
+
Returns:
|
150
|
+
The rectangular bounding box
|
151
|
+
"""
|
152
|
+
x1, y1, x2, y2, x3, y3, x4, y4 = quad_box
|
153
|
+
x_min = min(x1, x2, x3, x4)
|
154
|
+
x_max = max(x1, x2, x3, x4)
|
155
|
+
y_min = min(y1, y2, y3, y4)
|
156
|
+
y_max = max(y1, y2, y3, y4)
|
157
|
+
return [x_min, y_min, x_max, y_max]
|
158
|
+
|
159
|
+
|
143
160
|
def overlay_bboxes(
|
144
161
|
image: Union[str, Path, np.ndarray, ImageType], bboxes: Dict
|
145
162
|
) -> ImageType:
|
@@ -3,7 +3,7 @@ vision_agent/agent/__init__.py,sha256=qpduQ9YufJQfMmG6jwKC2xmlbtR2qK8_1eQC1sGA9K
|
|
3
3
|
vision_agent/agent/agent.py,sha256=Bt8yhjCFXuRdZaHxKEesG40V09nWRt45sZluri1R3AA,575
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=JXdl2xz14LKQAmScY-MIW23AD2WBFCsnI0JS6dAyj3Q,1412
|
5
5
|
vision_agent/agent/vision_agent.py,sha256=U7VqUR-Io0xkGHpcF03Kq87Y0YQIdZQGqxuXdwjQzgk,8441
|
6
|
-
vision_agent/agent/vision_agent_coder.py,sha256=
|
6
|
+
vision_agent/agent/vision_agent_coder.py,sha256=A3x1vb2iYq1Bi0AfUodFh2b0w9G0XfN0Kq0gjY8f5f0,30700
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=a3R_vHlT2FW3-DSn4OWgzF9zEAx-uKM4ZaTi9Kn-K54,11116
|
8
8
|
vision_agent/agent/vision_agent_prompts.py,sha256=hjs-m4ZHR7HE1HtOeX_1rOvTQA2FMEAqEkaBbGPBYDo,6072
|
9
9
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -14,20 +14,20 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
14
14
|
vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
|
15
15
|
vision_agent/lmm/lmm.py,sha256=KcS6h-8whGFmwt7t4LNlj0hZ4U-rBojYBLKLmrMsF48,15075
|
16
16
|
vision_agent/lmm/types.py,sha256=8TSRoTbXyCKVJiH-wHXI2OiGOMSkYv1vLGYeAXtNpOQ,153
|
17
|
-
vision_agent/tools/__init__.py,sha256=
|
17
|
+
vision_agent/tools/__init__.py,sha256=MK0D8NtIChwGHwqsTz3LeV5BGuQecNVrNzUsyaEwuGA,1926
|
18
18
|
vision_agent/tools/meta_tools.py,sha256=v2FrLl0YwM7JwsVRfgfnryd9qorbPRiObestexbnNBs,15170
|
19
19
|
vision_agent/tools/meta_tools_types.py,sha256=aU4knXEhm0AnDYW958T6Q6qPwN4yq8pQzQOxqFaOjzg,596
|
20
20
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
21
21
|
vision_agent/tools/tool_utils.py,sha256=ZhZ9oEcOvRSuWPy-gV0rx3pvaaXzBW-ZC3YQanXrq1g,4733
|
22
|
-
vision_agent/tools/tools.py,sha256=
|
22
|
+
vision_agent/tools/tools.py,sha256=fgPE0VHfBiQPJKkslBm_hugTOyRT-Hnw7eztvC-l4_o,44661
|
23
23
|
vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
|
24
24
|
vision_agent/utils/exceptions.py,sha256=isVH-SVL4vHj3q5kK4z7cy5_aOapAqHXWkpibfSNbUs,1659
|
25
25
|
vision_agent/utils/execute.py,sha256=ZRxztUfZwvMvPnFbKx5W_LZzTuKl8Zf5dP3Y8P2-3nk,25093
|
26
|
-
vision_agent/utils/image_utils.py,sha256=
|
26
|
+
vision_agent/utils/image_utils.py,sha256=c_g5i_cFC0C-Yw9gU_NaVgQdmBlyumw3bLIDtCU42xo,8200
|
27
27
|
vision_agent/utils/sim.py,sha256=7JvtWGN0Ik5ife3qQYWs7Fm3T8AnAXGFd5HnvDC15mQ,4433
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.103.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.103.dist-info/METADATA,sha256=DfZa2bcKHvQxsgAJRBdIEpPGdBjt18TuOwMzXOUIV_w,10729
|
32
|
+
vision_agent-0.2.103.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.103.dist-info/RECORD,,
|
File without changes
|
File without changes
|