vision-agent 0.2.101__tar.gz → 0.2.103__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.101 → vision_agent-0.2.103}/PKG-INFO +1 -1
- {vision_agent-0.2.101 → vision_agent-0.2.103}/pyproject.toml +1 -1
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/agent/vision_agent_coder.py +10 -1
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/tools/__init__.py +1 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/tools/tools.py +47 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/utils/image_utils.py +17 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/LICENSE +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/README.md +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/agent/agent_utils.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/tools/meta_tools_types.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/utils/video.py +0 -0
@@ -735,8 +735,17 @@ class VisionAgentCoder(Agent):
|
|
735
735
|
|
736
736
|
if self.verbosity >= 1:
|
737
737
|
for p in plans:
|
738
|
+
# tabulate will fail if the keys are not the same for all elements
|
739
|
+
p_fixed = [
|
740
|
+
{
|
741
|
+
"instructions": (
|
742
|
+
e["instructions"] if "instructions" in e else ""
|
743
|
+
)
|
744
|
+
}
|
745
|
+
for e in plans[p]
|
746
|
+
]
|
738
747
|
_LOGGER.info(
|
739
|
-
f"\n{tabulate(tabular_data=
|
748
|
+
f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
740
749
|
)
|
741
750
|
|
742
751
|
tool_infos = retrieve_tools(
|
@@ -28,6 +28,7 @@ from vision_agent.utils.image_utils import (
|
|
28
28
|
denormalize_bbox,
|
29
29
|
get_image_size,
|
30
30
|
normalize_bbox,
|
31
|
+
convert_quad_box_to_bbox,
|
31
32
|
rle_decode,
|
32
33
|
)
|
33
34
|
|
@@ -652,6 +653,51 @@ def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str
|
|
652
653
|
return return_data
|
653
654
|
|
654
655
|
|
656
|
+
def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
657
|
+
"""'florencev2_ocr' is a tool that can detect text and text regions in an image.
|
658
|
+
Each text region contains one line of text. It returns a list of detected text,
|
659
|
+
the text region as a bounding box with normalized coordinates, and confidence
|
660
|
+
scores. The results are sorted from top-left to bottom right.
|
661
|
+
|
662
|
+
Parameters:
|
663
|
+
image (np.ndarray): The image to extract text from.
|
664
|
+
|
665
|
+
Returns:
|
666
|
+
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
667
|
+
with nornmalized coordinates, and confidence score.
|
668
|
+
|
669
|
+
Example
|
670
|
+
-------
|
671
|
+
>>> florencev2_ocr(image)
|
672
|
+
[
|
673
|
+
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
674
|
+
]
|
675
|
+
"""
|
676
|
+
|
677
|
+
image_size = image.shape[:2]
|
678
|
+
image_b64 = convert_to_b64(image)
|
679
|
+
data = {
|
680
|
+
"image": image_b64,
|
681
|
+
"task": "<OCR_WITH_REGION>",
|
682
|
+
"function_name": "florencev2_ocr",
|
683
|
+
}
|
684
|
+
|
685
|
+
detections = send_inference_request(data, "florence2", v2=True)
|
686
|
+
detections = detections["<OCR_WITH_REGION>"]
|
687
|
+
return_data = []
|
688
|
+
for i in range(len(detections["quad_boxes"])):
|
689
|
+
return_data.append(
|
690
|
+
{
|
691
|
+
"label": detections["labels"][i],
|
692
|
+
"bbox": normalize_bbox(
|
693
|
+
convert_quad_box_to_bbox(detections["quad_boxes"][i]), image_size
|
694
|
+
),
|
695
|
+
"score": 1.0,
|
696
|
+
}
|
697
|
+
)
|
698
|
+
return return_data
|
699
|
+
|
700
|
+
|
655
701
|
def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
|
656
702
|
"""'detr_segmentation' is a tool that can segment common objects in an
|
657
703
|
image without any text prompt. It returns a list of detected objects
|
@@ -1248,6 +1294,7 @@ TOOLS = [
|
|
1248
1294
|
loca_visual_prompt_counting,
|
1249
1295
|
florencev2_roberta_vqa,
|
1250
1296
|
florencev2_image_caption,
|
1297
|
+
florencev2_ocr,
|
1251
1298
|
detr_segmentation,
|
1252
1299
|
depth_anything_v2,
|
1253
1300
|
generate_soft_edge_image,
|
@@ -140,6 +140,23 @@ def denormalize_bbox(
|
|
140
140
|
return bbox
|
141
141
|
|
142
142
|
|
143
|
+
def convert_quad_box_to_bbox(quad_box: List[Union[int, float]]) -> List[float]:
|
144
|
+
r"""Convert a quadrilateral bounding box to a rectangular bounding box.
|
145
|
+
|
146
|
+
Parameters:
|
147
|
+
quad_box: the quadrilateral bounding box
|
148
|
+
|
149
|
+
Returns:
|
150
|
+
The rectangular bounding box
|
151
|
+
"""
|
152
|
+
x1, y1, x2, y2, x3, y3, x4, y4 = quad_box
|
153
|
+
x_min = min(x1, x2, x3, x4)
|
154
|
+
x_max = max(x1, x2, x3, x4)
|
155
|
+
y_min = min(y1, y2, y3, y4)
|
156
|
+
y_max = max(y1, y2, y3, y4)
|
157
|
+
return [x_min, y_min, x_max, y_max]
|
158
|
+
|
159
|
+
|
143
160
|
def overlay_bboxes(
|
144
161
|
image: Union[str, Path, np.ndarray, ImageType], bboxes: Dict
|
145
162
|
) -> ImageType:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.101 → vision_agent-0.2.103}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|