vision-agent 0.2.11__tar.gz → 0.2.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-0.2.11 → vision_agent-0.2.12}/PKG-INFO +3 -2
- {vision_agent-0.2.11 → vision_agent-0.2.12}/README.md +2 -1
- {vision_agent-0.2.11 → vision_agent-0.2.12}/pyproject.toml +1 -1
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/agent/vision_agent.py +25 -13
- {vision_agent-0.2.11 → vision_agent-0.2.12}/LICENSE +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/agent/easytool.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/agent/easytool_prompts.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/agent/reflexion.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/agent/reflexion_prompts.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/image_utils.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/llm/__init__.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/llm/llm.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/tools/__init__.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/tools/tools.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/tools/video.py +0 -0
- {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/type_defs.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.12
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -150,7 +150,7 @@ you. For example:
|
|
150
150
|
|
151
151
|
#### Custom Tools
|
152
152
|
You can also add your own custom tools for your vision agent to use:
|
153
|
-
|
153
|
+
|
154
154
|
```python
|
155
155
|
from vision_agent.tools import Tool, register_tool
|
156
156
|
@register_tool
|
@@ -188,6 +188,7 @@ find an example that creates a custom tool for template matching [here](examples
|
|
188
188
|
| BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
|
189
189
|
| SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
|
190
190
|
| BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
|
191
|
+
| MaskDistance | MaskDistance returns the minimum distance between two segmentation masks in pixel units |
|
191
192
|
| BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
|
192
193
|
| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
|
193
194
|
| ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image. |
|
@@ -122,7 +122,7 @@ you. For example:
|
|
122
122
|
|
123
123
|
#### Custom Tools
|
124
124
|
You can also add your own custom tools for your vision agent to use:
|
125
|
-
|
125
|
+
|
126
126
|
```python
|
127
127
|
from vision_agent.tools import Tool, register_tool
|
128
128
|
@register_tool
|
@@ -160,6 +160,7 @@ find an example that creates a custom tool for template matching [here](examples
|
|
160
160
|
| BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
|
161
161
|
| SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
|
162
162
|
| BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
|
163
|
+
| MaskDistance | MaskDistance returns the minimum distance between two segmentation masks in pixel units |
|
163
164
|
| BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
|
164
165
|
| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
|
165
166
|
| ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image. |
|
@@ -489,6 +489,7 @@ class VisionAgent(Agent):
|
|
489
489
|
image: Optional[Union[str, Path]] = None,
|
490
490
|
reference_data: Optional[Dict[str, str]] = None,
|
491
491
|
visualize_output: Optional[bool] = False,
|
492
|
+
self_reflection: Optional[bool] = True,
|
492
493
|
) -> str:
|
493
494
|
"""Invoke the vision agent.
|
494
495
|
|
@@ -501,6 +502,7 @@ class VisionAgent(Agent):
|
|
501
502
|
{"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
|
502
503
|
where the bounding box coordinates are normalized.
|
503
504
|
visualize_output: Whether to visualize the output.
|
505
|
+
self_reflection: boolean to enable and disable self reflection.
|
504
506
|
|
505
507
|
Returns:
|
506
508
|
The result of the vision agent in text.
|
@@ -512,6 +514,7 @@ class VisionAgent(Agent):
|
|
512
514
|
image=image,
|
513
515
|
visualize_output=visualize_output,
|
514
516
|
reference_data=reference_data,
|
517
|
+
self_reflection=self_reflection,
|
515
518
|
)
|
516
519
|
|
517
520
|
def log_progress(self, description: str) -> None:
|
@@ -538,6 +541,7 @@ class VisionAgent(Agent):
|
|
538
541
|
image: Optional[Union[str, Path]] = None,
|
539
542
|
reference_data: Optional[Dict[str, str]] = None,
|
540
543
|
visualize_output: Optional[bool] = False,
|
544
|
+
self_reflection: Optional[bool] = True,
|
541
545
|
) -> Tuple[str, List[Dict]]:
|
542
546
|
"""Chat with the vision agent and return the final answer and all tool results.
|
543
547
|
|
@@ -550,6 +554,7 @@ class VisionAgent(Agent):
|
|
550
554
|
{"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
|
551
555
|
where the bounding box coordinates are normalized.
|
552
556
|
visualize_output: Whether to visualize the output.
|
557
|
+
self_reflection: boolean to enable and disable self reflection.
|
553
558
|
|
554
559
|
Returns:
|
555
560
|
A tuple where the first item is the final answer and the second item is a
|
@@ -625,20 +630,25 @@ class VisionAgent(Agent):
|
|
625
630
|
reflection_images = [image]
|
626
631
|
else:
|
627
632
|
reflection_images = None
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
633
|
+
|
634
|
+
if self_reflection:
|
635
|
+
reflection = self_reflect(
|
636
|
+
self.reflect_model,
|
637
|
+
question,
|
638
|
+
self.tools,
|
639
|
+
all_tool_results,
|
640
|
+
final_answer,
|
641
|
+
reflection_images,
|
642
|
+
)
|
643
|
+
self.log_progress(f"Reflection: {reflection}")
|
644
|
+
parsed_reflection = parse_reflect(reflection)
|
645
|
+
if parsed_reflection["Finish"]:
|
646
|
+
break
|
647
|
+
else:
|
648
|
+
reflections += "\n" + parsed_reflection["Reflection"]
|
640
649
|
else:
|
641
|
-
|
650
|
+
self.log_progress("Self Reflection skipped based on user request.")
|
651
|
+
break
|
642
652
|
# '<ANSWER>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
|
643
653
|
self.log_progress(
|
644
654
|
f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}</ANSWER>"
|
@@ -660,12 +670,14 @@ class VisionAgent(Agent):
|
|
660
670
|
image: Optional[Union[str, Path]] = None,
|
661
671
|
reference_data: Optional[Dict[str, str]] = None,
|
662
672
|
visualize_output: Optional[bool] = False,
|
673
|
+
self_reflection: Optional[bool] = True,
|
663
674
|
) -> str:
|
664
675
|
answer, _ = self.chat_with_workflow(
|
665
676
|
chat,
|
666
677
|
image=image,
|
667
678
|
visualize_output=visualize_output,
|
668
679
|
reference_data=reference_data,
|
680
|
+
self_reflection=self_reflection,
|
669
681
|
)
|
670
682
|
return answer
|
671
683
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|