vision-agent 0.2.11__tar.gz → 0.2.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {vision_agent-0.2.11 → vision_agent-0.2.12}/PKG-INFO +3 -2
  2. {vision_agent-0.2.11 → vision_agent-0.2.12}/README.md +2 -1
  3. {vision_agent-0.2.11 → vision_agent-0.2.12}/pyproject.toml +1 -1
  4. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/agent/vision_agent.py +25 -13
  5. {vision_agent-0.2.11 → vision_agent-0.2.12}/LICENSE +0 -0
  6. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/__init__.py +0 -0
  7. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/agent/__init__.py +0 -0
  8. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/agent/agent.py +0 -0
  9. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/agent/easytool.py +0 -0
  10. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/agent/easytool_prompts.py +0 -0
  11. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/agent/reflexion.py +0 -0
  12. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/agent/reflexion_prompts.py +0 -0
  13. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/agent/vision_agent_prompts.py +0 -0
  14. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/fonts/__init__.py +0 -0
  15. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  16. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/image_utils.py +0 -0
  17. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/llm/__init__.py +0 -0
  18. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/llm/llm.py +0 -0
  19. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/lmm/__init__.py +0 -0
  20. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/lmm/lmm.py +0 -0
  21. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/tools/__init__.py +0 -0
  22. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/tools/prompts.py +0 -0
  23. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/tools/tools.py +0 -0
  24. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/tools/video.py +0 -0
  25. {vision_agent-0.2.11 → vision_agent-0.2.12}/vision_agent/type_defs.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.11
3
+ Version: 0.2.12
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -150,7 +150,7 @@ you. For example:
150
150
 
151
151
  #### Custom Tools
152
152
  You can also add your own custom tools for your vision agent to use:
153
-
153
+
154
154
  ```python
155
155
  from vision_agent.tools import Tool, register_tool
156
156
  @register_tool
@@ -188,6 +188,7 @@ find an example that creates a custom tool for template matching [here](examples
188
188
  | BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
189
189
  | SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
190
190
  | BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
191
+ | MaskDistance | MaskDistance returns the minimum distance between two segmentation masks in pixel units |
191
192
  | BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
192
193
  | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
193
194
  | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image. |
@@ -122,7 +122,7 @@ you. For example:
122
122
 
123
123
  #### Custom Tools
124
124
  You can also add your own custom tools for your vision agent to use:
125
-
125
+
126
126
  ```python
127
127
  from vision_agent.tools import Tool, register_tool
128
128
  @register_tool
@@ -160,6 +160,7 @@ find an example that creates a custom tool for template matching [here](examples
160
160
  | BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
161
161
  | SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
162
162
  | BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
163
+ | MaskDistance | MaskDistance returns the minimum distance between two segmentation masks in pixel units |
163
164
  | BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
164
165
  | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
165
166
  | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image. |
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.11"
7
+ version = "0.2.12"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -489,6 +489,7 @@ class VisionAgent(Agent):
489
489
  image: Optional[Union[str, Path]] = None,
490
490
  reference_data: Optional[Dict[str, str]] = None,
491
491
  visualize_output: Optional[bool] = False,
492
+ self_reflection: Optional[bool] = True,
492
493
  ) -> str:
493
494
  """Invoke the vision agent.
494
495
 
@@ -501,6 +502,7 @@ class VisionAgent(Agent):
501
502
  {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
502
503
  where the bounding box coordinates are normalized.
503
504
  visualize_output: Whether to visualize the output.
505
+ self_reflection: boolean to enable and disable self reflection.
504
506
 
505
507
  Returns:
506
508
  The result of the vision agent in text.
@@ -512,6 +514,7 @@ class VisionAgent(Agent):
512
514
  image=image,
513
515
  visualize_output=visualize_output,
514
516
  reference_data=reference_data,
517
+ self_reflection=self_reflection,
515
518
  )
516
519
 
517
520
  def log_progress(self, description: str) -> None:
@@ -538,6 +541,7 @@ class VisionAgent(Agent):
538
541
  image: Optional[Union[str, Path]] = None,
539
542
  reference_data: Optional[Dict[str, str]] = None,
540
543
  visualize_output: Optional[bool] = False,
544
+ self_reflection: Optional[bool] = True,
541
545
  ) -> Tuple[str, List[Dict]]:
542
546
  """Chat with the vision agent and return the final answer and all tool results.
543
547
 
@@ -550,6 +554,7 @@ class VisionAgent(Agent):
550
554
  {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
551
555
  where the bounding box coordinates are normalized.
552
556
  visualize_output: Whether to visualize the output.
557
+ self_reflection: boolean to enable and disable self reflection.
553
558
 
554
559
  Returns:
555
560
  A tuple where the first item is the final answer and the second item is a
@@ -625,20 +630,25 @@ class VisionAgent(Agent):
625
630
  reflection_images = [image]
626
631
  else:
627
632
  reflection_images = None
628
- reflection = self_reflect(
629
- self.reflect_model,
630
- question,
631
- self.tools,
632
- all_tool_results,
633
- final_answer,
634
- reflection_images,
635
- )
636
- self.log_progress(f"Reflection: {reflection}")
637
- parsed_reflection = parse_reflect(reflection)
638
- if parsed_reflection["Finish"]:
639
- break
633
+
634
+ if self_reflection:
635
+ reflection = self_reflect(
636
+ self.reflect_model,
637
+ question,
638
+ self.tools,
639
+ all_tool_results,
640
+ final_answer,
641
+ reflection_images,
642
+ )
643
+ self.log_progress(f"Reflection: {reflection}")
644
+ parsed_reflection = parse_reflect(reflection)
645
+ if parsed_reflection["Finish"]:
646
+ break
647
+ else:
648
+ reflections += "\n" + parsed_reflection["Reflection"]
640
649
  else:
641
- reflections += "\n" + parsed_reflection["Reflection"]
650
+ self.log_progress("Self Reflection skipped based on user request.")
651
+ break
642
652
  # '<ANSWER>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
643
653
  self.log_progress(
644
654
  f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}</ANSWER>"
@@ -660,12 +670,14 @@ class VisionAgent(Agent):
660
670
  image: Optional[Union[str, Path]] = None,
661
671
  reference_data: Optional[Dict[str, str]] = None,
662
672
  visualize_output: Optional[bool] = False,
673
+ self_reflection: Optional[bool] = True,
663
674
  ) -> str:
664
675
  answer, _ = self.chat_with_workflow(
665
676
  chat,
666
677
  image=image,
667
678
  visualize_output=visualize_output,
668
679
  reference_data=reference_data,
680
+ self_reflection=self_reflection,
669
681
  )
670
682
  return answer
671
683
 
File without changes