vision-agent 0.2.155__tar.gz → 0.2.156__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. {vision_agent-0.2.155 → vision_agent-0.2.156}/PKG-INFO +1 -1
  2. {vision_agent-0.2.155 → vision_agent-0.2.156}/pyproject.toml +1 -1
  3. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/agent/agent_utils.py +6 -0
  4. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/agent/vision_agent.py +0 -2
  5. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/agent/vision_agent_coder.py +7 -3
  6. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/agent/vision_agent_prompts.py +7 -6
  7. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/tools/__init__.py +0 -1
  8. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/tools/meta_tools.py +3 -1
  9. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/tools/tools.py +58 -59
  10. {vision_agent-0.2.155 → vision_agent-0.2.156}/LICENSE +0 -0
  11. {vision_agent-0.2.155 → vision_agent-0.2.156}/README.md +0 -0
  12. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/__init__.py +0 -0
  13. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/agent/__init__.py +0 -0
  14. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/agent/agent.py +0 -0
  15. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  16. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/clients/__init__.py +0 -0
  17. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/clients/http.py +0 -0
  18. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/clients/landing_public_api.py +0 -0
  19. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/fonts/__init__.py +0 -0
  20. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  21. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/lmm/__init__.py +0 -0
  22. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/lmm/lmm.py +0 -0
  23. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/lmm/types.py +0 -0
  24. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/tools/prompts.py +0 -0
  25. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/tools/tool_utils.py +1 -1
  26. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/tools/tools_types.py +1 -1
  27. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/utils/__init__.py +0 -0
  28. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/utils/exceptions.py +0 -0
  29. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/utils/execute.py +0 -0
  30. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/utils/image_utils.py +0 -0
  31. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/utils/sim.py +0 -0
  32. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/utils/type_defs.py +0 -0
  33. {vision_agent-0.2.155 → vision_agent-0.2.156}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.155
3
+ Version: 0.2.156
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.155"
7
+ version = "0.2.156"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -77,3 +77,9 @@ def extract_code(code: str) -> str:
77
77
  if code.startswith("python\n"):
78
78
  code = code[len("python\n") :]
79
79
  return code
80
+
81
+
82
+ def remove_installs_from_code(code: str) -> str:
83
+ pattern = r"\n!pip install.*?(\n|\Z)\n"
84
+ code = re.sub(pattern, "", code, flags=re.DOTALL)
85
+ return code
@@ -407,8 +407,6 @@ class VisionAgent(Agent):
407
407
  code_interpreter.download_file(
408
408
  str(remote_artifacts_path.name), str(self.local_artifacts_path)
409
409
  )
410
- artifacts.load(self.local_artifacts_path)
411
- artifacts.save()
412
410
  return orig_chat, artifacts
413
411
 
414
412
  def streaming_message(self, message: Dict[str, Any]) -> None:
@@ -13,7 +13,11 @@ from tabulate import tabulate
13
13
 
14
14
  import vision_agent.tools as T
15
15
  from vision_agent.agent import Agent
16
- from vision_agent.agent.agent_utils import extract_code, extract_json
16
+ from vision_agent.agent.agent_utils import (
17
+ extract_code,
18
+ extract_json,
19
+ remove_installs_from_code,
20
+ )
17
21
  from vision_agent.agent.vision_agent_coder_prompts import (
18
22
  CODE,
19
23
  FIX_BUG,
@@ -836,8 +840,8 @@ class VisionAgentCoder(Agent):
836
840
  media=media_list,
837
841
  )
838
842
  success = cast(bool, results["success"])
839
- code = cast(str, results["code"])
840
- test = cast(str, results["test"])
843
+ code = remove_installs_from_code(cast(str, results["code"]))
844
+ test = remove_installs_from_code(cast(str, results["test"]))
841
845
  working_memory.extend(results["working_memory"]) # type: ignore
842
846
  plan.append({"code": code, "test": test, "plan": plan_i})
843
847
 
@@ -28,7 +28,8 @@ Here is the current conversation so far:
28
28
  1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
29
29
  2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
30
30
  3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`.
31
- 4. **Output in JSON**: Respond in the following format in JSON:
31
+ 4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
32
+ 5. **Output in JSON**: Respond in the following format in JSON:
32
33
 
33
34
  ```json
34
35
  {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
@@ -62,7 +63,7 @@ OBSERVATION:
62
63
  [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
63
64
 
64
65
 
65
- AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true}
66
+ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true}
66
67
  """
67
68
 
68
69
  EXAMPLES_CODE1_EXTRA = """
@@ -91,7 +92,7 @@ OBSERVATION:
91
92
  ----- stdout -----
92
93
  [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
93
94
 
94
- AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
95
+ AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true}
95
96
  """
96
97
 
97
98
  EXAMPLES_CODE2 = """
@@ -157,16 +158,16 @@ OBSERVATION:
157
158
  ----- stdout -----
158
159
  2
159
160
 
160
- AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
161
+ AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true}
161
162
 
162
163
  USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
163
164
 
164
- AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
165
+ AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "<execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
165
166
 
166
167
  OBSERVATION:
167
168
  [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
168
169
 
169
- AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
170
+ AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "<execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
170
171
 
171
172
  OBSERVATION:
172
173
  [Artifact code.py edits]
@@ -45,7 +45,6 @@ from .tools import (
45
45
  loca_zero_shot_counting,
46
46
  ocr,
47
47
  overlay_bounding_boxes,
48
- overlay_counting_results,
49
48
  overlay_heat_map,
50
49
  overlay_segmentation_masks,
51
50
  owl_v2_image,
@@ -116,7 +116,9 @@ class Artifacts:
116
116
  )
117
117
  output_str = "[Artifacts loaded]\n"
118
118
  for k in self.artifacts.keys():
119
- output_str += f"Artifact {k} loaded to {str(loaded_path / k)}\n"
119
+ output_str += (
120
+ f"Artifact name: {k}, loaded to path: {str(loaded_path / k)}\n"
121
+ )
120
122
  output_str += "[End of artifacts]\n"
121
123
  print(output_str)
122
124
  return output_str
@@ -13,7 +13,7 @@ from uuid import UUID
13
13
  import cv2
14
14
  import numpy as np
15
15
  import requests
16
- from PIL import Image, ImageDraw, ImageEnhance, ImageFont
16
+ from PIL import Image, ImageDraw, ImageFont
17
17
  from pillow_heif import register_heif_opener # type: ignore
18
18
  from pytube import YouTube # type: ignore
19
19
 
@@ -1150,10 +1150,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
1150
1150
  def florence2_phrase_grounding(
1151
1151
  prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
1152
1152
  ) -> List[Dict[str, Any]]:
1153
- """'florence2_phrase_grounding' will run florence2 on a image. It can
1154
- detect multiple objects given a text prompt which can be object names or caption.
1155
- You can optionally separate the object names in the text with commas. It returns
1156
- a list of bounding boxes with normalized coordinates, label names and associated
1153
+ """'florence2_phrase_grounding' is a tool that can detect multiple
1154
+ objects given a text prompt which can be object names or caption. You
1155
+ can optionally separate the object names in the text with commas. It returns a list
1156
+ of bounding boxes with normalized coordinates, label names and associated
1157
1157
  probability scores of 1.0.
1158
1158
 
1159
1159
  Parameters:
@@ -1812,6 +1812,11 @@ def save_image(image: np.ndarray, file_path: str) -> None:
1812
1812
  """
1813
1813
  from IPython.display import display
1814
1814
 
1815
+ if not isinstance(image, np.ndarray) or (
1816
+ image.shape[0] == 0 and image.shape[1] == 0
1817
+ ):
1818
+ raise ValueError("The image is not a valid NumPy array with shape (H, W, C)")
1819
+
1815
1820
  pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
1816
1821
  display(pil_image)
1817
1822
  pil_image.save(file_path)
@@ -1838,6 +1843,15 @@ def save_video(
1838
1843
  if fps <= 0:
1839
1844
  raise ValueError(f"fps must be greater than 0 got {fps}")
1840
1845
 
1846
+ if not isinstance(frames, list) or len(frames) == 0:
1847
+ raise ValueError("Frames must be a list of NumPy arrays")
1848
+
1849
+ for frame in frames:
1850
+ if not isinstance(frame, np.ndarray) or (
1851
+ frame.shape[0] == 0 and frame.shape[1] == 0
1852
+ ):
1853
+ raise ValueError("A frame is not a valid NumPy array with shape (H, W, C)")
1854
+
1841
1855
  if output_video_path is None:
1842
1856
  output_video_path = tempfile.NamedTemporaryFile(
1843
1857
  delete=False, suffix=".mp4"
@@ -1907,30 +1921,36 @@ def overlay_bounding_boxes(
1907
1921
  bboxes = bbox_int[i]
1908
1922
  bboxes = sorted(bboxes, key=lambda x: x["label"], reverse=True)
1909
1923
 
1910
- width, height = pil_image.size
1911
- fontsize = max(12, int(min(width, height) / 40))
1912
- draw = ImageDraw.Draw(pil_image)
1913
- font = ImageFont.truetype(
1914
- str(
1915
- resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")
1916
- ),
1917
- fontsize,
1918
- )
1919
-
1920
- for elt in bboxes:
1921
- label = elt["label"]
1922
- box = elt["bbox"]
1923
- scores = elt["score"]
1924
-
1925
- # denormalize the box if it is normalized
1926
- box = denormalize_bbox(box, (height, width))
1927
- draw.rectangle(box, outline=color[label], width=4)
1928
- text = f"{label}: {scores:.2f}"
1929
- text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
1930
- draw.rectangle(
1931
- (box[0], box[1], text_box[2], text_box[3]), fill=color[label]
1924
+ if len(bboxes) > 20:
1925
+ pil_image = _plot_counting(pil_image, bboxes, color)
1926
+ else:
1927
+ width, height = pil_image.size
1928
+ fontsize = max(12, int(min(width, height) / 40))
1929
+ draw = ImageDraw.Draw(pil_image)
1930
+ font = ImageFont.truetype(
1931
+ str(
1932
+ resources.files("vision_agent.fonts").joinpath(
1933
+ "default_font_ch_en.ttf"
1934
+ )
1935
+ ),
1936
+ fontsize,
1932
1937
  )
1933
- draw.text((box[0], box[1]), text, fill="black", font=font)
1938
+
1939
+ for elt in bboxes:
1940
+ label = elt["label"]
1941
+ box = elt["bbox"]
1942
+ scores = elt["score"]
1943
+
1944
+ # denormalize the box if it is normalized
1945
+ box = denormalize_bbox(box, (height, width))
1946
+ draw.rectangle(box, outline=color[label], width=4)
1947
+ text = f"{label}: {scores:.2f}"
1948
+ text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
1949
+ draw.rectangle(
1950
+ (box[0], box[1], text_box[2], text_box[3]), fill=color[label]
1951
+ )
1952
+ draw.text((box[0], box[1]), text, fill="black", font=font)
1953
+
1934
1954
  frame_out.append(np.array(pil_image))
1935
1955
  return frame_out[0] if len(frame_out) == 1 else frame_out
1936
1956
 
@@ -2089,39 +2109,19 @@ def overlay_heat_map(
2089
2109
  return np.array(combined)
2090
2110
 
2091
2111
 
2092
- def overlay_counting_results(
2093
- image: np.ndarray, instances: List[Dict[str, Any]]
2094
- ) -> np.ndarray:
2095
- """'overlay_counting_results' is a utility function that displays counting results on
2096
- an image.
2097
-
2098
- Parameters:
2099
- image (np.ndarray): The image to display the bounding boxes on.
2100
- instances (List[Dict[str, Any]]): A list of dictionaries containing the bounding
2101
- box information of each instance
2102
-
2103
- Returns:
2104
- np.ndarray: The image with the instance_id dislpayed
2105
-
2106
- Example
2107
- -------
2108
- >>> image_with_bboxes = overlay_counting_results(
2109
- image, [{'score': 0.99, 'label': 'object', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
2110
- )
2111
- """
2112
- pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
2113
- color = (158, 218, 229)
2114
-
2115
- width, height = pil_image.size
2112
+ def _plot_counting(
2113
+ image: Image.Image,
2114
+ bboxes: List[Dict[str, Any]],
2115
+ colors: Dict[str, Tuple[int, int, int]],
2116
+ ) -> Image.Image:
2117
+ width, height = image.size
2116
2118
  fontsize = max(10, int(min(width, height) / 80))
2117
- pil_image = ImageEnhance.Brightness(pil_image).enhance(0.5)
2118
- draw = ImageDraw.Draw(pil_image)
2119
+ draw = ImageDraw.Draw(image)
2119
2120
  font = ImageFont.truetype(
2120
2121
  str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
2121
2122
  fontsize,
2122
2123
  )
2123
-
2124
- for i, elt in enumerate(instances, 1):
2124
+ for i, elt in enumerate(bboxes, 1):
2125
2125
  label = f"{i}"
2126
2126
  box = elt["bbox"]
2127
2127
 
@@ -2143,7 +2143,7 @@ def overlay_counting_results(
2143
2143
  text_y1 = cy + text_height / 2
2144
2144
 
2145
2145
  # Draw the rectangle encapsulating the text
2146
- draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=color)
2146
+ draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=colors[elt["label"]])
2147
2147
 
2148
2148
  # Draw the text at the center of the bounding box
2149
2149
  draw.text(
@@ -2154,7 +2154,7 @@ def overlay_counting_results(
2154
2154
  anchor="lt",
2155
2155
  )
2156
2156
 
2157
- return np.array(pil_image)
2157
+ return image
2158
2158
 
2159
2159
 
2160
2160
  FUNCTION_TOOLS = [
@@ -2187,7 +2187,6 @@ UTIL_TOOLS = [
2187
2187
  overlay_bounding_boxes,
2188
2188
  overlay_segmentation_masks,
2189
2189
  overlay_heat_map,
2190
- overlay_counting_results,
2191
2190
  ]
2192
2191
 
2193
2192
  TOOLS = FUNCTION_TOOLS + UTIL_TOOLS
File without changes
File without changes
@@ -1,6 +1,6 @@
1
- import os
2
1
  import inspect
3
2
  import logging
3
+ import os
4
4
  from base64 import b64encode
5
5
  from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
6
6
 
@@ -1,6 +1,6 @@
1
1
  from enum import Enum
2
- from uuid import UUID
3
2
  from typing import List, Optional, Tuple, Union
3
+ from uuid import UUID
4
4
 
5
5
  from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
6
6