vision-agent 0.2.154__tar.gz → 0.2.156__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. {vision_agent-0.2.154 → vision_agent-0.2.156}/PKG-INFO +1 -1
  2. {vision_agent-0.2.154 → vision_agent-0.2.156}/pyproject.toml +1 -1
  3. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/agent/agent_utils.py +6 -0
  4. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/agent/vision_agent.py +0 -2
  5. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/agent/vision_agent_coder.py +7 -3
  6. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/agent/vision_agent_prompts.py +7 -6
  7. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/tools/__init__.py +0 -1
  8. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/tools/meta_tools.py +3 -1
  9. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/tools/tools.py +67 -64
  10. {vision_agent-0.2.154 → vision_agent-0.2.156}/LICENSE +0 -0
  11. {vision_agent-0.2.154 → vision_agent-0.2.156}/README.md +0 -0
  12. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/__init__.py +0 -0
  13. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/agent/__init__.py +0 -0
  14. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/agent/agent.py +0 -0
  15. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  16. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/clients/__init__.py +0 -0
  17. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/clients/http.py +0 -0
  18. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/clients/landing_public_api.py +0 -0
  19. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/fonts/__init__.py +0 -0
  20. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  21. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/lmm/__init__.py +0 -0
  22. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/lmm/lmm.py +0 -0
  23. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/lmm/types.py +0 -0
  24. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/tools/prompts.py +0 -0
  25. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/tools/tool_utils.py +1 -1
  26. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/tools/tools_types.py +1 -1
  27. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/utils/__init__.py +0 -0
  28. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/utils/exceptions.py +0 -0
  29. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/utils/execute.py +0 -0
  30. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/utils/image_utils.py +0 -0
  31. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/utils/sim.py +0 -0
  32. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/utils/type_defs.py +0 -0
  33. {vision_agent-0.2.154 → vision_agent-0.2.156}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.154
3
+ Version: 0.2.156
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.154"
7
+ version = "0.2.156"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -77,3 +77,9 @@ def extract_code(code: str) -> str:
77
77
  if code.startswith("python\n"):
78
78
  code = code[len("python\n") :]
79
79
  return code
80
+
81
+
82
+ def remove_installs_from_code(code: str) -> str:
83
+ pattern = r"\n!pip install.*?(\n|\Z)\n"
84
+ code = re.sub(pattern, "", code, flags=re.DOTALL)
85
+ return code
@@ -407,8 +407,6 @@ class VisionAgent(Agent):
407
407
  code_interpreter.download_file(
408
408
  str(remote_artifacts_path.name), str(self.local_artifacts_path)
409
409
  )
410
- artifacts.load(self.local_artifacts_path)
411
- artifacts.save()
412
410
  return orig_chat, artifacts
413
411
 
414
412
  def streaming_message(self, message: Dict[str, Any]) -> None:
@@ -13,7 +13,11 @@ from tabulate import tabulate
13
13
 
14
14
  import vision_agent.tools as T
15
15
  from vision_agent.agent import Agent
16
- from vision_agent.agent.agent_utils import extract_code, extract_json
16
+ from vision_agent.agent.agent_utils import (
17
+ extract_code,
18
+ extract_json,
19
+ remove_installs_from_code,
20
+ )
17
21
  from vision_agent.agent.vision_agent_coder_prompts import (
18
22
  CODE,
19
23
  FIX_BUG,
@@ -836,8 +840,8 @@ class VisionAgentCoder(Agent):
836
840
  media=media_list,
837
841
  )
838
842
  success = cast(bool, results["success"])
839
- code = cast(str, results["code"])
840
- test = cast(str, results["test"])
843
+ code = remove_installs_from_code(cast(str, results["code"]))
844
+ test = remove_installs_from_code(cast(str, results["test"]))
841
845
  working_memory.extend(results["working_memory"]) # type: ignore
842
846
  plan.append({"code": code, "test": test, "plan": plan_i})
843
847
 
@@ -28,7 +28,8 @@ Here is the current conversation so far:
28
28
  1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
29
29
  2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
30
30
  3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`.
31
- 4. **Output in JSON**: Respond in the following format in JSON:
31
+ 4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
32
+ 5. **Output in JSON**: Respond in the following format in JSON:
32
33
 
33
34
  ```json
34
35
  {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
@@ -62,7 +63,7 @@ OBSERVATION:
62
63
  [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
63
64
 
64
65
 
65
- AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true}
66
+ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true}
66
67
  """
67
68
 
68
69
  EXAMPLES_CODE1_EXTRA = """
@@ -91,7 +92,7 @@ OBSERVATION:
91
92
  ----- stdout -----
92
93
  [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
93
94
 
94
- AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
95
+ AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true}
95
96
  """
96
97
 
97
98
  EXAMPLES_CODE2 = """
@@ -157,16 +158,16 @@ OBSERVATION:
157
158
  ----- stdout -----
158
159
  2
159
160
 
160
- AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
161
+ AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true}
161
162
 
162
163
  USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
163
164
 
164
- AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
165
+ AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "<execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
165
166
 
166
167
  OBSERVATION:
167
168
  [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
168
169
 
169
- AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
170
+ AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "<execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
170
171
 
171
172
  OBSERVATION:
172
173
  [Artifact code.py edits]
@@ -45,7 +45,6 @@ from .tools import (
45
45
  loca_zero_shot_counting,
46
46
  ocr,
47
47
  overlay_bounding_boxes,
48
- overlay_counting_results,
49
48
  overlay_heat_map,
50
49
  overlay_segmentation_masks,
51
50
  owl_v2_image,
@@ -116,7 +116,9 @@ class Artifacts:
116
116
  )
117
117
  output_str = "[Artifacts loaded]\n"
118
118
  for k in self.artifacts.keys():
119
- output_str += f"Artifact {k} loaded to {str(loaded_path / k)}\n"
119
+ output_str += (
120
+ f"Artifact name: {k}, loaded to path: {str(loaded_path / k)}\n"
121
+ )
120
122
  output_str += "[End of artifacts]\n"
121
123
  print(output_str)
122
124
  return output_str
@@ -13,7 +13,7 @@ from uuid import UUID
13
13
  import cv2
14
14
  import numpy as np
15
15
  import requests
16
- from PIL import Image, ImageDraw, ImageEnhance, ImageFont
16
+ from PIL import Image, ImageDraw, ImageFont
17
17
  from pillow_heif import register_heif_opener # type: ignore
18
18
  from pytube import YouTube # type: ignore
19
19
 
@@ -700,18 +700,22 @@ def countgd_counting(
700
700
  {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
701
701
  ]
702
702
  """
703
- image_b64 = convert_to_b64(image)
703
+ buffer_bytes = numpy_to_bytes(image)
704
+ files = [("image", buffer_bytes)]
704
705
  prompt = prompt.replace(", ", " .")
705
- payload = {"prompt": prompt, "image": image_b64}
706
+ payload = {"prompts": [prompt], "model": "countgd"}
706
707
  metadata = {"function_name": "countgd_counting"}
707
- resp_data = send_task_inference_request(payload, "countgd", metadata=metadata)
708
+ resp_data = send_task_inference_request(
709
+ payload, "text-to-object-detection", files=files, metadata=metadata
710
+ )
711
+ bboxes_per_frame = resp_data[0]
708
712
  bboxes_formatted = [
709
713
  ODResponseData(
710
714
  label=bbox["label"],
711
- bbox=list(map(lambda x: round(x, 2), bbox["bbox"])),
715
+ bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
712
716
  score=round(bbox["score"], 2),
713
717
  )
714
- for bbox in resp_data
718
+ for bbox in bboxes_per_frame
715
719
  ]
716
720
  filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
717
721
  return [bbox.model_dump() for bbox in filtered_bboxes]
@@ -1146,10 +1150,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
1146
1150
  def florence2_phrase_grounding(
1147
1151
  prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
1148
1152
  ) -> List[Dict[str, Any]]:
1149
- """'florence2_phrase_grounding' will run florence2 on a image. It can
1150
- detect multiple objects given a text prompt which can be object names or caption.
1151
- You can optionally separate the object names in the text with commas. It returns
1152
- a list of bounding boxes with normalized coordinates, label names and associated
1153
+ """'florence2_phrase_grounding' is a tool that can detect multiple
1154
+ objects given a text prompt which can be object names or caption. You
1155
+ can optionally separate the object names in the text with commas. It returns a list
1156
+ of bounding boxes with normalized coordinates, label names and associated
1153
1157
  probability scores of 1.0.
1154
1158
 
1155
1159
  Parameters:
@@ -1808,6 +1812,11 @@ def save_image(image: np.ndarray, file_path: str) -> None:
1808
1812
  """
1809
1813
  from IPython.display import display
1810
1814
 
1815
+ if not isinstance(image, np.ndarray) or (
1816
+ image.shape[0] == 0 and image.shape[1] == 0
1817
+ ):
1818
+ raise ValueError("The image is not a valid NumPy array with shape (H, W, C)")
1819
+
1811
1820
  pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
1812
1821
  display(pil_image)
1813
1822
  pil_image.save(file_path)
@@ -1834,6 +1843,15 @@ def save_video(
1834
1843
  if fps <= 0:
1835
1844
  raise ValueError(f"fps must be greater than 0 got {fps}")
1836
1845
 
1846
+ if not isinstance(frames, list) or len(frames) == 0:
1847
+ raise ValueError("Frames must be a list of NumPy arrays")
1848
+
1849
+ for frame in frames:
1850
+ if not isinstance(frame, np.ndarray) or (
1851
+ frame.shape[0] == 0 and frame.shape[1] == 0
1852
+ ):
1853
+ raise ValueError("A frame is not a valid NumPy array with shape (H, W, C)")
1854
+
1837
1855
  if output_video_path is None:
1838
1856
  output_video_path = tempfile.NamedTemporaryFile(
1839
1857
  delete=False, suffix=".mp4"
@@ -1903,30 +1921,36 @@ def overlay_bounding_boxes(
1903
1921
  bboxes = bbox_int[i]
1904
1922
  bboxes = sorted(bboxes, key=lambda x: x["label"], reverse=True)
1905
1923
 
1906
- width, height = pil_image.size
1907
- fontsize = max(12, int(min(width, height) / 40))
1908
- draw = ImageDraw.Draw(pil_image)
1909
- font = ImageFont.truetype(
1910
- str(
1911
- resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")
1912
- ),
1913
- fontsize,
1914
- )
1915
-
1916
- for elt in bboxes:
1917
- label = elt["label"]
1918
- box = elt["bbox"]
1919
- scores = elt["score"]
1920
-
1921
- # denormalize the box if it is normalized
1922
- box = denormalize_bbox(box, (height, width))
1923
- draw.rectangle(box, outline=color[label], width=4)
1924
- text = f"{label}: {scores:.2f}"
1925
- text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
1926
- draw.rectangle(
1927
- (box[0], box[1], text_box[2], text_box[3]), fill=color[label]
1924
+ if len(bboxes) > 20:
1925
+ pil_image = _plot_counting(pil_image, bboxes, color)
1926
+ else:
1927
+ width, height = pil_image.size
1928
+ fontsize = max(12, int(min(width, height) / 40))
1929
+ draw = ImageDraw.Draw(pil_image)
1930
+ font = ImageFont.truetype(
1931
+ str(
1932
+ resources.files("vision_agent.fonts").joinpath(
1933
+ "default_font_ch_en.ttf"
1934
+ )
1935
+ ),
1936
+ fontsize,
1928
1937
  )
1929
- draw.text((box[0], box[1]), text, fill="black", font=font)
1938
+
1939
+ for elt in bboxes:
1940
+ label = elt["label"]
1941
+ box = elt["bbox"]
1942
+ scores = elt["score"]
1943
+
1944
+ # denormalize the box if it is normalized
1945
+ box = denormalize_bbox(box, (height, width))
1946
+ draw.rectangle(box, outline=color[label], width=4)
1947
+ text = f"{label}: {scores:.2f}"
1948
+ text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
1949
+ draw.rectangle(
1950
+ (box[0], box[1], text_box[2], text_box[3]), fill=color[label]
1951
+ )
1952
+ draw.text((box[0], box[1]), text, fill="black", font=font)
1953
+
1930
1954
  frame_out.append(np.array(pil_image))
1931
1955
  return frame_out[0] if len(frame_out) == 1 else frame_out
1932
1956
 
@@ -2085,39 +2109,19 @@ def overlay_heat_map(
2085
2109
  return np.array(combined)
2086
2110
 
2087
2111
 
2088
- def overlay_counting_results(
2089
- image: np.ndarray, instances: List[Dict[str, Any]]
2090
- ) -> np.ndarray:
2091
- """'overlay_counting_results' is a utility function that displays counting results on
2092
- an image.
2093
-
2094
- Parameters:
2095
- image (np.ndarray): The image to display the bounding boxes on.
2096
- instances (List[Dict[str, Any]]): A list of dictionaries containing the bounding
2097
- box information of each instance
2098
-
2099
- Returns:
2100
- np.ndarray: The image with the instance_id dislpayed
2101
-
2102
- Example
2103
- -------
2104
- >>> image_with_bboxes = overlay_counting_results(
2105
- image, [{'score': 0.99, 'label': 'object', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
2106
- )
2107
- """
2108
- pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
2109
- color = (158, 218, 229)
2110
-
2111
- width, height = pil_image.size
2112
+ def _plot_counting(
2113
+ image: Image.Image,
2114
+ bboxes: List[Dict[str, Any]],
2115
+ colors: Dict[str, Tuple[int, int, int]],
2116
+ ) -> Image.Image:
2117
+ width, height = image.size
2112
2118
  fontsize = max(10, int(min(width, height) / 80))
2113
- pil_image = ImageEnhance.Brightness(pil_image).enhance(0.5)
2114
- draw = ImageDraw.Draw(pil_image)
2119
+ draw = ImageDraw.Draw(image)
2115
2120
  font = ImageFont.truetype(
2116
2121
  str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
2117
2122
  fontsize,
2118
2123
  )
2119
-
2120
- for i, elt in enumerate(instances, 1):
2124
+ for i, elt in enumerate(bboxes, 1):
2121
2125
  label = f"{i}"
2122
2126
  box = elt["bbox"]
2123
2127
 
@@ -2139,7 +2143,7 @@ def overlay_counting_results(
2139
2143
  text_y1 = cy + text_height / 2
2140
2144
 
2141
2145
  # Draw the rectangle encapsulating the text
2142
- draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=color)
2146
+ draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=colors[elt["label"]])
2143
2147
 
2144
2148
  # Draw the text at the center of the bounding box
2145
2149
  draw.text(
@@ -2150,7 +2154,7 @@ def overlay_counting_results(
2150
2154
  anchor="lt",
2151
2155
  )
2152
2156
 
2153
- return np.array(pil_image)
2157
+ return image
2154
2158
 
2155
2159
 
2156
2160
  FUNCTION_TOOLS = [
@@ -2183,7 +2187,6 @@ UTIL_TOOLS = [
2183
2187
  overlay_bounding_boxes,
2184
2188
  overlay_segmentation_masks,
2185
2189
  overlay_heat_map,
2186
- overlay_counting_results,
2187
2190
  ]
2188
2191
 
2189
2192
  TOOLS = FUNCTION_TOOLS + UTIL_TOOLS
File without changes
File without changes
@@ -1,6 +1,6 @@
1
- import os
2
1
  import inspect
3
2
  import logging
3
+ import os
4
4
  from base64 import b64encode
5
5
  from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
6
6
 
@@ -1,6 +1,6 @@
1
1
  from enum import Enum
2
- from uuid import UUID
3
2
  from typing import List, Optional, Tuple, Union
3
+ from uuid import UUID
4
4
 
5
5
  from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
6
6