vision-agent 0.2.22__tar.gz → 0.2.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {vision_agent-0.2.22 → vision_agent-0.2.24}/PKG-INFO +4 -2
  2. {vision_agent-0.2.22 → vision_agent-0.2.24}/pyproject.toml +4 -2
  3. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/agent/agent_coder.py +19 -5
  4. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/agent/vision_agent_v2.py +13 -4
  5. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/agent/vision_agent_v2_prompt.py +4 -3
  6. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/llm/llm.py +7 -4
  7. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/tools/tools_v2.py +80 -2
  8. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/utils/sim.py +6 -1
  9. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/utils/type_defs.py +1 -1
  10. {vision_agent-0.2.22 → vision_agent-0.2.24}/LICENSE +0 -0
  11. {vision_agent-0.2.22 → vision_agent-0.2.24}/README.md +0 -0
  12. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/__init__.py +0 -0
  13. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/agent/__init__.py +0 -0
  14. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/agent/agent.py +0 -0
  15. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/agent/agent_coder_prompts.py +0 -0
  16. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/agent/easytool.py +0 -0
  17. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/agent/easytool_prompts.py +0 -0
  18. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/agent/reflexion.py +0 -0
  19. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/agent/reflexion_prompts.py +0 -0
  20. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/agent/vision_agent.py +0 -0
  21. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/agent/vision_agent_prompts.py +0 -0
  22. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/fonts/__init__.py +0 -0
  23. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  24. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/llm/__init__.py +0 -0
  25. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/lmm/__init__.py +0 -0
  26. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/lmm/lmm.py +0 -0
  27. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/tools/__init__.py +1 -1
  28. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/tools/prompts.py +0 -0
  29. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/tools/tool_utils.py +0 -0
  30. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/tools/tools.py +0 -0
  31. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/utils/__init__.py +0 -0
  32. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/utils/execute.py +0 -0
  33. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/utils/image_utils.py +0 -0
  34. {vision_agent-0.2.22 → vision_agent-0.2.24}/vision_agent/utils/video.py +0 -0
@@ -1,14 +1,16 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.22
3
+ Version: 0.2.24
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
7
- Requires-Python: >=3.9
7
+ Requires-Python: >=3.9,<4.0
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: Programming Language :: Python :: 3.9
10
10
  Classifier: Programming Language :: Python :: 3.10
11
11
  Classifier: Programming Language :: Python :: 3.11
12
+ Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
13
+ Requires-Dist: langsmith (>=0.1.58,<0.2.0)
12
14
  Requires-Dist: moviepy (>=1.0.0,<2.0.0)
13
15
  Requires-Dist: nbclient (>=0.10.0,<0.11.0)
14
16
  Requires-Dist: nbformat (>=5.10.4,<6.0.0)
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.22"
7
+ version = "0.2.24"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -16,7 +16,7 @@ packages = [{include = "vision_agent"}]
16
16
  "documentation" = "https://github.com/landing-ai/vision-agent"
17
17
 
18
18
  [tool.poetry.dependencies] # main dependency group
19
- python = ">=3.9"
19
+ python = ">=3.9,<4.0"
20
20
  numpy = ">=1.21.0,<2.0.0"
21
21
  pillow = "10.*"
22
22
  requests = "2.*"
@@ -32,6 +32,8 @@ scipy = "1.13.*"
32
32
  nbclient = "^0.10.0"
33
33
  nbformat = "^5.10.4"
34
34
  rich = "^13.7.1"
35
+ langsmith = "^0.1.58"
36
+ ipykernel = "^6.29.4"
35
37
 
36
38
  [tool.poetry.group.dev.dependencies]
37
39
  autoflake = "1.*"
@@ -5,6 +5,9 @@ import sys
5
5
  from pathlib import Path
6
6
  from typing import Dict, List, Optional, Union
7
7
 
8
+ from rich.console import Console
9
+ from rich.syntax import Syntax
10
+
8
11
  from vision_agent.agent import Agent
9
12
  from vision_agent.agent.agent_coder_prompts import (
10
13
  DEBUG,
@@ -40,6 +43,7 @@ from vision_agent.tools.tools_v2 import *
40
43
  logging.basicConfig(stream=sys.stdout)
41
44
  _LOGGER = logging.getLogger(__name__)
42
45
  _EXECUTE = Execute()
46
+ _CONSOLE = Console()
43
47
 
44
48
 
45
49
  def write_tests(question: str, code: str, model: LLM) -> str:
@@ -103,7 +107,7 @@ def run_visual_tests(
103
107
 
104
108
 
105
109
  def fix_bugs(code: str, tests: str, result: str, feedback: str, model: LLM) -> str:
106
- prompt = FIX_BUG.format(completion=code, test_case=tests, result=result)
110
+ prompt = FIX_BUG.format(code=code, tests=tests, result=result, feedback=feedback)
107
111
  completion = model(prompt)
108
112
  return preprocess_data(completion)
109
113
 
@@ -139,7 +143,8 @@ class AgentCoder(Agent):
139
143
  else visual_tester_agent
140
144
  )
141
145
  self.max_turns = 3
142
- if verbose:
146
+ self.verbose = verbose
147
+ if self.verbose:
143
148
  _LOGGER.setLevel(logging.INFO)
144
149
 
145
150
  def __call__(
@@ -164,9 +169,15 @@ class AgentCoder(Agent):
164
169
  feedback = ""
165
170
  for _ in range(self.max_turns):
166
171
  code = write_program(question, feedback, self.coder_agent)
167
- _LOGGER.info(f"code:\n{code}")
172
+ if self.verbose:
173
+ _CONSOLE.print(
174
+ Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
175
+ )
168
176
  debug = write_debug(question, code, feedback, self.tester_agent)
169
- _LOGGER.info(f"debug:\n{debug}")
177
+ if self.verbose:
178
+ _CONSOLE.print(
179
+ Syntax(debug, "python", theme="gruvbox-dark", line_numbers=True)
180
+ )
170
181
  results = execute_tests(code, debug)
171
182
  _LOGGER.info(
172
183
  f"execution results: passed: {results['passed']}\n{results['result']}"
@@ -176,7 +187,10 @@ class AgentCoder(Agent):
176
187
  code = fix_bugs(
177
188
  code, debug, results["result"].strip(), feedback, self.coder_agent # type: ignore
178
189
  )
179
- _LOGGER.info(f"fixed code:\n{code}")
190
+ if self.verbose:
191
+ _CONSOLE.print(
192
+ Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
193
+ )
180
194
  else:
181
195
  # TODO: Sometimes it prints nothing, so we need to handle that case
182
196
  # TODO: The visual agent reflection does not work very well, needs more testing
@@ -4,6 +4,7 @@ from pathlib import Path
4
4
  from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
5
5
 
6
6
  import pandas as pd
7
+ from langsmith import traceable
7
8
  from rich.console import Console
8
9
  from rich.syntax import Syntax
9
10
  from tabulate import tabulate
@@ -66,6 +67,7 @@ def extract_json(json_str: str) -> Dict[str, Any]:
66
67
  return json_dict # type: ignore
67
68
 
68
69
 
70
+ @traceable(name="planning")
69
71
  def write_plan(
70
72
  chat: List[Dict[str, str]],
71
73
  plan: Optional[List[Dict[str, Any]]],
@@ -214,6 +216,7 @@ def write_and_exec_code(
214
216
  return success, code, result, working_memory
215
217
 
216
218
 
219
+ @traceable(name="plan execution")
217
220
  def run_plan(
218
221
  user_req: str,
219
222
  plan: List[Dict[str, Any]],
@@ -235,9 +238,11 @@ def run_plan(
235
238
  f"""
236
239
  {tabulate(tabular_data=[task], headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
237
240
  )
238
- tool_info = "\n".join(
239
- [e["doc"] for e in tool_recommender.top_k(task["instruction"])]
240
- )
241
+ tools = tool_recommender.top_k(task["instruction"], thresh=0.3)
242
+ tool_info = "\n".join([e["doc"] for e in tools])
243
+
244
+ if verbosity == 2:
245
+ _LOGGER.info(f"Tools retrieved: {[e['desc'] for e in tools]}")
241
246
 
242
247
  if long_term_memory is not None:
243
248
  retrieved_ltm = "\n".join(
@@ -283,6 +288,7 @@ class VisionAgentV2(Agent):
283
288
  solve vision tasks. It is inspired by MetaGPT's Data Interpreter
284
289
  https://arxiv.org/abs/2402.18679. Vision Agent has several key features to help it
285
290
  generate code:
291
+
286
292
  - A planner to generate a plan of tasks to solve a user requirement. The planner
287
293
  can output code tasks or test tasks, where test tasks are used to verify the code.
288
294
  - Automatic debugging, if a task fails, the agent will attempt to debug the code
@@ -331,6 +337,7 @@ class VisionAgentV2(Agent):
331
337
  results = self.chat_with_workflow(input, image, plan)
332
338
  return results["code"] # type: ignore
333
339
 
340
+ @traceable
334
341
  def chat_with_workflow(
335
342
  self,
336
343
  chat: List[Dict[str, str]],
@@ -375,7 +382,9 @@ class VisionAgentV2(Agent):
375
382
  self.long_term_memory,
376
383
  self.verbosity,
377
384
  )
378
- success = all(task["success"] for task in plan)
385
+ success = all(
386
+ task["success"] if "success" in task else False for task in plan
387
+ )
379
388
  working_memory.update(working_memory_i)
380
389
 
381
390
  if not success:
@@ -34,7 +34,7 @@ PLAN = """
34
34
 
35
35
  # Task:
36
36
  Based on the context and the tools you have available, write a plan of subtasks to achieve the user request that adhere to the following requirements:
37
- - For each subtask, you should provide a short instruction on what to do. Ensure the subtasks are large enough to be meaningful, encompassing multiple lines of code.
37
+ - For each subtask, you should provide instructions on what to do. Write detailed subtasks, ensure they are large enough to be meaningful, encompassing multiple lines of code.
38
38
  - You do not need to have the agent rewrite any tool functionality you already have, you should instead instruct it to utilize one or more of those tools in each subtask.
39
39
  - You can have agents either write coding tasks, to code some functionality or testing tasks to test previous functionality.
40
40
  - If a current plan exists, examine each item in the plan to determine if it was successful. If there was an item that failed, i.e. 'success': False, then you should rewrite that item and all subsequent items to ensure that the rewritten plan is successful.
@@ -73,9 +73,10 @@ CODE = """
73
73
  {code}
74
74
 
75
75
  # Constraints
76
- - Write a function that accomplishes the 'User Requirement'. You are supplied code from a previous task under 'Previous Code', feel free to copy over that code into your own implementation if you need it.
77
- - Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info for Current Subtask'. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
76
+ - Write a function that accomplishes the 'Current Subtask'. You are supplied code from a previous task under 'Previous Code', do not delete or change previous code unless it contains a bug or it is necessary to complete the 'Current Subtask'.
77
+ - Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info' when working on 'Current Subtask'. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
78
78
  - You may recieve previous trials and errors under 'Previous Task', this is code, output and reflections from previous tasks. You can use these to avoid running in to the same issues when writing your code.
79
+ - Use the `save_json` function from `vision_agent.tools.tools_v2` to save your output as a json file.
79
80
  - Write clean, readable, and well-documented code.
80
81
 
81
82
  # Output
@@ -3,6 +3,7 @@ import os
3
3
  from abc import ABC, abstractmethod
4
4
  from typing import Any, Callable, Dict, List, Mapping, Optional, Union, cast
5
5
 
6
+ from langsmith.wrappers import wrap_openai
6
7
  from openai import AzureOpenAI, OpenAI
7
8
 
8
9
  from vision_agent.tools import (
@@ -41,9 +42,9 @@ class OpenAILLM(LLM):
41
42
  **kwargs: Any
42
43
  ):
43
44
  if not api_key:
44
- self.client = OpenAI()
45
+ self.client = wrap_openai(OpenAI())
45
46
  else:
46
- self.client = OpenAI(api_key=api_key)
47
+ self.client = wrap_openai(OpenAI(api_key=api_key))
47
48
 
48
49
  self.model_name = model_name
49
50
  self.system_prompt = system_prompt
@@ -165,8 +166,10 @@ class AzureOpenAILLM(OpenAILLM):
165
166
  if not azure_endpoint:
166
167
  raise ValueError("Azure OpenAI endpoint is required.")
167
168
 
168
- self.client = AzureOpenAI(
169
- api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint
169
+ self.client = wrap_openai(
170
+ AzureOpenAI(
171
+ api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint
172
+ )
170
173
  )
171
174
  self.model_name = model_name
172
175
  self.kwargs = kwargs
@@ -1,15 +1,17 @@
1
1
  import inspect
2
2
  import io
3
+ import json
3
4
  import logging
4
5
  import tempfile
5
6
  from importlib import resources
6
7
  from pathlib import Path
7
- from typing import Any, Callable, Dict, List, Tuple, Union
8
+ from typing import Any, Callable, Dict, List, Tuple, Union, cast
8
9
 
9
10
  import numpy as np
10
11
  import pandas as pd
11
12
  import requests
12
13
  from PIL import Image, ImageDraw, ImageFont
14
+ from scipy.spatial import distance # type: ignore
13
15
 
14
16
  from vision_agent.tools.tool_utils import _send_inference_request
15
17
  from vision_agent.utils import extract_frames_from_video
@@ -233,9 +235,82 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
233
235
  return output
234
236
 
235
237
 
238
+ def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
239
+ """'closest_mask_distance' calculates the closest distance between two masks.
240
+
241
+ Parameters:
242
+ mask1 (np.ndarray): The first mask.
243
+ mask2 (np.ndarray): The second mask.
244
+
245
+ Returns:
246
+ float: The closest distance between the two masks.
247
+
248
+ Example
249
+ -------
250
+ >>> closest_mask_distance(mask1, mask2)
251
+ 0.5
252
+ """
253
+
254
+ mask1 = np.clip(mask1, 0, 1)
255
+ mask2 = np.clip(mask2, 0, 1)
256
+ mask1_points = np.transpose(np.nonzero(mask1))
257
+ mask2_points = np.transpose(np.nonzero(mask2))
258
+ dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean")
259
+ return cast(float, np.min(dist_matrix))
260
+
261
+
262
+ def closest_box_distance(box1: List[float], box2: List[float]) -> float:
263
+ """'closest_box_distance' calculates the closest distance between two bounding boxes.
264
+
265
+ Parameters:
266
+ box1 (List[float]): The first bounding box.
267
+ box2 (List[float]): The second bounding box.
268
+
269
+ Returns:
270
+ float: The closest distance between the two bounding boxes.
271
+
272
+ Example
273
+ -------
274
+ >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
275
+ 141.42
276
+ """
277
+
278
+ x11, y11, x12, y12 = box1
279
+ x21, y21, x22, y22 = box2
280
+
281
+ horizontal_distance = np.max([0, x21 - x12, x11 - x22])
282
+ vertical_distance = np.max([0, y21 - y12, y11 - y22])
283
+ return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
284
+
285
+
236
286
  # Utility and visualization functions
237
287
 
238
288
 
289
+ def save_json(data: Any, file_path: str) -> None:
290
+ """'save_json' is a utility function that saves data as a JSON file. It is helpful
291
+ for saving data that contains NumPy arrays which are not JSON serializable.
292
+
293
+ Parameters:
294
+ data (Any): The data to save.
295
+ file_path (str): The path to save the JSON file.
296
+
297
+ Example
298
+ -------
299
+ >>> save_json(data, "path/to/file.json")
300
+ """
301
+
302
+ class NumpyEncoder(json.JSONEncoder):
303
+ def default(self, obj: Any): # type: ignore
304
+ if isinstance(obj, np.ndarray):
305
+ return obj.tolist()
306
+ elif isinstance(obj, np.bool_):
307
+ return bool(obj)
308
+ return json.JSONEncoder.default(self, obj)
309
+
310
+ with open(file_path, "w") as f:
311
+ json.dump(data, f, cls=NumpyEncoder)
312
+
313
+
239
314
  def load_image(image_path: str) -> np.ndarray:
240
315
  """'load_image' is a utility function that loads an image from the given path.
241
316
 
@@ -429,6 +504,9 @@ TOOLS = [
429
504
  grounding_sam,
430
505
  extract_frames,
431
506
  ocr,
507
+ closest_mask_distance,
508
+ closest_box_distance,
509
+ save_json,
432
510
  load_image,
433
511
  save_image,
434
512
  overlay_bounding_boxes,
@@ -438,5 +516,5 @@ TOOLS_DF = get_tools_df(TOOLS) # type: ignore
438
516
  TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
439
517
  TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
440
518
  UTILITIES_DOCSTRING = get_tool_documentation(
441
- [load_image, save_image, overlay_bounding_boxes]
519
+ [save_json, load_image, save_image, overlay_bounding_boxes]
442
520
  )
@@ -56,12 +56,15 @@ class Sim:
56
56
  df = df.drop("embs", axis=1)
57
57
  df.to_csv(sim_file / "df.csv", index=False)
58
58
 
59
- def top_k(self, query: str, k: int = 5) -> Sequence[Dict]:
59
+ def top_k(
60
+ self, query: str, k: int = 5, thresh: Optional[float] = None
61
+ ) -> Sequence[Dict]:
60
62
  """Returns the top k most similar items to the query.
61
63
 
62
64
  Parameters:
63
65
  query: str: The query to compare to.
64
66
  k: int: The number of items to return.
67
+ thresh: Optional[float]: The minimum similarity threshold.
65
68
 
66
69
  Returns:
67
70
  Sequence[Dict]: The top k most similar items.
@@ -70,6 +73,8 @@ class Sim:
70
73
  embedding = get_embedding(self.client, query, model=self.model)
71
74
  self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
72
75
  res = self.df.sort_values("sim", ascending=False).head(k)
76
+ if thresh is not None:
77
+ res = res[res.sim > thresh]
73
78
  return res[[c for c in res.columns if c != "embs"]].to_dict(orient="records")
74
79
 
75
80
 
@@ -12,7 +12,7 @@ class LandingaiAPIKey(BaseSettings):
12
12
  """
13
13
 
14
14
  api_key: str = Field(
15
- default="land_sk_hw34v3tyEc35OAhP8F7hnGnrDv2C8hD2ycMyq0aMkVS1H40D22",
15
+ default="land_sk_PCRPYKqB3cq0JWGY83hjEk33SWSDOwdNoyUjTgCDMZO4NxeCXW",
16
16
  alias="LANDINGAI_API_KEY",
17
17
  description="The API key of LandingAI.",
18
18
  )
File without changes
File without changes
@@ -3,8 +3,8 @@ from .tools import ( # Counter,
3
3
  CLIP,
4
4
  OCR,
5
5
  TOOLS,
6
- BboxStats,
7
6
  BboxIoU,
7
+ BboxStats,
8
8
  BoxDistance,
9
9
  Crop,
10
10
  DINOv,