vision-agent 0.2.90__py3-none-any.whl → 0.2.92__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,402 @@
1
+ import os
2
+ import subprocess
3
+ from pathlib import Path
4
+ from typing import Any, Dict, List, Union
5
+
6
+ import vision_agent as va
7
+ from vision_agent.lmm.types import Message
8
+ from vision_agent.tools.tool_utils import get_tool_documentation
9
+ from vision_agent.tools.tools import TOOL_DESCRIPTIONS
10
+
11
+ # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
12
+
13
+ CURRENT_FILE = None
14
+ CURRENT_LINE = 0
15
+ DEFAULT_WINDOW_SIZE = 100
16
+ ZMQ_PORT = os.environ.get("ZMQ_PORT", None)
17
+
18
+
19
+ def report_progress_callback(port: int, inp: Dict[str, Any]) -> None:
20
+ import zmq
21
+
22
+ context = zmq.Context()
23
+ socket = context.socket(zmq.PUSH)
24
+ socket.connect(f"tcp://localhost:{port}")
25
+ socket.send_json(inp)
26
+
27
+
28
+ def filter_file(file_name: Union[str, Path]) -> bool:
29
+ file_name_p = Path(file_name)
30
+ return (
31
+ file_name_p.is_file()
32
+ and "__pycache__" not in str(file_name_p)
33
+ and file_name_p.suffix in [".py", ".txt"]
34
+ and not file_name_p.name.startswith(".")
35
+ )
36
+
37
+
38
+ def generate_vision_code(save_file: str, chat: str, media: List[str]) -> str:
39
+ """Generates python code to solve vision based tasks.
40
+
41
+ Parameters:
42
+ save_file (str): The file path to save the code.
43
+ chat (str): The chat message from the user.
44
+ media (List[str]): The media files to use.
45
+
46
+ Returns:
47
+ str: The generated code.
48
+
49
+ Examples
50
+ --------
51
+ >>> generate_vision_code("code.py", "Can you detect the dogs in this image?", ["image.jpg"])
52
+ from vision_agent.tools import load_image, owl_v2
53
+ def detect_dogs(image_path: str):
54
+ image = load_image(image_path)
55
+ dogs = owl_v2("dog", image)
56
+ return dogs
57
+ """
58
+
59
+ if ZMQ_PORT is not None:
60
+ agent = va.agent.VisionAgentCoder(
61
+ report_progress_callback=lambda inp: report_progress_callback(
62
+ int(ZMQ_PORT), inp
63
+ )
64
+ )
65
+ else:
66
+ agent = va.agent.VisionAgentCoder()
67
+ try:
68
+ fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
69
+ response = agent.chat_with_workflow(fixed_chat)
70
+ code = response["code"]
71
+ with open(save_file, "w") as f:
72
+ f.write(code)
73
+ code_lines = code.splitlines(keepends=True)
74
+ total_lines = len(code_lines)
75
+ return view_lines(code_lines, 0, total_lines, save_file, total_lines)
76
+ except Exception as e:
77
+ return str(e)
78
+
79
+
80
+ def edit_vision_code(code_file: str, chat_history: List[str], media: List[str]) -> str:
81
+ """Edits python code to solve a vision based task.
82
+
83
+ Parameters:
84
+ code_file (str): The file path to the code.
85
+ chat_history (List[str]): The chat history to used to generate the code.
86
+
87
+ Returns:
88
+ str: The edited code.
89
+
90
+ Examples
91
+ --------
92
+ >>> edit_vision_code(
93
+ >>> "code.py",
94
+ >>> ["Can you detect the dogs in this image?", "Can you use a higher threshold?"],
95
+ >>> ["dog.jpg"],
96
+ >>> )
97
+ from vision_agent.tools import load_image, owl_v2
98
+ def detect_dogs(image_path: str):
99
+ image = load_image(image_path)
100
+ dogs = owl_v2("dog", image, threshold=0.8)
101
+ return dogs
102
+ """
103
+
104
+ agent = va.agent.VisionAgentCoder()
105
+ with open(code_file, "r") as f:
106
+ code = f.read()
107
+
108
+ # Append latest code to second to last message from assistant
109
+ fixed_chat_history: List[Message] = []
110
+ for i, chat in enumerate(chat_history):
111
+ if i == 0:
112
+ fixed_chat_history.append({"role": "user", "content": chat, "media": media})
113
+ elif i > 0 and i < len(chat_history) - 1:
114
+ fixed_chat_history.append({"role": "user", "content": chat})
115
+ elif i == len(chat_history) - 1:
116
+ fixed_chat_history.append({"role": "assistant", "content": code})
117
+ fixed_chat_history.append({"role": "user", "content": chat})
118
+
119
+ try:
120
+ response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False)
121
+ code = response["code"]
122
+ with open(code_file, "w") as f:
123
+ f.write(code)
124
+ code_lines = code.splitlines(keepends=True)
125
+ total_lines = len(code_lines)
126
+ return view_lines(code_lines, 0, total_lines, code_file, total_lines)
127
+ except Exception as e:
128
+ return str(e)
129
+
130
+
131
+ def format_lines(lines: List[str], start_idx: int) -> str:
132
+ output = ""
133
+ for i, line in enumerate(lines):
134
+ output += f"{i + start_idx}|{line}"
135
+ return output
136
+
137
+
138
+ def view_lines(
139
+ lines: List[str], line_num: int, window_size: int, file_path: str, total_lines: int
140
+ ) -> str:
141
+ start = max(0, line_num - window_size)
142
+ end = min(len(lines), line_num + window_size)
143
+ return (
144
+ f"[File: {file_path} ({total_lines} lines total)]\n"
145
+ + format_lines(lines[start:end], start)
146
+ + ("[End of file]" if end == len(lines) else f"[{len(lines) - end} more lines]")
147
+ )
148
+
149
+
150
+ def open_file(file_path: str, line_num: int = 0, window_size: int = 100) -> str:
151
+ """Opens the file at at the given path in the editor. If `line_num` is provided,
152
+ the window will be moved to include that line. It only shows the first 100 lines by
153
+ default! Max `window_size` supported is 2000. use `scroll up/down` to view the file
154
+ if you want to see more.
155
+
156
+ Parameters:
157
+ file_path (str): The file path to open, preferred absolute path.
158
+ line_num (int): The line number to move the window to.
159
+ window_size (int): The number of lines to show above and below the line.
160
+ """
161
+
162
+ file_path_p = Path(file_path)
163
+ if not file_path_p.exists():
164
+ return f"[File {file_path} does not exist]"
165
+
166
+ total_lines = sum(1 for _ in open(file_path_p))
167
+ window_size = min(window_size, 2000)
168
+ window_size = window_size // 2
169
+ if line_num - window_size < 0:
170
+ line_num = window_size
171
+ elif line_num >= total_lines:
172
+ line_num = total_lines - 1 - window_size
173
+
174
+ global CURRENT_LINE, CURRENT_FILE
175
+ CURRENT_LINE = line_num
176
+ CURRENT_FILE = file_path
177
+
178
+ with open(file_path, "r") as f:
179
+ lines = f.readlines()
180
+
181
+ return view_lines(lines, line_num, window_size, file_path, total_lines)
182
+
183
+
184
+ def create_file(file_path: str) -> str:
185
+ """Creates and opens a new file with the given name.
186
+
187
+ Parameters:
188
+ file_path (str): The file path to create, preferred absolute path.
189
+ """
190
+
191
+ file_path_p = Path(file_path)
192
+ if file_path_p.exists():
193
+ return f"[File {file_path} already exists]"
194
+ file_path_p.touch()
195
+ global CURRENT_FILE
196
+ CURRENT_FILE = file_path
197
+ return f"[File created {file_path}]"
198
+
199
+
200
+ def scroll_up() -> str:
201
+ """Moves the window up by 100 lines."""
202
+ if CURRENT_FILE is None:
203
+ return "[No file is open]"
204
+
205
+ return open_file(CURRENT_FILE, CURRENT_LINE + DEFAULT_WINDOW_SIZE)
206
+
207
+
208
+ def scroll_down() -> str:
209
+ """Moves the window down by 100 lines."""
210
+ if CURRENT_FILE is None:
211
+ return "[No file is open]"
212
+
213
+ return open_file(CURRENT_FILE, CURRENT_LINE - DEFAULT_WINDOW_SIZE)
214
+
215
+
216
+ def search_dir(search_term: str, dir_path: str) -> str:
217
+ """Searches for search_term in all files in a directory.
218
+
219
+ Parameters:
220
+ search_term (str): The search term to look for.
221
+ dir_path (str): The directory path to search in, preferred absolute path.
222
+ """
223
+
224
+ dir_path_p = Path(dir_path)
225
+ if not dir_path_p.exists():
226
+ return f"[Directory {dir_path} does not exist]"
227
+
228
+ matches = []
229
+ for file in dir_path_p.glob("**/*"):
230
+ if filter_file(file):
231
+ with open(file, "r") as f:
232
+ lines = f.readlines()
233
+ for i, line in enumerate(lines):
234
+ if search_term in line:
235
+ matches.append(f"{file}:{i}|{line.strip()}\n")
236
+ if not matches:
237
+ return f"[No matches found for {search_term} in {dir_path}]"
238
+ if len(matches) > 100:
239
+ return f"[More than {len(matches)} matches found for {search_term} in {dir_path}. Please narrow your search]"
240
+
241
+ return_str = f"[Found {len(matches)} matches for {search_term} in {dir_path}]\n"
242
+ for match in matches:
243
+ return_str += match
244
+
245
+ return_str += f"[End of matches for {search_term} in {dir_path}]"
246
+ return return_str
247
+
248
+
249
+ def search_file(search_term: str, file_path: str) -> str:
250
+ """Searches the file for the given search term.
251
+
252
+ Parameters:
253
+ search_term (str): The search term to look for.
254
+ file_path (str): The file path to search in, preferred absolute path.
255
+ """
256
+
257
+ file_path_p = Path(file_path)
258
+ if not file_path_p.exists():
259
+ return f"[File {file_path} does not exist]"
260
+
261
+ with open(file_path_p, "r") as f:
262
+ lines = f.readlines()
263
+
264
+ search_results = []
265
+ for i, line in enumerate(lines):
266
+ if search_term in line:
267
+ search_results.append(f"{i}|{line.strip()}\n")
268
+
269
+ if not search_results:
270
+ return f"[No matches found for {search_term} in {file_path}]"
271
+
272
+ return_str = (
273
+ f"[Found {len(search_results)} matches for {search_term} in {file_path}]\n"
274
+ )
275
+ for result in search_results:
276
+ return_str += result
277
+
278
+ return_str += f"[End of matches for {search_term} in {file_path}]"
279
+ return return_str
280
+
281
+
282
+ def find_file(file_name: str, dir_path: str = "./") -> str:
283
+ """Finds all files with the given name in the specified directory.
284
+
285
+ Parameters:
286
+ file_name (str): The file name to look for.
287
+ dir_path (str): The directory path to search in, preferred absolute path.
288
+ """
289
+
290
+ dir_path_p = Path(dir_path)
291
+ if not dir_path_p.exists():
292
+ return f"[Directory {dir_path} does not exist]"
293
+
294
+ files = list(dir_path_p.glob(f"**/*{file_name}*"))
295
+ files = [f for f in files if filter_file(f)]
296
+ if not files:
297
+ return f"[No files found in {dir_path} with name {file_name}]"
298
+
299
+ return_str = f"[Found {len(files)} matches for {file_name} in {dir_path}]\n"
300
+ for match in files:
301
+ return_str += str(match) + "\n"
302
+
303
+ return_str += f"[End of matches for {file_name} in {dir_path}]"
304
+ return return_str
305
+
306
+
307
+ def edit_file(file_path: str, start: int, end: int, content: str) -> str:
308
+ """Edits the file at the given path with the provided content. The content will be
309
+ inserted between the `start` and `end` line numbers. If the `start` and `end` are
310
+ the same, the content will be inserted at the `start` line number. If the `end` is
311
+ greater than the total number of lines in the file, the content will be inserted at
312
+ the end of the file. If the `start` or `end` are negative, the function will return
313
+ an error message.
314
+
315
+ Parameters:
316
+ file_path (str): The file path to edit, preferred absolute path.
317
+ start (int): The line number to start the edit.
318
+ end (int): The line number to end the edit.
319
+ content (str): The content to insert.
320
+ """
321
+ file_path_p = Path(file_path)
322
+ if not file_path_p.exists():
323
+ return f"[File {file_path} does not exist]"
324
+
325
+ total_lines = sum(1 for _ in open(file_path_p))
326
+ if start < 0 or end < 0 or start > end or end > total_lines:
327
+ return "[Invalid line range]"
328
+ if start == end:
329
+ end += 1
330
+
331
+ new_content_lines = content.splitlines(keepends=True)
332
+ new_content_lines = [
333
+ line if line.endswith("\n") else line + "\n" for line in new_content_lines
334
+ ]
335
+ with open(file_path_p, "r") as f:
336
+ lines = f.readlines()
337
+ edited_lines = lines[:start] + new_content_lines + lines[end:]
338
+
339
+ cur_line = start + len(content.split("\n")) // 2
340
+ tmp_file = file_path_p.with_suffix(".tmp")
341
+ with open(tmp_file, "w") as f:
342
+ f.writelines(edited_lines)
343
+
344
+ process = subprocess.Popen(
345
+ [
346
+ "flake8",
347
+ "--isolated",
348
+ "--select=F821,F822,F831,E111,E112,E113,E999,E902",
349
+ tmp_file,
350
+ ],
351
+ stdout=subprocess.PIPE,
352
+ stderr=subprocess.PIPE,
353
+ text=True,
354
+ )
355
+ stdout, _ = process.communicate()
356
+ tmp_file.unlink()
357
+ if stdout != "":
358
+ stdout = stdout.replace(tmp_file.name, file_path)
359
+ error_msg = "[Edit failed with the following status]\n" + stdout
360
+ original_view = view_lines(
361
+ lines,
362
+ start + ((end - start) // 2),
363
+ DEFAULT_WINDOW_SIZE,
364
+ file_path,
365
+ total_lines,
366
+ )
367
+ total_lines_edit = sum(1 for _ in edited_lines)
368
+ edited_view = view_lines(
369
+ edited_lines, cur_line, DEFAULT_WINDOW_SIZE, file_path, total_lines_edit
370
+ )
371
+
372
+ error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}"
373
+ return error_msg
374
+
375
+ with open(file_path_p, "w") as f:
376
+ f.writelines(edited_lines)
377
+
378
+ return open_file(file_path, cur_line)
379
+
380
+
381
+ def get_tool_descriptions() -> str:
382
+ """Returns a description of all the tools that `generate_vision_code` has access to.
383
+ Helpful for answerings questions about what types of vision tasks you can do with
384
+ `generate_vision_code`."""
385
+ return TOOL_DESCRIPTIONS
386
+
387
+
388
+ META_TOOL_DOCSTRING = get_tool_documentation(
389
+ [
390
+ get_tool_descriptions,
391
+ generate_vision_code,
392
+ edit_vision_code,
393
+ open_file,
394
+ create_file,
395
+ scroll_up,
396
+ scroll_down,
397
+ edit_file,
398
+ search_dir,
399
+ search_file,
400
+ find_file,
401
+ ]
402
+ )
@@ -1,7 +1,9 @@
1
+ import inspect
1
2
  import logging
2
3
  import os
3
- from typing import Any, Dict, MutableMapping, Optional
4
+ from typing import Any, Callable, Dict, List, MutableMapping, Optional
4
5
 
6
+ import pandas as pd
5
7
  from IPython.display import display
6
8
  from pydantic import BaseModel
7
9
  from requests import Session
@@ -14,7 +16,7 @@ from vision_agent.utils.type_defs import LandingaiAPIKey
14
16
 
15
17
  _LOGGER = logging.getLogger(__name__)
16
18
  _LND_API_KEY = LandingaiAPIKey().api_key
17
- _LND_API_URL = "https://api.staging.landing.ai/v1/agent"
19
+ _LND_API_URL = "https://api.landing.ai/v1/agent"
18
20
 
19
21
 
20
22
  class ToolCallTrace(BaseModel):
@@ -93,3 +95,47 @@ def _create_requests_session(
93
95
  session.mount(url, HTTPAdapter(max_retries=retries if num_retry > 0 else 0))
94
96
  session.headers.update(headers)
95
97
  return session
98
+
99
+
100
+ def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
101
+ docstrings = ""
102
+ for func in funcs:
103
+ docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n"
104
+
105
+ return docstrings
106
+
107
+
108
+ def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str:
109
+ descriptions = ""
110
+ for func in funcs:
111
+ description = func.__doc__
112
+ if description is None:
113
+ description = ""
114
+
115
+ if "Parameters:" in description:
116
+ description = (
117
+ description[: description.find("Parameters:")]
118
+ .replace("\n", " ")
119
+ .strip()
120
+ )
121
+
122
+ description = " ".join(description.split())
123
+ descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n"
124
+ return descriptions
125
+
126
+
127
+ def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
128
+ data: Dict[str, List[str]] = {"desc": [], "doc": []}
129
+
130
+ for func in funcs:
131
+ desc = func.__doc__
132
+ if desc is None:
133
+ desc = ""
134
+ desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip()
135
+ desc = " ".join(desc.split())
136
+
137
+ doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}"
138
+ data["desc"].append(desc)
139
+ data["doc"].append(doc)
140
+
141
+ return pd.DataFrame(data) # type: ignore
@@ -1,22 +1,25 @@
1
- import inspect
2
1
  import io
3
2
  import json
4
3
  import logging
5
4
  import tempfile
6
5
  from importlib import resources
7
6
  from pathlib import Path
8
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
7
+ from typing import Any, Dict, List, Optional, Tuple, Union, cast
9
8
 
10
9
  import cv2
11
10
  import numpy as np
12
- import pandas as pd
13
11
  import requests
14
12
  from moviepy.editor import ImageSequenceClip
15
13
  from PIL import Image, ImageDraw, ImageFont
16
14
  from pillow_heif import register_heif_opener # type: ignore
17
15
  from pytube import YouTube # type: ignore
18
16
 
19
- from vision_agent.tools.tool_utils import send_inference_request
17
+ from vision_agent.tools.tool_utils import (
18
+ get_tool_descriptions,
19
+ get_tool_documentation,
20
+ get_tools_df,
21
+ send_inference_request,
22
+ )
20
23
  from vision_agent.utils import extract_frames_from_video
21
24
  from vision_agent.utils.execute import FileSerializer, MimeType
22
25
  from vision_agent.utils.image_utils import (
@@ -54,7 +57,6 @@ COLORS = [
54
57
  ]
55
58
  _API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
56
59
  _OCR_URL = "https://app.landing.ai/ocr/v1/detect-text"
57
- logging.basicConfig(level=logging.INFO)
58
60
  _LOGGER = logging.getLogger(__name__)
59
61
 
60
62
 
@@ -1220,50 +1222,6 @@ def overlay_heat_map(
1220
1222
  return np.array(combined)
1221
1223
 
1222
1224
 
1223
- def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
1224
- docstrings = ""
1225
- for func in funcs:
1226
- docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n"
1227
-
1228
- return docstrings
1229
-
1230
-
1231
- def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str:
1232
- descriptions = ""
1233
- for func in funcs:
1234
- description = func.__doc__
1235
- if description is None:
1236
- description = ""
1237
-
1238
- if "Parameters:" in description:
1239
- description = (
1240
- description[: description.find("Parameters:")]
1241
- .replace("\n", " ")
1242
- .strip()
1243
- )
1244
-
1245
- description = " ".join(description.split())
1246
- descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n"
1247
- return descriptions
1248
-
1249
-
1250
- def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
1251
- data: Dict[str, List[str]] = {"desc": [], "doc": []}
1252
-
1253
- for func in funcs:
1254
- desc = func.__doc__
1255
- if desc is None:
1256
- desc = ""
1257
- desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip()
1258
- desc = " ".join(desc.split())
1259
-
1260
- doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}"
1261
- data["desc"].append(desc)
1262
- data["doc"].append(doc)
1263
-
1264
- return pd.DataFrame(data) # type: ignore
1265
-
1266
-
1267
1225
  TOOLS = [
1268
1226
  owl_v2,
1269
1227
  grounding_sam,