vision-agent 1.0.5__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,605 +0,0 @@
1
- import copy
2
- import json
3
- import logging
4
- import os
5
- from pathlib import Path
6
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
7
-
8
- from vision_agent.agent import Agent
9
- from vision_agent.agent.vision_agent_prompts import (
10
- EXAMPLES_CODE1,
11
- EXAMPLES_CODE2,
12
- EXAMPLES_CODE3,
13
- EXAMPLES_CODE3_EXTRA2,
14
- VA_CODE,
15
- )
16
- from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM
17
- from vision_agent.models import Message
18
- from vision_agent.tools.meta_tools import (
19
- META_TOOL_DOCSTRING,
20
- Artifacts,
21
- check_and_load_image,
22
- use_extra_vision_agent_args,
23
- )
24
- from vision_agent.utils import CodeInterpreterFactory
25
- from vision_agent.utils.agent import extract_json, extract_tag
26
- from vision_agent.utils.execute import CodeInterpreter, Execution
27
-
28
- logging.basicConfig(level=logging.INFO)
29
- _LOGGER = logging.getLogger(__name__)
30
- WORKSPACE = Path(os.getenv("WORKSPACE", ""))
31
- WORKSPACE.mkdir(parents=True, exist_ok=True)
32
- if str(WORKSPACE) != "":
33
- os.environ["PYTHONPATH"] = f"{WORKSPACE}:{os.getenv('PYTHONPATH', '')}"
34
-
35
-
36
- class BoilerplateCode:
37
- pre_code = [
38
- "from typing import *",
39
- "from vision_agent.utils.execute import CodeInterpreter",
40
- "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning, list_artifacts",
41
- "artifacts = Artifacts('{cwd}')",
42
- ]
43
- post_code: List[str] = []
44
-
45
- @staticmethod
46
- def add_boilerplate(code: str, **format: Any) -> str:
47
- """Run this method to prepend the default imports to the code.
48
- NOTE: be sure to run this method after the custom tools have been registered.
49
- """
50
- return (
51
- "\n".join([s.format(**format) for s in BoilerplateCode.pre_code])
52
- + "\n\n"
53
- + code
54
- + "\n\n"
55
- + "\n".join([s.format(**format) for s in BoilerplateCode.post_code])
56
- )
57
-
58
-
59
- def format_agent_message(agent_message: str) -> str:
60
- agent_message_json = extract_json(agent_message)
61
- output = ""
62
- if "thinking" in agent_message_json and agent_message_json["thinking"]:
63
- output += "<thinking>" + agent_message_json["thinking"] + "</thinking>"
64
- if "response" in agent_message_json and agent_message_json["response"]:
65
- output += "<response>" + agent_message_json["response"] + "</response>"
66
- if "execute_python" in agent_message_json and agent_message_json["execute_python"]:
67
- output += (
68
- "\n<execute_python>\n"
69
- + agent_message_json["execute_python"]
70
- + "\n</execute_python>\n"
71
- )
72
- if (
73
- "let_user_respond" in agent_message_json
74
- and agent_message_json["let_user_respond"]
75
- ):
76
- output += (
77
- "<let_user_respond>"
78
- + str(agent_message_json["let_user_respond"])
79
- + "</let_user_respond>"
80
- )
81
-
82
- return output
83
-
84
-
85
- def _clean_response(response: str) -> str:
86
- # Sometimes the LLM will hallucinate responses to an <execute_python> tag as if it
87
- # had already executed the code. This function removes the hallucinated response.
88
- if "<execute_python>" in response:
89
- end_execute_python = response.find("</execute_python>")
90
- response = response[: end_execute_python + len("</execute_python>")]
91
- return response
92
-
93
-
94
- def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
95
- chat = copy.deepcopy(chat)
96
-
97
- # only add 10 most recent messages in the chat to not go over token limit
98
- conversation = ""
99
- for chat_i in chat[-10:]:
100
- if chat_i["role"] == "user":
101
- conversation += f"USER: {chat_i['content']}\n\n"
102
- elif chat_i["role"] == "observation":
103
- conversation += f"OBSERVATION:\n{chat_i['content']}\n\n"
104
- elif chat_i["role"] == "assistant":
105
- conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n" # type: ignore
106
- else:
107
- raise ValueError(f"role {chat_i['role']} is not supported")
108
-
109
- prompt = VA_CODE.format(
110
- documentation=META_TOOL_DOCSTRING,
111
- examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}\n{EXAMPLES_CODE3_EXTRA2}",
112
- conversation=conversation,
113
- )
114
- message: Message = {"role": "user", "content": prompt}
115
- # only add recent media so we don't overload the model with old images
116
- if (
117
- chat[-1]["role"] == "observation"
118
- and "media" in chat[-1]
119
- and len(chat[-1]["media"]) > 0 # type: ignore
120
- ):
121
- media_obs = [media for media in chat[-1]["media"] if Path(media).exists()] # type: ignore
122
- if len(media_obs) > 0:
123
- message["media"] = media_obs # type: ignore
124
- conv_resp = cast(str, orch([message], stream=False))
125
-
126
- # clean the response first, if we are executing code, do not resond or end
127
- # conversation before the code has been executed.
128
- conv_resp = _clean_response(conv_resp)
129
-
130
- let_user_respond_str = extract_tag(conv_resp, "let_user_respond")
131
- let_user_respond = (
132
- "true" in let_user_respond_str.lower() if let_user_respond_str else False
133
- )
134
-
135
- return {
136
- "thinking": extract_tag(conv_resp, "thinking"),
137
- "response": extract_tag(conv_resp, "response"),
138
- "execute_python": extract_tag(conv_resp, "execute_python"),
139
- "let_user_respond": let_user_respond,
140
- }
141
-
142
-
143
- def execute_code_action(
144
- artifacts: Artifacts,
145
- code: str,
146
- code_interpreter: CodeInterpreter,
147
- ) -> Tuple[Execution, str]:
148
- result = code_interpreter.exec_isolation(
149
- BoilerplateCode.add_boilerplate(code, cwd=str(artifacts.cwd))
150
- )
151
-
152
- obs = str(result.logs)
153
- if result.error:
154
- obs += f"\n{result.error}"
155
- return result, obs
156
-
157
-
158
- def execute_user_code_action(
159
- artifacts: Artifacts,
160
- last_user_message: Message,
161
- code_interpreter: CodeInterpreter,
162
- ) -> Tuple[Optional[Execution], Optional[str]]:
163
- user_result = None
164
- user_obs = None
165
-
166
- if last_user_message["role"] != "user":
167
- return user_result, user_obs
168
-
169
- last_user_content = cast(str, last_user_message.get("content", ""))
170
- try:
171
- user_code_action = json.loads(last_user_content).get("execute_python", None)
172
- except json.JSONDecodeError:
173
- return user_result, user_obs
174
-
175
- if user_code_action is not None:
176
- user_code_action = use_extra_vision_agent_args(user_code_action, False)
177
- user_result, user_obs = execute_code_action(
178
- artifacts, user_code_action, code_interpreter
179
- )
180
- if user_result.error:
181
- user_obs += f"\n{user_result.error}"
182
- return user_result, user_obs
183
-
184
-
185
- def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
186
- response = copy.deepcopy(response)
187
-
188
- if "execute_python" in response and response["execute_python"]:
189
- # only include descriptions for these, the rest will just have executing
190
- # code
191
- description_map = {
192
- "open_code_artifact": "Reading file.",
193
- "create_code_artifact": "Creating file.",
194
- "edit_code_artifact": "Editing file.",
195
- "generate_vision_code": "Generating vision code.",
196
- "edit_vision_code": "Editing vision code.",
197
- }
198
- description = ""
199
- for k, v in description_map.items():
200
- if k in response["execute_python"]:
201
- description += v + " "
202
- if description == "":
203
- description = "Executing code."
204
-
205
- response["response"] = description
206
-
207
- return response
208
-
209
-
210
- def new_format_to_old_format(new_format: Dict[str, Any]) -> Dict[str, Any]:
211
- thoughts = new_format["thinking"] if new_format["thinking"] is not None else ""
212
- response = new_format["response"] if new_format["response"] is not None else ""
213
- if new_format["execute_python"] is not None:
214
- response += (
215
- f"\n<execute_python>\n{new_format['execute_python']}\n</execute_python>"
216
- )
217
- return {
218
- "thoughts": thoughts,
219
- "response": response,
220
- "let_user_respond": new_format["let_user_respond"],
221
- }
222
-
223
-
224
- def old_format_to_new_format(old_format_str: str) -> str:
225
- try:
226
- old_format = json.loads(old_format_str)
227
- except json.JSONDecodeError:
228
- return old_format_str
229
-
230
- if "thoughts" in old_format:
231
- thinking = (
232
- old_format["thoughts"] if old_format["thoughts"].strip() != "" else None
233
- )
234
- else:
235
- thinking = None
236
-
237
- let_user_respond = (
238
- old_format["let_user_respond"] if "let_user_respond" in old_format else True
239
- )
240
-
241
- if "response" in old_format and "<execute_python>" in old_format["response"]:
242
- execute_python = extract_tag(old_format["response"], "execute_python")
243
- response = (
244
- old_format["response"]
245
- .replace(execute_python, "")
246
- .replace("<execute_python>", "")
247
- .replace("</execute_python>", "")
248
- .strip()
249
- )
250
- else:
251
- execute_python = None
252
- response = old_format["response"] if "response" in old_format else None
253
-
254
- return json.dumps(
255
- {
256
- "thinking": thinking,
257
- "response": response,
258
- "execute_python": execute_python,
259
- "let_user_respond": let_user_respond,
260
- }
261
- )
262
-
263
-
264
- class VisionAgent(Agent):
265
- """Vision Agent is an agent that can chat with the user and call tools or other
266
- agents to generate code for it. Vision Agent uses python code to execute actions
267
- for the user. Vision Agent is inspired by by OpenDevin
268
- https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030
269
-
270
- Example
271
- -------
272
- >>> from vision_agent.agent import VisionAgent
273
- >>> agent = VisionAgent()
274
- >>> resp = agent("Hello")
275
- >>> resp.append({"role": "user", "content": "Can you write a function that counts dogs?", "media": ["dog.jpg"]})
276
- >>> resp = agent(resp)
277
- """
278
-
279
- def __init__(
280
- self,
281
- agent: Optional[LMM] = None,
282
- cwd: Optional[Union[Path, str]] = None,
283
- verbosity: int = 0,
284
- callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
285
- code_sandbox_runtime: Optional[str] = None,
286
- ) -> None:
287
- """Initialize the VisionAgent.
288
-
289
- Parameters:
290
- agent (Optional[LMM]): The agent to use for conversation and orchestration
291
- of other agents.
292
- verbosity (int): The verbosity level of the agent.
293
- callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
294
- function to send intermediate update messages.
295
- code_sandbox_runtime (Optional[str]): For string values it can be one of:
296
- None or "local". If None, it will read from the environment
297
- variable "CODE_SANDBOX_RUNTIME".
298
- """
299
-
300
- self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
301
- self.max_iterations = 12
302
- self.cwd = Path(cwd) if cwd is not None else Path.cwd()
303
- self.verbosity = verbosity
304
- self.callback_message = callback_message
305
- self.code_sandbox_runtime = code_sandbox_runtime
306
- if self.verbosity >= 1:
307
- _LOGGER.setLevel(logging.INFO)
308
-
309
- def __call__(
310
- self,
311
- input: Union[str, List[Message]],
312
- media: Optional[Union[str, Path]] = None,
313
- artifacts: Optional[Artifacts] = None,
314
- ) -> str:
315
- """Chat with VisionAgent and get the conversation response.
316
-
317
- Parameters:
318
- input (Union[str, List[Message]): A conversation in the format of
319
- [{"role": "user", "content": "describe your task here..."}, ...] or a
320
- string of just the contents.
321
- media (Optional[Union[str, Path]]): The media file to be used in the task.
322
- artifacts (Optional[Artifacts]): The artifacts to use in the task.
323
-
324
- Returns:
325
- str: The conversation response.
326
- """
327
- if isinstance(input, str):
328
- input = [{"role": "user", "content": input}]
329
- if media is not None:
330
- input[0]["media"] = [media]
331
- results, _ = self.chat_with_artifacts(input, artifacts)
332
- return results[-1]["content"] # type: ignore
333
-
334
- def chat(
335
- self,
336
- chat: List[Message],
337
- ) -> List[Message]:
338
- """Chat with VisionAgent, it will use code to execute actions to accomplish
339
- its tasks.
340
-
341
- Parameters:
342
- chat (List[Message]): A conversation in the format of:
343
- [{"role": "user", "content": "describe your task here..."}]
344
- or if it contains media files, it should be in the format of:
345
- [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
346
-
347
- Returns:
348
- List[Message]: The conversation response.
349
- """
350
- return self.chat_with_artifacts(chat)[0]
351
-
352
- def chat_with_artifacts(
353
- self,
354
- chat: List[Message],
355
- artifacts: Optional[Artifacts] = None,
356
- test_multi_plan: bool = True,
357
- custom_tool_names: Optional[List[str]] = None,
358
- ) -> Tuple[List[Message], Artifacts]:
359
- """Chat with VisionAgent, it will use code to execute actions to accomplish
360
- its tasks.
361
-
362
- Parameters:
363
- chat (List[Message]): A conversation in the format of:
364
- [{"role": "user", "content": "describe your task here..."}]
365
- or if it contains media files, it should be in the format of:
366
- [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
367
- artifacts (Optional[Artifacts]): The artifacts to use in the task.
368
- test_multi_plan (bool): If True, it will test tools for multiple plans and
369
- pick the best one based off of the tool results. If False, it will go
370
- with the first plan.
371
- custom_tool_names (List[str]): A list of customized tools for agent to
372
- pick and use. If not provided, default to full tool set from
373
- vision_agent.tools.
374
-
375
- Returns:
376
- List[Message]: The conversation response.
377
- """
378
-
379
- if not chat:
380
- raise ValueError("chat cannot be empty")
381
-
382
- if not artifacts:
383
- artifacts = Artifacts(self.cwd)
384
-
385
- with CodeInterpreterFactory.new_instance(
386
- code_sandbox_runtime=self.code_sandbox_runtime,
387
- remote_path=self.cwd,
388
- ) as code_interpreter:
389
- orig_chat = copy.deepcopy(chat)
390
- int_chat = copy.deepcopy(chat)
391
- last_user_message = chat[-1]
392
- for chat_i in int_chat:
393
- if "media" in chat_i:
394
- for media in chat_i["media"]:
395
- media = cast(str, media)
396
- media_remote_path = Path(artifacts.cwd) / Path(media).name
397
- chat_i["content"] += f" Media name {media_remote_path}" # type: ignore
398
-
399
- int_chat = cast(
400
- List[Message],
401
- [
402
- (
403
- {
404
- "role": c["role"],
405
- "content": old_format_to_new_format(c["content"]), # type: ignore
406
- "media": c["media"],
407
- }
408
- if "media" in c
409
- else {"role": c["role"], "content": old_format_to_new_format(c["content"])} # type: ignore
410
- )
411
- for c in int_chat
412
- ],
413
- )
414
-
415
- finished = False
416
- iterations = 0
417
- last_response = None
418
-
419
- # Upload artifacts to remote location and show where they are going
420
- # to be loaded to. The actual loading happens in BoilerplateCode as
421
- # part of the pre_code.
422
- artifacts_loaded = artifacts.show()
423
- int_chat.append({"role": "observation", "content": artifacts_loaded})
424
- orig_chat.append({"role": "observation", "content": artifacts_loaded})
425
- self.streaming_message({"role": "observation", "content": artifacts_loaded})
426
-
427
- user_result, user_obs = execute_user_code_action(
428
- artifacts,
429
- last_user_message,
430
- code_interpreter,
431
- )
432
- finished = user_result is not None and user_obs is not None
433
- if user_result is not None and user_obs is not None:
434
- # be sure to update the chat with user execution results
435
- chat_elt: Message = {"role": "observation", "content": user_obs}
436
- int_chat.append(chat_elt)
437
- chat_elt["execution"] = user_result
438
- orig_chat.append(chat_elt)
439
- self.streaming_message(
440
- {
441
- "role": "observation",
442
- "content": user_obs,
443
- "execution": user_result,
444
- "finished": finished,
445
- }
446
- )
447
-
448
- while not finished and iterations < self.max_iterations:
449
- response = run_conversation(self.agent, int_chat)
450
- if self.verbosity >= 1:
451
- _LOGGER.info(response)
452
-
453
- code_action = response.get("execute_python", None)
454
- # sometimes it gets stuck in a loop, so we force it to exit
455
- if last_response == response:
456
- response["let_user_respond"] = True
457
- self.streaming_message(
458
- {
459
- "role": "assistant",
460
- "content": "{}",
461
- "error": {
462
- "name": "Error when running conversation agent",
463
- "value": "Agent is stuck in conversation loop, exited",
464
- "traceback_raw": [],
465
- },
466
- "finished": True,
467
- }
468
- )
469
- else:
470
- self.streaming_message(
471
- {
472
- "role": "assistant",
473
- "content": new_format_to_old_format(
474
- add_step_descriptions(response)
475
- ),
476
- "finished": response.get("let_user_respond", False)
477
- and code_action is None,
478
- }
479
- )
480
-
481
- int_chat.append(
482
- {
483
- "role": "assistant",
484
- "content": json.dumps(
485
- new_format_to_old_format(add_step_descriptions(response))
486
- ),
487
- }
488
- )
489
- orig_chat.append(
490
- {
491
- "role": "assistant",
492
- "content": json.dumps(
493
- new_format_to_old_format(add_step_descriptions(response))
494
- ),
495
- }
496
- )
497
- finished = response.get("let_user_respond", False)
498
-
499
- if code_action is not None:
500
- code_action = use_extra_vision_agent_args(
501
- code_action, test_multi_plan, custom_tool_names
502
- )
503
-
504
- if code_action is not None:
505
- result, obs = execute_code_action(
506
- artifacts,
507
- code_action,
508
- code_interpreter,
509
- )
510
- obs_chat_elt: Message = {"role": "observation", "content": obs}
511
- media_obs = check_and_load_image(code_action)
512
- if media_obs and result.success:
513
- obs_chat_elt["media"] = [
514
- artifacts.cwd / media_ob for media_ob in media_obs
515
- ]
516
-
517
- if self.verbosity >= 1:
518
- _LOGGER.info(obs)
519
-
520
- # don't add execution results to internal chat
521
- int_chat.append(obs_chat_elt)
522
- obs_chat_elt["execution"] = result
523
- orig_chat.append(obs_chat_elt)
524
- self.streaming_message(
525
- {
526
- "role": "observation",
527
- "content": obs,
528
- "execution": result,
529
- "finished": finished,
530
- }
531
- )
532
-
533
- iterations += 1
534
- last_response = response
535
-
536
- return orig_chat, artifacts
537
-
538
- def streaming_message(self, message: Dict[str, Any]) -> None:
539
- if self.callback_message:
540
- self.callback_message(message)
541
-
542
- def log_progress(self, data: Dict[str, Any]) -> None:
543
- pass
544
-
545
-
546
- class OpenAIVisionAgent(VisionAgent):
547
- def __init__(
548
- self,
549
- agent: Optional[LMM] = None,
550
- cwd: Optional[Union[Path, str]] = None,
551
- verbosity: int = 0,
552
- callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
553
- ) -> None:
554
- """Initialize the VisionAgent using OpenAI LMMs.
555
-
556
- Parameters:
557
- agent (Optional[LMM]): The agent to use for conversation and orchestration
558
- of other agents.
559
- verbosity (int): The verbosity level of the agent.
560
- callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
561
- function to send intermediate update messages.
562
- code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
563
- it can be one of: None or "local". If None, it will read from
564
- the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
565
- object is provided it will use that.
566
- """
567
-
568
- agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
569
- super().__init__(
570
- agent,
571
- cwd,
572
- verbosity,
573
- callback_message,
574
- )
575
-
576
-
577
- class AnthropicVisionAgent(VisionAgent):
578
- def __init__(
579
- self,
580
- agent: Optional[LMM] = None,
581
- cwd: Optional[Union[Path, str]] = None,
582
- verbosity: int = 0,
583
- callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
584
- ) -> None:
585
- """Initialize the VisionAgent using Anthropic LMMs.
586
-
587
- Parameters:
588
- agent (Optional[LMM]): The agent to use for conversation and orchestration
589
- of other agents.
590
- verbosity (int): The verbosity level of the agent.
591
- callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
592
- function to send intermediate update messages.
593
- code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
594
- it can be one of: None or "local". If None, it will read from
595
- the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
596
- object is provided it will use that.
597
- """
598
-
599
- agent = AnthropicLMM(temperature=0.0) if agent is None else agent
600
- super().__init__(
601
- agent,
602
- cwd,
603
- verbosity,
604
- callback_message,
605
- )