vision-agent 0.2.199__py3-none-any.whl → 0.2.201__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,4 @@
1
- from .agent import Agent
1
+ from .agent import Agent, AgentCoder, AgentPlanner
2
2
  from .vision_agent import VisionAgent
3
3
  from .vision_agent_coder import (
4
4
  AnthropicVisionAgentCoder,
@@ -17,3 +17,4 @@ from .vision_agent_planner import (
17
17
  VisionAgentPlanner,
18
18
  )
19
19
  from .vision_agent_planner_v2 import VisionAgentPlannerV2
20
+ from .vision_agent_v2 import VisionAgentV2
@@ -2,7 +2,9 @@ from abc import ABC, abstractmethod
2
2
  from pathlib import Path
3
3
  from typing import Any, Dict, List, Optional, Union
4
4
 
5
+ from vision_agent.agent.types import AgentMessage, CodeContext, PlanContext
5
6
  from vision_agent.lmm.types import Message
7
+ from vision_agent.utils.execute import CodeInterpreter
6
8
 
7
9
 
8
10
  class Agent(ABC):
@@ -20,3 +22,34 @@ class Agent(ABC):
20
22
  This is a hook that is intended for reporting the progress of the agent.
21
23
  """
22
24
  pass
25
+
26
+
27
+ class AgentCoder(Agent):
28
+ @abstractmethod
29
+ def generate_code(
30
+ self,
31
+ chat: List[AgentMessage],
32
+ max_steps: Optional[int] = None,
33
+ code_interpreter: Optional[CodeInterpreter] = None,
34
+ ) -> CodeContext:
35
+ pass
36
+
37
+ @abstractmethod
38
+ def generate_code_from_plan(
39
+ self,
40
+ chat: List[AgentMessage],
41
+ plan_context: PlanContext,
42
+ code_interpreter: Optional[CodeInterpreter] = None,
43
+ ) -> CodeContext:
44
+ pass
45
+
46
+
47
+ class AgentPlanner(Agent):
48
+ @abstractmethod
49
+ def generate_plan(
50
+ self,
51
+ chat: List[AgentMessage],
52
+ max_steps: Optional[int] = None,
53
+ code_interpreter: Optional[CodeInterpreter] = None,
54
+ ) -> PlanContext:
55
+ pass
@@ -4,16 +4,17 @@ import logging
4
4
  import re
5
5
  import sys
6
6
  import tempfile
7
- from typing import Any, Dict, List, Optional, Tuple, cast
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, Tuple, Union, cast
8
9
 
9
10
  import libcst as cst
10
- from pydantic import BaseModel
11
11
  from rich.console import Console
12
12
  from rich.style import Style
13
13
  from rich.syntax import Syntax
14
14
  from rich.table import Table
15
15
 
16
16
  import vision_agent.tools as T
17
+ from vision_agent.agent.types import AgentMessage, PlanContext
17
18
  from vision_agent.lmm.types import Message
18
19
  from vision_agent.utils.execute import CodeInterpreter, Execution
19
20
  from vision_agent.utils.image_utils import b64_to_pil, convert_to_b64
@@ -24,19 +25,6 @@ _CONSOLE = Console()
24
25
  _MAX_TABULATE_COL_WIDTH = 80
25
26
 
26
27
 
27
- class PlanContext(BaseModel):
28
- plan: str
29
- instructions: List[str]
30
- code: str
31
-
32
-
33
- class CodeContext(BaseModel):
34
- code: str
35
- test: str
36
- success: bool
37
- test_result: Execution
38
-
39
-
40
28
  def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]:
41
29
  json_pattern = r"\{.*\}"
42
30
  match = re.search(json_pattern, json_str, re.DOTALL)
@@ -228,15 +216,15 @@ def print_table(title: str, columns: List[str], rows: List[List[str]]) -> None:
228
216
 
229
217
 
230
218
  def add_media_to_chat(
231
- chat: List[Message], code_interpreter: CodeInterpreter
232
- ) -> Tuple[List[Message], List[Message], List[str]]:
219
+ chat: List[AgentMessage], code_interpreter: Optional[CodeInterpreter] = None
220
+ ) -> Tuple[List[AgentMessage], List[AgentMessage], List[Union[str, Path]]]:
233
221
  orig_chat = copy.deepcopy(chat)
234
222
  int_chat = copy.deepcopy(chat)
235
- media_list = []
223
+ media_list: List[Union[str, Path]] = []
236
224
  for chat_i in int_chat:
237
- if "media" in chat_i:
238
- media_list_i = []
239
- for media in chat_i["media"]:
225
+ if chat_i.media is not None:
226
+ media_list_i: List[Union[str, Path]] = []
227
+ for media in chat_i.media:
240
228
  if isinstance(media, str) and media.startswith("data:image/"):
241
229
  media_pil = b64_to_pil(media)
242
230
  with tempfile.NamedTemporaryFile(
@@ -244,25 +232,29 @@ def add_media_to_chat(
244
232
  ) as temp_file:
245
233
  media_pil.save(temp_file, format="PNG")
246
234
  media = str(temp_file.name)
247
- media = str(code_interpreter.upload_file(media)) # type: ignore
235
+ if code_interpreter is not None:
236
+ media = str(code_interpreter.upload_file(media))
248
237
  media_list_i.append(media)
249
- # don't duplicate appending media name
250
- if not str(chat_i["content"]).endswith(f" Media name {media}"):
251
- chat_i["content"] += f" Media name {media}" # type: ignore
252
- chat_i["media"] = media_list_i
238
+ # don't duplicate appending media name and only add them for user messages
239
+ if (
240
+ not str(chat_i.content).endswith(f" Media name {media}")
241
+ and chat_i.role == "user"
242
+ ):
243
+ chat_i.content += f" Media name {media}"
244
+ chat_i.media = media_list_i if len(media_list_i) > 0 else None
253
245
  media_list.extend(media_list_i)
254
246
 
255
247
  int_chat = cast(
256
- List[Message],
248
+ List[AgentMessage],
257
249
  [
258
250
  (
259
- {
260
- "role": c["role"],
261
- "content": c["content"],
262
- "media": c["media"],
263
- }
264
- if "media" in c
265
- else {"role": c["role"], "content": c["content"]}
251
+ AgentMessage(
252
+ role=c.role,
253
+ content=c.content,
254
+ media=c.media,
255
+ )
256
+ if c.media is not None
257
+ else AgentMessage(role=c.role, content=c.content, media=None)
266
258
  )
267
259
  for c in int_chat
268
260
  ],
@@ -283,6 +275,27 @@ def capture_media_from_exec(execution: Execution) -> List[str]:
283
275
  return images
284
276
 
285
277
 
278
+ def convert_message_to_agentmessage(
279
+ input: Union[str, List[Message]],
280
+ media: Optional[Union[str, Path]] = None,
281
+ ) -> List[AgentMessage]:
282
+ if isinstance(input, str):
283
+ input_msg = [
284
+ AgentMessage(
285
+ role="user",
286
+ content=input,
287
+ media=([media] if media is not None else None),
288
+ )
289
+ ]
290
+ else:
291
+ input_msg = [
292
+ AgentMessage(role=msg["role"], content=msg["content"], media=None)
293
+ for msg in input
294
+ ]
295
+ input_msg[0].media = [media] if media is not None else None
296
+ return input_msg
297
+
298
+
286
299
  def strip_function_calls( # noqa: C901
287
300
  code: str, exclusions: Optional[List[str]] = None
288
301
  ) -> str:
@@ -0,0 +1,51 @@
1
+ from pathlib import Path
2
+ from typing import List, Literal, Optional, Union
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from vision_agent.utils.execute import Execution
7
+
8
+
9
+ class AgentMessage(BaseModel):
10
+ """AgentMessage encompases messages sent to the entire Agentic system, which includes
11
+ both LMMs and sub-agents.
12
+
13
+ user: The user's message.
14
+ assistant: The assistant's message.
15
+ observation: An observation made after conducting an action, either by the user or
16
+ assistant.
17
+ interaction: An interaction between the user and the assistant. For example if the
18
+ assistant wants to ask the user for help on a task, it could send an
19
+ interaction message.
20
+ conversation: Messages coming from the conversation agent, this is a type of
21
+ assistant messages.
22
+ planner: Messages coming from the planner agent, this is a type of assistant
23
+ messages.
24
+ coder: Messages coming from the coder agent, this is a type of assistant messages.
25
+
26
+ """
27
+
28
+ role: Union[
29
+ Literal["user"],
30
+ Literal["assistant"], # planner, coder and conversation are of type assistant
31
+ Literal["observation"],
32
+ Literal["interaction"],
33
+ Literal["conversation"],
34
+ Literal["planner"],
35
+ Literal["coder"],
36
+ ]
37
+ content: str
38
+ media: Optional[List[Union[str, Path]]] = None
39
+
40
+
41
+ class PlanContext(BaseModel):
42
+ plan: str
43
+ instructions: List[str]
44
+ code: str
45
+
46
+
47
+ class CodeContext(BaseModel):
48
+ code: str
49
+ test: str
50
+ success: bool
51
+ test_result: Execution
@@ -36,14 +36,10 @@ class BoilerplateCode:
36
36
  pre_code = [
37
37
  "from typing import *",
38
38
  "from vision_agent.utils.execute import CodeInterpreter",
39
- "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning, list_artifacts, capture_files_into_artifacts",
40
- "artifacts = Artifacts('{remote_path}', '{remote_path}')",
41
- "artifacts.load('{remote_path}')",
42
- ]
43
- post_code = [
44
- "capture_files_into_artifacts(artifacts)",
45
- "artifacts.save()",
39
+ "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning, list_artifacts",
40
+ "artifacts = Artifacts('{cwd}')",
46
41
  ]
42
+ post_code: List[str] = []
47
43
 
48
44
  @staticmethod
49
45
  def add_boilerplate(code: str, **format: Any) -> str:
@@ -149,9 +145,7 @@ def execute_code_action(
149
145
  code_interpreter: CodeInterpreter,
150
146
  ) -> Tuple[Execution, str]:
151
147
  result = code_interpreter.exec_isolation(
152
- BoilerplateCode.add_boilerplate(
153
- code, remote_path=str(artifacts.remote_save_path)
154
- )
148
+ BoilerplateCode.add_boilerplate(code, cwd=str(artifacts.cwd))
155
149
  )
156
150
 
157
151
  obs = str(result.logs)
@@ -212,19 +206,6 @@ def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
212
206
  return response
213
207
 
214
208
 
215
- def setup_artifacts() -> Artifacts:
216
- # this is setting remote artifacts path
217
- sandbox = os.environ.get("CODE_SANDBOX_RUNTIME", None)
218
- if sandbox is None or sandbox == "local":
219
- remote = WORKSPACE / "artifacts.pkl"
220
- elif sandbox == "e2b":
221
- remote = Path("/home/user/artifacts.pkl")
222
- else:
223
- raise ValueError(f"Unknown code sandbox runtime {sandbox}")
224
- artifacts = Artifacts(remote, Path(os.getcwd()) / "artifacts.pkl")
225
- return artifacts
226
-
227
-
228
209
  def new_format_to_old_format(new_format: Dict[str, Any]) -> Dict[str, Any]:
229
210
  thoughts = new_format["thinking"] if new_format["thinking"] is not None else ""
230
211
  response = new_format["response"] if new_format["response"] is not None else ""
@@ -297,9 +278,10 @@ class VisionAgent(Agent):
297
278
  def __init__(
298
279
  self,
299
280
  agent: Optional[LMM] = None,
281
+ cwd: Optional[Union[Path, str]] = None,
300
282
  verbosity: int = 0,
301
283
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
302
- code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
284
+ code_sandbox_runtime: Optional[str] = None,
303
285
  ) -> None:
304
286
  """Initialize the VisionAgent.
305
287
 
@@ -317,9 +299,10 @@ class VisionAgent(Agent):
317
299
 
318
300
  self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
319
301
  self.max_iterations = 12
302
+ self.cwd = Path(cwd) if cwd is not None else Path.cwd()
320
303
  self.verbosity = verbosity
321
- self.code_interpreter = code_interpreter
322
304
  self.callback_message = callback_message
305
+ self.code_sandbox_runtime = code_sandbox_runtime
323
306
  if self.verbosity >= 1:
324
307
  _LOGGER.setLevel(logging.INFO)
325
308
 
@@ -397,40 +380,21 @@ class VisionAgent(Agent):
397
380
  raise ValueError("chat cannot be empty")
398
381
 
399
382
  if not artifacts:
400
- artifacts = setup_artifacts()
401
-
402
- # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
403
- code_interpreter = (
404
- self.code_interpreter
405
- if self.code_interpreter is not None
406
- and not isinstance(self.code_interpreter, str)
407
- else CodeInterpreterFactory.new_instance(
408
- code_sandbox_runtime=self.code_interpreter,
409
- remote_path=artifacts.remote_save_path.parent,
410
- )
411
- )
383
+ artifacts = Artifacts(self.cwd)
412
384
 
413
- if code_interpreter.remote_path != artifacts.remote_save_path.parent:
414
- raise ValueError(
415
- f"Code interpreter remote path {code_interpreter.remote_path} does not match artifacts remote path {artifacts.remote_save_path.parent}"
416
- )
417
-
418
- with code_interpreter:
385
+ with CodeInterpreterFactory.new_instance(
386
+ code_sandbox_runtime=self.code_sandbox_runtime,
387
+ remote_path=self.cwd,
388
+ ) as code_interpreter:
419
389
  orig_chat = copy.deepcopy(chat)
420
390
  int_chat = copy.deepcopy(chat)
421
391
  last_user_message = chat[-1]
422
- media_list = []
423
392
  for chat_i in int_chat:
424
393
  if "media" in chat_i:
425
394
  for media in chat_i["media"]:
426
395
  media = cast(str, media)
427
- artifacts.artifacts[Path(media).name] = open(media, "rb").read()
428
-
429
- media_remote_path = (
430
- Path(artifacts.remote_save_path.parent) / Path(media).name
431
- )
396
+ media_remote_path = Path(artifacts.cwd) / Path(media).name
432
397
  chat_i["content"] += f" Media name {media_remote_path}" # type: ignore
433
- media_list.append(media_remote_path)
434
398
 
435
399
  int_chat = cast(
436
400
  List[Message],
@@ -452,15 +416,10 @@ class VisionAgent(Agent):
452
416
  iterations = 0
453
417
  last_response = None
454
418
 
455
- # Save the current state of artifacts, will include any images the user
456
- # passed in.
457
- artifacts.save()
458
-
459
419
  # Upload artifacts to remote location and show where they are going
460
420
  # to be loaded to. The actual loading happens in BoilerplateCode as
461
421
  # part of the pre_code.
462
- code_interpreter.upload_file(artifacts.local_save_path)
463
- artifacts_loaded = artifacts.show(artifacts.remote_save_path.parent)
422
+ artifacts_loaded = artifacts.show()
464
423
  int_chat.append({"role": "observation", "content": artifacts_loaded})
465
424
  orig_chat.append({"role": "observation", "content": artifacts_loaded})
466
425
  self.streaming_message({"role": "observation", "content": artifacts_loaded})
@@ -487,10 +446,6 @@ class VisionAgent(Agent):
487
446
  )
488
447
 
489
448
  while not finished and iterations < self.max_iterations:
490
- # ensure we upload the artifacts before each turn, so any local
491
- # modifications we made to it will be reflected in the remote
492
- code_interpreter.upload_file(artifacts.local_save_path)
493
-
494
449
  response = run_conversation(self.agent, int_chat)
495
450
  if self.verbosity >= 1:
496
451
  _LOGGER.info(response)
@@ -555,11 +510,8 @@ class VisionAgent(Agent):
555
510
  obs_chat_elt: Message = {"role": "observation", "content": obs}
556
511
  media_obs = check_and_load_image(code_action)
557
512
  if media_obs and result.success:
558
- # media paths will be under the local_save_path when we download
559
- # them after each turn
560
513
  obs_chat_elt["media"] = [
561
- artifacts.local_save_path.parent / media_ob
562
- for media_ob in media_obs
514
+ artifacts.cwd / media_ob for media_ob in media_obs
563
515
  ]
564
516
 
565
517
  if self.verbosity >= 1:
@@ -581,15 +533,6 @@ class VisionAgent(Agent):
581
533
  iterations += 1
582
534
  last_response = response
583
535
 
584
- # after each turn, download the artifacts locally
585
- code_interpreter.download_file(
586
- str(artifacts.remote_save_path.name),
587
- str(artifacts.local_save_path),
588
- )
589
- artifacts.load(
590
- artifacts.local_save_path, artifacts.local_save_path.parent
591
- )
592
-
593
536
  return orig_chat, artifacts
594
537
 
595
538
  def streaming_message(self, message: Dict[str, Any]) -> None:
@@ -604,9 +547,9 @@ class OpenAIVisionAgent(VisionAgent):
604
547
  def __init__(
605
548
  self,
606
549
  agent: Optional[LMM] = None,
550
+ cwd: Optional[Union[Path, str]] = None,
607
551
  verbosity: int = 0,
608
552
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
609
- code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
610
553
  ) -> None:
611
554
  """Initialize the VisionAgent using OpenAI LMMs.
612
555
 
@@ -625,9 +568,9 @@ class OpenAIVisionAgent(VisionAgent):
625
568
  agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
626
569
  super().__init__(
627
570
  agent,
571
+ cwd,
628
572
  verbosity,
629
573
  callback_message,
630
- code_interpreter,
631
574
  )
632
575
 
633
576
 
@@ -635,9 +578,9 @@ class AnthropicVisionAgent(VisionAgent):
635
578
  def __init__(
636
579
  self,
637
580
  agent: Optional[LMM] = None,
581
+ cwd: Optional[Union[Path, str]] = None,
638
582
  verbosity: int = 0,
639
583
  callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
640
- code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
641
584
  ) -> None:
642
585
  """Initialize the VisionAgent using Anthropic LMMs.
643
586
 
@@ -656,7 +599,7 @@ class AnthropicVisionAgent(VisionAgent):
656
599
  agent = AnthropicLMM(temperature=0.0) if agent is None else agent
657
600
  super().__init__(
658
601
  agent,
602
+ cwd,
659
603
  verbosity,
660
604
  callback_message,
661
- code_interpreter,
662
605
  )
@@ -450,12 +450,6 @@ class VisionAgentCoder(Agent):
450
450
  for chat_i in chat:
451
451
  if "media" in chat_i:
452
452
  for media in chat_i["media"]:
453
- media = (
454
- media
455
- if type(media) is str
456
- and media.startswith(("http", "https"))
457
- else code_interpreter.upload_file(cast(str, media))
458
- )
459
453
  chat_i["content"] += f" Media name {media}" # type: ignore
460
454
  media_list.append(str(media))
461
455