vision-agent 0.2.40__tar.gz → 0.2.42__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {vision_agent-0.2.40 → vision_agent-0.2.42}/PKG-INFO +1 -1
  2. {vision_agent-0.2.40 → vision_agent-0.2.42}/pyproject.toml +1 -1
  3. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/agent/vision_agent.py +126 -112
  4. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/tools/tools.py +6 -6
  5. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/utils/execute.py +24 -17
  6. {vision_agent-0.2.40 → vision_agent-0.2.42}/LICENSE +0 -0
  7. {vision_agent-0.2.40 → vision_agent-0.2.42}/README.md +0 -0
  8. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/__init__.py +0 -0
  9. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/agent/__init__.py +0 -0
  10. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/agent/agent.py +0 -0
  11. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/agent/agent_coder.py +0 -0
  12. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/agent/agent_coder_prompts.py +0 -0
  13. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/agent/data_interpreter.py +0 -0
  14. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/agent/data_interpreter_prompts.py +0 -0
  15. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/agent/easytool.py +0 -0
  16. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/agent/easytool_prompts.py +0 -0
  17. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/agent/easytool_v2.py +0 -0
  18. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/agent/easytool_v2_prompts.py +0 -0
  19. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/agent/reflexion.py +0 -0
  20. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/agent/reflexion_prompts.py +0 -0
  21. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/agent/vision_agent_prompts.py +0 -0
  22. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/fonts/__init__.py +0 -0
  23. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  24. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/llm/__init__.py +0 -0
  25. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/llm/llm.py +0 -0
  26. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/lmm/__init__.py +0 -0
  27. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/lmm/lmm.py +0 -0
  28. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/tools/__init__.py +0 -0
  29. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/tools/easytool_tools.py +0 -0
  30. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/tools/prompts.py +0 -0
  31. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/tools/tool_utils.py +0 -0
  32. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/utils/__init__.py +0 -0
  33. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/utils/image_utils.py +0 -0
  34. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/utils/sim.py +0 -0
  35. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/utils/type_defs.py +0 -0
  36. {vision_agent-0.2.40 → vision_agent-0.2.42}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.40
3
+ Version: 0.2.42
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.40"
7
+ version = "0.2.42"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -25,12 +25,13 @@ from vision_agent.agent.vision_agent_prompts import (
25
25
  from vision_agent.llm import LLM, OpenAILLM
26
26
  from vision_agent.lmm import LMM, OpenAILMM
27
27
  from vision_agent.utils import CodeInterpreterFactory, Execution
28
+ from vision_agent.utils.execute import CodeInterpreter
29
+ from vision_agent.utils.image_utils import b64_to_pil
28
30
  from vision_agent.utils.sim import Sim
29
31
 
30
32
  logging.basicConfig(stream=sys.stdout)
31
33
  _LOGGER = logging.getLogger(__name__)
32
34
  _MAX_TABULATE_COL_WIDTH = 80
33
- _EXECUTE = CodeInterpreterFactory.get_default_instance()
34
35
  _CONSOLE = Console()
35
36
  _DEFAULT_IMPORT = "\n".join(T.__new_tools__)
36
37
 
@@ -122,6 +123,7 @@ def write_and_test_code(
122
123
  coder: LLM,
123
124
  tester: LLM,
124
125
  debugger: LLM,
126
+ code_interpreter: CodeInterpreter,
125
127
  log_progress: Callable[[Dict[str, Any]], None],
126
128
  verbosity: int = 0,
127
129
  max_retries: int = 3,
@@ -158,7 +160,7 @@ def write_and_test_code(
158
160
  },
159
161
  }
160
162
  )
161
- result = _EXECUTE.exec_isolation(f"{_DEFAULT_IMPORT}\n{code}\n{test}")
163
+ result = code_interpreter.exec_isolation(f"{_DEFAULT_IMPORT}\n{code}\n{test}")
162
164
  log_progress(
163
165
  {
164
166
  "type": "code",
@@ -173,7 +175,7 @@ def write_and_test_code(
173
175
  if verbosity == 2:
174
176
  _print_code("Initial code and tests:", code, test)
175
177
  _LOGGER.info(
176
- f"Initial code execution result:\n{result.text(include_logs=False)}"
178
+ f"Initial code execution result:\n{result.text(include_logs=True)}"
177
179
  )
178
180
 
179
181
  count = 0
@@ -210,7 +212,7 @@ def write_and_test_code(
210
212
  {"code": f"{code}\n{test}", "feedback": fixed_code_and_test["reflections"]}
211
213
  )
212
214
 
213
- result = _EXECUTE.exec_isolation(f"{_DEFAULT_IMPORT}\n{code}\n{test}")
215
+ result = code_interpreter.exec_isolation(f"{_DEFAULT_IMPORT}\n{code}\n{test}")
214
216
  log_progress(
215
217
  {
216
218
  "type": "code",
@@ -228,7 +230,7 @@ def write_and_test_code(
228
230
  )
229
231
  _print_code("Code and test after attempted fix:", code, test)
230
232
  _LOGGER.info(
231
- f"Code execution result after attempted fix: {result.text(include_logs=False)}"
233
+ f"Code execution result after attempted fix: {result.text(include_logs=True)}"
232
234
  )
233
235
  count += 1
234
236
 
@@ -377,6 +379,7 @@ class VisionAgent(Agent):
377
379
  chat: List[Dict[str, str]],
378
380
  media: Optional[Union[str, Path]] = None,
379
381
  self_reflection: bool = False,
382
+ display_visualization: bool = False,
380
383
  ) -> Dict[str, Any]:
381
384
  """Chat with Vision Agent and return intermediate information regarding the task.
382
385
 
@@ -385,6 +388,7 @@ class VisionAgent(Agent):
385
388
  [{"role": "user", "content": "describe your task here..."}].
386
389
  media (Optional[Union[str, Path]]): The media file to be used in the task.
387
390
  self_reflection (bool): Whether to reflect on the task and debug the code.
391
+ show_visualization (bool): If True, it opens a new window locally to show the image(s) created by visualization code (if there is any).
388
392
 
389
393
  Returns:
390
394
  Dict[str, Any]: A dictionary containing the code, test, test result, plan,
@@ -394,127 +398,137 @@ class VisionAgent(Agent):
394
398
  if not chat:
395
399
  raise ValueError("Chat cannot be empty.")
396
400
 
397
- if media is not None:
398
- media = _EXECUTE.upload_file(media)
399
- for chat_i in chat:
400
- if chat_i["role"] == "user":
401
- chat_i["content"] += f" Image name {media}"
402
-
403
- # re-grab custom tools
404
- global _DEFAULT_IMPORT
405
- _DEFAULT_IMPORT = "\n".join(T.__new_tools__)
406
-
407
- code = ""
408
- test = ""
409
- working_memory: List[Dict[str, str]] = []
410
- results = {"code": "", "test": "", "plan": []}
411
- plan = []
412
- success = False
413
- retries = 0
414
-
415
- while not success and retries < self.max_retries:
416
- self.log_progress(
417
- {
418
- "type": "plans",
419
- "status": "started",
420
- }
421
- )
422
- plan_i = write_plan(
423
- chat,
424
- T.TOOL_DESCRIPTIONS,
425
- format_memory(working_memory),
426
- self.planner,
427
- media=[media] if media else None,
428
- )
429
- plan_i_str = "\n-".join([e["instructions"] for e in plan_i])
430
-
431
- self.log_progress(
432
- {
433
- "type": "plans",
434
- "status": "completed",
435
- "payload": plan_i,
436
- }
437
- )
438
- if self.verbosity >= 1:
439
-
440
- _LOGGER.info(
441
- f"""
442
- {tabulate(tabular_data=plan_i, headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
443
- )
444
-
445
- tool_info = retrieve_tools(
446
- plan_i,
447
- self.tool_recommender,
448
- self.log_progress,
449
- self.verbosity,
450
- )
451
- results = write_and_test_code(
452
- FULL_TASK.format(user_request=chat[0]["content"], subtasks=plan_i_str),
453
- tool_info,
454
- T.UTILITIES_DOCSTRING,
455
- format_memory(working_memory),
456
- self.coder,
457
- self.tester,
458
- self.debugger,
459
- self.log_progress,
460
- verbosity=self.verbosity,
461
- input_media=media,
462
- )
463
- success = cast(bool, results["success"])
464
- code = cast(str, results["code"])
465
- test = cast(str, results["test"])
466
- working_memory.extend(results["working_memory"]) # type: ignore
467
- plan.append({"code": code, "test": test, "plan": plan_i})
468
-
469
- if self_reflection:
401
+ # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
402
+ with CodeInterpreterFactory.new_instance() as code_interpreter:
403
+ if media is not None:
404
+ media = code_interpreter.upload_file(media)
405
+ for chat_i in chat:
406
+ if chat_i["role"] == "user":
407
+ chat_i["content"] += f" Image name {media}"
408
+
409
+ # re-grab custom tools
410
+ global _DEFAULT_IMPORT
411
+ _DEFAULT_IMPORT = "\n".join(T.__new_tools__)
412
+
413
+ code = ""
414
+ test = ""
415
+ working_memory: List[Dict[str, str]] = []
416
+ results = {"code": "", "test": "", "plan": []}
417
+ plan = []
418
+ success = False
419
+ retries = 0
420
+
421
+ while not success and retries < self.max_retries:
470
422
  self.log_progress(
471
423
  {
472
- "type": "self_reflection",
424
+ "type": "plans",
473
425
  "status": "started",
474
426
  }
475
427
  )
476
- reflection = reflect(
428
+ plan_i = write_plan(
477
429
  chat,
478
- FULL_TASK.format(
479
- user_request=chat[0]["content"], subtasks=plan_i_str
480
- ),
481
- code,
430
+ T.TOOL_DESCRIPTIONS,
431
+ format_memory(working_memory),
482
432
  self.planner,
433
+ media=[media] if media else None,
483
434
  )
484
- if self.verbosity > 0:
485
- _LOGGER.info(f"Reflection: {reflection}")
486
- feedback = cast(str, reflection["feedback"])
487
- success = cast(bool, reflection["success"])
435
+ plan_i_str = "\n-".join([e["instructions"] for e in plan_i])
436
+
488
437
  self.log_progress(
489
438
  {
490
- "type": "self_reflection",
491
- "status": "completed" if success else "failed",
492
- "payload": reflection,
439
+ "type": "plans",
440
+ "status": "completed",
441
+ "payload": plan_i,
493
442
  }
494
443
  )
495
- working_memory.append({"code": f"{code}\n{test}", "feedback": feedback})
496
-
497
- retries += 1
444
+ if self.verbosity >= 1:
445
+ _LOGGER.info(
446
+ f"\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
447
+ )
448
+
449
+ tool_info = retrieve_tools(
450
+ plan_i,
451
+ self.tool_recommender,
452
+ self.log_progress,
453
+ self.verbosity,
454
+ )
455
+ results = write_and_test_code(
456
+ task=FULL_TASK.format(
457
+ user_request=chat[0]["content"], subtasks=plan_i_str
458
+ ),
459
+ tool_info=tool_info,
460
+ tool_utils=T.UTILITIES_DOCSTRING,
461
+ working_memory=format_memory(working_memory),
462
+ coder=self.coder,
463
+ tester=self.tester,
464
+ debugger=self.debugger,
465
+ code_interpreter=code_interpreter,
466
+ log_progress=self.log_progress,
467
+ verbosity=self.verbosity,
468
+ input_media=media,
469
+ )
470
+ success = cast(bool, results["success"])
471
+ code = cast(str, results["code"])
472
+ test = cast(str, results["test"])
473
+ working_memory.extend(results["working_memory"]) # type: ignore
474
+ plan.append({"code": code, "test": test, "plan": plan_i})
475
+
476
+ if self_reflection:
477
+ self.log_progress(
478
+ {
479
+ "type": "self_reflection",
480
+ "status": "started",
481
+ }
482
+ )
483
+ reflection = reflect(
484
+ chat,
485
+ FULL_TASK.format(
486
+ user_request=chat[0]["content"], subtasks=plan_i_str
487
+ ),
488
+ code,
489
+ self.planner,
490
+ )
491
+ if self.verbosity > 0:
492
+ _LOGGER.info(f"Reflection: {reflection}")
493
+ feedback = cast(str, reflection["feedback"])
494
+ success = cast(bool, reflection["success"])
495
+ self.log_progress(
496
+ {
497
+ "type": "self_reflection",
498
+ "status": "completed" if success else "failed",
499
+ "payload": reflection,
500
+ }
501
+ )
502
+ working_memory.append(
503
+ {"code": f"{code}\n{test}", "feedback": feedback}
504
+ )
505
+
506
+ retries += 1
507
+
508
+ execution_result = cast(Execution, results["test_result"])
509
+ self.log_progress(
510
+ {
511
+ "type": "final_code",
512
+ "status": "completed" if success else "failed",
513
+ "payload": {
514
+ "code": code,
515
+ "test": test,
516
+ "result": execution_result.to_json(),
517
+ },
518
+ }
519
+ )
498
520
 
499
- self.log_progress(
500
- {
501
- "type": "final_code",
502
- "status": "completed" if success else "failed",
503
- "payload": {
504
- "code": code,
505
- "test": test,
506
- "result": cast(Execution, results["test_result"]).to_json(),
507
- },
521
+ if display_visualization:
522
+ for res in execution_result.results:
523
+ if res.png:
524
+ b64_to_pil(res.png).show()
525
+ return {
526
+ "code": code,
527
+ "test": test,
528
+ "test_result": execution_result,
529
+ "plan": plan,
530
+ "working_memory": working_memory,
508
531
  }
509
- )
510
-
511
- return {
512
- "code": code,
513
- "test": test,
514
- "test_result": results["test_result"],
515
- "plan": plan,
516
- "working_memory": working_memory,
517
- }
518
532
 
519
533
  def log_progress(self, data: Dict[str, Any]) -> None:
520
534
  if self.report_progress_callback is not None:
@@ -524,7 +524,7 @@ def save_image(image: np.ndarray) -> str:
524
524
  def overlay_bounding_boxes(
525
525
  image: np.ndarray, bboxes: List[Dict[str, Any]]
526
526
  ) -> np.ndarray:
527
- """'display_bounding_boxes' is a utility function that displays bounding boxes on
527
+ """'overlay_bounding_boxes' is a utility function that displays bounding boxes on
528
528
  an image.
529
529
 
530
530
  Parameters:
@@ -537,7 +537,7 @@ def overlay_bounding_boxes(
537
537
 
538
538
  Example
539
539
  -------
540
- >>> image_with_bboxes = display_bounding_boxes(
540
+ >>> image_with_bboxes = overlay_bounding_boxes(
541
541
  image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
542
542
  )
543
543
  """
@@ -583,7 +583,7 @@ def overlay_bounding_boxes(
583
583
  def overlay_segmentation_masks(
584
584
  image: np.ndarray, masks: List[Dict[str, Any]]
585
585
  ) -> np.ndarray:
586
- """'display_segmentation_masks' is a utility function that displays segmentation
586
+ """'overlay_segmentation_masks' is a utility function that displays segmentation
587
587
  masks.
588
588
 
589
589
  Parameters:
@@ -595,7 +595,7 @@ def overlay_segmentation_masks(
595
595
 
596
596
  Example
597
597
  -------
598
- >>> image_with_masks = display_segmentation_masks(
598
+ >>> image_with_masks = overlay_segmentation_masks(
599
599
  image,
600
600
  [{
601
601
  'score': 0.99,
@@ -633,7 +633,7 @@ def overlay_segmentation_masks(
633
633
  def overlay_heat_map(
634
634
  image: np.ndarray, heat_map: Dict[str, Any], alpha: float = 0.8
635
635
  ) -> np.ndarray:
636
- """'display_heat_map' is a utility function that displays a heat map on an image.
636
+ """'overlay_heat_map' is a utility function that displays a heat map on an image.
637
637
 
638
638
  Parameters:
639
639
  image (np.ndarray): The image to display the heat map on.
@@ -646,7 +646,7 @@ def overlay_heat_map(
646
646
 
647
647
  Example
648
648
  -------
649
- >>> image_with_heat_map = display_heat_map(
649
+ >>> image_with_heat_map = overlay_heat_map(
650
650
  image,
651
651
  {
652
652
  'heat_map': array([[0, 0, 0, ..., 0, 0, 0],
@@ -8,6 +8,7 @@ import re
8
8
  import sys
9
9
  import tempfile
10
10
  import traceback
11
+ import warnings
11
12
  from enum import Enum
12
13
  from io import IOBase
13
14
  from pathlib import Path
@@ -218,7 +219,7 @@ class Logs(BaseModel):
218
219
  stdout_str = "\n".join(self.stdout)
219
220
  stderr_str = "\n".join(self.stderr)
220
221
  return _remove_escape_and_color_codes(
221
- f"stdout:\n{stdout_str}\nstderr:\n{stderr_str}"
222
+ f"----- stdout -----\n{stdout_str}\n----- stderr -----\n{stderr_str}"
222
223
  )
223
224
 
224
225
 
@@ -263,21 +264,19 @@ class Execution(BaseModel):
263
264
  """
264
265
  Returns the text representation of this object, i.e. including the main result or the error traceback, optionally along with the logs (stdout, stderr).
265
266
  """
266
- prefix = (
267
- "\n".join(self.logs.stdout) + "\n".join(self.logs.stderr)
268
- if include_logs
269
- else ""
270
- )
267
+ prefix = str(self.logs) if include_logs else ""
271
268
  if self.error:
272
- return prefix + "\n" + self.error.traceback
273
- return next(
269
+ return prefix + "\n----- Error -----\n" + self.error.traceback
270
+
271
+ result_str = [
274
272
  (
275
- prefix + "\n" + (res.text or "")
276
- for res in self.results
273
+ f"----- Final output -----\n{res.text}"
277
274
  if res.is_main_result
278
- ),
279
- prefix,
280
- )
275
+ else f"----- Intermediate output-----\n{res.text}"
276
+ )
277
+ for res in self.results
278
+ ]
279
+ return prefix + "\n" + "\n".join(result_str)
281
280
 
282
281
  @property
283
282
  def success(self) -> bool:
@@ -404,7 +403,7 @@ print(f"Vision Agent version: {va_version}")"""
404
403
  self.interpreter.notebook.restart_kernel()
405
404
 
406
405
  def exec_cell(self, code: str) -> Execution:
407
- execution = self.interpreter.notebook.exec_cell(code)
406
+ execution = self.interpreter.notebook.exec_cell(code, timeout=self.timeout)
408
407
  return Execution.from_e2b_execution(execution)
409
408
 
410
409
  def upload_file(self, file: Union[str, Path, IO]) -> str:
@@ -508,16 +507,24 @@ class CodeInterpreterFactory:
508
507
 
509
508
  @staticmethod
510
509
  def get_default_instance() -> CodeInterpreter:
510
+ warnings.warn(
511
+ "Use new_instance() instead for production usage, get_default_instance() is for testing and will be removed in the future."
512
+ )
511
513
  inst_map = CodeInterpreterFactory._instance_map
512
514
  instance = inst_map.get(CodeInterpreterFactory._default_key)
513
515
  if instance:
514
516
  return instance
517
+ instance = CodeInterpreterFactory.new_instance()
518
+ inst_map[CodeInterpreterFactory._default_key] = instance
519
+ return instance
520
+
521
+ @staticmethod
522
+ def new_instance() -> CodeInterpreter:
515
523
  if os.getenv("CODE_SANDBOX_RUNTIME") == "e2b":
516
- instance = E2BCodeInterpreter(timeout=600)
517
- atexit.register(instance.close)
524
+ instance: CodeInterpreter = E2BCodeInterpreter(timeout=600)
518
525
  else:
519
526
  instance = LocalCodeInterpreter(timeout=600)
520
- inst_map[CodeInterpreterFactory._default_key] = instance
527
+ atexit.register(instance.close)
521
528
  return instance
522
529
 
523
530
 
File without changes
File without changes