vision-agent 0.2.82__py3-none-any.whl → 0.2.84__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -176,6 +176,7 @@ def pick_plan(
176
176
  model: LMM,
177
177
  code_interpreter: CodeInterpreter,
178
178
  verbosity: int = 0,
179
+ max_retries: int = 3,
179
180
  ) -> Tuple[str, str]:
180
181
  chat = copy.deepcopy(chat)
181
182
  if chat[-1]["role"] != "user":
@@ -192,13 +193,13 @@ def pick_plan(
192
193
  if len(tool_output.logs.stdout) > 0:
193
194
  tool_output_str = tool_output.logs.stdout[0]
194
195
 
195
- if verbosity >= 1:
196
+ if verbosity == 2:
196
197
  _print_code("Initial code and tests:", code)
197
198
  _LOGGER.info(f"Initial code execution result:\n{tool_output.text()}")
198
199
 
199
200
  # retry if the tool output is empty or code fails
200
- count = 1
201
- while (not tool_output.success or tool_output_str == "") and count < 3:
201
+ count = 0
202
+ while (not tool_output.success or tool_output_str == "") and count < max_retries:
202
203
  prompt = TEST_PLANS.format(
203
204
  docstring=tool_info,
204
205
  plans=plan_str,
@@ -214,12 +215,15 @@ def pick_plan(
214
215
  if len(tool_output.logs.stdout) > 0:
215
216
  tool_output_str = tool_output.logs.stdout[0]
216
217
 
217
- if verbosity == 1:
218
+ if verbosity == 2:
218
219
  _print_code("Code and test after attempted fix:", code)
219
220
  _LOGGER.info(f"Code execution result after attempte {count}")
220
221
 
221
222
  count += 1
222
223
 
224
+ if verbosity >= 1:
225
+ _print_code("Final code:", code)
226
+
223
227
  user_req = chat[-1]["content"]
224
228
  context = USER_REQ.format(user_request=user_req)
225
229
  # because the tool picker model gets the image as well, we have to be careful with
@@ -408,7 +412,7 @@ def debug_code(
408
412
  FIX_BUG.format(
409
413
  code=code,
410
414
  tests=test,
411
- result="\n".join(result.text().splitlines()[-50:]),
415
+ result="\n".join(result.text().splitlines()[-100:]),
412
416
  feedback=format_memory(working_memory + new_working_memory),
413
417
  )
414
418
  )
@@ -673,92 +677,85 @@ class VisionAgent(Agent):
673
677
  working_memory: List[Dict[str, str]] = []
674
678
  results = {"code": "", "test": "", "plan": []}
675
679
  plan = []
676
- success = False
677
- retries = 0
678
-
679
- while not success and retries < self.max_retries:
680
- self.log_progress(
681
- {
682
- "type": "plans",
683
- "status": "started",
684
- }
685
- )
686
- plans = write_plans(
687
- int_chat,
688
- T.TOOL_DESCRIPTIONS,
689
- format_memory(working_memory),
690
- self.planner,
691
- )
692
680
 
693
- if self.verbosity >= 1:
694
- for p in plans:
695
- _LOGGER.info(
696
- f"\n{tabulate(tabular_data=plans[p], headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
697
- )
698
-
699
- tool_infos = retrieve_tools(
700
- plans,
701
- self.tool_recommender,
702
- self.log_progress,
703
- self.verbosity,
704
- )
705
- best_plan, tool_output_str = pick_plan(
706
- int_chat,
707
- plans,
708
- tool_infos["all"],
709
- self.coder,
710
- code_interpreter,
711
- verbosity=self.verbosity,
712
- )
681
+ self.log_progress(
682
+ {
683
+ "type": "plans",
684
+ "status": "started",
685
+ }
686
+ )
687
+ plans = write_plans(
688
+ int_chat,
689
+ T.TOOL_DESCRIPTIONS,
690
+ format_memory(working_memory),
691
+ self.planner,
692
+ )
713
693
 
714
- if best_plan in plans and best_plan in tool_infos:
715
- plan_i = plans[best_plan]
716
- tool_info = tool_infos[best_plan]
717
- else:
718
- if self.verbosity >= 1:
719
- _LOGGER.warning(
720
- f"Best plan {best_plan} not found in plans or tool_infos. Using the first plan and tool info."
721
- )
722
- k = list(plans.keys())[0]
723
- plan_i = plans[k]
724
- tool_info = tool_infos[k]
725
-
726
- self.log_progress(
727
- {
728
- "type": "plans",
729
- "status": "completed",
730
- "payload": plan_i,
731
- }
732
- )
733
- if self.verbosity >= 1:
694
+ if self.verbosity >= 1:
695
+ for p in plans:
734
696
  _LOGGER.info(
735
- f"Picked best plan:\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
697
+ f"\n{tabulate(tabular_data=plans[p], headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
736
698
  )
737
699
 
738
- results = write_and_test_code(
739
- chat=[
740
- {"role": c["role"], "content": c["content"]} for c in int_chat
741
- ],
742
- plan="\n-" + "\n-".join([e["instructions"] for e in plan_i]),
743
- tool_info=tool_info,
744
- tool_output=tool_output_str,
745
- tool_utils=T.UTILITIES_DOCSTRING,
746
- working_memory=working_memory,
747
- coder=self.coder,
748
- tester=self.tester,
749
- debugger=self.debugger,
750
- code_interpreter=code_interpreter,
751
- log_progress=self.log_progress,
752
- verbosity=self.verbosity,
753
- media=media_list,
700
+ tool_infos = retrieve_tools(
701
+ plans,
702
+ self.tool_recommender,
703
+ self.log_progress,
704
+ self.verbosity,
705
+ )
706
+ best_plan, tool_output_str = pick_plan(
707
+ int_chat,
708
+ plans,
709
+ tool_infos["all"],
710
+ self.coder,
711
+ code_interpreter,
712
+ verbosity=self.verbosity,
713
+ )
714
+
715
+ if best_plan in plans and best_plan in tool_infos:
716
+ plan_i = plans[best_plan]
717
+ tool_info = tool_infos[best_plan]
718
+ else:
719
+ if self.verbosity >= 1:
720
+ _LOGGER.warning(
721
+ f"Best plan {best_plan} not found in plans or tool_infos. Using the first plan and tool info."
722
+ )
723
+ k = list(plans.keys())[0]
724
+ plan_i = plans[k]
725
+ tool_info = tool_infos[k]
726
+
727
+ self.log_progress(
728
+ {
729
+ "type": "plans",
730
+ "status": "completed",
731
+ "payload": plan_i,
732
+ }
733
+ )
734
+ if self.verbosity >= 1:
735
+ _LOGGER.info(
736
+ f"Picked best plan:\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
754
737
  )
755
- success = cast(bool, results["success"])
756
- code = cast(str, results["code"])
757
- test = cast(str, results["test"])
758
- working_memory.extend(results["working_memory"]) # type: ignore
759
- plan.append({"code": code, "test": test, "plan": plan_i})
760
738
 
761
- retries += 1
739
+ results = write_and_test_code(
740
+ chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
741
+ plan="\n-" + "\n-".join([e["instructions"] for e in plan_i]),
742
+ tool_info=tool_info,
743
+ tool_output=tool_output_str,
744
+ tool_utils=T.UTILITIES_DOCSTRING,
745
+ working_memory=working_memory,
746
+ coder=self.coder,
747
+ tester=self.tester,
748
+ debugger=self.debugger,
749
+ code_interpreter=code_interpreter,
750
+ log_progress=self.log_progress,
751
+ verbosity=self.verbosity,
752
+ media=media_list,
753
+ )
754
+ success = cast(bool, results["success"])
755
+ code = cast(str, results["code"])
756
+ test = cast(str, results["test"])
757
+ working_memory.extend(results["working_memory"]) # type: ignore
758
+ plan.append({"code": code, "test": test, "plan": plan_i})
762
759
 
763
760
  execution_result = cast(Execution, results["test_result"])
764
761
  self.log_progress(
@@ -1 +1 @@
1
- from .lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM
1
+ from .lmm import LMM, AzureOpenAILMM, ClaudeSonnetLMM, Message, OllamaLMM, OpenAILMM
vision_agent/lmm/lmm.py CHANGED
@@ -7,7 +7,9 @@ from abc import ABC, abstractmethod
7
7
  from pathlib import Path
8
8
  from typing import Any, Callable, Dict, List, Optional, Union, cast
9
9
 
10
+ import anthropic
10
11
  import requests
12
+ from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam
11
13
  from openai import AzureOpenAI, OpenAI
12
14
  from PIL import Image
13
15
 
@@ -375,3 +377,92 @@ class OllamaLMM(LMM):
375
377
 
376
378
  response = response.json()
377
379
  return response["response"] # type: ignore
380
+
381
+
382
+ class ClaudeSonnetLMM(LMM):
383
+ r"""An LMM class for Anthropic's Claude Sonnet model."""
384
+
385
+ def __init__(
386
+ self,
387
+ api_key: Optional[str] = None,
388
+ model_name: str = "claude-3-sonnet-20240229",
389
+ max_tokens: int = 4096,
390
+ temperature: float = 0.7,
391
+ **kwargs: Any,
392
+ ):
393
+ self.client = anthropic.Anthropic(api_key=api_key)
394
+ self.model_name = model_name
395
+ self.max_tokens = max_tokens
396
+ self.temperature = temperature
397
+ self.kwargs = kwargs
398
+
399
+ def __call__(
400
+ self,
401
+ input: Union[str, List[Dict[str, Any]]],
402
+ ) -> str:
403
+ if isinstance(input, str):
404
+ return self.generate(input)
405
+ return self.chat(input)
406
+
407
+ def chat(
408
+ self,
409
+ chat: List[Dict[str, Any]],
410
+ ) -> str:
411
+ messages: List[MessageParam] = []
412
+ for msg in chat:
413
+ content: List[Union[TextBlockParam, ImageBlockParam]] = [
414
+ TextBlockParam(type="text", text=msg["content"])
415
+ ]
416
+ if "media" in msg:
417
+ for media_path in msg["media"]:
418
+ encoded_media = encode_media(media_path)
419
+ content.append(
420
+ ImageBlockParam(
421
+ type="image",
422
+ source={
423
+ "type": "base64",
424
+ "media_type": "image/png",
425
+ "data": encoded_media,
426
+ },
427
+ )
428
+ )
429
+ messages.append({"role": msg["role"], "content": content})
430
+
431
+ response = self.client.messages.create(
432
+ model=self.model_name,
433
+ max_tokens=self.max_tokens,
434
+ temperature=self.temperature,
435
+ messages=messages,
436
+ **self.kwargs,
437
+ )
438
+ return cast(str, response.content[0].text)
439
+
440
+ def generate(
441
+ self,
442
+ prompt: str,
443
+ media: Optional[List[Union[str, Path]]] = None,
444
+ ) -> str:
445
+ content: List[Union[TextBlockParam, ImageBlockParam]] = [
446
+ TextBlockParam(type="text", text=prompt)
447
+ ]
448
+ if media:
449
+ for m in media:
450
+ encoded_media = encode_media(m)
451
+ content.append(
452
+ ImageBlockParam(
453
+ type="image",
454
+ source={
455
+ "type": "base64",
456
+ "media_type": "image/png",
457
+ "data": encoded_media,
458
+ },
459
+ )
460
+ )
461
+ response = self.client.messages.create(
462
+ model=self.model_name,
463
+ max_tokens=self.max_tokens,
464
+ temperature=self.temperature,
465
+ messages=[{"role": "user", "content": content}],
466
+ **self.kwargs,
467
+ )
468
+ return cast(str, response.content[0].text)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.82
3
+ Version: 0.2.84
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -9,6 +9,7 @@ Classifier: Programming Language :: Python :: 3
9
9
  Classifier: Programming Language :: Python :: 3.9
10
10
  Classifier: Programming Language :: Python :: 3.10
11
11
  Classifier: Programming Language :: Python :: 3.11
12
+ Requires-Dist: anthropic (>=0.31.0,<0.32.0)
12
13
  Requires-Dist: e2b (>=0.17.1,<0.18.0)
13
14
  Requires-Dist: e2b-code-interpreter (==0.0.11a2)
14
15
  Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
@@ -1,12 +1,12 @@
1
1
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
2
  vision_agent/agent/__init__.py,sha256=IUwfbPMcT8X_rnXMLmI8gJ4ltsHy_XSs9eLiKURJxeY,81
3
3
  vision_agent/agent/agent.py,sha256=ZK-5lOtd9-eD9aWcXssJpnOyvZuO7_5hAmnb-6sWVe8,569
4
- vision_agent/agent/vision_agent.py,sha256=2yQcwYoGF4-NsjD6OY1_XjisYJxr2K1871mnwyWioKo,29148
4
+ vision_agent/agent/vision_agent.py,sha256=fLCkqYJzk9SNtu8TzKBk0TLZrXDMTCqgI3FI-zkc-qs,28768
5
5
  vision_agent/agent/vision_agent_prompts.py,sha256=brBV-SmzyzTG5M9nfV3R5xdYT_BUYOKzxNFmTa2Sp-o,11049
6
6
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
8
- vision_agent/lmm/__init__.py,sha256=bw24xyQJHGzmph5e-bKCiTh9AX6tRFI2OUd0mofxjZI,68
9
- vision_agent/lmm/lmm.py,sha256=UtUl3k2TiN4gbdlqE16rexQ72WFE7FGru0yguyJ4jAE,12129
8
+ vision_agent/lmm/__init__.py,sha256=j9mQsIXQOYfW6nFd47uTwuBe1ranpEbwW308qLfCWN0,85
9
+ vision_agent/lmm/lmm.py,sha256=035uONyp6_jD3PVdNdSg2PMHOG1voqnpsn2IyybUENs,15147
10
10
  vision_agent/tools/__init__.py,sha256=k69hvcy2FWjDqVA0klzybKeoToOH_bom5NTVSliA0Og,1838
11
11
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
12
12
  vision_agent/tools/tool_utils.py,sha256=6z0jrvUnesJEFqDHZoAvbXPic8rzh0KfILL07tu0uRo,2205
@@ -18,7 +18,7 @@ vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOk
18
18
  vision_agent/utils/sim.py,sha256=1HTaiVaBiKeyXIy21IYGXlPw0TipOyw9FPOJDfyLI94,4409
19
19
  vision_agent/utils/type_defs.py,sha256=QeQRRIlklZMWzxROcCn5ELxP89nYdXGydy1rAiSpZZw,1384
20
20
  vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
21
- vision_agent-0.2.82.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
22
- vision_agent-0.2.82.dist-info/METADATA,sha256=hvZlgdZ55jCzin2ZHECYtMLH6n6yTa3yhnXDU8Nvjcc,9433
23
- vision_agent-0.2.82.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
24
- vision_agent-0.2.82.dist-info/RECORD,,
21
+ vision_agent-0.2.84.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
22
+ vision_agent-0.2.84.dist-info/METADATA,sha256=mZM17x03oCnI8tp4g7psZzonwNlS0fqN0f78dWbob-o,9477
23
+ vision_agent-0.2.84.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
24
+ vision_agent-0.2.84.dist-info/RECORD,,