vision-agent 0.2.82__py3-none-any.whl → 0.2.84__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +81 -84
- vision_agent/lmm/__init__.py +1 -1
- vision_agent/lmm/lmm.py +91 -0
- {vision_agent-0.2.82.dist-info → vision_agent-0.2.84.dist-info}/METADATA +2 -1
- {vision_agent-0.2.82.dist-info → vision_agent-0.2.84.dist-info}/RECORD +7 -7
- {vision_agent-0.2.82.dist-info → vision_agent-0.2.84.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.82.dist-info → vision_agent-0.2.84.dist-info}/WHEEL +0 -0
@@ -176,6 +176,7 @@ def pick_plan(
|
|
176
176
|
model: LMM,
|
177
177
|
code_interpreter: CodeInterpreter,
|
178
178
|
verbosity: int = 0,
|
179
|
+
max_retries: int = 3,
|
179
180
|
) -> Tuple[str, str]:
|
180
181
|
chat = copy.deepcopy(chat)
|
181
182
|
if chat[-1]["role"] != "user":
|
@@ -192,13 +193,13 @@ def pick_plan(
|
|
192
193
|
if len(tool_output.logs.stdout) > 0:
|
193
194
|
tool_output_str = tool_output.logs.stdout[0]
|
194
195
|
|
195
|
-
if verbosity
|
196
|
+
if verbosity == 2:
|
196
197
|
_print_code("Initial code and tests:", code)
|
197
198
|
_LOGGER.info(f"Initial code execution result:\n{tool_output.text()}")
|
198
199
|
|
199
200
|
# retry if the tool output is empty or code fails
|
200
|
-
count =
|
201
|
-
while (not tool_output.success or tool_output_str == "") and count <
|
201
|
+
count = 0
|
202
|
+
while (not tool_output.success or tool_output_str == "") and count < max_retries:
|
202
203
|
prompt = TEST_PLANS.format(
|
203
204
|
docstring=tool_info,
|
204
205
|
plans=plan_str,
|
@@ -214,12 +215,15 @@ def pick_plan(
|
|
214
215
|
if len(tool_output.logs.stdout) > 0:
|
215
216
|
tool_output_str = tool_output.logs.stdout[0]
|
216
217
|
|
217
|
-
if verbosity ==
|
218
|
+
if verbosity == 2:
|
218
219
|
_print_code("Code and test after attempted fix:", code)
|
219
220
|
_LOGGER.info(f"Code execution result after attempte {count}")
|
220
221
|
|
221
222
|
count += 1
|
222
223
|
|
224
|
+
if verbosity >= 1:
|
225
|
+
_print_code("Final code:", code)
|
226
|
+
|
223
227
|
user_req = chat[-1]["content"]
|
224
228
|
context = USER_REQ.format(user_request=user_req)
|
225
229
|
# because the tool picker model gets the image as well, we have to be careful with
|
@@ -408,7 +412,7 @@ def debug_code(
|
|
408
412
|
FIX_BUG.format(
|
409
413
|
code=code,
|
410
414
|
tests=test,
|
411
|
-
result="\n".join(result.text().splitlines()[-
|
415
|
+
result="\n".join(result.text().splitlines()[-100:]),
|
412
416
|
feedback=format_memory(working_memory + new_working_memory),
|
413
417
|
)
|
414
418
|
)
|
@@ -673,92 +677,85 @@ class VisionAgent(Agent):
|
|
673
677
|
working_memory: List[Dict[str, str]] = []
|
674
678
|
results = {"code": "", "test": "", "plan": []}
|
675
679
|
plan = []
|
676
|
-
success = False
|
677
|
-
retries = 0
|
678
|
-
|
679
|
-
while not success and retries < self.max_retries:
|
680
|
-
self.log_progress(
|
681
|
-
{
|
682
|
-
"type": "plans",
|
683
|
-
"status": "started",
|
684
|
-
}
|
685
|
-
)
|
686
|
-
plans = write_plans(
|
687
|
-
int_chat,
|
688
|
-
T.TOOL_DESCRIPTIONS,
|
689
|
-
format_memory(working_memory),
|
690
|
-
self.planner,
|
691
|
-
)
|
692
680
|
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
best_plan, tool_output_str = pick_plan(
|
706
|
-
int_chat,
|
707
|
-
plans,
|
708
|
-
tool_infos["all"],
|
709
|
-
self.coder,
|
710
|
-
code_interpreter,
|
711
|
-
verbosity=self.verbosity,
|
712
|
-
)
|
681
|
+
self.log_progress(
|
682
|
+
{
|
683
|
+
"type": "plans",
|
684
|
+
"status": "started",
|
685
|
+
}
|
686
|
+
)
|
687
|
+
plans = write_plans(
|
688
|
+
int_chat,
|
689
|
+
T.TOOL_DESCRIPTIONS,
|
690
|
+
format_memory(working_memory),
|
691
|
+
self.planner,
|
692
|
+
)
|
713
693
|
|
714
|
-
|
715
|
-
|
716
|
-
tool_info = tool_infos[best_plan]
|
717
|
-
else:
|
718
|
-
if self.verbosity >= 1:
|
719
|
-
_LOGGER.warning(
|
720
|
-
f"Best plan {best_plan} not found in plans or tool_infos. Using the first plan and tool info."
|
721
|
-
)
|
722
|
-
k = list(plans.keys())[0]
|
723
|
-
plan_i = plans[k]
|
724
|
-
tool_info = tool_infos[k]
|
725
|
-
|
726
|
-
self.log_progress(
|
727
|
-
{
|
728
|
-
"type": "plans",
|
729
|
-
"status": "completed",
|
730
|
-
"payload": plan_i,
|
731
|
-
}
|
732
|
-
)
|
733
|
-
if self.verbosity >= 1:
|
694
|
+
if self.verbosity >= 1:
|
695
|
+
for p in plans:
|
734
696
|
_LOGGER.info(
|
735
|
-
f"
|
697
|
+
f"\n{tabulate(tabular_data=plans[p], headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
736
698
|
)
|
737
699
|
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
700
|
+
tool_infos = retrieve_tools(
|
701
|
+
plans,
|
702
|
+
self.tool_recommender,
|
703
|
+
self.log_progress,
|
704
|
+
self.verbosity,
|
705
|
+
)
|
706
|
+
best_plan, tool_output_str = pick_plan(
|
707
|
+
int_chat,
|
708
|
+
plans,
|
709
|
+
tool_infos["all"],
|
710
|
+
self.coder,
|
711
|
+
code_interpreter,
|
712
|
+
verbosity=self.verbosity,
|
713
|
+
)
|
714
|
+
|
715
|
+
if best_plan in plans and best_plan in tool_infos:
|
716
|
+
plan_i = plans[best_plan]
|
717
|
+
tool_info = tool_infos[best_plan]
|
718
|
+
else:
|
719
|
+
if self.verbosity >= 1:
|
720
|
+
_LOGGER.warning(
|
721
|
+
f"Best plan {best_plan} not found in plans or tool_infos. Using the first plan and tool info."
|
722
|
+
)
|
723
|
+
k = list(plans.keys())[0]
|
724
|
+
plan_i = plans[k]
|
725
|
+
tool_info = tool_infos[k]
|
726
|
+
|
727
|
+
self.log_progress(
|
728
|
+
{
|
729
|
+
"type": "plans",
|
730
|
+
"status": "completed",
|
731
|
+
"payload": plan_i,
|
732
|
+
}
|
733
|
+
)
|
734
|
+
if self.verbosity >= 1:
|
735
|
+
_LOGGER.info(
|
736
|
+
f"Picked best plan:\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
754
737
|
)
|
755
|
-
success = cast(bool, results["success"])
|
756
|
-
code = cast(str, results["code"])
|
757
|
-
test = cast(str, results["test"])
|
758
|
-
working_memory.extend(results["working_memory"]) # type: ignore
|
759
|
-
plan.append({"code": code, "test": test, "plan": plan_i})
|
760
738
|
|
761
|
-
|
739
|
+
results = write_and_test_code(
|
740
|
+
chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
|
741
|
+
plan="\n-" + "\n-".join([e["instructions"] for e in plan_i]),
|
742
|
+
tool_info=tool_info,
|
743
|
+
tool_output=tool_output_str,
|
744
|
+
tool_utils=T.UTILITIES_DOCSTRING,
|
745
|
+
working_memory=working_memory,
|
746
|
+
coder=self.coder,
|
747
|
+
tester=self.tester,
|
748
|
+
debugger=self.debugger,
|
749
|
+
code_interpreter=code_interpreter,
|
750
|
+
log_progress=self.log_progress,
|
751
|
+
verbosity=self.verbosity,
|
752
|
+
media=media_list,
|
753
|
+
)
|
754
|
+
success = cast(bool, results["success"])
|
755
|
+
code = cast(str, results["code"])
|
756
|
+
test = cast(str, results["test"])
|
757
|
+
working_memory.extend(results["working_memory"]) # type: ignore
|
758
|
+
plan.append({"code": code, "test": test, "plan": plan_i})
|
762
759
|
|
763
760
|
execution_result = cast(Execution, results["test_result"])
|
764
761
|
self.log_progress(
|
vision_agent/lmm/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
from .lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM
|
1
|
+
from .lmm import LMM, AzureOpenAILMM, ClaudeSonnetLMM, Message, OllamaLMM, OpenAILMM
|
vision_agent/lmm/lmm.py
CHANGED
@@ -7,7 +7,9 @@ from abc import ABC, abstractmethod
|
|
7
7
|
from pathlib import Path
|
8
8
|
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
9
9
|
|
10
|
+
import anthropic
|
10
11
|
import requests
|
12
|
+
from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam
|
11
13
|
from openai import AzureOpenAI, OpenAI
|
12
14
|
from PIL import Image
|
13
15
|
|
@@ -375,3 +377,92 @@ class OllamaLMM(LMM):
|
|
375
377
|
|
376
378
|
response = response.json()
|
377
379
|
return response["response"] # type: ignore
|
380
|
+
|
381
|
+
|
382
|
+
class ClaudeSonnetLMM(LMM):
|
383
|
+
r"""An LMM class for Anthropic's Claude Sonnet model."""
|
384
|
+
|
385
|
+
def __init__(
|
386
|
+
self,
|
387
|
+
api_key: Optional[str] = None,
|
388
|
+
model_name: str = "claude-3-sonnet-20240229",
|
389
|
+
max_tokens: int = 4096,
|
390
|
+
temperature: float = 0.7,
|
391
|
+
**kwargs: Any,
|
392
|
+
):
|
393
|
+
self.client = anthropic.Anthropic(api_key=api_key)
|
394
|
+
self.model_name = model_name
|
395
|
+
self.max_tokens = max_tokens
|
396
|
+
self.temperature = temperature
|
397
|
+
self.kwargs = kwargs
|
398
|
+
|
399
|
+
def __call__(
|
400
|
+
self,
|
401
|
+
input: Union[str, List[Dict[str, Any]]],
|
402
|
+
) -> str:
|
403
|
+
if isinstance(input, str):
|
404
|
+
return self.generate(input)
|
405
|
+
return self.chat(input)
|
406
|
+
|
407
|
+
def chat(
|
408
|
+
self,
|
409
|
+
chat: List[Dict[str, Any]],
|
410
|
+
) -> str:
|
411
|
+
messages: List[MessageParam] = []
|
412
|
+
for msg in chat:
|
413
|
+
content: List[Union[TextBlockParam, ImageBlockParam]] = [
|
414
|
+
TextBlockParam(type="text", text=msg["content"])
|
415
|
+
]
|
416
|
+
if "media" in msg:
|
417
|
+
for media_path in msg["media"]:
|
418
|
+
encoded_media = encode_media(media_path)
|
419
|
+
content.append(
|
420
|
+
ImageBlockParam(
|
421
|
+
type="image",
|
422
|
+
source={
|
423
|
+
"type": "base64",
|
424
|
+
"media_type": "image/png",
|
425
|
+
"data": encoded_media,
|
426
|
+
},
|
427
|
+
)
|
428
|
+
)
|
429
|
+
messages.append({"role": msg["role"], "content": content})
|
430
|
+
|
431
|
+
response = self.client.messages.create(
|
432
|
+
model=self.model_name,
|
433
|
+
max_tokens=self.max_tokens,
|
434
|
+
temperature=self.temperature,
|
435
|
+
messages=messages,
|
436
|
+
**self.kwargs,
|
437
|
+
)
|
438
|
+
return cast(str, response.content[0].text)
|
439
|
+
|
440
|
+
def generate(
|
441
|
+
self,
|
442
|
+
prompt: str,
|
443
|
+
media: Optional[List[Union[str, Path]]] = None,
|
444
|
+
) -> str:
|
445
|
+
content: List[Union[TextBlockParam, ImageBlockParam]] = [
|
446
|
+
TextBlockParam(type="text", text=prompt)
|
447
|
+
]
|
448
|
+
if media:
|
449
|
+
for m in media:
|
450
|
+
encoded_media = encode_media(m)
|
451
|
+
content.append(
|
452
|
+
ImageBlockParam(
|
453
|
+
type="image",
|
454
|
+
source={
|
455
|
+
"type": "base64",
|
456
|
+
"media_type": "image/png",
|
457
|
+
"data": encoded_media,
|
458
|
+
},
|
459
|
+
)
|
460
|
+
)
|
461
|
+
response = self.client.messages.create(
|
462
|
+
model=self.model_name,
|
463
|
+
max_tokens=self.max_tokens,
|
464
|
+
temperature=self.temperature,
|
465
|
+
messages=[{"role": "user", "content": content}],
|
466
|
+
**self.kwargs,
|
467
|
+
)
|
468
|
+
return cast(str, response.content[0].text)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.84
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -9,6 +9,7 @@ Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: Programming Language :: Python :: 3.9
|
10
10
|
Classifier: Programming Language :: Python :: 3.10
|
11
11
|
Classifier: Programming Language :: Python :: 3.11
|
12
|
+
Requires-Dist: anthropic (>=0.31.0,<0.32.0)
|
12
13
|
Requires-Dist: e2b (>=0.17.1,<0.18.0)
|
13
14
|
Requires-Dist: e2b-code-interpreter (==0.0.11a2)
|
14
15
|
Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
|
@@ -1,12 +1,12 @@
|
|
1
1
|
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
2
2
|
vision_agent/agent/__init__.py,sha256=IUwfbPMcT8X_rnXMLmI8gJ4ltsHy_XSs9eLiKURJxeY,81
|
3
3
|
vision_agent/agent/agent.py,sha256=ZK-5lOtd9-eD9aWcXssJpnOyvZuO7_5hAmnb-6sWVe8,569
|
4
|
-
vision_agent/agent/vision_agent.py,sha256=
|
4
|
+
vision_agent/agent/vision_agent.py,sha256=fLCkqYJzk9SNtu8TzKBk0TLZrXDMTCqgI3FI-zkc-qs,28768
|
5
5
|
vision_agent/agent/vision_agent_prompts.py,sha256=brBV-SmzyzTG5M9nfV3R5xdYT_BUYOKzxNFmTa2Sp-o,11049
|
6
6
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
8
|
-
vision_agent/lmm/__init__.py,sha256=
|
9
|
-
vision_agent/lmm/lmm.py,sha256=
|
8
|
+
vision_agent/lmm/__init__.py,sha256=j9mQsIXQOYfW6nFd47uTwuBe1ranpEbwW308qLfCWN0,85
|
9
|
+
vision_agent/lmm/lmm.py,sha256=035uONyp6_jD3PVdNdSg2PMHOG1voqnpsn2IyybUENs,15147
|
10
10
|
vision_agent/tools/__init__.py,sha256=k69hvcy2FWjDqVA0klzybKeoToOH_bom5NTVSliA0Og,1838
|
11
11
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
12
12
|
vision_agent/tools/tool_utils.py,sha256=6z0jrvUnesJEFqDHZoAvbXPic8rzh0KfILL07tu0uRo,2205
|
@@ -18,7 +18,7 @@ vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOk
|
|
18
18
|
vision_agent/utils/sim.py,sha256=1HTaiVaBiKeyXIy21IYGXlPw0TipOyw9FPOJDfyLI94,4409
|
19
19
|
vision_agent/utils/type_defs.py,sha256=QeQRRIlklZMWzxROcCn5ELxP89nYdXGydy1rAiSpZZw,1384
|
20
20
|
vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
|
21
|
-
vision_agent-0.2.
|
22
|
-
vision_agent-0.2.
|
23
|
-
vision_agent-0.2.
|
24
|
-
vision_agent-0.2.
|
21
|
+
vision_agent-0.2.84.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
22
|
+
vision_agent-0.2.84.dist-info/METADATA,sha256=mZM17x03oCnI8tp4g7psZzonwNlS0fqN0f78dWbob-o,9477
|
23
|
+
vision_agent-0.2.84.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
24
|
+
vision_agent-0.2.84.dist-info/RECORD,,
|
File without changes
|
File without changes
|