vision-agent 0.2.161__py3-none-any.whl → 0.2.163__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/__init__.py +8 -0
- vision_agent/agent/agent_utils.py +98 -2
- vision_agent/agent/vision_agent.py +54 -22
- vision_agent/agent/vision_agent_coder.py +222 -512
- vision_agent/agent/vision_agent_coder_prompts.py +12 -221
- vision_agent/agent/vision_agent_planner.py +583 -0
- vision_agent/agent/vision_agent_planner_prompts.py +199 -0
- vision_agent/tools/__init__.py +0 -1
- vision_agent/tools/meta_tools.py +107 -35
- vision_agent/tools/tools.py +2 -2
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/METADATA +8 -7
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/RECORD +14 -12
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/WHEEL +0 -0
@@ -0,0 +1,199 @@
|
|
1
|
+
USER_REQ = """
|
2
|
+
## User Request
|
3
|
+
{user_request}
|
4
|
+
"""
|
5
|
+
|
6
|
+
PLAN = """
|
7
|
+
**Context**:
|
8
|
+
{context}
|
9
|
+
|
10
|
+
**Tools Available**:
|
11
|
+
{tool_desc}
|
12
|
+
|
13
|
+
**Previous Feedback**:
|
14
|
+
{feedback}
|
15
|
+
|
16
|
+
**Instructions**:
|
17
|
+
1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
|
18
|
+
2. For each subtask, be sure to include the tool(s) you want to use to accomplish that subtask.
|
19
|
+
3. Output three different plans each utilize a different strategy or set of tools ordering them from most likely to least likely to succeed.
|
20
|
+
|
21
|
+
Output a list of jsons in the following format:
|
22
|
+
|
23
|
+
```json
|
24
|
+
{{
|
25
|
+
"plan1":
|
26
|
+
{{
|
27
|
+
"thoughts": str # your thought process for choosing this plan
|
28
|
+
"instructions": [
|
29
|
+
str # what you should do in this task associated with a tool
|
30
|
+
]
|
31
|
+
}},
|
32
|
+
"plan2": ...,
|
33
|
+
"plan3": ...
|
34
|
+
}}
|
35
|
+
```
|
36
|
+
"""
|
37
|
+
|
38
|
+
TEST_PLANS = """
|
39
|
+
**Role**: You are a software programmer responsible for testing different tools.
|
40
|
+
|
41
|
+
**Task**: Your responsibility is to take a set of several plans and test the different tools for each plan.
|
42
|
+
|
43
|
+
**Documentation**:
|
44
|
+
This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
|
45
|
+
|
46
|
+
{docstring}
|
47
|
+
|
48
|
+
**Plans**:
|
49
|
+
{plans}
|
50
|
+
|
51
|
+
**Previous Attempts**:
|
52
|
+
{previous_attempts}
|
53
|
+
|
54
|
+
**Examples**:
|
55
|
+
--- EXAMPLE1 ---
|
56
|
+
plan1:
|
57
|
+
- Load the image from the provided file path 'image.jpg'.
|
58
|
+
- Use the 'owl_v2_image' tool with the prompt 'person' to detect and count the number of people in the image.
|
59
|
+
plan2:
|
60
|
+
- Load the image from the provided file path 'image.jpg'.
|
61
|
+
- Use the 'florence2_sam2_image' tool with the prompt 'person' to detect and count the number of people in the image.
|
62
|
+
- Count the number of detected objects labeled as 'person'.
|
63
|
+
plan3:
|
64
|
+
- Load the image from the provided file path 'image.jpg'.
|
65
|
+
- Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
|
66
|
+
|
67
|
+
```python
|
68
|
+
from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_counting
|
69
|
+
image = load_image("image.jpg")
|
70
|
+
owl_v2_out = owl_v2_image("person", image)
|
71
|
+
|
72
|
+
f2s2_out = florence2_sam2_image("person", image)
|
73
|
+
# strip out the masks from the output becuase they don't provide useful information when printed
|
74
|
+
f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
|
75
|
+
|
76
|
+
cgd_out = countgd_counting(image)
|
77
|
+
|
78
|
+
final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
|
79
|
+
print(final_out)
|
80
|
+
--- END EXAMPLE1 ---
|
81
|
+
|
82
|
+
--- EXAMPLE2 ---
|
83
|
+
plan1:
|
84
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
85
|
+
- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
|
86
|
+
plan2:
|
87
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
88
|
+
- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
|
89
|
+
plan3:
|
90
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
91
|
+
- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
|
92
|
+
|
93
|
+
|
94
|
+
```python
|
95
|
+
import numpy as np
|
96
|
+
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
|
97
|
+
|
98
|
+
# sample at 1 FPS and use the first 10 frames to reduce processing time
|
99
|
+
frames = extract_frames_and_timestamps("video.mp4", 1)
|
100
|
+
frames = [f["frame"] for f in frames][:10]
|
101
|
+
|
102
|
+
# strip arrays from the output to make it easier to read
|
103
|
+
def remove_arrays(o):
|
104
|
+
if isinstance(o, list):
|
105
|
+
return [remove_arrays(e) for e in o]
|
106
|
+
elif isinstance(o, dict):
|
107
|
+
return {{k: remove_arrays(v) for k, v in o.items()}}
|
108
|
+
elif isinstance(o, np.ndarray):
|
109
|
+
return "array: " + str(o.shape)
|
110
|
+
else:
|
111
|
+
return o
|
112
|
+
|
113
|
+
# return the counts of each label per frame to help determine the stability of the model results
|
114
|
+
def get_counts(preds):
|
115
|
+
counts = {{}}
|
116
|
+
for i, pred_frame in enumerate(preds):
|
117
|
+
counts_i = {{}}
|
118
|
+
for pred in pred_frame:
|
119
|
+
label = pred["label"].split(":")[1] if ":" in pred["label"] else pred["label"]
|
120
|
+
counts_i[label] = counts_i.get(label, 0) + 1
|
121
|
+
counts[f"frame_{{i}}"] = counts_i
|
122
|
+
return counts
|
123
|
+
|
124
|
+
|
125
|
+
# plan1
|
126
|
+
owl_v2_out = owl_v2_video("person", frames)
|
127
|
+
owl_v2_counts = get_counts(owl_v2_out)
|
128
|
+
|
129
|
+
# plan2
|
130
|
+
florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
|
131
|
+
florence2_counts = get_counts(florence2_out)
|
132
|
+
|
133
|
+
# plan3
|
134
|
+
f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
|
135
|
+
remove_arrays(f2s2_tracking_out)
|
136
|
+
f2s2_counts = get_counts(f2s2_tracking_out)
|
137
|
+
|
138
|
+
final_out = {{
|
139
|
+
"owl_v2_video": owl_v2_out,
|
140
|
+
"florence2_phrase_grounding": florence2_out,
|
141
|
+
"florence2_sam2_video_tracking": f2s2_out,
|
142
|
+
}}
|
143
|
+
|
144
|
+
counts = {{
|
145
|
+
"owl_v2_video": owl_v2_counts,
|
146
|
+
"florence2_phrase_grounding": florence2_counts,
|
147
|
+
"florence2_sam2_video_tracking": f2s2_counts,
|
148
|
+
}}
|
149
|
+
|
150
|
+
print(final_out)
|
151
|
+
print(labels_and_scores)
|
152
|
+
print(counts)
|
153
|
+
```
|
154
|
+
--- END EXAMPLE2 ---
|
155
|
+
|
156
|
+
**Instructions**:
|
157
|
+
1. Write a program to load the media and call each tool and print it's output along with other relevant information.
|
158
|
+
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
|
159
|
+
3. Your test case MUST run only on the given images which are {media}
|
160
|
+
4. Print this final dictionary.
|
161
|
+
5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
|
162
|
+
"""
|
163
|
+
|
164
|
+
PREVIOUS_FAILED = """
|
165
|
+
**Previous Failed Attempts**:
|
166
|
+
You previously ran this code:
|
167
|
+
```python
|
168
|
+
{code}
|
169
|
+
```
|
170
|
+
|
171
|
+
But got the following error or no stdout:
|
172
|
+
{error}
|
173
|
+
"""
|
174
|
+
|
175
|
+
PICK_PLAN = """
|
176
|
+
**Role**: You are an advanced AI model that can understand the user request and construct plans to accomplish it.
|
177
|
+
|
178
|
+
**Task**: Your responsibility is to pick the best plan from the three plans provided.
|
179
|
+
|
180
|
+
**Context**:
|
181
|
+
{context}
|
182
|
+
|
183
|
+
**Plans**:
|
184
|
+
{plans}
|
185
|
+
|
186
|
+
**Tool Output**:
|
187
|
+
{tool_output}
|
188
|
+
|
189
|
+
**Instructions**:
|
190
|
+
1. Re-read the user request, plans, tool outputs and examine the image.
|
191
|
+
2. Solve the problem yourself given the image and pick the most accurate plan that matches your solution the best.
|
192
|
+
3. Add modifications to improve the plan including: changing a tool, adding thresholds, string matching.
|
193
|
+
3. Output a JSON object with the following format:
|
194
|
+
{{
|
195
|
+
"predicted_answer": str # the answer you would expect from the best plan
|
196
|
+
"thoughts": str # your thought process for choosing the best plan over other plans and any modifications you made
|
197
|
+
"best_plan": str # the best plan you have chosen, must be `plan1`, `plan2`, or `plan3`
|
198
|
+
}}
|
199
|
+
"""
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/meta_tools.py
CHANGED
@@ -11,9 +11,12 @@ from typing import Any, Dict, List, Optional, Union
|
|
11
11
|
|
12
12
|
import numpy as np
|
13
13
|
from IPython.display import display
|
14
|
+
from redbaron import RedBaron # type: ignore
|
14
15
|
|
15
16
|
import vision_agent as va
|
17
|
+
from vision_agent.agent.agent_utils import extract_json
|
16
18
|
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
19
|
+
from vision_agent.lmm import AnthropicLMM
|
17
20
|
from vision_agent.lmm.types import Message
|
18
21
|
from vision_agent.tools.tool_utils import get_tool_documentation
|
19
22
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
@@ -22,8 +25,6 @@ from vision_agent.utils.execute import Execution, MimeType
|
|
22
25
|
from vision_agent.utils.image_utils import convert_to_b64, numpy_to_bytes
|
23
26
|
from vision_agent.utils.video import frames_to_bytes
|
24
27
|
|
25
|
-
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
26
|
-
|
27
28
|
CURRENT_FILE = None
|
28
29
|
CURRENT_LINE = 0
|
29
30
|
DEFAULT_WINDOW_SIZE = 100
|
@@ -152,6 +153,9 @@ class Artifacts:
|
|
152
153
|
return name in self.artifacts
|
153
154
|
|
154
155
|
|
156
|
+
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
157
|
+
|
158
|
+
|
155
159
|
def format_lines(lines: List[str], start_idx: int) -> str:
|
156
160
|
output = ""
|
157
161
|
for i, line in enumerate(lines):
|
@@ -338,6 +342,85 @@ def edit_code_artifact(
|
|
338
342
|
return open_code_artifact(artifacts, name, cur_line)
|
339
343
|
|
340
344
|
|
345
|
+
def generate_vision_plan(
|
346
|
+
artifacts: Artifacts,
|
347
|
+
name: str,
|
348
|
+
chat: str,
|
349
|
+
media: List[str],
|
350
|
+
test_multi_plan: bool = True,
|
351
|
+
custom_tool_names: Optional[List[str]] = None,
|
352
|
+
) -> str:
|
353
|
+
"""Generates a plan to solve vision based tasks.
|
354
|
+
|
355
|
+
Parameters:
|
356
|
+
artifacts (Artifacts): The artifacts object to save the plan to.
|
357
|
+
name (str): The name of the artifact to save the plan context to.
|
358
|
+
chat (str): The chat message from the user.
|
359
|
+
media (List[str]): The media files to use.
|
360
|
+
test_multi_plan (bool): Do not change this parameter.
|
361
|
+
custom_tool_names (Optional[List[str]]): Do not change this parameter.
|
362
|
+
|
363
|
+
Returns:
|
364
|
+
str: The generated plan.
|
365
|
+
|
366
|
+
Examples
|
367
|
+
--------
|
368
|
+
>>> generate_vision_plan(artifacts, "plan.json", "Can you detect the dogs in this image?", ["image.jpg"])
|
369
|
+
[Start Plan Context]
|
370
|
+
plan1: This is a plan to detect dogs in an image
|
371
|
+
-load image
|
372
|
+
-detect dogs
|
373
|
+
-return detections
|
374
|
+
[End Plan Context]
|
375
|
+
"""
|
376
|
+
|
377
|
+
if ZMQ_PORT is not None:
|
378
|
+
agent = va.agent.VisionAgentPlanner(
|
379
|
+
report_progress_callback=lambda inp: report_progress_callback(
|
380
|
+
int(ZMQ_PORT), inp
|
381
|
+
)
|
382
|
+
)
|
383
|
+
else:
|
384
|
+
agent = va.agent.VisionAgentPlanner()
|
385
|
+
|
386
|
+
fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
|
387
|
+
response = agent.generate_plan(
|
388
|
+
fixed_chat,
|
389
|
+
test_multi_plan=test_multi_plan,
|
390
|
+
custom_tool_names=custom_tool_names,
|
391
|
+
)
|
392
|
+
if response.test_results is not None:
|
393
|
+
redisplay_results(response.test_results)
|
394
|
+
response.test_results = None
|
395
|
+
artifacts[name] = response.model_dump_json()
|
396
|
+
media_names = extract_json(
|
397
|
+
AnthropicLMM()( # type: ignore
|
398
|
+
f"""Extract any media file names from this output in the following JSON format:
|
399
|
+
{{"media": ["image1.jpg", "image2.jpg"]}}
|
400
|
+
|
401
|
+
{artifacts[name]}"""
|
402
|
+
)
|
403
|
+
)
|
404
|
+
if "media" in media_names and isinstance(media_names, dict):
|
405
|
+
for media in media_names["media"]:
|
406
|
+
if isinstance(media, str):
|
407
|
+
with open(media, "rb") as f:
|
408
|
+
artifacts[media] = f.read()
|
409
|
+
|
410
|
+
output_str = f"[Start Plan Context, saved at {name}]"
|
411
|
+
for plan in response.plans.keys():
|
412
|
+
output_str += f"\n{plan}: {response.plans[plan]['thoughts'].strip()}\n" # type: ignore
|
413
|
+
output_str += " -" + "\n -".join(
|
414
|
+
e.strip() for e in response.plans[plan]["instructions"]
|
415
|
+
)
|
416
|
+
|
417
|
+
output_str += f"\nbest plan: {response.best_plan}\n"
|
418
|
+
output_str += "thoughts: " + response.plan_thoughts.strip() + "\n"
|
419
|
+
output_str += "[End Plan Context]"
|
420
|
+
print(output_str)
|
421
|
+
return output_str
|
422
|
+
|
423
|
+
|
341
424
|
def generate_vision_code(
|
342
425
|
artifacts: Artifacts,
|
343
426
|
name: str,
|
@@ -368,7 +451,6 @@ def generate_vision_code(
|
|
368
451
|
dogs = owl_v2("dog", image)
|
369
452
|
return dogs
|
370
453
|
"""
|
371
|
-
|
372
454
|
if ZMQ_PORT is not None:
|
373
455
|
agent = va.agent.VisionAgentCoder(
|
374
456
|
report_progress_callback=lambda inp: report_progress_callback(
|
@@ -379,7 +461,7 @@ def generate_vision_code(
|
|
379
461
|
agent = va.agent.VisionAgentCoder(verbosity=int(VERBOSITY))
|
380
462
|
|
381
463
|
fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
|
382
|
-
response = agent.
|
464
|
+
response = agent.generate_code(
|
383
465
|
fixed_chat,
|
384
466
|
test_multi_plan=test_multi_plan,
|
385
467
|
custom_tool_names=custom_tool_names,
|
@@ -411,7 +493,7 @@ def edit_vision_code(
|
|
411
493
|
name: str,
|
412
494
|
chat_history: List[str],
|
413
495
|
media: List[str],
|
414
|
-
|
496
|
+
custom_tool_names: Optional[List[str]] = None,
|
415
497
|
) -> str:
|
416
498
|
"""Edits python code to solve a vision based task.
|
417
499
|
|
@@ -419,7 +501,7 @@ def edit_vision_code(
|
|
419
501
|
artifacts (Artifacts): The artifacts object to save the code to.
|
420
502
|
name (str): The file path to the code.
|
421
503
|
chat_history (List[str]): The chat history to used to generate the code.
|
422
|
-
|
504
|
+
custom_tool_names (Optional[List[str]]): Do not change this parameter.
|
423
505
|
|
424
506
|
Returns:
|
425
507
|
str: The edited code.
|
@@ -459,10 +541,10 @@ def edit_vision_code(
|
|
459
541
|
fixed_chat_history.append({"role": "assistant", "content": code})
|
460
542
|
fixed_chat_history.append({"role": "user", "content": chat})
|
461
543
|
|
462
|
-
response = agent.
|
544
|
+
response = agent.generate_code(
|
463
545
|
fixed_chat_history,
|
464
546
|
test_multi_plan=False,
|
465
|
-
custom_tool_names=
|
547
|
+
custom_tool_names=custom_tool_names,
|
466
548
|
)
|
467
549
|
redisplay_results(response["test_result"])
|
468
550
|
code = response["code"]
|
@@ -625,7 +707,7 @@ def get_diff_with_prompts(name: str, before: str, after: str) -> str:
|
|
625
707
|
def use_extra_vision_agent_args(
|
626
708
|
code: str,
|
627
709
|
test_multi_plan: bool = True,
|
628
|
-
|
710
|
+
custom_tool_names: Optional[List[str]] = None,
|
629
711
|
) -> str:
|
630
712
|
"""This is for forcing arguments passed by the user to VisionAgent into the
|
631
713
|
VisionAgentCoder call.
|
@@ -633,36 +715,25 @@ def use_extra_vision_agent_args(
|
|
633
715
|
Parameters:
|
634
716
|
code (str): The code to edit.
|
635
717
|
test_multi_plan (bool): Do not change this parameter.
|
636
|
-
|
718
|
+
custom_tool_names (Optional[List[str]]): Do not change this parameter.
|
637
719
|
|
638
720
|
Returns:
|
639
721
|
str: The edited code.
|
640
722
|
"""
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
arg = match.group(1)
|
656
|
-
out_str = f"edit_vision_code({arg}"
|
657
|
-
if customized_tool_names is not None:
|
658
|
-
out_str += f", custom_tool_names={customized_tool_names})"
|
659
|
-
else:
|
660
|
-
out_str += ")"
|
661
|
-
return out_str
|
662
|
-
|
663
|
-
new_code = re.sub(generate_pattern, generate_replacer, code)
|
664
|
-
new_code = re.sub(edit_pattern, edit_replacer, new_code)
|
665
|
-
return new_code
|
723
|
+
red = RedBaron(code)
|
724
|
+
for node in red:
|
725
|
+
# seems to always be atomtrailers not call type
|
726
|
+
if node.type == "atomtrailers":
|
727
|
+
if (
|
728
|
+
node.name.value == "generate_vision_code"
|
729
|
+
or node.name.value == "edit_vision_code"
|
730
|
+
):
|
731
|
+
node.value[1].value.append(f"test_multi_plan={test_multi_plan}")
|
732
|
+
|
733
|
+
if custom_tool_names is not None:
|
734
|
+
node.value[1].value.append(f"custom_tool_names={custom_tool_names}")
|
735
|
+
cleaned_code = red.dumps().strip()
|
736
|
+
return cleaned_code if isinstance(cleaned_code, str) else code
|
666
737
|
|
667
738
|
|
668
739
|
def use_object_detection_fine_tuning(
|
@@ -748,6 +819,7 @@ META_TOOL_DOCSTRING = get_tool_documentation(
|
|
748
819
|
open_code_artifact,
|
749
820
|
create_code_artifact,
|
750
821
|
edit_code_artifact,
|
822
|
+
generate_vision_plan,
|
751
823
|
generate_vision_code,
|
752
824
|
edit_vision_code,
|
753
825
|
write_media_artifact,
|
vision_agent/tools/tools.py
CHANGED
@@ -1923,7 +1923,7 @@ def overlay_bounding_boxes(
|
|
1923
1923
|
bboxes = bbox_int[i]
|
1924
1924
|
bboxes = sorted(bboxes, key=lambda x: x["label"], reverse=True)
|
1925
1925
|
|
1926
|
-
if len(bboxes) >
|
1926
|
+
if len(bboxes) > 40:
|
1927
1927
|
pil_image = _plot_counting(pil_image, bboxes, color)
|
1928
1928
|
else:
|
1929
1929
|
width, height = pil_image.size
|
@@ -2117,7 +2117,7 @@ def _plot_counting(
|
|
2117
2117
|
colors: Dict[str, Tuple[int, int, int]],
|
2118
2118
|
) -> Image.Image:
|
2119
2119
|
width, height = image.size
|
2120
|
-
fontsize = max(
|
2120
|
+
fontsize = max(12, int(min(width, height) / 40))
|
2121
2121
|
draw = ImageDraw.Draw(image)
|
2122
2122
|
font = ImageFont.truetype(
|
2123
2123
|
str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.163
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -27,6 +27,7 @@ Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
|
|
27
27
|
Requires-Dist: pydantic (==2.7.4)
|
28
28
|
Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
29
29
|
Requires-Dist: pytube (==15.0.0)
|
30
|
+
Requires-Dist: redbaron (>=0.9.2,<0.10.0)
|
30
31
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
31
32
|
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
32
33
|
Requires-Dist: scipy (>=1.13.0,<1.14.0)
|
@@ -142,7 +143,7 @@ continuing, for example it may want to execute code and look at the output befor
|
|
142
143
|
letting the user respond.
|
143
144
|
|
144
145
|
### Chatting and Artifacts
|
145
|
-
If you run `
|
146
|
+
If you run `chat_with_artifacts` you will also notice an `Artifact` object. `Artifact`'s
|
146
147
|
are a way to sync files between local and remote environments. The agent will read and
|
147
148
|
write to the artifact object, which is just a pickle object, when it wants to save or
|
148
149
|
load files.
|
@@ -159,7 +160,7 @@ with open("image.png", "rb") as f:
|
|
159
160
|
artifacts["image.png"] = f.read()
|
160
161
|
|
161
162
|
agent = va.agent.VisionAgent()
|
162
|
-
response, artifacts = agent.
|
163
|
+
response, artifacts = agent.chat_with_artifacts(
|
163
164
|
[
|
164
165
|
{
|
165
166
|
"role": "user",
|
@@ -339,11 +340,11 @@ mode by passing in the verbose argument:
|
|
339
340
|
```
|
340
341
|
|
341
342
|
### Detailed Usage
|
342
|
-
You can also have it return more information by calling `
|
343
|
+
You can also have it return more information by calling `generate_code`. The format
|
343
344
|
of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
|
344
345
|
|
345
346
|
```python
|
346
|
-
>>> results = agent.
|
347
|
+
>>> results = agent.generate_code([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
|
347
348
|
>>> print(results)
|
348
349
|
{
|
349
350
|
"code": "from vision_agent.tools import ..."
|
@@ -372,7 +373,7 @@ conv = [
|
|
372
373
|
"media": ["workers.png"],
|
373
374
|
}
|
374
375
|
]
|
375
|
-
result = agent.
|
376
|
+
result = agent.generate_code(conv)
|
376
377
|
code = result["code"]
|
377
378
|
conv.append({"role": "assistant", "content": code})
|
378
379
|
conv.append(
|
@@ -381,7 +382,7 @@ conv.append(
|
|
381
382
|
"content": "Can you also return the number of workers wearing safety gear?",
|
382
383
|
}
|
383
384
|
)
|
384
|
-
result = agent.
|
385
|
+
result = agent.generate_code(conv)
|
385
386
|
```
|
386
387
|
|
387
388
|
|
@@ -1,10 +1,12 @@
|
|
1
1
|
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
2
|
-
vision_agent/agent/__init__.py,sha256=
|
2
|
+
vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
|
3
3
|
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
|
-
vision_agent/agent/agent_utils.py,sha256=
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
6
|
-
vision_agent/agent/vision_agent_coder.py,sha256=
|
7
|
-
vision_agent/agent/vision_agent_coder_prompts.py,sha256=
|
4
|
+
vision_agent/agent/agent_utils.py,sha256=eSgg8CwWylX_erLTqTg2pVhEEgVkMLRrQfYRyJzI3so,5443
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=MUigVufYML2sYn9Hsngswa77XxlZBgCwQyBfK8tlsio,22551
|
6
|
+
vision_agent/agent/vision_agent_coder.py,sha256=aVkl0b9LKvy-auuHGYSag-ixYnue0iRQqD1PYLPBR-s,29312
|
7
|
+
vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
|
8
|
+
vision_agent/agent/vision_agent_planner.py,sha256=mjmnXG9CvYf_ZA7ZJ3ri4H-2U_Km55gF1sZYRSOlxpY,19027
|
9
|
+
vision_agent/agent/vision_agent_planner_prompts.py,sha256=JDARUzko2HZdxkBtcy6wuP9DCCmbqhK_gnVgrjr6l1k,6691
|
8
10
|
vision_agent/agent/vision_agent_prompts.py,sha256=LZ9Bnx7ZFkqbNOMqwfdiWZU4niND9Z1ArcFHNSn_jzA,11187
|
9
11
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
12
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
@@ -14,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
14
16
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
15
17
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
16
18
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
|
-
vision_agent/tools/__init__.py,sha256=
|
18
|
-
vision_agent/tools/meta_tools.py,sha256=
|
19
|
+
vision_agent/tools/__init__.py,sha256=50wwisjudmZn7_SEwigTiiDxQ0HXbSIhVI4O8kvE9Es,2365
|
20
|
+
vision_agent/tools/meta_tools.py,sha256=MULJrZiTODOAN20TGceLdXcwoSGMNaE7bQbywySITnA,28458
|
19
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
21
|
-
vision_agent/tools/tools.py,sha256=
|
23
|
+
vision_agent/tools/tools.py,sha256=uWyR4pebTezXx9IWCKX4SL5sB9u_7LdRP0-KWU52zsU,78524
|
22
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
23
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
24
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -27,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
27
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
28
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
31
|
vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
32
|
+
vision_agent-0.2.163.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.163.dist-info/METADATA,sha256=grneiMhM3Lwzi9ex9JL8A0R5cmpqyOHaaTcLGRfLwWs,17785
|
34
|
+
vision_agent-0.2.163.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.163.dist-info/RECORD,,
|
File without changes
|
File without changes
|