vision-agent 0.2.161__py3-none-any.whl → 0.2.163__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +8 -0
- vision_agent/agent/agent_utils.py +98 -2
- vision_agent/agent/vision_agent.py +54 -22
- vision_agent/agent/vision_agent_coder.py +222 -512
- vision_agent/agent/vision_agent_coder_prompts.py +12 -221
- vision_agent/agent/vision_agent_planner.py +583 -0
- vision_agent/agent/vision_agent_planner_prompts.py +199 -0
- vision_agent/tools/__init__.py +0 -1
- vision_agent/tools/meta_tools.py +107 -35
- vision_agent/tools/tools.py +2 -2
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/METADATA +8 -7
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/RECORD +14 -12
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.161.dist-info → vision_agent-0.2.163.dist-info}/WHEEL +0 -0
@@ -0,0 +1,199 @@
|
|
1
|
+
USER_REQ = """
|
2
|
+
## User Request
|
3
|
+
{user_request}
|
4
|
+
"""
|
5
|
+
|
6
|
+
PLAN = """
|
7
|
+
**Context**:
|
8
|
+
{context}
|
9
|
+
|
10
|
+
**Tools Available**:
|
11
|
+
{tool_desc}
|
12
|
+
|
13
|
+
**Previous Feedback**:
|
14
|
+
{feedback}
|
15
|
+
|
16
|
+
**Instructions**:
|
17
|
+
1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
|
18
|
+
2. For each subtask, be sure to include the tool(s) you want to use to accomplish that subtask.
|
19
|
+
3. Output three different plans each utilize a different strategy or set of tools ordering them from most likely to least likely to succeed.
|
20
|
+
|
21
|
+
Output a list of jsons in the following format:
|
22
|
+
|
23
|
+
```json
|
24
|
+
{{
|
25
|
+
"plan1":
|
26
|
+
{{
|
27
|
+
"thoughts": str # your thought process for choosing this plan
|
28
|
+
"instructions": [
|
29
|
+
str # what you should do in this task associated with a tool
|
30
|
+
]
|
31
|
+
}},
|
32
|
+
"plan2": ...,
|
33
|
+
"plan3": ...
|
34
|
+
}}
|
35
|
+
```
|
36
|
+
"""
|
37
|
+
|
38
|
+
TEST_PLANS = """
|
39
|
+
**Role**: You are a software programmer responsible for testing different tools.
|
40
|
+
|
41
|
+
**Task**: Your responsibility is to take a set of several plans and test the different tools for each plan.
|
42
|
+
|
43
|
+
**Documentation**:
|
44
|
+
This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
|
45
|
+
|
46
|
+
{docstring}
|
47
|
+
|
48
|
+
**Plans**:
|
49
|
+
{plans}
|
50
|
+
|
51
|
+
**Previous Attempts**:
|
52
|
+
{previous_attempts}
|
53
|
+
|
54
|
+
**Examples**:
|
55
|
+
--- EXAMPLE1 ---
|
56
|
+
plan1:
|
57
|
+
- Load the image from the provided file path 'image.jpg'.
|
58
|
+
- Use the 'owl_v2_image' tool with the prompt 'person' to detect and count the number of people in the image.
|
59
|
+
plan2:
|
60
|
+
- Load the image from the provided file path 'image.jpg'.
|
61
|
+
- Use the 'florence2_sam2_image' tool with the prompt 'person' to detect and count the number of people in the image.
|
62
|
+
- Count the number of detected objects labeled as 'person'.
|
63
|
+
plan3:
|
64
|
+
- Load the image from the provided file path 'image.jpg'.
|
65
|
+
- Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
|
66
|
+
|
67
|
+
```python
|
68
|
+
from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_counting
|
69
|
+
image = load_image("image.jpg")
|
70
|
+
owl_v2_out = owl_v2_image("person", image)
|
71
|
+
|
72
|
+
f2s2_out = florence2_sam2_image("person", image)
|
73
|
+
# strip out the masks from the output becuase they don't provide useful information when printed
|
74
|
+
f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
|
75
|
+
|
76
|
+
cgd_out = countgd_counting(image)
|
77
|
+
|
78
|
+
final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
|
79
|
+
print(final_out)
|
80
|
+
--- END EXAMPLE1 ---
|
81
|
+
|
82
|
+
--- EXAMPLE2 ---
|
83
|
+
plan1:
|
84
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
85
|
+
- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
|
86
|
+
plan2:
|
87
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
88
|
+
- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
|
89
|
+
plan3:
|
90
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
91
|
+
- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
|
92
|
+
|
93
|
+
|
94
|
+
```python
|
95
|
+
import numpy as np
|
96
|
+
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
|
97
|
+
|
98
|
+
# sample at 1 FPS and use the first 10 frames to reduce processing time
|
99
|
+
frames = extract_frames_and_timestamps("video.mp4", 1)
|
100
|
+
frames = [f["frame"] for f in frames][:10]
|
101
|
+
|
102
|
+
# strip arrays from the output to make it easier to read
|
103
|
+
def remove_arrays(o):
|
104
|
+
if isinstance(o, list):
|
105
|
+
return [remove_arrays(e) for e in o]
|
106
|
+
elif isinstance(o, dict):
|
107
|
+
return {{k: remove_arrays(v) for k, v in o.items()}}
|
108
|
+
elif isinstance(o, np.ndarray):
|
109
|
+
return "array: " + str(o.shape)
|
110
|
+
else:
|
111
|
+
return o
|
112
|
+
|
113
|
+
# return the counts of each label per frame to help determine the stability of the model results
|
114
|
+
def get_counts(preds):
|
115
|
+
counts = {{}}
|
116
|
+
for i, pred_frame in enumerate(preds):
|
117
|
+
counts_i = {{}}
|
118
|
+
for pred in pred_frame:
|
119
|
+
label = pred["label"].split(":")[1] if ":" in pred["label"] else pred["label"]
|
120
|
+
counts_i[label] = counts_i.get(label, 0) + 1
|
121
|
+
counts[f"frame_{{i}}"] = counts_i
|
122
|
+
return counts
|
123
|
+
|
124
|
+
|
125
|
+
# plan1
|
126
|
+
owl_v2_out = owl_v2_video("person", frames)
|
127
|
+
owl_v2_counts = get_counts(owl_v2_out)
|
128
|
+
|
129
|
+
# plan2
|
130
|
+
florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
|
131
|
+
florence2_counts = get_counts(florence2_out)
|
132
|
+
|
133
|
+
# plan3
|
134
|
+
f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
|
135
|
+
remove_arrays(f2s2_tracking_out)
|
136
|
+
f2s2_counts = get_counts(f2s2_tracking_out)
|
137
|
+
|
138
|
+
final_out = {{
|
139
|
+
"owl_v2_video": owl_v2_out,
|
140
|
+
"florence2_phrase_grounding": florence2_out,
|
141
|
+
"florence2_sam2_video_tracking": f2s2_out,
|
142
|
+
}}
|
143
|
+
|
144
|
+
counts = {{
|
145
|
+
"owl_v2_video": owl_v2_counts,
|
146
|
+
"florence2_phrase_grounding": florence2_counts,
|
147
|
+
"florence2_sam2_video_tracking": f2s2_counts,
|
148
|
+
}}
|
149
|
+
|
150
|
+
print(final_out)
|
151
|
+
print(labels_and_scores)
|
152
|
+
print(counts)
|
153
|
+
```
|
154
|
+
--- END EXAMPLE2 ---
|
155
|
+
|
156
|
+
**Instructions**:
|
157
|
+
1. Write a program to load the media and call each tool and print it's output along with other relevant information.
|
158
|
+
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
|
159
|
+
3. Your test case MUST run only on the given images which are {media}
|
160
|
+
4. Print this final dictionary.
|
161
|
+
5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
|
162
|
+
"""
|
163
|
+
|
164
|
+
PREVIOUS_FAILED = """
|
165
|
+
**Previous Failed Attempts**:
|
166
|
+
You previously ran this code:
|
167
|
+
```python
|
168
|
+
{code}
|
169
|
+
```
|
170
|
+
|
171
|
+
But got the following error or no stdout:
|
172
|
+
{error}
|
173
|
+
"""
|
174
|
+
|
175
|
+
PICK_PLAN = """
|
176
|
+
**Role**: You are an advanced AI model that can understand the user request and construct plans to accomplish it.
|
177
|
+
|
178
|
+
**Task**: Your responsibility is to pick the best plan from the three plans provided.
|
179
|
+
|
180
|
+
**Context**:
|
181
|
+
{context}
|
182
|
+
|
183
|
+
**Plans**:
|
184
|
+
{plans}
|
185
|
+
|
186
|
+
**Tool Output**:
|
187
|
+
{tool_output}
|
188
|
+
|
189
|
+
**Instructions**:
|
190
|
+
1. Re-read the user request, plans, tool outputs and examine the image.
|
191
|
+
2. Solve the problem yourself given the image and pick the most accurate plan that matches your solution the best.
|
192
|
+
3. Add modifications to improve the plan including: changing a tool, adding thresholds, string matching.
|
193
|
+
3. Output a JSON object with the following format:
|
194
|
+
{{
|
195
|
+
"predicted_answer": str # the answer you would expect from the best plan
|
196
|
+
"thoughts": str # your thought process for choosing the best plan over other plans and any modifications you made
|
197
|
+
"best_plan": str # the best plan you have chosen, must be `plan1`, `plan2`, or `plan3`
|
198
|
+
}}
|
199
|
+
"""
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/meta_tools.py
CHANGED
@@ -11,9 +11,12 @@ from typing import Any, Dict, List, Optional, Union
|
|
11
11
|
|
12
12
|
import numpy as np
|
13
13
|
from IPython.display import display
|
14
|
+
from redbaron import RedBaron # type: ignore
|
14
15
|
|
15
16
|
import vision_agent as va
|
17
|
+
from vision_agent.agent.agent_utils import extract_json
|
16
18
|
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
19
|
+
from vision_agent.lmm import AnthropicLMM
|
17
20
|
from vision_agent.lmm.types import Message
|
18
21
|
from vision_agent.tools.tool_utils import get_tool_documentation
|
19
22
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
@@ -22,8 +25,6 @@ from vision_agent.utils.execute import Execution, MimeType
|
|
22
25
|
from vision_agent.utils.image_utils import convert_to_b64, numpy_to_bytes
|
23
26
|
from vision_agent.utils.video import frames_to_bytes
|
24
27
|
|
25
|
-
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
26
|
-
|
27
28
|
CURRENT_FILE = None
|
28
29
|
CURRENT_LINE = 0
|
29
30
|
DEFAULT_WINDOW_SIZE = 100
|
@@ -152,6 +153,9 @@ class Artifacts:
|
|
152
153
|
return name in self.artifacts
|
153
154
|
|
154
155
|
|
156
|
+
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
157
|
+
|
158
|
+
|
155
159
|
def format_lines(lines: List[str], start_idx: int) -> str:
|
156
160
|
output = ""
|
157
161
|
for i, line in enumerate(lines):
|
@@ -338,6 +342,85 @@ def edit_code_artifact(
|
|
338
342
|
return open_code_artifact(artifacts, name, cur_line)
|
339
343
|
|
340
344
|
|
345
|
+
def generate_vision_plan(
|
346
|
+
artifacts: Artifacts,
|
347
|
+
name: str,
|
348
|
+
chat: str,
|
349
|
+
media: List[str],
|
350
|
+
test_multi_plan: bool = True,
|
351
|
+
custom_tool_names: Optional[List[str]] = None,
|
352
|
+
) -> str:
|
353
|
+
"""Generates a plan to solve vision based tasks.
|
354
|
+
|
355
|
+
Parameters:
|
356
|
+
artifacts (Artifacts): The artifacts object to save the plan to.
|
357
|
+
name (str): The name of the artifact to save the plan context to.
|
358
|
+
chat (str): The chat message from the user.
|
359
|
+
media (List[str]): The media files to use.
|
360
|
+
test_multi_plan (bool): Do not change this parameter.
|
361
|
+
custom_tool_names (Optional[List[str]]): Do not change this parameter.
|
362
|
+
|
363
|
+
Returns:
|
364
|
+
str: The generated plan.
|
365
|
+
|
366
|
+
Examples
|
367
|
+
--------
|
368
|
+
>>> generate_vision_plan(artifacts, "plan.json", "Can you detect the dogs in this image?", ["image.jpg"])
|
369
|
+
[Start Plan Context]
|
370
|
+
plan1: This is a plan to detect dogs in an image
|
371
|
+
-load image
|
372
|
+
-detect dogs
|
373
|
+
-return detections
|
374
|
+
[End Plan Context]
|
375
|
+
"""
|
376
|
+
|
377
|
+
if ZMQ_PORT is not None:
|
378
|
+
agent = va.agent.VisionAgentPlanner(
|
379
|
+
report_progress_callback=lambda inp: report_progress_callback(
|
380
|
+
int(ZMQ_PORT), inp
|
381
|
+
)
|
382
|
+
)
|
383
|
+
else:
|
384
|
+
agent = va.agent.VisionAgentPlanner()
|
385
|
+
|
386
|
+
fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
|
387
|
+
response = agent.generate_plan(
|
388
|
+
fixed_chat,
|
389
|
+
test_multi_plan=test_multi_plan,
|
390
|
+
custom_tool_names=custom_tool_names,
|
391
|
+
)
|
392
|
+
if response.test_results is not None:
|
393
|
+
redisplay_results(response.test_results)
|
394
|
+
response.test_results = None
|
395
|
+
artifacts[name] = response.model_dump_json()
|
396
|
+
media_names = extract_json(
|
397
|
+
AnthropicLMM()( # type: ignore
|
398
|
+
f"""Extract any media file names from this output in the following JSON format:
|
399
|
+
{{"media": ["image1.jpg", "image2.jpg"]}}
|
400
|
+
|
401
|
+
{artifacts[name]}"""
|
402
|
+
)
|
403
|
+
)
|
404
|
+
if "media" in media_names and isinstance(media_names, dict):
|
405
|
+
for media in media_names["media"]:
|
406
|
+
if isinstance(media, str):
|
407
|
+
with open(media, "rb") as f:
|
408
|
+
artifacts[media] = f.read()
|
409
|
+
|
410
|
+
output_str = f"[Start Plan Context, saved at {name}]"
|
411
|
+
for plan in response.plans.keys():
|
412
|
+
output_str += f"\n{plan}: {response.plans[plan]['thoughts'].strip()}\n" # type: ignore
|
413
|
+
output_str += " -" + "\n -".join(
|
414
|
+
e.strip() for e in response.plans[plan]["instructions"]
|
415
|
+
)
|
416
|
+
|
417
|
+
output_str += f"\nbest plan: {response.best_plan}\n"
|
418
|
+
output_str += "thoughts: " + response.plan_thoughts.strip() + "\n"
|
419
|
+
output_str += "[End Plan Context]"
|
420
|
+
print(output_str)
|
421
|
+
return output_str
|
422
|
+
|
423
|
+
|
341
424
|
def generate_vision_code(
|
342
425
|
artifacts: Artifacts,
|
343
426
|
name: str,
|
@@ -368,7 +451,6 @@ def generate_vision_code(
|
|
368
451
|
dogs = owl_v2("dog", image)
|
369
452
|
return dogs
|
370
453
|
"""
|
371
|
-
|
372
454
|
if ZMQ_PORT is not None:
|
373
455
|
agent = va.agent.VisionAgentCoder(
|
374
456
|
report_progress_callback=lambda inp: report_progress_callback(
|
@@ -379,7 +461,7 @@ def generate_vision_code(
|
|
379
461
|
agent = va.agent.VisionAgentCoder(verbosity=int(VERBOSITY))
|
380
462
|
|
381
463
|
fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
|
382
|
-
response = agent.
|
464
|
+
response = agent.generate_code(
|
383
465
|
fixed_chat,
|
384
466
|
test_multi_plan=test_multi_plan,
|
385
467
|
custom_tool_names=custom_tool_names,
|
@@ -411,7 +493,7 @@ def edit_vision_code(
|
|
411
493
|
name: str,
|
412
494
|
chat_history: List[str],
|
413
495
|
media: List[str],
|
414
|
-
|
496
|
+
custom_tool_names: Optional[List[str]] = None,
|
415
497
|
) -> str:
|
416
498
|
"""Edits python code to solve a vision based task.
|
417
499
|
|
@@ -419,7 +501,7 @@ def edit_vision_code(
|
|
419
501
|
artifacts (Artifacts): The artifacts object to save the code to.
|
420
502
|
name (str): The file path to the code.
|
421
503
|
chat_history (List[str]): The chat history to used to generate the code.
|
422
|
-
|
504
|
+
custom_tool_names (Optional[List[str]]): Do not change this parameter.
|
423
505
|
|
424
506
|
Returns:
|
425
507
|
str: The edited code.
|
@@ -459,10 +541,10 @@ def edit_vision_code(
|
|
459
541
|
fixed_chat_history.append({"role": "assistant", "content": code})
|
460
542
|
fixed_chat_history.append({"role": "user", "content": chat})
|
461
543
|
|
462
|
-
response = agent.
|
544
|
+
response = agent.generate_code(
|
463
545
|
fixed_chat_history,
|
464
546
|
test_multi_plan=False,
|
465
|
-
custom_tool_names=
|
547
|
+
custom_tool_names=custom_tool_names,
|
466
548
|
)
|
467
549
|
redisplay_results(response["test_result"])
|
468
550
|
code = response["code"]
|
@@ -625,7 +707,7 @@ def get_diff_with_prompts(name: str, before: str, after: str) -> str:
|
|
625
707
|
def use_extra_vision_agent_args(
|
626
708
|
code: str,
|
627
709
|
test_multi_plan: bool = True,
|
628
|
-
|
710
|
+
custom_tool_names: Optional[List[str]] = None,
|
629
711
|
) -> str:
|
630
712
|
"""This is for forcing arguments passed by the user to VisionAgent into the
|
631
713
|
VisionAgentCoder call.
|
@@ -633,36 +715,25 @@ def use_extra_vision_agent_args(
|
|
633
715
|
Parameters:
|
634
716
|
code (str): The code to edit.
|
635
717
|
test_multi_plan (bool): Do not change this parameter.
|
636
|
-
|
718
|
+
custom_tool_names (Optional[List[str]]): Do not change this parameter.
|
637
719
|
|
638
720
|
Returns:
|
639
721
|
str: The edited code.
|
640
722
|
"""
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
arg = match.group(1)
|
656
|
-
out_str = f"edit_vision_code({arg}"
|
657
|
-
if customized_tool_names is not None:
|
658
|
-
out_str += f", custom_tool_names={customized_tool_names})"
|
659
|
-
else:
|
660
|
-
out_str += ")"
|
661
|
-
return out_str
|
662
|
-
|
663
|
-
new_code = re.sub(generate_pattern, generate_replacer, code)
|
664
|
-
new_code = re.sub(edit_pattern, edit_replacer, new_code)
|
665
|
-
return new_code
|
723
|
+
red = RedBaron(code)
|
724
|
+
for node in red:
|
725
|
+
# seems to always be atomtrailers not call type
|
726
|
+
if node.type == "atomtrailers":
|
727
|
+
if (
|
728
|
+
node.name.value == "generate_vision_code"
|
729
|
+
or node.name.value == "edit_vision_code"
|
730
|
+
):
|
731
|
+
node.value[1].value.append(f"test_multi_plan={test_multi_plan}")
|
732
|
+
|
733
|
+
if custom_tool_names is not None:
|
734
|
+
node.value[1].value.append(f"custom_tool_names={custom_tool_names}")
|
735
|
+
cleaned_code = red.dumps().strip()
|
736
|
+
return cleaned_code if isinstance(cleaned_code, str) else code
|
666
737
|
|
667
738
|
|
668
739
|
def use_object_detection_fine_tuning(
|
@@ -748,6 +819,7 @@ META_TOOL_DOCSTRING = get_tool_documentation(
|
|
748
819
|
open_code_artifact,
|
749
820
|
create_code_artifact,
|
750
821
|
edit_code_artifact,
|
822
|
+
generate_vision_plan,
|
751
823
|
generate_vision_code,
|
752
824
|
edit_vision_code,
|
753
825
|
write_media_artifact,
|
vision_agent/tools/tools.py
CHANGED
@@ -1923,7 +1923,7 @@ def overlay_bounding_boxes(
|
|
1923
1923
|
bboxes = bbox_int[i]
|
1924
1924
|
bboxes = sorted(bboxes, key=lambda x: x["label"], reverse=True)
|
1925
1925
|
|
1926
|
-
if len(bboxes) >
|
1926
|
+
if len(bboxes) > 40:
|
1927
1927
|
pil_image = _plot_counting(pil_image, bboxes, color)
|
1928
1928
|
else:
|
1929
1929
|
width, height = pil_image.size
|
@@ -2117,7 +2117,7 @@ def _plot_counting(
|
|
2117
2117
|
colors: Dict[str, Tuple[int, int, int]],
|
2118
2118
|
) -> Image.Image:
|
2119
2119
|
width, height = image.size
|
2120
|
-
fontsize = max(
|
2120
|
+
fontsize = max(12, int(min(width, height) / 40))
|
2121
2121
|
draw = ImageDraw.Draw(image)
|
2122
2122
|
font = ImageFont.truetype(
|
2123
2123
|
str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.163
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -27,6 +27,7 @@ Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
|
|
27
27
|
Requires-Dist: pydantic (==2.7.4)
|
28
28
|
Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
29
29
|
Requires-Dist: pytube (==15.0.0)
|
30
|
+
Requires-Dist: redbaron (>=0.9.2,<0.10.0)
|
30
31
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
31
32
|
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
32
33
|
Requires-Dist: scipy (>=1.13.0,<1.14.0)
|
@@ -142,7 +143,7 @@ continuing, for example it may want to execute code and look at the output befor
|
|
142
143
|
letting the user respond.
|
143
144
|
|
144
145
|
### Chatting and Artifacts
|
145
|
-
If you run `
|
146
|
+
If you run `chat_with_artifacts` you will also notice an `Artifact` object. `Artifact`'s
|
146
147
|
are a way to sync files between local and remote environments. The agent will read and
|
147
148
|
write to the artifact object, which is just a pickle object, when it wants to save or
|
148
149
|
load files.
|
@@ -159,7 +160,7 @@ with open("image.png", "rb") as f:
|
|
159
160
|
artifacts["image.png"] = f.read()
|
160
161
|
|
161
162
|
agent = va.agent.VisionAgent()
|
162
|
-
response, artifacts = agent.
|
163
|
+
response, artifacts = agent.chat_with_artifacts(
|
163
164
|
[
|
164
165
|
{
|
165
166
|
"role": "user",
|
@@ -339,11 +340,11 @@ mode by passing in the verbose argument:
|
|
339
340
|
```
|
340
341
|
|
341
342
|
### Detailed Usage
|
342
|
-
You can also have it return more information by calling `
|
343
|
+
You can also have it return more information by calling `generate_code`. The format
|
343
344
|
of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
|
344
345
|
|
345
346
|
```python
|
346
|
-
>>> results = agent.
|
347
|
+
>>> results = agent.generate_code([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
|
347
348
|
>>> print(results)
|
348
349
|
{
|
349
350
|
"code": "from vision_agent.tools import ..."
|
@@ -372,7 +373,7 @@ conv = [
|
|
372
373
|
"media": ["workers.png"],
|
373
374
|
}
|
374
375
|
]
|
375
|
-
result = agent.
|
376
|
+
result = agent.generate_code(conv)
|
376
377
|
code = result["code"]
|
377
378
|
conv.append({"role": "assistant", "content": code})
|
378
379
|
conv.append(
|
@@ -381,7 +382,7 @@ conv.append(
|
|
381
382
|
"content": "Can you also return the number of workers wearing safety gear?",
|
382
383
|
}
|
383
384
|
)
|
384
|
-
result = agent.
|
385
|
+
result = agent.generate_code(conv)
|
385
386
|
```
|
386
387
|
|
387
388
|
|
@@ -1,10 +1,12 @@
|
|
1
1
|
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
2
|
-
vision_agent/agent/__init__.py,sha256=
|
2
|
+
vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
|
3
3
|
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
|
-
vision_agent/agent/agent_utils.py,sha256=
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
6
|
-
vision_agent/agent/vision_agent_coder.py,sha256=
|
7
|
-
vision_agent/agent/vision_agent_coder_prompts.py,sha256=
|
4
|
+
vision_agent/agent/agent_utils.py,sha256=eSgg8CwWylX_erLTqTg2pVhEEgVkMLRrQfYRyJzI3so,5443
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=MUigVufYML2sYn9Hsngswa77XxlZBgCwQyBfK8tlsio,22551
|
6
|
+
vision_agent/agent/vision_agent_coder.py,sha256=aVkl0b9LKvy-auuHGYSag-ixYnue0iRQqD1PYLPBR-s,29312
|
7
|
+
vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
|
8
|
+
vision_agent/agent/vision_agent_planner.py,sha256=mjmnXG9CvYf_ZA7ZJ3ri4H-2U_Km55gF1sZYRSOlxpY,19027
|
9
|
+
vision_agent/agent/vision_agent_planner_prompts.py,sha256=JDARUzko2HZdxkBtcy6wuP9DCCmbqhK_gnVgrjr6l1k,6691
|
8
10
|
vision_agent/agent/vision_agent_prompts.py,sha256=LZ9Bnx7ZFkqbNOMqwfdiWZU4niND9Z1ArcFHNSn_jzA,11187
|
9
11
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
12
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
@@ -14,11 +16,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
14
16
|
vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
|
15
17
|
vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
|
16
18
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
|
-
vision_agent/tools/__init__.py,sha256=
|
18
|
-
vision_agent/tools/meta_tools.py,sha256=
|
19
|
+
vision_agent/tools/__init__.py,sha256=50wwisjudmZn7_SEwigTiiDxQ0HXbSIhVI4O8kvE9Es,2365
|
20
|
+
vision_agent/tools/meta_tools.py,sha256=MULJrZiTODOAN20TGceLdXcwoSGMNaE7bQbywySITnA,28458
|
19
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
21
|
-
vision_agent/tools/tools.py,sha256=
|
23
|
+
vision_agent/tools/tools.py,sha256=uWyR4pebTezXx9IWCKX4SL5sB9u_7LdRP0-KWU52zsU,78524
|
22
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
23
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
24
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -27,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
|
|
27
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
28
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
31
|
vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
32
|
+
vision_agent-0.2.163.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.163.dist-info/METADATA,sha256=grneiMhM3Lwzi9ex9JL8A0R5cmpqyOHaaTcLGRfLwWs,17785
|
34
|
+
vision_agent-0.2.163.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.163.dist-info/RECORD,,
|
File without changes
|
File without changes
|