vision-agent 0.2.161__py3-none-any.whl → 0.2.162__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,199 @@
1
+ USER_REQ = """
2
+ ## User Request
3
+ {user_request}
4
+ """
5
+
6
+ PLAN = """
7
+ **Context**:
8
+ {context}
9
+
10
+ **Tools Available**:
11
+ {tool_desc}
12
+
13
+ **Previous Feedback**:
14
+ {feedback}
15
+
16
+ **Instructions**:
17
+ 1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
18
+ 2. For each subtask, be sure to include the tool(s) you want to use to accomplish that subtask.
19
+ 3. Output three different plans each utilize a different strategy or set of tools ordering them from most likely to least likely to succeed.
20
+
21
+ Output a list of jsons in the following format:
22
+
23
+ ```json
24
+ {{
25
+ "plan1":
26
+ {{
27
+ "thoughts": str # your thought process for choosing this plan
28
+ "instructions": [
29
+ str # what you should do in this task associated with a tool
30
+ ]
31
+ }},
32
+ "plan2": ...,
33
+ "plan3": ...
34
+ }}
35
+ ```
36
+ """
37
+
38
+ TEST_PLANS = """
39
+ **Role**: You are a software programmer responsible for testing different tools.
40
+
41
+ **Task**: Your responsibility is to take a set of several plans and test the different tools for each plan.
42
+
43
+ **Documentation**:
44
+ This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
45
+
46
+ {docstring}
47
+
48
+ **Plans**:
49
+ {plans}
50
+
51
+ **Previous Attempts**:
52
+ {previous_attempts}
53
+
54
+ **Examples**:
55
+ --- EXAMPLE1 ---
56
+ plan1:
57
+ - Load the image from the provided file path 'image.jpg'.
58
+ - Use the 'owl_v2_image' tool with the prompt 'person' to detect and count the number of people in the image.
59
+ plan2:
60
+ - Load the image from the provided file path 'image.jpg'.
61
+ - Use the 'florence2_sam2_image' tool with the prompt 'person' to detect and count the number of people in the image.
62
+ - Count the number of detected objects labeled as 'person'.
63
+ plan3:
64
+ - Load the image from the provided file path 'image.jpg'.
65
+ - Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
66
+
67
+ ```python
68
+ from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_counting
69
+ image = load_image("image.jpg")
70
+ owl_v2_out = owl_v2_image("person", image)
71
+
72
+ f2s2_out = florence2_sam2_image("person", image)
73
+ # strip out the masks from the output becuase they don't provide useful information when printed
74
+ f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
75
+
76
+ cgd_out = countgd_counting(image)
77
+
78
+ final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
79
+ print(final_out)
80
+ --- END EXAMPLE1 ---
81
+
82
+ --- EXAMPLE2 ---
83
+ plan1:
84
+ - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
85
+ - Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
86
+ plan2:
87
+ - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
88
+ - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
89
+ plan3:
90
+ - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
91
+ - Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
92
+
93
+
94
+ ```python
95
+ import numpy as np
96
+ from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
97
+
98
+ # sample at 1 FPS and use the first 10 frames to reduce processing time
99
+ frames = extract_frames_and_timestamps("video.mp4", 1)
100
+ frames = [f["frame"] for f in frames][:10]
101
+
102
+ # strip arrays from the output to make it easier to read
103
+ def remove_arrays(o):
104
+ if isinstance(o, list):
105
+ return [remove_arrays(e) for e in o]
106
+ elif isinstance(o, dict):
107
+ return {{k: remove_arrays(v) for k, v in o.items()}}
108
+ elif isinstance(o, np.ndarray):
109
+ return "array: " + str(o.shape)
110
+ else:
111
+ return o
112
+
113
+ # return the counts of each label per frame to help determine the stability of the model results
114
+ def get_counts(preds):
115
+ counts = {{}}
116
+ for i, pred_frame in enumerate(preds):
117
+ counts_i = {{}}
118
+ for pred in pred_frame:
119
+ label = pred["label"].split(":")[1] if ":" in pred["label"] else pred["label"]
120
+ counts_i[label] = counts_i.get(label, 0) + 1
121
+ counts[f"frame_{{i}}"] = counts_i
122
+ return counts
123
+
124
+
125
+ # plan1
126
+ owl_v2_out = owl_v2_video("person", frames)
127
+ owl_v2_counts = get_counts(owl_v2_out)
128
+
129
+ # plan2
130
+ florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
131
+ florence2_counts = get_counts(florence2_out)
132
+
133
+ # plan3
134
+ f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
135
+ remove_arrays(f2s2_tracking_out)
136
+ f2s2_counts = get_counts(f2s2_tracking_out)
137
+
138
+ final_out = {{
139
+ "owl_v2_video": owl_v2_out,
140
+ "florence2_phrase_grounding": florence2_out,
141
+ "florence2_sam2_video_tracking": f2s2_out,
142
+ }}
143
+
144
+ counts = {{
145
+ "owl_v2_video": owl_v2_counts,
146
+ "florence2_phrase_grounding": florence2_counts,
147
+ "florence2_sam2_video_tracking": f2s2_counts,
148
+ }}
149
+
150
+ print(final_out)
151
+ print(labels_and_scores)
152
+ print(counts)
153
+ ```
154
+ --- END EXAMPLE2 ---
155
+
156
+ **Instructions**:
157
+ 1. Write a program to load the media and call each tool and print it's output along with other relevant information.
158
+ 2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
159
+ 3. Your test case MUST run only on the given images which are {media}
160
+ 4. Print this final dictionary.
161
+ 5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
162
+ """
163
+
164
+ PREVIOUS_FAILED = """
165
+ **Previous Failed Attempts**:
166
+ You previously ran this code:
167
+ ```python
168
+ {code}
169
+ ```
170
+
171
+ But got the following error or no stdout:
172
+ {error}
173
+ """
174
+
175
+ PICK_PLAN = """
176
+ **Role**: You are an advanced AI model that can understand the user request and construct plans to accomplish it.
177
+
178
+ **Task**: Your responsibility is to pick the best plan from the three plans provided.
179
+
180
+ **Context**:
181
+ {context}
182
+
183
+ **Plans**:
184
+ {plans}
185
+
186
+ **Tool Output**:
187
+ {tool_output}
188
+
189
+ **Instructions**:
190
+ 1. Re-read the user request, plans, tool outputs and examine the image.
191
+ 2. Solve the problem yourself given the image and pick the most accurate plan that matches your solution the best.
192
+ 3. Add modifications to improve the plan including: changing a tool, adding thresholds, string matching.
193
+ 3. Output a JSON object with the following format:
194
+ {{
195
+ "predicted_answer": str # the answer you would expect from the best plan
196
+ "thoughts": str # your thought process for choosing the best plan over other plans and any modifications you made
197
+ "best_plan": str # the best plan you have chosen, must be `plan1`, `plan2`, or `plan3`
198
+ }}
199
+ """
@@ -1,6 +1,5 @@
1
1
  from typing import Callable, List, Optional
2
2
 
3
- from .meta_tools import META_TOOL_DOCSTRING, Artifacts
4
3
  from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
5
4
  from .tool_utils import get_tool_descriptions_by_names
6
5
  from .tools import (
@@ -13,7 +13,9 @@ import numpy as np
13
13
  from IPython.display import display
14
14
 
15
15
  import vision_agent as va
16
+ from vision_agent.agent.agent_utils import extract_json
16
17
  from vision_agent.clients.landing_public_api import LandingPublicAPI
18
+ from vision_agent.lmm import AnthropicLMM
17
19
  from vision_agent.lmm.types import Message
18
20
  from vision_agent.tools.tool_utils import get_tool_documentation
19
21
  from vision_agent.tools.tools import TOOL_DESCRIPTIONS
@@ -338,6 +340,85 @@ def edit_code_artifact(
338
340
  return open_code_artifact(artifacts, name, cur_line)
339
341
 
340
342
 
343
+ def generate_vision_plan(
344
+ artifacts: Artifacts,
345
+ name: str,
346
+ chat: str,
347
+ media: List[str],
348
+ test_multi_plan: bool = True,
349
+ custom_tool_names: Optional[List[str]] = None,
350
+ ) -> str:
351
+ """Generates a plan to solve vision based tasks.
352
+
353
+ Parameters:
354
+ artifacts (Artifacts): The artifacts object to save the plan to.
355
+ name (str): The name of the artifact to save the plan context to.
356
+ chat (str): The chat message from the user.
357
+ media (List[str]): The media files to use.
358
+ test_multi_plan (bool): Do not change this parameter.
359
+ custom_tool_names (Optional[List[str]]): Do not change this parameter.
360
+
361
+ Returns:
362
+ str: The generated plan.
363
+
364
+ Examples
365
+ --------
366
+ >>> generate_vision_plan(artifacts, "plan.json", "Can you detect the dogs in this image?", ["image.jpg"])
367
+ [Start Plan Context]
368
+ plan1: This is a plan to detect dogs in an image
369
+ -load image
370
+ -detect dogs
371
+ -return detections
372
+ [End Plan Context]
373
+ """
374
+
375
+ if ZMQ_PORT is not None:
376
+ agent = va.agent.VisionAgentPlanner(
377
+ report_progress_callback=lambda inp: report_progress_callback(
378
+ int(ZMQ_PORT), inp
379
+ )
380
+ )
381
+ else:
382
+ agent = va.agent.VisionAgentPlanner()
383
+
384
+ fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
385
+ response = agent.generate_plan(
386
+ fixed_chat,
387
+ test_multi_plan=test_multi_plan,
388
+ custom_tool_names=custom_tool_names,
389
+ )
390
+ if response.test_results is not None:
391
+ redisplay_results(response.test_results)
392
+ response.test_results = None
393
+ artifacts[name] = response.model_dump_json()
394
+ media_names = extract_json(
395
+ AnthropicLMM()( # type: ignore
396
+ f"""Extract any media file names from this output in the following JSON format:
397
+ {{"media": ["image1.jpg", "image2.jpg"]}}
398
+
399
+ {artifacts[name]}"""
400
+ )
401
+ )
402
+ if "media" in media_names and isinstance(media_names, dict):
403
+ for media in media_names["media"]:
404
+ if isinstance(media, str):
405
+ with open(media, "rb") as f:
406
+ artifacts[media] = f.read()
407
+
408
+ output_str = f"[Start Plan Context, saved at {name}]"
409
+ for plan in response.plans.keys():
410
+ output_str += f"\n{plan}: {response.plans[plan]['thoughts'].strip()}\n" # type: ignore
411
+ output_str += " -" + "\n -".join(
412
+ e.strip() for e in response.plans[plan]["instructions"]
413
+ )
414
+
415
+ output_str += f"\nbest plan: {response.best_plan}\n"
416
+ output_str += "thoughts: " + response.plan_thoughts.strip() + "\n"
417
+ output_str += "[End Plan Context]"
418
+ print(output_str)
419
+ return output_str
420
+
421
+
341
422
  def generate_vision_code(
342
423
  artifacts: Artifacts,
343
424
  name: str,
@@ -368,7 +449,6 @@ def generate_vision_code(
368
449
  dogs = owl_v2("dog", image)
369
450
  return dogs
370
451
  """
371
-
372
452
  if ZMQ_PORT is not None:
373
453
  agent = va.agent.VisionAgentCoder(
374
454
  report_progress_callback=lambda inp: report_progress_callback(
@@ -379,7 +459,7 @@ def generate_vision_code(
379
459
  agent = va.agent.VisionAgentCoder(verbosity=int(VERBOSITY))
380
460
 
381
461
  fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
382
- response = agent.chat_with_workflow(
462
+ response = agent.generate_code(
383
463
  fixed_chat,
384
464
  test_multi_plan=test_multi_plan,
385
465
  custom_tool_names=custom_tool_names,
@@ -459,7 +539,7 @@ def edit_vision_code(
459
539
  fixed_chat_history.append({"role": "assistant", "content": code})
460
540
  fixed_chat_history.append({"role": "user", "content": chat})
461
541
 
462
- response = agent.chat_with_workflow(
542
+ response = agent.generate_code(
463
543
  fixed_chat_history,
464
544
  test_multi_plan=False,
465
545
  custom_tool_names=customized_tool_names,
@@ -748,6 +828,7 @@ META_TOOL_DOCSTRING = get_tool_documentation(
748
828
  open_code_artifact,
749
829
  create_code_artifact,
750
830
  edit_code_artifact,
831
+ generate_vision_plan,
751
832
  generate_vision_code,
752
833
  edit_vision_code,
753
834
  write_media_artifact,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.161
3
+ Version: 0.2.162
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -142,7 +142,7 @@ continuing, for example it may want to execute code and look at the output befor
142
142
  letting the user respond.
143
143
 
144
144
  ### Chatting and Artifacts
145
- If you run `chat_with_code` you will also notice an `Artifact` object. `Artifact`'s
145
+ If you run `chat_with_artifacts` you will also notice an `Artifact` object. `Artifact`'s
146
146
  are a way to sync files between local and remote environments. The agent will read and
147
147
  write to the artifact object, which is just a pickle object, when it wants to save or
148
148
  load files.
@@ -159,7 +159,7 @@ with open("image.png", "rb") as f:
159
159
  artifacts["image.png"] = f.read()
160
160
 
161
161
  agent = va.agent.VisionAgent()
162
- response, artifacts = agent.chat_with_code(
162
+ response, artifacts = agent.chat_with_artifacts(
163
163
  [
164
164
  {
165
165
  "role": "user",
@@ -339,11 +339,11 @@ mode by passing in the verbose argument:
339
339
  ```
340
340
 
341
341
  ### Detailed Usage
342
- You can also have it return more information by calling `chat_with_workflow`. The format
342
+ You can also have it return more information by calling `generate_code`. The format
343
343
  of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
344
344
 
345
345
  ```python
346
- >>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
346
+ >>> results = agent.generate_code([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
347
347
  >>> print(results)
348
348
  {
349
349
  "code": "from vision_agent.tools import ..."
@@ -372,7 +372,7 @@ conv = [
372
372
  "media": ["workers.png"],
373
373
  }
374
374
  ]
375
- result = agent.chat_with_workflow(conv)
375
+ result = agent.generate_code(conv)
376
376
  code = result["code"]
377
377
  conv.append({"role": "assistant", "content": code})
378
378
  conv.append(
@@ -381,7 +381,7 @@ conv.append(
381
381
  "content": "Can you also return the number of workers wearing safety gear?",
382
382
  }
383
383
  )
384
- result = agent.chat_with_workflow(conv)
384
+ result = agent.generate_code(conv)
385
385
  ```
386
386
 
387
387
 
@@ -1,10 +1,12 @@
1
1
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
- vision_agent/agent/__init__.py,sha256=NF2LABqHixLvbsOIO-fe-VKZ7awvShLtcT0oQT4eWtI,235
2
+ vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
3
3
  vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
4
- vision_agent/agent/agent_utils.py,sha256=eIpLz2NunEqEsBBrECJaD34-2uY0bsFNnW-XKfqqohs,2518
5
- vision_agent/agent/vision_agent.py,sha256=x0oSuQk-rcERUUdZp29FB77Ua-eyQD3fLi_UfmsBTV0,20761
6
- vision_agent/agent/vision_agent_coder.py,sha256=2ZoGikn2nakGDfs20XRshZjQUyvbw6l47UhExJAYkqI,38515
7
- vision_agent/agent/vision_agent_coder_prompts.py,sha256=BmbTMhth4v1qLexuoSeyo47QQ0kPQvL1pLbCJHMsWDw,18910
4
+ vision_agent/agent/agent_utils.py,sha256=AAIqi8U3Cc58RH_AJ6grxIHpgdu9AjctmapokLp88pQ,4766
5
+ vision_agent/agent/vision_agent.py,sha256=ycPuFzUh043ltdR9F3oNeTcud5-lO0ydrsVzuIAKWcY,22567
6
+ vision_agent/agent/vision_agent_coder.py,sha256=ISw85eqVRpFdBMs974Rg8VqjTfbcn2XiwBfPe-u9pNI,28214
7
+ vision_agent/agent/vision_agent_coder_prompts.py,sha256=zQJ2PxX7xWA0RsZrCT8F59s-5F4WpPAqMRdmUA5_UW4,12215
8
+ vision_agent/agent/vision_agent_planner.py,sha256=xsE9W2DJsUygmyD_aIYp-4o4LSzfe8mRVGrPTe0b3CI,17975
9
+ vision_agent/agent/vision_agent_planner_prompts.py,sha256=JDARUzko2HZdxkBtcy6wuP9DCCmbqhK_gnVgrjr6l1k,6691
8
10
  vision_agent/agent/vision_agent_prompts.py,sha256=LZ9Bnx7ZFkqbNOMqwfdiWZU4niND9Z1ArcFHNSn_jzA,11187
9
11
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
12
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
@@ -14,8 +16,8 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
14
16
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
15
17
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
16
18
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
17
- vision_agent/tools/__init__.py,sha256=PLVbfTMjKxQlHIRWnq9b785W9a52AXQS_tOa0tkQ0ZY,2420
18
- vision_agent/tools/meta_tools.py,sha256=BF5-fVshLhhpck5lJErKxnfPu9YxudBhR7ar_qA9Mjo,25889
19
+ vision_agent/tools/__init__.py,sha256=50wwisjudmZn7_SEwigTiiDxQ0HXbSIhVI4O8kvE9Es,2365
20
+ vision_agent/tools/meta_tools.py,sha256=T6M-O0uymeVltOggC-Qr1EzSpSnXmy0HTD7j2ZElY6s,28732
19
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
20
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
21
23
  vision_agent/tools/tools.py,sha256=vS1yCk3Fza9eYOTHPFwwroo_ULdw2ztMQMb81x1U5f8,78524
@@ -27,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
27
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
28
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
31
  vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
30
- vision_agent-0.2.161.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.161.dist-info/METADATA,sha256=81mQ74IJal478wgLMjkneAVg0kE89VVjX9Wa_hL0lMo,17753
32
- vision_agent-0.2.161.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.161.dist-info/RECORD,,
32
+ vision_agent-0.2.162.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.162.dist-info/METADATA,sha256=lc7obdExLHxIWS6zTG_wUQq_1PmVpwrlMkR4-fVj75M,17743
34
+ vision_agent-0.2.162.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.162.dist-info/RECORD,,