vision-agent 0.2.161__py3-none-any.whl → 0.2.162__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,199 @@
1
+ USER_REQ = """
2
+ ## User Request
3
+ {user_request}
4
+ """
5
+
6
+ PLAN = """
7
+ **Context**:
8
+ {context}
9
+
10
+ **Tools Available**:
11
+ {tool_desc}
12
+
13
+ **Previous Feedback**:
14
+ {feedback}
15
+
16
+ **Instructions**:
17
+ 1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
18
+ 2. For each subtask, be sure to include the tool(s) you want to use to accomplish that subtask.
19
+ 3. Output three different plans each utilize a different strategy or set of tools ordering them from most likely to least likely to succeed.
20
+
21
+ Output a list of jsons in the following format:
22
+
23
+ ```json
24
+ {{
25
+ "plan1":
26
+ {{
27
+ "thoughts": str # your thought process for choosing this plan
28
+ "instructions": [
29
+ str # what you should do in this task associated with a tool
30
+ ]
31
+ }},
32
+ "plan2": ...,
33
+ "plan3": ...
34
+ }}
35
+ ```
36
+ """
37
+
38
+ TEST_PLANS = """
39
+ **Role**: You are a software programmer responsible for testing different tools.
40
+
41
+ **Task**: Your responsibility is to take a set of several plans and test the different tools for each plan.
42
+
43
+ **Documentation**:
44
+ This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
45
+
46
+ {docstring}
47
+
48
+ **Plans**:
49
+ {plans}
50
+
51
+ **Previous Attempts**:
52
+ {previous_attempts}
53
+
54
+ **Examples**:
55
+ --- EXAMPLE1 ---
56
+ plan1:
57
+ - Load the image from the provided file path 'image.jpg'.
58
+ - Use the 'owl_v2_image' tool with the prompt 'person' to detect and count the number of people in the image.
59
+ plan2:
60
+ - Load the image from the provided file path 'image.jpg'.
61
+ - Use the 'florence2_sam2_image' tool with the prompt 'person' to detect and count the number of people in the image.
62
+ - Count the number of detected objects labeled as 'person'.
63
+ plan3:
64
+ - Load the image from the provided file path 'image.jpg'.
65
+ - Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
66
+
67
+ ```python
68
+ from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_counting
69
+ image = load_image("image.jpg")
70
+ owl_v2_out = owl_v2_image("person", image)
71
+
72
+ f2s2_out = florence2_sam2_image("person", image)
73
+ # strip out the masks from the output becuase they don't provide useful information when printed
74
+ f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
75
+
76
+ cgd_out = countgd_counting(image)
77
+
78
+ final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
79
+ print(final_out)
80
+ --- END EXAMPLE1 ---
81
+
82
+ --- EXAMPLE2 ---
83
+ plan1:
84
+ - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
85
+ - Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
86
+ plan2:
87
+ - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
88
+ - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
89
+ plan3:
90
+ - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
91
+ - Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
92
+
93
+
94
+ ```python
95
+ import numpy as np
96
+ from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
97
+
98
+ # sample at 1 FPS and use the first 10 frames to reduce processing time
99
+ frames = extract_frames_and_timestamps("video.mp4", 1)
100
+ frames = [f["frame"] for f in frames][:10]
101
+
102
+ # strip arrays from the output to make it easier to read
103
+ def remove_arrays(o):
104
+ if isinstance(o, list):
105
+ return [remove_arrays(e) for e in o]
106
+ elif isinstance(o, dict):
107
+ return {{k: remove_arrays(v) for k, v in o.items()}}
108
+ elif isinstance(o, np.ndarray):
109
+ return "array: " + str(o.shape)
110
+ else:
111
+ return o
112
+
113
+ # return the counts of each label per frame to help determine the stability of the model results
114
+ def get_counts(preds):
115
+ counts = {{}}
116
+ for i, pred_frame in enumerate(preds):
117
+ counts_i = {{}}
118
+ for pred in pred_frame:
119
+ label = pred["label"].split(":")[1] if ":" in pred["label"] else pred["label"]
120
+ counts_i[label] = counts_i.get(label, 0) + 1
121
+ counts[f"frame_{{i}}"] = counts_i
122
+ return counts
123
+
124
+
125
+ # plan1
126
+ owl_v2_out = owl_v2_video("person", frames)
127
+ owl_v2_counts = get_counts(owl_v2_out)
128
+
129
+ # plan2
130
+ florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
131
+ florence2_counts = get_counts(florence2_out)
132
+
133
+ # plan3
134
+ f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
135
+ remove_arrays(f2s2_tracking_out)
136
+ f2s2_counts = get_counts(f2s2_tracking_out)
137
+
138
+ final_out = {{
139
+ "owl_v2_video": owl_v2_out,
140
+ "florence2_phrase_grounding": florence2_out,
141
+ "florence2_sam2_video_tracking": f2s2_out,
142
+ }}
143
+
144
+ counts = {{
145
+ "owl_v2_video": owl_v2_counts,
146
+ "florence2_phrase_grounding": florence2_counts,
147
+ "florence2_sam2_video_tracking": f2s2_counts,
148
+ }}
149
+
150
+ print(final_out)
151
+ print(labels_and_scores)
152
+ print(counts)
153
+ ```
154
+ --- END EXAMPLE2 ---
155
+
156
+ **Instructions**:
157
+ 1. Write a program to load the media and call each tool and print it's output along with other relevant information.
158
+ 2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
159
+ 3. Your test case MUST run only on the given images which are {media}
160
+ 4. Print this final dictionary.
161
+ 5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
162
+ """
163
+
164
+ PREVIOUS_FAILED = """
165
+ **Previous Failed Attempts**:
166
+ You previously ran this code:
167
+ ```python
168
+ {code}
169
+ ```
170
+
171
+ But got the following error or no stdout:
172
+ {error}
173
+ """
174
+
175
+ PICK_PLAN = """
176
+ **Role**: You are an advanced AI model that can understand the user request and construct plans to accomplish it.
177
+
178
+ **Task**: Your responsibility is to pick the best plan from the three plans provided.
179
+
180
+ **Context**:
181
+ {context}
182
+
183
+ **Plans**:
184
+ {plans}
185
+
186
+ **Tool Output**:
187
+ {tool_output}
188
+
189
+ **Instructions**:
190
+ 1. Re-read the user request, plans, tool outputs and examine the image.
191
+ 2. Solve the problem yourself given the image and pick the most accurate plan that matches your solution the best.
192
+ 3. Add modifications to improve the plan including: changing a tool, adding thresholds, string matching.
193
+ 3. Output a JSON object with the following format:
194
+ {{
195
+ "predicted_answer": str # the answer you would expect from the best plan
196
+ "thoughts": str # your thought process for choosing the best plan over other plans and any modifications you made
197
+ "best_plan": str # the best plan you have chosen, must be `plan1`, `plan2`, or `plan3`
198
+ }}
199
+ """
@@ -1,6 +1,5 @@
1
1
  from typing import Callable, List, Optional
2
2
 
3
- from .meta_tools import META_TOOL_DOCSTRING, Artifacts
4
3
  from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
5
4
  from .tool_utils import get_tool_descriptions_by_names
6
5
  from .tools import (
@@ -13,7 +13,9 @@ import numpy as np
13
13
  from IPython.display import display
14
14
 
15
15
  import vision_agent as va
16
+ from vision_agent.agent.agent_utils import extract_json
16
17
  from vision_agent.clients.landing_public_api import LandingPublicAPI
18
+ from vision_agent.lmm import AnthropicLMM
17
19
  from vision_agent.lmm.types import Message
18
20
  from vision_agent.tools.tool_utils import get_tool_documentation
19
21
  from vision_agent.tools.tools import TOOL_DESCRIPTIONS
@@ -338,6 +340,85 @@ def edit_code_artifact(
338
340
  return open_code_artifact(artifacts, name, cur_line)
339
341
 
340
342
 
343
+ def generate_vision_plan(
344
+ artifacts: Artifacts,
345
+ name: str,
346
+ chat: str,
347
+ media: List[str],
348
+ test_multi_plan: bool = True,
349
+ custom_tool_names: Optional[List[str]] = None,
350
+ ) -> str:
351
+ """Generates a plan to solve vision based tasks.
352
+
353
+ Parameters:
354
+ artifacts (Artifacts): The artifacts object to save the plan to.
355
+ name (str): The name of the artifact to save the plan context to.
356
+ chat (str): The chat message from the user.
357
+ media (List[str]): The media files to use.
358
+ test_multi_plan (bool): Do not change this parameter.
359
+ custom_tool_names (Optional[List[str]]): Do not change this parameter.
360
+
361
+ Returns:
362
+ str: The generated plan.
363
+
364
+ Examples
365
+ --------
366
+ >>> generate_vision_plan(artifacts, "plan.json", "Can you detect the dogs in this image?", ["image.jpg"])
367
+ [Start Plan Context]
368
+ plan1: This is a plan to detect dogs in an image
369
+ -load image
370
+ -detect dogs
371
+ -return detections
372
+ [End Plan Context]
373
+ """
374
+
375
+ if ZMQ_PORT is not None:
376
+ agent = va.agent.VisionAgentPlanner(
377
+ report_progress_callback=lambda inp: report_progress_callback(
378
+ int(ZMQ_PORT), inp
379
+ )
380
+ )
381
+ else:
382
+ agent = va.agent.VisionAgentPlanner()
383
+
384
+ fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
385
+ response = agent.generate_plan(
386
+ fixed_chat,
387
+ test_multi_plan=test_multi_plan,
388
+ custom_tool_names=custom_tool_names,
389
+ )
390
+ if response.test_results is not None:
391
+ redisplay_results(response.test_results)
392
+ response.test_results = None
393
+ artifacts[name] = response.model_dump_json()
394
+ media_names = extract_json(
395
+ AnthropicLMM()( # type: ignore
396
+ f"""Extract any media file names from this output in the following JSON format:
397
+ {{"media": ["image1.jpg", "image2.jpg"]}}
398
+
399
+ {artifacts[name]}"""
400
+ )
401
+ )
402
+ if "media" in media_names and isinstance(media_names, dict):
403
+ for media in media_names["media"]:
404
+ if isinstance(media, str):
405
+ with open(media, "rb") as f:
406
+ artifacts[media] = f.read()
407
+
408
+ output_str = f"[Start Plan Context, saved at {name}]"
409
+ for plan in response.plans.keys():
410
+ output_str += f"\n{plan}: {response.plans[plan]['thoughts'].strip()}\n" # type: ignore
411
+ output_str += " -" + "\n -".join(
412
+ e.strip() for e in response.plans[plan]["instructions"]
413
+ )
414
+
415
+ output_str += f"\nbest plan: {response.best_plan}\n"
416
+ output_str += "thoughts: " + response.plan_thoughts.strip() + "\n"
417
+ output_str += "[End Plan Context]"
418
+ print(output_str)
419
+ return output_str
420
+
421
+
341
422
  def generate_vision_code(
342
423
  artifacts: Artifacts,
343
424
  name: str,
@@ -368,7 +449,6 @@ def generate_vision_code(
368
449
  dogs = owl_v2("dog", image)
369
450
  return dogs
370
451
  """
371
-
372
452
  if ZMQ_PORT is not None:
373
453
  agent = va.agent.VisionAgentCoder(
374
454
  report_progress_callback=lambda inp: report_progress_callback(
@@ -379,7 +459,7 @@ def generate_vision_code(
379
459
  agent = va.agent.VisionAgentCoder(verbosity=int(VERBOSITY))
380
460
 
381
461
  fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
382
- response = agent.chat_with_workflow(
462
+ response = agent.generate_code(
383
463
  fixed_chat,
384
464
  test_multi_plan=test_multi_plan,
385
465
  custom_tool_names=custom_tool_names,
@@ -459,7 +539,7 @@ def edit_vision_code(
459
539
  fixed_chat_history.append({"role": "assistant", "content": code})
460
540
  fixed_chat_history.append({"role": "user", "content": chat})
461
541
 
462
- response = agent.chat_with_workflow(
542
+ response = agent.generate_code(
463
543
  fixed_chat_history,
464
544
  test_multi_plan=False,
465
545
  custom_tool_names=customized_tool_names,
@@ -748,6 +828,7 @@ META_TOOL_DOCSTRING = get_tool_documentation(
748
828
  open_code_artifact,
749
829
  create_code_artifact,
750
830
  edit_code_artifact,
831
+ generate_vision_plan,
751
832
  generate_vision_code,
752
833
  edit_vision_code,
753
834
  write_media_artifact,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.161
3
+ Version: 0.2.162
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -142,7 +142,7 @@ continuing, for example it may want to execute code and look at the output befor
142
142
  letting the user respond.
143
143
 
144
144
  ### Chatting and Artifacts
145
- If you run `chat_with_code` you will also notice an `Artifact` object. `Artifact`'s
145
+ If you run `chat_with_artifacts` you will also notice an `Artifact` object. `Artifact`'s
146
146
  are a way to sync files between local and remote environments. The agent will read and
147
147
  write to the artifact object, which is just a pickle object, when it wants to save or
148
148
  load files.
@@ -159,7 +159,7 @@ with open("image.png", "rb") as f:
159
159
  artifacts["image.png"] = f.read()
160
160
 
161
161
  agent = va.agent.VisionAgent()
162
- response, artifacts = agent.chat_with_code(
162
+ response, artifacts = agent.chat_with_artifacts(
163
163
  [
164
164
  {
165
165
  "role": "user",
@@ -339,11 +339,11 @@ mode by passing in the verbose argument:
339
339
  ```
340
340
 
341
341
  ### Detailed Usage
342
- You can also have it return more information by calling `chat_with_workflow`. The format
342
+ You can also have it return more information by calling `generate_code`. The format
343
343
  of the input is a list of dictionaries with the keys `role`, `content`, and `media`:
344
344
 
345
345
  ```python
346
- >>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
346
+ >>> results = agent.generate_code([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?", "media": ["jar.jpg"]}])
347
347
  >>> print(results)
348
348
  {
349
349
  "code": "from vision_agent.tools import ..."
@@ -372,7 +372,7 @@ conv = [
372
372
  "media": ["workers.png"],
373
373
  }
374
374
  ]
375
- result = agent.chat_with_workflow(conv)
375
+ result = agent.generate_code(conv)
376
376
  code = result["code"]
377
377
  conv.append({"role": "assistant", "content": code})
378
378
  conv.append(
@@ -381,7 +381,7 @@ conv.append(
381
381
  "content": "Can you also return the number of workers wearing safety gear?",
382
382
  }
383
383
  )
384
- result = agent.chat_with_workflow(conv)
384
+ result = agent.generate_code(conv)
385
385
  ```
386
386
 
387
387
 
@@ -1,10 +1,12 @@
1
1
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
- vision_agent/agent/__init__.py,sha256=NF2LABqHixLvbsOIO-fe-VKZ7awvShLtcT0oQT4eWtI,235
2
+ vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
3
3
  vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
4
- vision_agent/agent/agent_utils.py,sha256=eIpLz2NunEqEsBBrECJaD34-2uY0bsFNnW-XKfqqohs,2518
5
- vision_agent/agent/vision_agent.py,sha256=x0oSuQk-rcERUUdZp29FB77Ua-eyQD3fLi_UfmsBTV0,20761
6
- vision_agent/agent/vision_agent_coder.py,sha256=2ZoGikn2nakGDfs20XRshZjQUyvbw6l47UhExJAYkqI,38515
7
- vision_agent/agent/vision_agent_coder_prompts.py,sha256=BmbTMhth4v1qLexuoSeyo47QQ0kPQvL1pLbCJHMsWDw,18910
4
+ vision_agent/agent/agent_utils.py,sha256=AAIqi8U3Cc58RH_AJ6grxIHpgdu9AjctmapokLp88pQ,4766
5
+ vision_agent/agent/vision_agent.py,sha256=ycPuFzUh043ltdR9F3oNeTcud5-lO0ydrsVzuIAKWcY,22567
6
+ vision_agent/agent/vision_agent_coder.py,sha256=ISw85eqVRpFdBMs974Rg8VqjTfbcn2XiwBfPe-u9pNI,28214
7
+ vision_agent/agent/vision_agent_coder_prompts.py,sha256=zQJ2PxX7xWA0RsZrCT8F59s-5F4WpPAqMRdmUA5_UW4,12215
8
+ vision_agent/agent/vision_agent_planner.py,sha256=xsE9W2DJsUygmyD_aIYp-4o4LSzfe8mRVGrPTe0b3CI,17975
9
+ vision_agent/agent/vision_agent_planner_prompts.py,sha256=JDARUzko2HZdxkBtcy6wuP9DCCmbqhK_gnVgrjr6l1k,6691
8
10
  vision_agent/agent/vision_agent_prompts.py,sha256=LZ9Bnx7ZFkqbNOMqwfdiWZU4niND9Z1ArcFHNSn_jzA,11187
9
11
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
12
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
@@ -14,8 +16,8 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
14
16
  vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
15
17
  vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
16
18
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
17
- vision_agent/tools/__init__.py,sha256=PLVbfTMjKxQlHIRWnq9b785W9a52AXQS_tOa0tkQ0ZY,2420
18
- vision_agent/tools/meta_tools.py,sha256=BF5-fVshLhhpck5lJErKxnfPu9YxudBhR7ar_qA9Mjo,25889
19
+ vision_agent/tools/__init__.py,sha256=50wwisjudmZn7_SEwigTiiDxQ0HXbSIhVI4O8kvE9Es,2365
20
+ vision_agent/tools/meta_tools.py,sha256=T6M-O0uymeVltOggC-Qr1EzSpSnXmy0HTD7j2ZElY6s,28732
19
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
20
22
  vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
21
23
  vision_agent/tools/tools.py,sha256=vS1yCk3Fza9eYOTHPFwwroo_ULdw2ztMQMb81x1U5f8,78524
@@ -27,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
27
29
  vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
28
30
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
31
  vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
30
- vision_agent-0.2.161.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.161.dist-info/METADATA,sha256=81mQ74IJal478wgLMjkneAVg0kE89VVjX9Wa_hL0lMo,17753
32
- vision_agent-0.2.161.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.161.dist-info/RECORD,,
32
+ vision_agent-0.2.162.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
33
+ vision_agent-0.2.162.dist-info/METADATA,sha256=lc7obdExLHxIWS6zTG_wUQq_1PmVpwrlMkR4-fVj75M,17743
34
+ vision_agent-0.2.162.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
35
+ vision_agent-0.2.162.dist-info/RECORD,,