vision-agent 0.2.80__tar.gz → 0.2.82__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {vision_agent-0.2.80 → vision_agent-0.2.82}/PKG-INFO +1 -1
  2. {vision_agent-0.2.80 → vision_agent-0.2.82}/pyproject.toml +1 -1
  3. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/agent/vision_agent.py +171 -86
  4. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/agent/vision_agent_prompts.py +95 -7
  5. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/lmm/lmm.py +43 -20
  6. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/utils/execute.py +9 -4
  7. {vision_agent-0.2.80 → vision_agent-0.2.82}/LICENSE +0 -0
  8. {vision_agent-0.2.80 → vision_agent-0.2.82}/README.md +0 -0
  9. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/__init__.py +0 -0
  10. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/agent/__init__.py +0 -0
  11. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/agent/agent.py +0 -0
  12. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/fonts/__init__.py +0 -0
  13. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  14. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/lmm/__init__.py +0 -0
  15. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/tools/__init__.py +6 -6
  16. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/tools/prompts.py +0 -0
  17. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/tools/tool_utils.py +0 -0
  18. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/tools/tools.py +0 -0
  19. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/utils/__init__.py +0 -0
  20. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/utils/exceptions.py +0 -0
  21. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/utils/image_utils.py +0 -0
  22. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/utils/sim.py +0 -0
  23. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/utils/type_defs.py +0 -0
  24. {vision_agent-0.2.80 → vision_agent-0.2.82}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.80
3
+ Version: 0.2.82
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.80"
7
+ version = "0.2.82"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -5,7 +5,7 @@ import logging
5
5
  import sys
6
6
  import tempfile
7
7
  from pathlib import Path
8
- from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
8
+ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
9
9
 
10
10
  from langsmith import traceable
11
11
  from PIL import Image
@@ -20,9 +20,11 @@ from vision_agent.agent.vision_agent_prompts import (
20
20
  CODE,
21
21
  FIX_BUG,
22
22
  FULL_TASK,
23
+ PICK_PLAN,
23
24
  PLAN,
24
- REFLECT,
25
+ PREVIOUS_FAILED,
25
26
  SIMPLE_TEST,
27
+ TEST_PLANS,
26
28
  USER_REQ,
27
29
  )
28
30
  from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OpenAILMM
@@ -80,6 +82,15 @@ def format_memory(memory: List[Dict[str, str]]) -> str:
80
82
  return output_str
81
83
 
82
84
 
85
+ def format_plans(plans: Dict[str, Any]) -> str:
86
+ plan_str = ""
87
+ for k, v in plans.items():
88
+ plan_str += f"{k}:\n"
89
+ plan_str += "-" + "\n-".join([e["instructions"] for e in v])
90
+
91
+ return plan_str
92
+
93
+
83
94
  def extract_code(code: str) -> str:
84
95
  if "\n```python" in code:
85
96
  start = "\n```python"
@@ -140,12 +151,12 @@ def extract_image(
140
151
 
141
152
 
142
153
  @traceable
143
- def write_plan(
154
+ def write_plans(
144
155
  chat: List[Message],
145
156
  tool_desc: str,
146
157
  working_memory: str,
147
158
  model: LMM,
148
- ) -> List[Dict[str, str]]:
159
+ ) -> Dict[str, Any]:
149
160
  chat = copy.deepcopy(chat)
150
161
  if chat[-1]["role"] != "user":
151
162
  raise ValueError("Last chat message must be from the user.")
@@ -154,14 +165,84 @@ def write_plan(
154
165
  context = USER_REQ.format(user_request=user_request)
155
166
  prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
156
167
  chat[-1]["content"] = prompt
157
- return extract_json(model.chat(chat))["plan"] # type: ignore
168
+ return extract_json(model.chat(chat))
169
+
170
+
171
+ @traceable
172
+ def pick_plan(
173
+ chat: List[Message],
174
+ plans: Dict[str, Any],
175
+ tool_info: str,
176
+ model: LMM,
177
+ code_interpreter: CodeInterpreter,
178
+ verbosity: int = 0,
179
+ ) -> Tuple[str, str]:
180
+ chat = copy.deepcopy(chat)
181
+ if chat[-1]["role"] != "user":
182
+ raise ValueError("Last chat message must be from the user.")
183
+
184
+ plan_str = format_plans(plans)
185
+ prompt = TEST_PLANS.format(
186
+ docstring=tool_info, plans=plan_str, previous_attempts=""
187
+ )
188
+
189
+ code = extract_code(model(prompt))
190
+ tool_output = code_interpreter.exec_isolation(DefaultImports.prepend_imports(code))
191
+ tool_output_str = ""
192
+ if len(tool_output.logs.stdout) > 0:
193
+ tool_output_str = tool_output.logs.stdout[0]
194
+
195
+ if verbosity >= 1:
196
+ _print_code("Initial code and tests:", code)
197
+ _LOGGER.info(f"Initial code execution result:\n{tool_output.text()}")
198
+
199
+ # retry if the tool output is empty or code fails
200
+ count = 1
201
+ while (not tool_output.success or tool_output_str == "") and count < 3:
202
+ prompt = TEST_PLANS.format(
203
+ docstring=tool_info,
204
+ plans=plan_str,
205
+ previous_attempts=PREVIOUS_FAILED.format(
206
+ code=code, error=tool_output.text()
207
+ ),
208
+ )
209
+ code = extract_code(model(prompt))
210
+ tool_output = code_interpreter.exec_isolation(
211
+ DefaultImports.prepend_imports(code)
212
+ )
213
+ tool_output_str = ""
214
+ if len(tool_output.logs.stdout) > 0:
215
+ tool_output_str = tool_output.logs.stdout[0]
216
+
217
+ if verbosity == 1:
218
+ _print_code("Code and test after attempted fix:", code)
219
+ _LOGGER.info(f"Code execution result after attempte {count}")
220
+
221
+ count += 1
222
+
223
+ user_req = chat[-1]["content"]
224
+ context = USER_REQ.format(user_request=user_req)
225
+ # because the tool picker model gets the image as well, we have to be careful with
226
+ # how much text we send it, so we truncate the tool output to 20,000 characters
227
+ prompt = PICK_PLAN.format(
228
+ context=context,
229
+ plans=format_plans(plans),
230
+ tool_output=tool_output_str[:20_000],
231
+ )
232
+ chat[-1]["content"] = prompt
233
+ best_plan = extract_json(model(chat))
234
+ if verbosity >= 1:
235
+ _LOGGER.info(f"Best plan:\n{best_plan}")
236
+ return best_plan["best_plan"], tool_output_str
158
237
 
159
238
 
160
239
  @traceable
161
240
  def write_code(
162
241
  coder: LMM,
163
242
  chat: List[Message],
243
+ plan: str,
164
244
  tool_info: str,
245
+ tool_output: str,
165
246
  feedback: str,
166
247
  ) -> str:
167
248
  chat = copy.deepcopy(chat)
@@ -171,7 +252,8 @@ def write_code(
171
252
  user_request = chat[-1]["content"]
172
253
  prompt = CODE.format(
173
254
  docstring=tool_info,
174
- question=user_request,
255
+ question=FULL_TASK.format(user_request=user_request, subtasks=plan),
256
+ tool_output=tool_output,
175
257
  feedback=feedback,
176
258
  )
177
259
  chat[-1]["content"] = prompt
@@ -203,27 +285,11 @@ def write_test(
203
285
  return extract_code(tester(chat))
204
286
 
205
287
 
206
- @traceable
207
- def reflect(
208
- chat: List[Message],
209
- plan: str,
210
- code: str,
211
- model: LMM,
212
- ) -> Dict[str, Union[str, bool]]:
213
- chat = copy.deepcopy(chat)
214
- if chat[-1]["role"] != "user":
215
- raise ValueError("Last chat message must be from the user.")
216
-
217
- user_request = chat[-1]["content"]
218
- context = USER_REQ.format(user_request=user_request)
219
- prompt = REFLECT.format(context=context, plan=plan, code=code)
220
- chat[-1]["content"] = prompt
221
- return extract_json(model(chat))
222
-
223
-
224
288
  def write_and_test_code(
225
289
  chat: List[Message],
290
+ plan: str,
226
291
  tool_info: str,
292
+ tool_output: str,
227
293
  tool_utils: str,
228
294
  working_memory: List[Dict[str, str]],
229
295
  coder: LMM,
@@ -241,7 +307,14 @@ def write_and_test_code(
241
307
  "status": "started",
242
308
  }
243
309
  )
244
- code = write_code(coder, chat, tool_info, format_memory(working_memory))
310
+ code = write_code(
311
+ coder,
312
+ chat,
313
+ plan,
314
+ tool_info,
315
+ tool_output,
316
+ format_memory(working_memory),
317
+ )
245
318
  test = write_test(
246
319
  tester, chat, tool_utils, code, format_memory(working_memory), media
247
320
  )
@@ -412,11 +485,11 @@ def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
412
485
 
413
486
 
414
487
  def retrieve_tools(
415
- plan: List[Dict[str, str]],
488
+ plans: Dict[str, List[Dict[str, str]]],
416
489
  tool_recommender: Sim,
417
490
  log_progress: Callable[[Dict[str, Any]], None],
418
491
  verbosity: int = 0,
419
- ) -> str:
492
+ ) -> Dict[str, str]:
420
493
  log_progress(
421
494
  {
422
495
  "type": "tools",
@@ -425,27 +498,29 @@ def retrieve_tools(
425
498
  )
426
499
  tool_info = []
427
500
  tool_desc = []
428
- tool_list: List[Dict[str, str]] = []
429
- for task in plan:
430
- tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3)
431
- tool_info.extend([e["doc"] for e in tools])
432
- tool_desc.extend([e["desc"] for e in tools])
433
- tool_list.extend(
434
- {"description": e["desc"], "documentation": e["doc"]} for e in tools
435
- )
436
- log_progress(
437
- {
438
- "type": "tools",
439
- "status": "completed",
440
- "payload": list({v["description"]: v for v in tool_list}.values()),
441
- }
442
- )
501
+ tool_lists: Dict[str, List[Dict[str, str]]] = {}
502
+ for k, plan in plans.items():
503
+ tool_lists[k] = []
504
+ for task in plan:
505
+ tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3)
506
+ tool_info.extend([e["doc"] for e in tools])
507
+ tool_desc.extend([e["desc"] for e in tools])
508
+ tool_lists[k].extend(
509
+ {"description": e["desc"], "documentation": e["doc"]} for e in tools
510
+ )
443
511
 
444
512
  if verbosity == 2:
445
513
  tool_desc_str = "\n".join(set(tool_desc))
446
514
  _LOGGER.info(f"Tools Description:\n{tool_desc_str}")
447
- tool_info_set = set(tool_info)
448
- return "\n\n".join(tool_info_set)
515
+
516
+ tool_lists_unique = {}
517
+ for k in tool_lists:
518
+ tool_lists_unique[k] = "\n\n".join(
519
+ set(e["documentation"] for e in tool_lists[k])
520
+ )
521
+ all_tools = "\n\n".join(set(tool_info))
522
+ tool_lists_unique["all"] = all_tools
523
+ return tool_lists_unique
449
524
 
450
525
 
451
526
  class VisionAgent(Agent):
@@ -543,7 +618,6 @@ class VisionAgent(Agent):
543
618
  def chat_with_workflow(
544
619
  self,
545
620
  chat: List[Message],
546
- self_reflection: bool = False,
547
621
  display_visualization: bool = False,
548
622
  ) -> Dict[str, Any]:
549
623
  """Chat with Vision Agent and return intermediate information regarding the task.
@@ -554,7 +628,6 @@ class VisionAgent(Agent):
554
628
  [{"role": "user", "content": "describe your task here..."}]
555
629
  or if it contains media files, it should be in the format of:
556
630
  [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
557
- self_reflection (bool): Whether to reflect on the task and debug the code.
558
631
  display_visualization (bool): If True, it opens a new window locally to
559
632
  show the image(s) created by visualization code (if there is any).
560
633
 
@@ -581,7 +654,18 @@ class VisionAgent(Agent):
581
654
 
582
655
  int_chat = cast(
583
656
  List[Message],
584
- [{"role": c["role"], "content": c["content"]} for c in chat],
657
+ [
658
+ (
659
+ {
660
+ "role": c["role"],
661
+ "content": c["content"],
662
+ "media": c["media"],
663
+ }
664
+ if "media" in c
665
+ else {"role": c["role"], "content": c["content"]}
666
+ )
667
+ for c in chat
668
+ ],
585
669
  )
586
670
 
587
671
  code = ""
@@ -599,13 +683,45 @@ class VisionAgent(Agent):
599
683
  "status": "started",
600
684
  }
601
685
  )
602
- plan_i = write_plan(
686
+ plans = write_plans(
603
687
  int_chat,
604
688
  T.TOOL_DESCRIPTIONS,
605
689
  format_memory(working_memory),
606
690
  self.planner,
607
691
  )
608
- plan_i_str = "\n-".join([e["instructions"] for e in plan_i])
692
+
693
+ if self.verbosity >= 1:
694
+ for p in plans:
695
+ _LOGGER.info(
696
+ f"\n{tabulate(tabular_data=plans[p], headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
697
+ )
698
+
699
+ tool_infos = retrieve_tools(
700
+ plans,
701
+ self.tool_recommender,
702
+ self.log_progress,
703
+ self.verbosity,
704
+ )
705
+ best_plan, tool_output_str = pick_plan(
706
+ int_chat,
707
+ plans,
708
+ tool_infos["all"],
709
+ self.coder,
710
+ code_interpreter,
711
+ verbosity=self.verbosity,
712
+ )
713
+
714
+ if best_plan in plans and best_plan in tool_infos:
715
+ plan_i = plans[best_plan]
716
+ tool_info = tool_infos[best_plan]
717
+ else:
718
+ if self.verbosity >= 1:
719
+ _LOGGER.warning(
720
+ f"Best plan {best_plan} not found in plans or tool_infos. Using the first plan and tool info."
721
+ )
722
+ k = list(plans.keys())[0]
723
+ plan_i = plans[k]
724
+ tool_info = tool_infos[k]
609
725
 
610
726
  self.log_progress(
611
727
  {
@@ -616,18 +732,16 @@ class VisionAgent(Agent):
616
732
  )
617
733
  if self.verbosity >= 1:
618
734
  _LOGGER.info(
619
- f"\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
735
+ f"Picked best plan:\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
620
736
  )
621
737
 
622
- tool_info = retrieve_tools(
623
- plan_i,
624
- self.tool_recommender,
625
- self.log_progress,
626
- self.verbosity,
627
- )
628
738
  results = write_and_test_code(
629
- chat=int_chat,
739
+ chat=[
740
+ {"role": c["role"], "content": c["content"]} for c in int_chat
741
+ ],
742
+ plan="\n-" + "\n-".join([e["instructions"] for e in plan_i]),
630
743
  tool_info=tool_info,
744
+ tool_output=tool_output_str,
631
745
  tool_utils=T.UTILITIES_DOCSTRING,
632
746
  working_memory=working_memory,
633
747
  coder=self.coder,
@@ -644,35 +758,6 @@ class VisionAgent(Agent):
644
758
  working_memory.extend(results["working_memory"]) # type: ignore
645
759
  plan.append({"code": code, "test": test, "plan": plan_i})
646
760
 
647
- if not self_reflection:
648
- break
649
-
650
- self.log_progress(
651
- {
652
- "type": "self_reflection",
653
- "status": "started",
654
- }
655
- )
656
- reflection = reflect(
657
- int_chat,
658
- FULL_TASK.format(
659
- user_request=chat[0]["content"], subtasks=plan_i_str
660
- ),
661
- code,
662
- self.planner,
663
- )
664
- if self.verbosity > 0:
665
- _LOGGER.info(f"Reflection: {reflection}")
666
- feedback = cast(str, reflection["feedback"])
667
- success = cast(bool, reflection["success"])
668
- self.log_progress(
669
- {
670
- "type": "self_reflection",
671
- "status": "completed" if success else "failed",
672
- "payload": reflection,
673
- }
674
- )
675
- working_memory.append({"code": f"{code}\n{test}", "feedback": feedback})
676
761
  retries += 1
677
762
 
678
763
  execution_result = cast(Execution, results["test_result"])
@@ -19,7 +19,7 @@ FEEDBACK = """
19
19
 
20
20
 
21
21
  PLAN = """
22
- **Context**
22
+ **Context**:
23
23
  {context}
24
24
 
25
25
  **Tools Available**:
@@ -29,23 +29,110 @@ PLAN = """
29
29
  {feedback}
30
30
 
31
31
  **Instructions**:
32
- 1. Based on the context and tools you have available, write a plan of subtasks to achieve the user request.
33
- 2. Go over the users request step by step and ensure each step is represented as a clear subtask in your plan.
32
+ 1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
33
+ 2. Output three different plans each utilize a different strategy or tool.
34
34
 
35
35
  Output a list of jsons in the following format
36
36
 
37
37
  ```json
38
38
  {{
39
- "plan":
39
+ "plan1":
40
40
  [
41
41
  {{
42
42
  "instructions": str # what you should do in this task associated with a tool
43
43
  }}
44
- ]
44
+ ],
45
+ "plan2": ...,
46
+ "plan3": ...
45
47
  }}
46
48
  ```
47
49
  """
48
50
 
51
+
52
+ TEST_PLANS = """
53
+ **Role**: You are a software programmer responsible for testing different tools.
54
+
55
+ **Task**: Your responsibility is to take a set of several plans and test the different tools for each plan.
56
+
57
+ **Documentation**:
58
+ This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
59
+
60
+ {docstring}
61
+
62
+ **Plans**:
63
+ {plans}
64
+
65
+ {previous_attempts}
66
+
67
+ **Instructions**:
68
+ 1. Write a program to load the media and call each tool and save it's output.
69
+ 2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove any array types from the printed dictionary.
70
+ 3. Print this final dictionary.
71
+
72
+ **Example**:
73
+ plan1:
74
+ - Load the image from the provided file path 'image.jpg'.
75
+ - Use the 'owl_v2' tool with the prompt 'person' to detect and count the number of people in the image.
76
+ plan2:
77
+ - Load the image from the provided file path 'image.jpg'.
78
+ - Use the 'grounding_sam' tool with the prompt 'person' to detect and count the number of people in the image.
79
+ - Count the number of detected objects labeled as 'person'.
80
+ plan3:
81
+ - Load the image from the provided file path 'image.jpg'.
82
+ - Use the 'loca_zero_shot_counting' tool to count the dominant foreground object, which in this case is people.
83
+
84
+ ```python
85
+ from vision_agent.tools import load_image, owl_v2, grounding_sam, loca_zero_shot_counting
86
+ image = load_image("image.jpg")
87
+ owl_v2_out = owl_v2("person", image)
88
+
89
+ gsam_out = grounding_sam("person", image)
90
+ gsam_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in gsam_out]
91
+
92
+ loca_out = loca_zero_shot_counting(image)
93
+ loca_out = loca_out["count"]
94
+
95
+ final_out = {{"owl_v2": owl_v2_out, "florencev2_object_detection": florencev2_out, "loca_zero_shot_counting": loca_out}}
96
+ print(final_out)
97
+ ```
98
+ """
99
+
100
+
101
+ PREVIOUS_FAILED = """
102
+ **Previous Failed Attempts**:
103
+ You previously ran this code:
104
+ ```python
105
+ {code}
106
+ ```
107
+
108
+ But got the following error or no stdout:
109
+ {error}
110
+ """
111
+
112
+
113
+ PICK_PLAN = """
114
+ **Role**: You are a software programmer.
115
+
116
+ **Task**: Your responsibility is to pick the best plan from the three plans provided.
117
+
118
+ **Context**:
119
+ {context}
120
+
121
+ **Plans**:
122
+ {plans}
123
+
124
+ **Tool Output**:
125
+ {tool_output}
126
+
127
+ **Instructions**:
128
+ 1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
129
+ 2. Output a JSON object with the following format:
130
+ {{
131
+ "thoughts": str # your thought process for choosing the best plan
132
+ "best_plan": str # the best plan you have chosen
133
+ }}
134
+ """
135
+
49
136
  CODE = """
50
137
  **Role**: You are a software programmer.
51
138
 
@@ -64,6 +151,9 @@ This is the documentation for the functions you have access to. You may call any
64
151
  **User Instructions**:
65
152
  {question}
66
153
 
154
+ **Tool Output**:
155
+ {tool_output}
156
+
67
157
  **Previous Feedback**:
68
158
  {feedback}
69
159
 
@@ -72,7 +162,6 @@ This is the documentation for the functions you have access to. You may call any
72
162
  2. **Algorithm/Method Selection**: Decide on the most efficient way.
73
163
  3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
74
164
  4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255).
75
- 5. **Logging**: Log the output of the custom functions that were provided to you from `from vision_agent.tools import *`. Use a debug flag in the function parameters to toggle logging on and off.
76
165
  """
77
166
 
78
167
  TEST = """
@@ -147,7 +236,6 @@ print(found_text)
147
236
  ```
148
237
  """
149
238
 
150
-
151
239
  SIMPLE_TEST = """
152
240
  **Role**: As a tester, your task is to create a simple test case for the provided code. This test case should verify the fundamental functionality under normal conditions.
153
241
 
@@ -1,4 +1,5 @@
1
1
  import base64
2
+ import io
2
3
  import json
3
4
  import logging
4
5
  import os
@@ -8,6 +9,7 @@ from typing import Any, Callable, Dict, List, Optional, Union, cast
8
9
 
9
10
  import requests
10
11
  from openai import AzureOpenAI, OpenAI
12
+ from PIL import Image
11
13
 
12
14
  import vision_agent.tools as T
13
15
  from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
@@ -15,12 +17,40 @@ from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
15
17
  _LOGGER = logging.getLogger(__name__)
16
18
 
17
19
 
18
- def encode_image(image: Union[str, Path]) -> str:
19
- with open(image, "rb") as f:
20
- encoded_image = base64.b64encode(f.read()).decode("utf-8")
20
+ def encode_image_bytes(image: bytes) -> str:
21
+ image = Image.open(io.BytesIO(image)).convert("RGB") # type: ignore
22
+ buffer = io.BytesIO()
23
+ image.save(buffer, format="PNG") # type: ignore
24
+ encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
21
25
  return encoded_image
22
26
 
23
27
 
28
+ def encode_media(media: Union[str, Path]) -> str:
29
+ extension = "png"
30
+ extension = Path(media).suffix
31
+ if extension.lower() not in {
32
+ ".jpg",
33
+ ".jpeg",
34
+ ".png",
35
+ ".webp",
36
+ ".bmp",
37
+ ".mp4",
38
+ ".mov",
39
+ }:
40
+ raise ValueError(f"Unsupported image extension: {extension}")
41
+
42
+ image_bytes = b""
43
+ if extension.lower() in {".mp4", ".mov"}:
44
+ frames = T.extract_frames(media)
45
+ image = frames[len(frames) // 2]
46
+ buffer = io.BytesIO()
47
+ Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
48
+ image_bytes = buffer.getvalue()
49
+ else:
50
+ image_bytes = open(media, "rb").read()
51
+ return encode_image_bytes(image_bytes)
52
+
53
+
24
54
  TextOrImage = Union[str, List[Union[str, Path]]]
25
55
  Message = Dict[str, TextOrImage]
26
56
 
@@ -54,7 +84,7 @@ class OpenAILMM(LMM):
54
84
  self,
55
85
  model_name: str = "gpt-4o",
56
86
  api_key: Optional[str] = None,
57
- max_tokens: int = 1024,
87
+ max_tokens: int = 4096,
58
88
  json_mode: bool = False,
59
89
  **kwargs: Any,
60
90
  ):
@@ -97,20 +127,14 @@ class OpenAILMM(LMM):
97
127
  fixed_c = {"role": c["role"]}
98
128
  fixed_c["content"] = [{"type": "text", "text": c["content"]}] # type: ignore
99
129
  if "media" in c:
100
- for image in c["media"]:
101
- extension = Path(image).suffix
102
- if extension.lower() == ".jpeg" or extension.lower() == ".jpg":
103
- extension = "jpg"
104
- elif extension.lower() == ".png":
105
- extension = "png"
106
- else:
107
- raise ValueError(f"Unsupported image extension: {extension}")
108
- encoded_image = encode_image(image)
130
+ for media in c["media"]:
131
+ encoded_media = encode_media(media)
132
+
109
133
  fixed_c["content"].append( # type: ignore
110
134
  {
111
135
  "type": "image_url",
112
136
  "image_url": {
113
- "url": f"data:image/{extension};base64,{encoded_image}", # type: ignore
137
+ "url": f"data:image/png;base64,{encoded_media}", # type: ignore
114
138
  "detail": "low",
115
139
  },
116
140
  },
@@ -138,13 +162,12 @@ class OpenAILMM(LMM):
138
162
  ]
139
163
  if media and len(media) > 0:
140
164
  for m in media:
141
- extension = Path(m).suffix
142
- encoded_image = encode_image(m)
165
+ encoded_media = encode_media(m)
143
166
  message[0]["content"].append(
144
167
  {
145
168
  "type": "image_url",
146
169
  "image_url": {
147
- "url": f"data:image/{extension};base64,{encoded_image}",
170
+ "url": f"data:image/png;base64,{encoded_media}",
148
171
  "detail": "low",
149
172
  },
150
173
  },
@@ -241,7 +264,7 @@ class AzureOpenAILMM(OpenAILMM):
241
264
  api_key: Optional[str] = None,
242
265
  api_version: str = "2024-02-01",
243
266
  azure_endpoint: Optional[str] = None,
244
- max_tokens: int = 1024,
267
+ max_tokens: int = 4096,
245
268
  json_mode: bool = False,
246
269
  **kwargs: Any,
247
270
  ):
@@ -312,7 +335,7 @@ class OllamaLMM(LMM):
312
335
  fixed_chat = []
313
336
  for message in chat:
314
337
  if "media" in message:
315
- message["images"] = [encode_image(m) for m in message["media"]]
338
+ message["images"] = [encode_media(m) for m in message["media"]]
316
339
  del message["media"]
317
340
  fixed_chat.append(message)
318
341
  url = f"{self.url}/chat"
@@ -343,7 +366,7 @@ class OllamaLMM(LMM):
343
366
  json_data = json.dumps(data)
344
367
  if media and len(media) > 0:
345
368
  for m in media:
346
- data["images"].append(encode_image(m)) # type: ignore
369
+ data["images"].append(encode_media(m)) # type: ignore
347
370
 
348
371
  response = requests.post(url, data=json_data)
349
372
 
@@ -362,8 +362,10 @@ class Execution(BaseModel):
362
362
  return Execution(
363
363
  error=Error(
364
364
  name=exec.__class__.__name__,
365
- value=str(exec),
366
- traceback_raw=traceback_raw,
365
+ value=_remove_escape_and_color_codes(str(exec)),
366
+ traceback_raw=[
367
+ _remove_escape_and_color_codes(line) for line in traceback_raw
368
+ ],
367
369
  )
368
370
  )
369
371
 
@@ -378,8 +380,11 @@ class Execution(BaseModel):
378
380
  error=(
379
381
  Error(
380
382
  name=exec.error.name,
381
- value=exec.error.value,
382
- traceback_raw=exec.error.traceback_raw,
383
+ value=_remove_escape_and_color_codes(exec.error.value),
384
+ traceback_raw=[
385
+ _remove_escape_and_color_codes(line)
386
+ for line in exec.error.traceback_raw
387
+ ],
383
388
  )
384
389
  if exec.error
385
390
  else None
File without changes
File without changes
@@ -11,19 +11,19 @@ from .tools import (
11
11
  clip,
12
12
  closest_box_distance,
13
13
  closest_mask_distance,
14
+ depth_anything_v2,
15
+ detr_segmentation,
16
+ dpt_hybrid_midas,
14
17
  extract_frames,
15
18
  florencev2_image_caption,
16
- get_tool_documentation,
17
19
  florencev2_object_detection,
18
- detr_segmentation,
19
- depth_anything_v2,
20
- generate_soft_edge_image,
21
- dpt_hybrid_midas,
20
+ florencev2_roberta_vqa,
22
21
  generate_pose_image,
22
+ generate_soft_edge_image,
23
+ get_tool_documentation,
23
24
  git_vqa_v2,
24
25
  grounding_dino,
25
26
  grounding_sam,
26
- florencev2_roberta_vqa,
27
27
  load_image,
28
28
  loca_visual_prompt_counting,
29
29
  loca_zero_shot_counting,