vision-agent 0.2.44__py3-none-any.whl → 0.2.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,13 @@
1
1
  import copy
2
+ import difflib
2
3
  import json
3
4
  import logging
4
5
  import sys
6
+ import tempfile
5
7
  from pathlib import Path
6
- from typing import Any, Callable, Dict, List, Optional, Union, cast
8
+ from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
7
9
 
10
+ from PIL import Image
8
11
  from rich.console import Console
9
12
  from rich.style import Style
10
13
  from rich.syntax import Syntax
@@ -14,7 +17,6 @@ import vision_agent.tools as T
14
17
  from vision_agent.agent import Agent
15
18
  from vision_agent.agent.vision_agent_prompts import (
16
19
  CODE,
17
- FEEDBACK,
18
20
  FIX_BUG,
19
21
  FULL_TASK,
20
22
  PLAN,
@@ -37,17 +39,27 @@ _CONSOLE = Console()
37
39
  _DEFAULT_IMPORT = "\n".join(T.__new_tools__)
38
40
 
39
41
 
40
- def format_memory(memory: List[Dict[str, str]]) -> str:
41
- return FEEDBACK.format(
42
- feedback="\n".join(
43
- [
44
- f"### Feedback {i}:\nCode: ```python\n{m['code']}\n```\nFeedback: {m['feedback']}\n"
45
- for i, m in enumerate(memory)
46
- ]
42
+ def get_diff(before: str, after: str) -> str:
43
+ return "".join(
44
+ difflib.unified_diff(
45
+ before.splitlines(keepends=True), after.splitlines(keepends=True)
47
46
  )
48
47
  )
49
48
 
50
49
 
50
+ def format_memory(memory: List[Dict[str, str]]) -> str:
51
+ output_str = ""
52
+ for i, m in enumerate(memory):
53
+ output_str += f"### Feedback {i}:\n"
54
+ output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
55
+ output_str += f"Feedback {i}: {m['feedback']}\n\n"
56
+ if "edits" in m:
57
+ output_str += f"Edits {i}:\n{m['edits']}\n"
58
+ output_str += "\n"
59
+
60
+ return output_str
61
+
62
+
51
63
  def extract_code(code: str) -> str:
52
64
  if "\n```python" in code:
53
65
  start = "\n```python"
@@ -78,12 +90,35 @@ def extract_json(json_str: str) -> Dict[str, Any]:
78
90
  return json_dict # type: ignore
79
91
 
80
92
 
93
+ def extract_image(
94
+ media: Optional[Sequence[Union[str, Path]]]
95
+ ) -> Optional[Sequence[Union[str, Path]]]:
96
+ if media is None:
97
+ return None
98
+
99
+ new_media = []
100
+ for m in media:
101
+ m = Path(m)
102
+ extension = m.suffix
103
+ if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
104
+ new_media.append(m)
105
+ elif extension in [".mp4", ".mov"]:
106
+ frames = T.extract_frames(m)
107
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
108
+ if len(frames) > 0:
109
+ Image.fromarray(frames[0][0]).save(tmp.name)
110
+ new_media.append(Path(tmp.name))
111
+ if len(new_media) == 0:
112
+ return None
113
+ return new_media
114
+
115
+
81
116
  def write_plan(
82
117
  chat: List[Dict[str, str]],
83
118
  tool_desc: str,
84
119
  working_memory: str,
85
120
  model: Union[LLM, LMM],
86
- media: Optional[List[Union[str, Path]]] = None,
121
+ media: Optional[Sequence[Union[str, Path]]] = None,
87
122
  ) -> List[Dict[str, str]]:
88
123
  chat = copy.deepcopy(chat)
89
124
  if chat[-1]["role"] != "user":
@@ -94,6 +129,7 @@ def write_plan(
94
129
  prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
95
130
  chat[-1]["content"] = prompt
96
131
  if isinstance(model, OpenAILMM):
132
+ media = extract_image(media)
97
133
  return extract_json(model.chat(chat, images=media))["plan"] # type: ignore
98
134
  else:
99
135
  return extract_json(model.chat(chat))["plan"] # type: ignore
@@ -103,7 +139,7 @@ def reflect(
103
139
  chat: List[Dict[str, str]],
104
140
  plan: str,
105
141
  code: str,
106
- model: LLM,
142
+ model: Union[LLM, LMM],
107
143
  ) -> Dict[str, Union[str, bool]]:
108
144
  chat = copy.deepcopy(chat)
109
145
  if chat[-1]["role"] != "user":
@@ -120,7 +156,7 @@ def write_and_test_code(
120
156
  task: str,
121
157
  tool_info: str,
122
158
  tool_utils: str,
123
- working_memory: str,
159
+ working_memory: List[Dict[str, str]],
124
160
  coder: LLM,
125
161
  tester: LLM,
126
162
  debugger: LLM,
@@ -137,7 +173,13 @@ def write_and_test_code(
137
173
  }
138
174
  )
139
175
  code = extract_code(
140
- coder(CODE.format(docstring=tool_info, question=task, feedback=working_memory))
176
+ coder(
177
+ CODE.format(
178
+ docstring=tool_info,
179
+ question=task,
180
+ feedback=format_memory(working_memory),
181
+ )
182
+ )
141
183
  )
142
184
  test = extract_code(
143
185
  tester(
@@ -180,7 +222,7 @@ def write_and_test_code(
180
222
  )
181
223
 
182
224
  count = 0
183
- new_working_memory = []
225
+ new_working_memory: List[Dict[str, str]] = []
184
226
  while not result.success and count < max_retries:
185
227
  log_progress(
186
228
  {
@@ -191,14 +233,28 @@ def write_and_test_code(
191
233
  fixed_code_and_test = extract_json(
192
234
  debugger(
193
235
  FIX_BUG.format(
194
- code=code, tests=test, result=result.text(), feedback=working_memory
236
+ code=code,
237
+ tests=test,
238
+ result="\n".join(result.text().splitlines()[-50:]),
239
+ feedback=format_memory(working_memory + new_working_memory),
195
240
  )
196
241
  )
197
242
  )
243
+ old_code = code
244
+ old_test = test
245
+
198
246
  if fixed_code_and_test["code"].strip() != "":
199
247
  code = extract_code(fixed_code_and_test["code"])
200
248
  if fixed_code_and_test["test"].strip() != "":
201
249
  test = extract_code(fixed_code_and_test["test"])
250
+
251
+ new_working_memory.append(
252
+ {
253
+ "code": f"{code}\n{test}",
254
+ "feedback": fixed_code_and_test["reflections"],
255
+ "edits": get_diff(f"{old_code}\n{old_test}", f"{code}\n{test}"),
256
+ }
257
+ )
202
258
  log_progress(
203
259
  {
204
260
  "type": "code",
@@ -209,9 +265,6 @@ def write_and_test_code(
209
265
  },
210
266
  }
211
267
  )
212
- new_working_memory.append(
213
- {"code": f"{code}\n{test}", "feedback": fixed_code_and_test["reflections"]}
214
- )
215
268
 
216
269
  result = code_interpreter.exec_isolation(f"{_DEFAULT_IMPORT}\n{code}\n{test}")
217
270
  log_progress(
@@ -309,7 +362,7 @@ class VisionAgent(Agent):
309
362
 
310
363
  def __init__(
311
364
  self,
312
- planner: Optional[LLM] = None,
365
+ planner: Optional[Union[LLM, LMM]] = None,
313
366
  coder: Optional[LLM] = None,
314
367
  tester: Optional[LLM] = None,
315
368
  debugger: Optional[LLM] = None,
@@ -459,7 +512,7 @@ class VisionAgent(Agent):
459
512
  ),
460
513
  tool_info=tool_info,
461
514
  tool_utils=T.UTILITIES_DOCSTRING,
462
- working_memory=format_memory(working_memory),
515
+ working_memory=working_memory,
463
516
  coder=self.coder,
464
517
  tester=self.tester,
465
518
  debugger=self.debugger,
@@ -503,6 +556,8 @@ class VisionAgent(Agent):
503
556
  working_memory.append(
504
557
  {"code": f"{code}\n{test}", "feedback": feedback}
505
558
  )
559
+ else:
560
+ break
506
561
 
507
562
  retries += 1
508
563
 
@@ -29,14 +29,17 @@ PLAN = """
29
29
  {feedback}
30
30
 
31
31
  **Instructions**:
32
- Based on the context and tools you have available, write a plan of subtasks to achieve the user request utilizing given tools when necessary. Output a list of jsons in the following format:
32
+ 1. Based on the context and tools you have available, write a plan of subtasks to achieve the user request.
33
+ 2. Go over the users request step by step and ensure each step is represented as a clear subtask in your plan.
34
+
35
+ Output a list of jsons in the following format
33
36
 
34
37
  ```json
35
38
  {{
36
39
  "plan":
37
40
  [
38
41
  {{
39
- "instructions": str # what you should do in this task, one short phrase or sentence
42
+ "instructions": str # what you should do in this task associated with a tool
40
43
  }}
41
44
  ]
42
45
  }}
@@ -194,9 +197,7 @@ When we run this test code:
194
197
  ```
195
198
 
196
199
  It raises this error:
197
- ```python
198
200
  {result}
199
- ```
200
201
 
201
202
  This is previous feedback provided on the code:
202
203
  {feedback}
@@ -199,14 +199,15 @@ def extract_frames(
199
199
 
200
200
  def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
201
201
  """'ocr' extracts text from an image. It returns a list of detected text, bounding
202
- boxes, and confidence scores. The results are sorted from top-left to bottom right
202
+ boxes with normalized coordinates, and confidence scores. The results are sorted
203
+ from top-left to bottom right.
203
204
 
204
205
  Parameters:
205
206
  image (np.ndarray): The image to extract text from.
206
207
 
207
208
  Returns:
208
- List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox,
209
- and confidence score.
209
+ List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
210
+ with nornmalized coordinates, and confidence score.
210
211
 
211
212
  Example
212
213
  -------
@@ -607,6 +608,7 @@ def overlay_bounding_boxes(
607
608
  label: COLORS[i % len(COLORS)]
608
609
  for i, label in enumerate(set([box["label"] for box in bboxes]))
609
610
  }
611
+ bboxes = sorted(bboxes, key=lambda x: x["label"], reverse=True)
610
612
 
611
613
  width, height = pil_image.size
612
614
  fontsize = max(12, int(min(width, height) / 40))
@@ -679,6 +681,7 @@ def overlay_segmentation_masks(
679
681
  label: COLORS[i % len(COLORS)]
680
682
  for i, label in enumerate(set([mask["label"] for mask in masks]))
681
683
  }
684
+ masks = sorted(masks, key=lambda x: x["label"], reverse=True)
682
685
 
683
686
  for elt in masks:
684
687
  mask = elt["mask"]
@@ -2,8 +2,8 @@ import base64
2
2
  import logging
3
3
  import math
4
4
  import os
5
- from concurrent.futures import ProcessPoolExecutor, as_completed
6
5
  import tempfile
6
+ from concurrent.futures import ProcessPoolExecutor, as_completed
7
7
  from typing import List, Tuple, cast
8
8
 
9
9
  import cv2
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.44
3
+ Version: 0.2.46
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -11,8 +11,8 @@ vision_agent/agent/easytool_v2.py,sha256=CjY-sSj3abxnSq3ZHZMt-7YvRWDXEZsC6RN8FFI
11
11
  vision_agent/agent/easytool_v2_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
12
12
  vision_agent/agent/reflexion.py,sha256=AlM5AvBJvCslXlYQdZiadq4oVHsNBm3IF_03DglTxRo,10506
13
13
  vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
14
- vision_agent/agent/vision_agent.py,sha256=JtPDIiLINXm3jBR0LbqblfB9yCv-8M-B7XRx1EPDhFU,18749
15
- vision_agent/agent/vision_agent_prompts.py,sha256=FnIYF2Fe3joRvFnOJD9ZyWXMihMyL606nXxWJ0adTZ8,8314
14
+ vision_agent/agent/vision_agent.py,sha256=S0VJWsdr0NIYjikXvPrEX-njGMqOIA53r4Q4NYY0Lpo,20365
15
+ vision_agent/agent/vision_agent_prompts.py,sha256=hgnTlaYp2HMBHLi3e4faPb-DI5jQL9jfhKq9jyEUEgY,8370
16
16
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
18
18
  vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
@@ -23,14 +23,14 @@ vision_agent/tools/__init__.py,sha256=K_7knxmyTIcSEGL8c9wF8RpVh3GrMYfybFaq-2SUM1
23
23
  vision_agent/tools/easytool_tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
24
24
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
25
25
  vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
26
- vision_agent/tools/tools.py,sha256=PhmJ0kQeZ-tSQ675HI8QnR49zlH6nJ_opt6QS4dNSVA,25889
26
+ vision_agent/tools/tools.py,sha256=SrNrIjyUKoTE3mCqGcy6nC-MeEzJ8uJCumlSkTvvPpg,26085
27
27
  vision_agent/utils/__init__.py,sha256=Ce4yPhoWanRsnTy3X7YzZNBYYRJsrJeT7N59WUf8GZM,209
28
28
  vision_agent/utils/execute.py,sha256=GlpUGe3pg5KdSvRHLFfVcn9ptXBIp-QRoHT3Wa6aIMs,20318
29
29
  vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
30
30
  vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
31
31
  vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
32
- vision_agent/utils/video.py,sha256=EuJJ7Owi3pIV-q3WcZ-LaaTrGAmmZ8YAA22rmEkY7GI,8885
33
- vision_agent-0.2.44.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
34
- vision_agent-0.2.44.dist-info/METADATA,sha256=EbnJiKZzbAgeCN30GRMYfMPN5w_wo9XBkuhWEP_0cN8,6817
35
- vision_agent-0.2.44.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
36
- vision_agent-0.2.44.dist-info/RECORD,,
32
+ vision_agent/utils/video.py,sha256=_u3UrEpcJzbclKyJYxF7SiDQGhE2gUc598diYYiEv34,8885
33
+ vision_agent-0.2.46.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
34
+ vision_agent-0.2.46.dist-info/METADATA,sha256=FOlKABAkLUX8oqtjeE2q9EO6j8yeoiwyw3lWUpIe0ow,6817
35
+ vision_agent-0.2.46.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
36
+ vision_agent-0.2.46.dist-info/RECORD,,