vision-agent 0.2.49__py3-none-any.whl → 0.2.51__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -428,12 +428,12 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
428
428
 
429
429
 
430
430
  class EasyToolV2(Agent):
431
- r"""EasyToolV2 is an agent framework that utilizes tools as well as self
432
- reflection to accomplish tasks, in particular vision tasks. EasyToolV2 is based
433
- off of EasyTool https://arxiv.org/abs/2401.06201 and Reflexion
434
- https://arxiv.org/abs/2303.11366 where it will attempt to complete a task and then
435
- reflect on whether or not it was able to accomplish the task based off of the plan
436
- and final results, if not it will redo the task with this newly added reflection.
431
+ """EasyToolV2 is an agent framework that utilizes tools as well as self reflection
432
+ to accomplish tasks, in particular vision tasks. EasyToolV2 is based off of EasyTool
433
+ https://arxiv.org/abs/2401.06201 and Reflexion https://arxiv.org/abs/2303.11366
434
+ where it will attempt to complete a task and then reflect on whether or not it was
435
+ able to accomplish the task based off of the plan and final results, if not it will
436
+ redo the task with this newly added reflection.
437
437
 
438
438
  Example
439
439
  -------
@@ -461,7 +461,10 @@ class EasyToolV2(Agent):
461
461
  reflect_model: the model to use for self reflection.
462
462
  max_retries: maximum number of retries to attempt to complete the task.
463
463
  verbose: whether to print more logs.
464
- report_progress_callback: a callback to report the progress of the agent. This is useful for streaming logs in a web application where multiple EasyToolV2 instances are running in parallel. This callback ensures that the progress are not mixed up.
464
+ report_progress_callback: a callback to report the progress of the agent.
465
+ This is useful for streaming logs in a web application where multiple
466
+ EasyToolV2 instances are running in parallel. This callback ensures
467
+ that the progress are not mixed up.
465
468
  """
466
469
  self.task_model = (
467
470
  OpenAILLM(model_name="gpt-4-turbo", json_mode=True, temperature=0.0)
@@ -495,9 +498,10 @@ class EasyToolV2(Agent):
495
498
  """Invoke the vision agent.
496
499
 
497
500
  Parameters:
498
- chat: A conversation in the format of
499
- [{"role": "user", "content": "describe your task here..."}].
500
- image: The input image referenced in the chat parameter.
501
+ input: A conversation in the format of
502
+ [{"role": "user", "content": "describe your task here..."}] or a string
503
+ containing just the content.
504
+ media: The input media referenced in the chat parameter.
501
505
  reference_data: A dictionary containing the reference image, mask or bounding
502
506
  box in the format of:
503
507
  {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
@@ -549,7 +553,7 @@ class EasyToolV2(Agent):
549
553
  Parameters:
550
554
  chat: A conversation in the format of
551
555
  [{"role": "user", "content": "describe your task here..."}].
552
- image: The input image referenced in the chat parameter.
556
+ media: The media image referenced in the chat parameter.
553
557
  reference_data: A dictionary containing the reference image, mask or bounding
554
558
  box in the format of:
555
559
  {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
@@ -558,9 +562,8 @@ class EasyToolV2(Agent):
558
562
  self_reflection: boolean to enable and disable self reflection.
559
563
 
560
564
  Returns:
561
- A tuple where the first item is the final answer and the second item is a
562
- list of all the tool results. The last item in the tool results also
563
- contains the visualized output.
565
+ Tuple[str, List[Dict]]: A tuple where the first item is the final answer
566
+ and the second item is a list of all the tool results.
564
567
  """
565
568
  if len(chat) == 0:
566
569
  raise ValueError("Input cannot be empty.")
@@ -144,7 +144,7 @@ class Reflexion(Agent):
144
144
 
145
145
  Parameters:
146
146
  input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
147
- image: the input image referenced in the prompt parameter.
147
+ media: the input media referenced in the prompt parameter.
148
148
 
149
149
  Returns:
150
150
  A text response.
@@ -36,11 +36,25 @@ logging.basicConfig(stream=sys.stdout)
36
36
  _LOGGER = logging.getLogger(__name__)
37
37
  _MAX_TABULATE_COL_WIDTH = 80
38
38
  _CONSOLE = Console()
39
- _DEFAULT_IMPORT = "\n".join(T.__new_tools__) + "\n".join(
40
- [
39
+
40
+
41
+ class DefaultImports:
42
+ """Container for default imports used in the code execution."""
43
+
44
+ common_imports = [
41
45
  "from typing import *",
42
46
  ]
43
- )
47
+
48
+ @staticmethod
49
+ def to_code_string() -> str:
50
+ return "\n".join(DefaultImports.common_imports + T.__new_tools__)
51
+
52
+ @staticmethod
53
+ def prepend_imports(code: str) -> str:
54
+ """Run this method to prepend the default imports to the code.
55
+ NOTE: be sure to run this method after the custom tools have been registered.
56
+ """
57
+ return DefaultImports.to_code_string() + "\n\n" + code
44
58
 
45
59
 
46
60
  def get_diff(before: str, after: str) -> str:
@@ -202,18 +216,20 @@ def write_and_test_code(
202
216
  "type": "code",
203
217
  "status": "running",
204
218
  "payload": {
205
- "code": code,
219
+ "code": DefaultImports.prepend_imports(code),
206
220
  "test": test,
207
221
  },
208
222
  }
209
223
  )
210
- result = code_interpreter.exec_isolation(f"{_DEFAULT_IMPORT}\n{code}\n{test}")
224
+ result = code_interpreter.exec_isolation(
225
+ f"{DefaultImports.to_code_string()}\n{code}\n{test}"
226
+ )
211
227
  log_progress(
212
228
  {
213
229
  "type": "code",
214
230
  "status": "completed" if result.success else "failed",
215
231
  "payload": {
216
- "code": code,
232
+ "code": DefaultImports.prepend_imports(code),
217
233
  "test": test,
218
234
  "result": result.to_json(),
219
235
  },
@@ -264,19 +280,21 @@ def write_and_test_code(
264
280
  "type": "code",
265
281
  "status": "running",
266
282
  "payload": {
267
- "code": code,
283
+ "code": DefaultImports.prepend_imports(code),
268
284
  "test": test,
269
285
  },
270
286
  }
271
287
  )
272
288
 
273
- result = code_interpreter.exec_isolation(f"{_DEFAULT_IMPORT}\n{code}\n{test}")
289
+ result = code_interpreter.exec_isolation(
290
+ f"{DefaultImports.to_code_string()}\n{code}\n{test}"
291
+ )
274
292
  log_progress(
275
293
  {
276
294
  "type": "code",
277
295
  "status": "completed" if result.success else "failed",
278
296
  "payload": {
279
- "code": code,
297
+ "code": DefaultImports.prepend_imports(code),
280
298
  "test": test,
281
299
  "result": result.to_json(),
282
300
  },
@@ -307,7 +325,14 @@ def write_and_test_code(
307
325
  def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
308
326
  _CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
309
327
  _CONSOLE.print("=" * 30 + " Code " + "=" * 30)
310
- _CONSOLE.print(Syntax(code, "python", theme="gruvbox-dark", line_numbers=True))
328
+ _CONSOLE.print(
329
+ Syntax(
330
+ DefaultImports.prepend_imports(code),
331
+ "python",
332
+ theme="gruvbox-dark",
333
+ line_numbers=True,
334
+ )
335
+ )
311
336
  if test:
312
337
  _CONSOLE.print("=" * 30 + " Test " + "=" * 30)
313
338
  _CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))
@@ -417,10 +442,10 @@ class VisionAgent(Agent):
417
442
  """Chat with Vision Agent and return intermediate information regarding the task.
418
443
 
419
444
  Parameters:
420
- chat (List[Dict[str, str]]): A conversation in the format of
421
- [{"role": "user", "content": "describe your task here..."}].
445
+ input (Union[List[Dict[str, str]], str]): A conversation in the format of
446
+ [{"role": "user", "content": "describe your task here..."}] or a string
447
+ of just the contents.
422
448
  media (Optional[Union[str, Path]]): The media file to be used in the task.
423
- self_reflection (bool): Whether to reflect on the task and debug the code.
424
449
 
425
450
  Returns:
426
451
  str: The code output by the Vision Agent.
@@ -446,7 +471,8 @@ class VisionAgent(Agent):
446
471
  [{"role": "user", "content": "describe your task here..."}].
447
472
  media (Optional[Union[str, Path]]): The media file to be used in the task.
448
473
  self_reflection (bool): Whether to reflect on the task and debug the code.
449
- show_visualization (bool): If True, it opens a new window locally to show the image(s) created by visualization code (if there is any).
474
+ display_visualization (bool): If True, it opens a new window locally to
475
+ show the image(s) created by visualization code (if there is any).
450
476
 
451
477
  Returns:
452
478
  Dict[str, Any]: A dictionary containing the code, test, test result, plan,
@@ -464,10 +490,6 @@ class VisionAgent(Agent):
464
490
  if chat_i["role"] == "user":
465
491
  chat_i["content"] += f" Image name {media}"
466
492
 
467
- # re-grab custom tools
468
- global _DEFAULT_IMPORT
469
- _DEFAULT_IMPORT = "\n".join(T.__new_tools__)
470
-
471
493
  code = ""
472
494
  test = ""
473
495
  working_memory: List[Dict[str, str]] = []
@@ -531,38 +553,35 @@ class VisionAgent(Agent):
531
553
  working_memory.extend(results["working_memory"]) # type: ignore
532
554
  plan.append({"code": code, "test": test, "plan": plan_i})
533
555
 
534
- if self_reflection:
535
- self.log_progress(
536
- {
537
- "type": "self_reflection",
538
- "status": "started",
539
- }
540
- )
541
- reflection = reflect(
542
- chat,
543
- FULL_TASK.format(
544
- user_request=chat[0]["content"], subtasks=plan_i_str
545
- ),
546
- code,
547
- self.planner,
548
- )
549
- if self.verbosity > 0:
550
- _LOGGER.info(f"Reflection: {reflection}")
551
- feedback = cast(str, reflection["feedback"])
552
- success = cast(bool, reflection["success"])
553
- self.log_progress(
554
- {
555
- "type": "self_reflection",
556
- "status": "completed" if success else "failed",
557
- "payload": reflection,
558
- }
559
- )
560
- working_memory.append(
561
- {"code": f"{code}\n{test}", "feedback": feedback}
562
- )
563
- else:
556
+ if not self_reflection:
564
557
  break
565
558
 
559
+ self.log_progress(
560
+ {
561
+ "type": "self_reflection",
562
+ "status": "started",
563
+ }
564
+ )
565
+ reflection = reflect(
566
+ chat,
567
+ FULL_TASK.format(
568
+ user_request=chat[0]["content"], subtasks=plan_i_str
569
+ ),
570
+ code,
571
+ self.planner,
572
+ )
573
+ if self.verbosity > 0:
574
+ _LOGGER.info(f"Reflection: {reflection}")
575
+ feedback = cast(str, reflection["feedback"])
576
+ success = cast(bool, reflection["success"])
577
+ self.log_progress(
578
+ {
579
+ "type": "self_reflection",
580
+ "status": "completed" if success else "failed",
581
+ "payload": reflection,
582
+ }
583
+ )
584
+ working_memory.append({"code": f"{code}\n{test}", "feedback": feedback})
566
585
  retries += 1
567
586
 
568
587
  execution_result = cast(Execution, results["test_result"])
@@ -571,7 +590,7 @@ class VisionAgent(Agent):
571
590
  "type": "final_code",
572
591
  "status": "completed" if success else "failed",
573
592
  "payload": {
574
- "code": code,
593
+ "code": DefaultImports.prepend_imports(code),
575
594
  "test": test,
576
595
  "result": execution_result.to_json(),
577
596
  },
@@ -586,7 +605,7 @@ class VisionAgent(Agent):
586
605
  play_video(res.mp4)
587
606
 
588
607
  return {
589
- "code": code,
608
+ "code": DefaultImports.prepend_imports(code),
590
609
  "test": test,
591
610
  "test_result": execution_result,
592
611
  "plan": plan,
@@ -75,17 +75,18 @@ def grounding_dino(
75
75
 
76
76
  Returns:
77
77
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
78
- bounding box of the detected objects with normalized coordinates between 0 and 1
79
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and
80
- xmax and ymax are the coordinates of the bottom-right of the bounding box.
78
+ bounding box of the detected objects with normalized coordinates between 0
79
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
80
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
81
+ bounding box.
81
82
 
82
83
  Example
83
84
  -------
84
- >>> grounding_dino("car. dinosaur", image)
85
- [
86
- {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
87
- {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
88
- ]
85
+ >>> grounding_dino("car. dinosaur", image)
86
+ [
87
+ {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
88
+ {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
89
+ ]
89
90
  """
90
91
  image_size = image.shape[:2]
91
92
  image_b64 = convert_to_b64(image)
@@ -129,27 +130,27 @@ def grounding_sam(
129
130
 
130
131
  Returns:
131
132
  List[Dict[str, Any]]: A list of dictionaries containing the score, label,
132
- bounding box, and mask of the detected objects with normalized coordinates
133
- (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and
134
- xmax and ymax are the coordinates of the bottom-right of the bounding box.
135
- The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
136
- the background.
133
+ bounding box, and mask of the detected objects with normalized coordinates
134
+ (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
135
+ and xmax and ymax are the coordinates of the bottom-right of the bounding box.
136
+ The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
137
+ the background.
137
138
 
138
139
  Example
139
140
  -------
140
- >>> grounding_sam("car. dinosaur", image)
141
- [
142
- {
143
- 'score': 0.99,
144
- 'label': 'dinosaur',
145
- 'bbox': [0.1, 0.11, 0.35, 0.4],
146
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
147
- [0, 0, 0, ..., 0, 0, 0],
148
- ...,
149
- [0, 0, 0, ..., 0, 0, 0],
150
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
151
- },
152
- ]
141
+ >>> grounding_sam("car. dinosaur", image)
142
+ [
143
+ {
144
+ 'score': 0.99,
145
+ 'label': 'dinosaur',
146
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
147
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
148
+ [0, 0, 0, ..., 0, 0, 0],
149
+ ...,
150
+ [0, 0, 0, ..., 0, 0, 0],
151
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
152
+ },
153
+ ]
153
154
  """
154
155
  image_size = image.shape[:2]
155
156
  image_b64 = convert_to_b64(image)
@@ -187,12 +188,12 @@ def extract_frames(
187
188
 
188
189
  Returns:
189
190
  List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
190
- and the timestamp in seconds.
191
+ as a numpy array and the timestamp in seconds.
191
192
 
192
193
  Example
193
194
  -------
194
- >>> extract_frames("path/to/video.mp4")
195
- [(frame1, 0.0), (frame2, 0.5), ...]
195
+ >>> extract_frames("path/to/video.mp4")
196
+ [(frame1, 0.0), (frame2, 0.5), ...]
196
197
  """
197
198
 
198
199
  return extract_frames_from_video(str(video_uri), fps)
@@ -212,10 +213,10 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
212
213
 
213
214
  Example
214
215
  -------
215
- >>> ocr(image)
216
- [
217
- {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
218
- ]
216
+ >>> ocr(image)
217
+ [
218
+ {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
219
+ ]
219
220
  """
220
221
 
221
222
  pil_image = Image.fromarray(image).convert("RGB")
@@ -266,9 +267,8 @@ def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
266
267
 
267
268
  Example
268
269
  -------
269
- >>> zero_shot_counting(image)
270
- {'count': 45},
271
-
270
+ >>> zero_shot_counting(image)
271
+ {'count': 45},
272
272
  """
273
273
 
274
274
  image_b64 = convert_to_b64(image)
@@ -297,9 +297,8 @@ def visual_prompt_counting(
297
297
 
298
298
  Example
299
299
  -------
300
- >>> visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
301
- {'count': 45},
302
-
300
+ >>> visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
301
+ {'count': 45},
303
302
  """
304
303
 
305
304
  image_size = get_image_size(image)
@@ -332,9 +331,8 @@ def image_question_answering(image: np.ndarray, prompt: str) -> str:
332
331
 
333
332
  Example
334
333
  -------
335
- >>> image_question_answering(image, 'What is the cat doing ?')
336
- 'drinking milk'
337
-
334
+ >>> image_question_answering(image, 'What is the cat doing ?')
335
+ 'drinking milk'
338
336
  """
339
337
 
340
338
  image_b64 = convert_to_b64(image)
@@ -363,9 +361,8 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
363
361
 
364
362
  Example
365
363
  -------
366
- >>> clip(image, ['dog', 'cat', 'bird'])
367
- {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
368
-
364
+ >>> clip(image, ['dog', 'cat', 'bird'])
365
+ {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
369
366
  """
370
367
 
371
368
  image_b64 = convert_to_b64(image)
@@ -391,9 +388,8 @@ def image_caption(image: np.ndarray) -> str:
391
388
 
392
389
  Example
393
390
  -------
394
- >>> image_caption(image)
395
- 'This image contains a cat sitting on a table with a bowl of milk.'
396
-
391
+ >>> image_caption(image)
392
+ 'This image contains a cat sitting on a table with a bowl of milk.'
397
393
  """
398
394
 
399
395
  image_b64 = convert_to_b64(image)
@@ -418,8 +414,8 @@ def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
418
414
 
419
415
  Example
420
416
  -------
421
- >>> closest_mask_distance(mask1, mask2)
422
- 0.5
417
+ >>> closest_mask_distance(mask1, mask2)
418
+ 0.5
423
419
  """
424
420
 
425
421
  mask1 = np.clip(mask1, 0, 1)
@@ -474,8 +470,8 @@ def closest_box_distance(
474
470
 
475
471
  Example
476
472
  -------
477
- >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
478
- 141.42
473
+ >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
474
+ 141.42
479
475
  """
480
476
 
481
477
  x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
@@ -499,7 +495,7 @@ def save_json(data: Any, file_path: str) -> None:
499
495
 
500
496
  Example
501
497
  -------
502
- >>> save_json(data, "path/to/file.json")
498
+ >>> save_json(data, "path/to/file.json")
503
499
  """
504
500
 
505
501
  class NumpyEncoder(json.JSONEncoder):
@@ -515,7 +511,7 @@ def save_json(data: Any, file_path: str) -> None:
515
511
 
516
512
 
517
513
  def load_image(image_path: str) -> np.ndarray:
518
- """'load_image' is a utility function that loads an image from the given path.
514
+ """'load_image' is a utility function that loads an image from the given file path string.
519
515
 
520
516
  Parameters:
521
517
  image_path (str): The path to the image.
@@ -525,9 +521,11 @@ def load_image(image_path: str) -> np.ndarray:
525
521
 
526
522
  Example
527
523
  -------
528
- >>> load_image("path/to/image.jpg")
524
+ >>> load_image("path/to/image.jpg")
529
525
  """
530
-
526
+ # NOTE: sometimes the generated code pass in a NumPy array
527
+ if isinstance(image_path, np.ndarray):
528
+ return image_path
531
529
  image = Image.open(image_path).convert("RGB")
532
530
  return np.array(image)
533
531
 
@@ -543,8 +541,8 @@ def save_image(image: np.ndarray) -> str:
543
541
 
544
542
  Example
545
543
  -------
546
- >>> save_image(image)
547
- "/tmp/tmpabc123.png"
544
+ >>> save_image(image)
545
+ "/tmp/tmpabc123.png"
548
546
  """
549
547
  from IPython.display import display
550
548
 
@@ -570,8 +568,8 @@ def save_video(
570
568
 
571
569
  Example
572
570
  -------
573
- >>> save_video(frames)
574
- "/tmp/tmpvideo123.mp4"
571
+ >>> save_video(frames)
572
+ "/tmp/tmpvideo123.mp4"
575
573
  """
576
574
  if fps <= 0:
577
575
  _LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
@@ -617,9 +615,9 @@ def overlay_bounding_boxes(
617
615
 
618
616
  Example
619
617
  -------
620
- >>> image_with_bboxes = overlay_bounding_boxes(
621
- image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
622
- )
618
+ >>> image_with_bboxes = overlay_bounding_boxes(
619
+ image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
620
+ )
623
621
  """
624
622
  pil_image = Image.fromarray(image.astype(np.uint8))
625
623
 
@@ -673,18 +671,18 @@ def overlay_segmentation_masks(
673
671
 
674
672
  Example
675
673
  -------
676
- >>> image_with_masks = overlay_segmentation_masks(
677
- image,
678
- [{
679
- 'score': 0.99,
680
- 'label': 'dinosaur',
681
- 'mask': array([[0, 0, 0, ..., 0, 0, 0],
682
- [0, 0, 0, ..., 0, 0, 0],
683
- ...,
684
- [0, 0, 0, ..., 0, 0, 0],
685
- [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
686
- }],
687
- )
674
+ >>> image_with_masks = overlay_segmentation_masks(
675
+ image,
676
+ [{
677
+ 'score': 0.99,
678
+ 'label': 'dinosaur',
679
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
680
+ [0, 0, 0, ..., 0, 0, 0],
681
+ ...,
682
+ [0, 0, 0, ..., 0, 0, 0],
683
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
684
+ }],
685
+ )
688
686
  """
689
687
  pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
690
688
 
@@ -725,16 +723,16 @@ def overlay_heat_map(
725
723
 
726
724
  Example
727
725
  -------
728
- >>> image_with_heat_map = overlay_heat_map(
729
- image,
730
- {
731
- 'heat_map': array([[0, 0, 0, ..., 0, 0, 0],
732
- [0, 0, 0, ..., 0, 0, 0],
733
- ...,
734
- [0, 0, 0, ..., 0, 0, 0],
735
- [0, 0, 0, ..., 125, 125, 125]], dtype=uint8),
736
- },
737
- )
726
+ >>> image_with_heat_map = overlay_heat_map(
727
+ image,
728
+ {
729
+ 'heat_map': array([[0, 0, 0, ..., 0, 0, 0],
730
+ [0, 0, 0, ..., 0, 0, 0],
731
+ ...,
732
+ [0, 0, 0, ..., 0, 0, 0],
733
+ [0, 0, 0, ..., 125, 125, 125]], dtype=uint8),
734
+ },
735
+ )
738
736
  """
739
737
  pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
740
738
 
@@ -63,9 +63,9 @@ def extract_frames_from_video(
63
63
 
64
64
  Returns:
65
65
  a list of tuples containing the extracted frame and the timestamp in seconds.
66
- E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds
67
- from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
68
- the video. The frames are sorted by the timestamp in ascending order.
66
+ E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds
67
+ from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
68
+ the video. The frames are sorted by the timestamp in ascending order.
69
69
  """
70
70
  with VideoFileClip(video_uri) as video:
71
71
  video_duration: float = video.duration
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.49
3
+ Version: 0.2.51
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -7,11 +7,11 @@ vision_agent/agent/data_interpreter.py,sha256=YlCm3DVyhCM9T6wpccWxC5XHoIj9smsEsk
7
7
  vision_agent/agent/data_interpreter_prompts.py,sha256=RDJggOfXwGaEoIcTYGX41ZEayCgYei1AootDOc_SN2g,6134
8
8
  vision_agent/agent/easytool.py,sha256=wMa9-tpAaiC4E2ONbidxmMM9YvAOw4_Sypf5mGKco_w,11526
9
9
  vision_agent/agent/easytool_prompts.py,sha256=Bikw-PPLkm78dwywTlnv32Y1Tw6JMeC-R7oCnXWLcTk,4656
10
- vision_agent/agent/easytool_v2.py,sha256=CjY-sSj3abxnSq3ZHZMt-7YvRWDXEZsC6RN8FFIypCA,27274
10
+ vision_agent/agent/easytool_v2.py,sha256=LY2cqzjVHBr7QMn4WsrZ7AfpWrDN0LjJIrd5tMo2-PI,27323
11
11
  vision_agent/agent/easytool_v2_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
12
- vision_agent/agent/reflexion.py,sha256=AlM5AvBJvCslXlYQdZiadq4oVHsNBm3IF_03DglTxRo,10506
12
+ vision_agent/agent/reflexion.py,sha256=scck3YcME6DhX5Vs4Wr1rYb8S4wkBUkN9UksyazfrZg,10506
13
13
  vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
14
- vision_agent/agent/vision_agent.py,sha256=X_LF2wRXVYAr8xMuJs3Omi8n06uVgLNgtF25sidKtfM,20424
14
+ vision_agent/agent/vision_agent.py,sha256=wGGISg6pDVNseF2fIAN1jH66OX2qZk2nDhuobeSNGHk,20957
15
15
  vision_agent/agent/vision_agent_prompts.py,sha256=hgnTlaYp2HMBHLi3e4faPb-DI5jQL9jfhKq9jyEUEgY,8370
16
16
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
@@ -23,14 +23,14 @@ vision_agent/tools/__init__.py,sha256=Sng6dChynJJCYWjraXXM0tep_VPdnYl3L9vb0HMy_P
23
23
  vision_agent/tools/easytool_tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
24
24
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
25
25
  vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
26
- vision_agent/tools/tools.py,sha256=IuTxw-08UodemQAmiIQWdwpqg_Cjf-opGuqtYHv8nuk,26583
26
+ vision_agent/tools/tools.py,sha256=L1_umAVxk_BlrDYEmV2eyu2cJnpieTW-Ipb03VwKqWU,27062
27
27
  vision_agent/utils/__init__.py,sha256=Ce4yPhoWanRsnTy3X7YzZNBYYRJsrJeT7N59WUf8GZM,209
28
28
  vision_agent/utils/execute.py,sha256=GqoAodxtwTPBr1nujPTsWiZO2rBGvWVXTe8lgxY4d_g,20603
29
29
  vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
30
30
  vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
31
31
  vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
32
- vision_agent/utils/video.py,sha256=BJ9fomy2giAl038JThQP1WQZ-u4J4J_nsZB7QEWvlcQ,8767
33
- vision_agent-0.2.49.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
34
- vision_agent-0.2.49.dist-info/METADATA,sha256=J8uaMXfLvURGCOujviCSb0aaCYOWQnAphcZHjD1bjWw,6817
35
- vision_agent-0.2.49.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
36
- vision_agent-0.2.49.dist-info/RECORD,,
32
+ vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
33
+ vision_agent-0.2.51.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
34
+ vision_agent-0.2.51.dist-info/METADATA,sha256=xUYxi6YH3U4QTlYNWZ51YI365ER6NANcYBiVeXN4egQ,6817
35
+ vision_agent-0.2.51.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
36
+ vision_agent-0.2.51.dist-info/RECORD,,