vision-agent 0.2.49__py3-none-any.whl → 0.2.51__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/easytool_v2.py +17 -14
- vision_agent/agent/reflexion.py +1 -1
- vision_agent/agent/vision_agent.py +69 -50
- vision_agent/tools/tools.py +83 -85
- vision_agent/utils/video.py +3 -3
- {vision_agent-0.2.49.dist-info → vision_agent-0.2.51.dist-info}/METADATA +1 -1
- {vision_agent-0.2.49.dist-info → vision_agent-0.2.51.dist-info}/RECORD +9 -9
- {vision_agent-0.2.49.dist-info → vision_agent-0.2.51.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.49.dist-info → vision_agent-0.2.51.dist-info}/WHEEL +0 -0
@@ -428,12 +428,12 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
|
|
428
428
|
|
429
429
|
|
430
430
|
class EasyToolV2(Agent):
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
431
|
+
"""EasyToolV2 is an agent framework that utilizes tools as well as self reflection
|
432
|
+
to accomplish tasks, in particular vision tasks. EasyToolV2 is based off of EasyTool
|
433
|
+
https://arxiv.org/abs/2401.06201 and Reflexion https://arxiv.org/abs/2303.11366
|
434
|
+
where it will attempt to complete a task and then reflect on whether or not it was
|
435
|
+
able to accomplish the task based off of the plan and final results, if not it will
|
436
|
+
redo the task with this newly added reflection.
|
437
437
|
|
438
438
|
Example
|
439
439
|
-------
|
@@ -461,7 +461,10 @@ class EasyToolV2(Agent):
|
|
461
461
|
reflect_model: the model to use for self reflection.
|
462
462
|
max_retries: maximum number of retries to attempt to complete the task.
|
463
463
|
verbose: whether to print more logs.
|
464
|
-
report_progress_callback: a callback to report the progress of the agent.
|
464
|
+
report_progress_callback: a callback to report the progress of the agent.
|
465
|
+
This is useful for streaming logs in a web application where multiple
|
466
|
+
EasyToolV2 instances are running in parallel. This callback ensures
|
467
|
+
that the progress are not mixed up.
|
465
468
|
"""
|
466
469
|
self.task_model = (
|
467
470
|
OpenAILLM(model_name="gpt-4-turbo", json_mode=True, temperature=0.0)
|
@@ -495,9 +498,10 @@ class EasyToolV2(Agent):
|
|
495
498
|
"""Invoke the vision agent.
|
496
499
|
|
497
500
|
Parameters:
|
498
|
-
|
499
|
-
[{"role": "user", "content": "describe your task here..."}]
|
500
|
-
|
501
|
+
input: A conversation in the format of
|
502
|
+
[{"role": "user", "content": "describe your task here..."}] or a string
|
503
|
+
containing just the content.
|
504
|
+
media: The input media referenced in the chat parameter.
|
501
505
|
reference_data: A dictionary containing the reference image, mask or bounding
|
502
506
|
box in the format of:
|
503
507
|
{"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
|
@@ -549,7 +553,7 @@ class EasyToolV2(Agent):
|
|
549
553
|
Parameters:
|
550
554
|
chat: A conversation in the format of
|
551
555
|
[{"role": "user", "content": "describe your task here..."}].
|
552
|
-
|
556
|
+
media: The media image referenced in the chat parameter.
|
553
557
|
reference_data: A dictionary containing the reference image, mask or bounding
|
554
558
|
box in the format of:
|
555
559
|
{"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
|
@@ -558,9 +562,8 @@ class EasyToolV2(Agent):
|
|
558
562
|
self_reflection: boolean to enable and disable self reflection.
|
559
563
|
|
560
564
|
Returns:
|
561
|
-
A tuple where the first item is the final answer
|
562
|
-
|
563
|
-
contains the visualized output.
|
565
|
+
Tuple[str, List[Dict]]: A tuple where the first item is the final answer
|
566
|
+
and the second item is a list of all the tool results.
|
564
567
|
"""
|
565
568
|
if len(chat) == 0:
|
566
569
|
raise ValueError("Input cannot be empty.")
|
vision_agent/agent/reflexion.py
CHANGED
@@ -144,7 +144,7 @@ class Reflexion(Agent):
|
|
144
144
|
|
145
145
|
Parameters:
|
146
146
|
input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
|
147
|
-
|
147
|
+
media: the input media referenced in the prompt parameter.
|
148
148
|
|
149
149
|
Returns:
|
150
150
|
A text response.
|
@@ -36,11 +36,25 @@ logging.basicConfig(stream=sys.stdout)
|
|
36
36
|
_LOGGER = logging.getLogger(__name__)
|
37
37
|
_MAX_TABULATE_COL_WIDTH = 80
|
38
38
|
_CONSOLE = Console()
|
39
|
-
|
40
|
-
|
39
|
+
|
40
|
+
|
41
|
+
class DefaultImports:
|
42
|
+
"""Container for default imports used in the code execution."""
|
43
|
+
|
44
|
+
common_imports = [
|
41
45
|
"from typing import *",
|
42
46
|
]
|
43
|
-
|
47
|
+
|
48
|
+
@staticmethod
|
49
|
+
def to_code_string() -> str:
|
50
|
+
return "\n".join(DefaultImports.common_imports + T.__new_tools__)
|
51
|
+
|
52
|
+
@staticmethod
|
53
|
+
def prepend_imports(code: str) -> str:
|
54
|
+
"""Run this method to prepend the default imports to the code.
|
55
|
+
NOTE: be sure to run this method after the custom tools have been registered.
|
56
|
+
"""
|
57
|
+
return DefaultImports.to_code_string() + "\n\n" + code
|
44
58
|
|
45
59
|
|
46
60
|
def get_diff(before: str, after: str) -> str:
|
@@ -202,18 +216,20 @@ def write_and_test_code(
|
|
202
216
|
"type": "code",
|
203
217
|
"status": "running",
|
204
218
|
"payload": {
|
205
|
-
"code": code,
|
219
|
+
"code": DefaultImports.prepend_imports(code),
|
206
220
|
"test": test,
|
207
221
|
},
|
208
222
|
}
|
209
223
|
)
|
210
|
-
result = code_interpreter.exec_isolation(
|
224
|
+
result = code_interpreter.exec_isolation(
|
225
|
+
f"{DefaultImports.to_code_string()}\n{code}\n{test}"
|
226
|
+
)
|
211
227
|
log_progress(
|
212
228
|
{
|
213
229
|
"type": "code",
|
214
230
|
"status": "completed" if result.success else "failed",
|
215
231
|
"payload": {
|
216
|
-
"code": code,
|
232
|
+
"code": DefaultImports.prepend_imports(code),
|
217
233
|
"test": test,
|
218
234
|
"result": result.to_json(),
|
219
235
|
},
|
@@ -264,19 +280,21 @@ def write_and_test_code(
|
|
264
280
|
"type": "code",
|
265
281
|
"status": "running",
|
266
282
|
"payload": {
|
267
|
-
"code": code,
|
283
|
+
"code": DefaultImports.prepend_imports(code),
|
268
284
|
"test": test,
|
269
285
|
},
|
270
286
|
}
|
271
287
|
)
|
272
288
|
|
273
|
-
result = code_interpreter.exec_isolation(
|
289
|
+
result = code_interpreter.exec_isolation(
|
290
|
+
f"{DefaultImports.to_code_string()}\n{code}\n{test}"
|
291
|
+
)
|
274
292
|
log_progress(
|
275
293
|
{
|
276
294
|
"type": "code",
|
277
295
|
"status": "completed" if result.success else "failed",
|
278
296
|
"payload": {
|
279
|
-
"code": code,
|
297
|
+
"code": DefaultImports.prepend_imports(code),
|
280
298
|
"test": test,
|
281
299
|
"result": result.to_json(),
|
282
300
|
},
|
@@ -307,7 +325,14 @@ def write_and_test_code(
|
|
307
325
|
def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
|
308
326
|
_CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True))
|
309
327
|
_CONSOLE.print("=" * 30 + " Code " + "=" * 30)
|
310
|
-
_CONSOLE.print(
|
328
|
+
_CONSOLE.print(
|
329
|
+
Syntax(
|
330
|
+
DefaultImports.prepend_imports(code),
|
331
|
+
"python",
|
332
|
+
theme="gruvbox-dark",
|
333
|
+
line_numbers=True,
|
334
|
+
)
|
335
|
+
)
|
311
336
|
if test:
|
312
337
|
_CONSOLE.print("=" * 30 + " Test " + "=" * 30)
|
313
338
|
_CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True))
|
@@ -417,10 +442,10 @@ class VisionAgent(Agent):
|
|
417
442
|
"""Chat with Vision Agent and return intermediate information regarding the task.
|
418
443
|
|
419
444
|
Parameters:
|
420
|
-
|
421
|
-
[{"role": "user", "content": "describe your task here..."}]
|
445
|
+
input (Union[List[Dict[str, str]], str]): A conversation in the format of
|
446
|
+
[{"role": "user", "content": "describe your task here..."}] or a string
|
447
|
+
of just the contents.
|
422
448
|
media (Optional[Union[str, Path]]): The media file to be used in the task.
|
423
|
-
self_reflection (bool): Whether to reflect on the task and debug the code.
|
424
449
|
|
425
450
|
Returns:
|
426
451
|
str: The code output by the Vision Agent.
|
@@ -446,7 +471,8 @@ class VisionAgent(Agent):
|
|
446
471
|
[{"role": "user", "content": "describe your task here..."}].
|
447
472
|
media (Optional[Union[str, Path]]): The media file to be used in the task.
|
448
473
|
self_reflection (bool): Whether to reflect on the task and debug the code.
|
449
|
-
|
474
|
+
display_visualization (bool): If True, it opens a new window locally to
|
475
|
+
show the image(s) created by visualization code (if there is any).
|
450
476
|
|
451
477
|
Returns:
|
452
478
|
Dict[str, Any]: A dictionary containing the code, test, test result, plan,
|
@@ -464,10 +490,6 @@ class VisionAgent(Agent):
|
|
464
490
|
if chat_i["role"] == "user":
|
465
491
|
chat_i["content"] += f" Image name {media}"
|
466
492
|
|
467
|
-
# re-grab custom tools
|
468
|
-
global _DEFAULT_IMPORT
|
469
|
-
_DEFAULT_IMPORT = "\n".join(T.__new_tools__)
|
470
|
-
|
471
493
|
code = ""
|
472
494
|
test = ""
|
473
495
|
working_memory: List[Dict[str, str]] = []
|
@@ -531,38 +553,35 @@ class VisionAgent(Agent):
|
|
531
553
|
working_memory.extend(results["working_memory"]) # type: ignore
|
532
554
|
plan.append({"code": code, "test": test, "plan": plan_i})
|
533
555
|
|
534
|
-
if self_reflection:
|
535
|
-
self.log_progress(
|
536
|
-
{
|
537
|
-
"type": "self_reflection",
|
538
|
-
"status": "started",
|
539
|
-
}
|
540
|
-
)
|
541
|
-
reflection = reflect(
|
542
|
-
chat,
|
543
|
-
FULL_TASK.format(
|
544
|
-
user_request=chat[0]["content"], subtasks=plan_i_str
|
545
|
-
),
|
546
|
-
code,
|
547
|
-
self.planner,
|
548
|
-
)
|
549
|
-
if self.verbosity > 0:
|
550
|
-
_LOGGER.info(f"Reflection: {reflection}")
|
551
|
-
feedback = cast(str, reflection["feedback"])
|
552
|
-
success = cast(bool, reflection["success"])
|
553
|
-
self.log_progress(
|
554
|
-
{
|
555
|
-
"type": "self_reflection",
|
556
|
-
"status": "completed" if success else "failed",
|
557
|
-
"payload": reflection,
|
558
|
-
}
|
559
|
-
)
|
560
|
-
working_memory.append(
|
561
|
-
{"code": f"{code}\n{test}", "feedback": feedback}
|
562
|
-
)
|
563
|
-
else:
|
556
|
+
if not self_reflection:
|
564
557
|
break
|
565
558
|
|
559
|
+
self.log_progress(
|
560
|
+
{
|
561
|
+
"type": "self_reflection",
|
562
|
+
"status": "started",
|
563
|
+
}
|
564
|
+
)
|
565
|
+
reflection = reflect(
|
566
|
+
chat,
|
567
|
+
FULL_TASK.format(
|
568
|
+
user_request=chat[0]["content"], subtasks=plan_i_str
|
569
|
+
),
|
570
|
+
code,
|
571
|
+
self.planner,
|
572
|
+
)
|
573
|
+
if self.verbosity > 0:
|
574
|
+
_LOGGER.info(f"Reflection: {reflection}")
|
575
|
+
feedback = cast(str, reflection["feedback"])
|
576
|
+
success = cast(bool, reflection["success"])
|
577
|
+
self.log_progress(
|
578
|
+
{
|
579
|
+
"type": "self_reflection",
|
580
|
+
"status": "completed" if success else "failed",
|
581
|
+
"payload": reflection,
|
582
|
+
}
|
583
|
+
)
|
584
|
+
working_memory.append({"code": f"{code}\n{test}", "feedback": feedback})
|
566
585
|
retries += 1
|
567
586
|
|
568
587
|
execution_result = cast(Execution, results["test_result"])
|
@@ -571,7 +590,7 @@ class VisionAgent(Agent):
|
|
571
590
|
"type": "final_code",
|
572
591
|
"status": "completed" if success else "failed",
|
573
592
|
"payload": {
|
574
|
-
"code": code,
|
593
|
+
"code": DefaultImports.prepend_imports(code),
|
575
594
|
"test": test,
|
576
595
|
"result": execution_result.to_json(),
|
577
596
|
},
|
@@ -586,7 +605,7 @@ class VisionAgent(Agent):
|
|
586
605
|
play_video(res.mp4)
|
587
606
|
|
588
607
|
return {
|
589
|
-
"code": code,
|
608
|
+
"code": DefaultImports.prepend_imports(code),
|
590
609
|
"test": test,
|
591
610
|
"test_result": execution_result,
|
592
611
|
"plan": plan,
|
vision_agent/tools/tools.py
CHANGED
@@ -75,17 +75,18 @@ def grounding_dino(
|
|
75
75
|
|
76
76
|
Returns:
|
77
77
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
78
|
-
|
79
|
-
|
80
|
-
|
78
|
+
bounding box of the detected objects with normalized coordinates between 0
|
79
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
80
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
81
|
+
bounding box.
|
81
82
|
|
82
83
|
Example
|
83
84
|
-------
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
85
|
+
>>> grounding_dino("car. dinosaur", image)
|
86
|
+
[
|
87
|
+
{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
88
|
+
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
89
|
+
]
|
89
90
|
"""
|
90
91
|
image_size = image.shape[:2]
|
91
92
|
image_b64 = convert_to_b64(image)
|
@@ -129,27 +130,27 @@ def grounding_sam(
|
|
129
130
|
|
130
131
|
Returns:
|
131
132
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
133
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
134
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
135
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
136
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
137
|
+
the background.
|
137
138
|
|
138
139
|
Example
|
139
140
|
-------
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
141
|
+
>>> grounding_sam("car. dinosaur", image)
|
142
|
+
[
|
143
|
+
{
|
144
|
+
'score': 0.99,
|
145
|
+
'label': 'dinosaur',
|
146
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
147
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
148
|
+
[0, 0, 0, ..., 0, 0, 0],
|
149
|
+
...,
|
150
|
+
[0, 0, 0, ..., 0, 0, 0],
|
151
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
152
|
+
},
|
153
|
+
]
|
153
154
|
"""
|
154
155
|
image_size = image.shape[:2]
|
155
156
|
image_b64 = convert_to_b64(image)
|
@@ -187,12 +188,12 @@ def extract_frames(
|
|
187
188
|
|
188
189
|
Returns:
|
189
190
|
List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
|
190
|
-
|
191
|
+
as a numpy array and the timestamp in seconds.
|
191
192
|
|
192
193
|
Example
|
193
194
|
-------
|
194
|
-
|
195
|
-
|
195
|
+
>>> extract_frames("path/to/video.mp4")
|
196
|
+
[(frame1, 0.0), (frame2, 0.5), ...]
|
196
197
|
"""
|
197
198
|
|
198
199
|
return extract_frames_from_video(str(video_uri), fps)
|
@@ -212,10 +213,10 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
212
213
|
|
213
214
|
Example
|
214
215
|
-------
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
216
|
+
>>> ocr(image)
|
217
|
+
[
|
218
|
+
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
219
|
+
]
|
219
220
|
"""
|
220
221
|
|
221
222
|
pil_image = Image.fromarray(image).convert("RGB")
|
@@ -266,9 +267,8 @@ def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
|
266
267
|
|
267
268
|
Example
|
268
269
|
-------
|
269
|
-
|
270
|
-
|
271
|
-
|
270
|
+
>>> zero_shot_counting(image)
|
271
|
+
{'count': 45},
|
272
272
|
"""
|
273
273
|
|
274
274
|
image_b64 = convert_to_b64(image)
|
@@ -297,9 +297,8 @@ def visual_prompt_counting(
|
|
297
297
|
|
298
298
|
Example
|
299
299
|
-------
|
300
|
-
|
301
|
-
|
302
|
-
|
300
|
+
>>> visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
|
301
|
+
{'count': 45},
|
303
302
|
"""
|
304
303
|
|
305
304
|
image_size = get_image_size(image)
|
@@ -332,9 +331,8 @@ def image_question_answering(image: np.ndarray, prompt: str) -> str:
|
|
332
331
|
|
333
332
|
Example
|
334
333
|
-------
|
335
|
-
|
336
|
-
|
337
|
-
|
334
|
+
>>> image_question_answering(image, 'What is the cat doing ?')
|
335
|
+
'drinking milk'
|
338
336
|
"""
|
339
337
|
|
340
338
|
image_b64 = convert_to_b64(image)
|
@@ -363,9 +361,8 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
|
|
363
361
|
|
364
362
|
Example
|
365
363
|
-------
|
366
|
-
|
367
|
-
|
368
|
-
|
364
|
+
>>> clip(image, ['dog', 'cat', 'bird'])
|
365
|
+
{"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
|
369
366
|
"""
|
370
367
|
|
371
368
|
image_b64 = convert_to_b64(image)
|
@@ -391,9 +388,8 @@ def image_caption(image: np.ndarray) -> str:
|
|
391
388
|
|
392
389
|
Example
|
393
390
|
-------
|
394
|
-
|
395
|
-
|
396
|
-
|
391
|
+
>>> image_caption(image)
|
392
|
+
'This image contains a cat sitting on a table with a bowl of milk.'
|
397
393
|
"""
|
398
394
|
|
399
395
|
image_b64 = convert_to_b64(image)
|
@@ -418,8 +414,8 @@ def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
|
|
418
414
|
|
419
415
|
Example
|
420
416
|
-------
|
421
|
-
|
422
|
-
|
417
|
+
>>> closest_mask_distance(mask1, mask2)
|
418
|
+
0.5
|
423
419
|
"""
|
424
420
|
|
425
421
|
mask1 = np.clip(mask1, 0, 1)
|
@@ -474,8 +470,8 @@ def closest_box_distance(
|
|
474
470
|
|
475
471
|
Example
|
476
472
|
-------
|
477
|
-
|
478
|
-
|
473
|
+
>>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
|
474
|
+
141.42
|
479
475
|
"""
|
480
476
|
|
481
477
|
x11, y11, x12, y12 = denormalize_bbox(box1, image_size)
|
@@ -499,7 +495,7 @@ def save_json(data: Any, file_path: str) -> None:
|
|
499
495
|
|
500
496
|
Example
|
501
497
|
-------
|
502
|
-
|
498
|
+
>>> save_json(data, "path/to/file.json")
|
503
499
|
"""
|
504
500
|
|
505
501
|
class NumpyEncoder(json.JSONEncoder):
|
@@ -515,7 +511,7 @@ def save_json(data: Any, file_path: str) -> None:
|
|
515
511
|
|
516
512
|
|
517
513
|
def load_image(image_path: str) -> np.ndarray:
|
518
|
-
"""'load_image' is a utility function that loads an image from the given path.
|
514
|
+
"""'load_image' is a utility function that loads an image from the given file path string.
|
519
515
|
|
520
516
|
Parameters:
|
521
517
|
image_path (str): The path to the image.
|
@@ -525,9 +521,11 @@ def load_image(image_path: str) -> np.ndarray:
|
|
525
521
|
|
526
522
|
Example
|
527
523
|
-------
|
528
|
-
|
524
|
+
>>> load_image("path/to/image.jpg")
|
529
525
|
"""
|
530
|
-
|
526
|
+
# NOTE: sometimes the generated code pass in a NumPy array
|
527
|
+
if isinstance(image_path, np.ndarray):
|
528
|
+
return image_path
|
531
529
|
image = Image.open(image_path).convert("RGB")
|
532
530
|
return np.array(image)
|
533
531
|
|
@@ -543,8 +541,8 @@ def save_image(image: np.ndarray) -> str:
|
|
543
541
|
|
544
542
|
Example
|
545
543
|
-------
|
546
|
-
|
547
|
-
|
544
|
+
>>> save_image(image)
|
545
|
+
"/tmp/tmpabc123.png"
|
548
546
|
"""
|
549
547
|
from IPython.display import display
|
550
548
|
|
@@ -570,8 +568,8 @@ def save_video(
|
|
570
568
|
|
571
569
|
Example
|
572
570
|
-------
|
573
|
-
|
574
|
-
|
571
|
+
>>> save_video(frames)
|
572
|
+
"/tmp/tmpvideo123.mp4"
|
575
573
|
"""
|
576
574
|
if fps <= 0:
|
577
575
|
_LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
|
@@ -617,9 +615,9 @@ def overlay_bounding_boxes(
|
|
617
615
|
|
618
616
|
Example
|
619
617
|
-------
|
620
|
-
|
621
|
-
|
622
|
-
|
618
|
+
>>> image_with_bboxes = overlay_bounding_boxes(
|
619
|
+
image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
|
620
|
+
)
|
623
621
|
"""
|
624
622
|
pil_image = Image.fromarray(image.astype(np.uint8))
|
625
623
|
|
@@ -673,18 +671,18 @@ def overlay_segmentation_masks(
|
|
673
671
|
|
674
672
|
Example
|
675
673
|
-------
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
674
|
+
>>> image_with_masks = overlay_segmentation_masks(
|
675
|
+
image,
|
676
|
+
[{
|
677
|
+
'score': 0.99,
|
678
|
+
'label': 'dinosaur',
|
679
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
680
|
+
[0, 0, 0, ..., 0, 0, 0],
|
681
|
+
...,
|
682
|
+
[0, 0, 0, ..., 0, 0, 0],
|
683
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
684
|
+
}],
|
685
|
+
)
|
688
686
|
"""
|
689
687
|
pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
|
690
688
|
|
@@ -725,16 +723,16 @@ def overlay_heat_map(
|
|
725
723
|
|
726
724
|
Example
|
727
725
|
-------
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
726
|
+
>>> image_with_heat_map = overlay_heat_map(
|
727
|
+
image,
|
728
|
+
{
|
729
|
+
'heat_map': array([[0, 0, 0, ..., 0, 0, 0],
|
730
|
+
[0, 0, 0, ..., 0, 0, 0],
|
731
|
+
...,
|
732
|
+
[0, 0, 0, ..., 0, 0, 0],
|
733
|
+
[0, 0, 0, ..., 125, 125, 125]], dtype=uint8),
|
734
|
+
},
|
735
|
+
)
|
738
736
|
"""
|
739
737
|
pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
|
740
738
|
|
vision_agent/utils/video.py
CHANGED
@@ -63,9 +63,9 @@ def extract_frames_from_video(
|
|
63
63
|
|
64
64
|
Returns:
|
65
65
|
a list of tuples containing the extracted frame and the timestamp in seconds.
|
66
|
-
|
67
|
-
|
68
|
-
|
66
|
+
E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds
|
67
|
+
from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
|
68
|
+
the video. The frames are sorted by the timestamp in ascending order.
|
69
69
|
"""
|
70
70
|
with VideoFileClip(video_uri) as video:
|
71
71
|
video_duration: float = video.duration
|
@@ -7,11 +7,11 @@ vision_agent/agent/data_interpreter.py,sha256=YlCm3DVyhCM9T6wpccWxC5XHoIj9smsEsk
|
|
7
7
|
vision_agent/agent/data_interpreter_prompts.py,sha256=RDJggOfXwGaEoIcTYGX41ZEayCgYei1AootDOc_SN2g,6134
|
8
8
|
vision_agent/agent/easytool.py,sha256=wMa9-tpAaiC4E2ONbidxmMM9YvAOw4_Sypf5mGKco_w,11526
|
9
9
|
vision_agent/agent/easytool_prompts.py,sha256=Bikw-PPLkm78dwywTlnv32Y1Tw6JMeC-R7oCnXWLcTk,4656
|
10
|
-
vision_agent/agent/easytool_v2.py,sha256=
|
10
|
+
vision_agent/agent/easytool_v2.py,sha256=LY2cqzjVHBr7QMn4WsrZ7AfpWrDN0LjJIrd5tMo2-PI,27323
|
11
11
|
vision_agent/agent/easytool_v2_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
|
12
|
-
vision_agent/agent/reflexion.py,sha256=
|
12
|
+
vision_agent/agent/reflexion.py,sha256=scck3YcME6DhX5Vs4Wr1rYb8S4wkBUkN9UksyazfrZg,10506
|
13
13
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
14
|
-
vision_agent/agent/vision_agent.py,sha256=
|
14
|
+
vision_agent/agent/vision_agent.py,sha256=wGGISg6pDVNseF2fIAN1jH66OX2qZk2nDhuobeSNGHk,20957
|
15
15
|
vision_agent/agent/vision_agent_prompts.py,sha256=hgnTlaYp2HMBHLi3e4faPb-DI5jQL9jfhKq9jyEUEgY,8370
|
16
16
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
@@ -23,14 +23,14 @@ vision_agent/tools/__init__.py,sha256=Sng6dChynJJCYWjraXXM0tep_VPdnYl3L9vb0HMy_P
|
|
23
23
|
vision_agent/tools/easytool_tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
|
24
24
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
25
25
|
vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
|
26
|
-
vision_agent/tools/tools.py,sha256=
|
26
|
+
vision_agent/tools/tools.py,sha256=L1_umAVxk_BlrDYEmV2eyu2cJnpieTW-Ipb03VwKqWU,27062
|
27
27
|
vision_agent/utils/__init__.py,sha256=Ce4yPhoWanRsnTy3X7YzZNBYYRJsrJeT7N59WUf8GZM,209
|
28
28
|
vision_agent/utils/execute.py,sha256=GqoAodxtwTPBr1nujPTsWiZO2rBGvWVXTe8lgxY4d_g,20603
|
29
29
|
vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
|
30
30
|
vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
|
31
31
|
vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
|
32
|
-
vision_agent/utils/video.py,sha256=
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
36
|
-
vision_agent-0.2.
|
32
|
+
vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
|
33
|
+
vision_agent-0.2.51.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
34
|
+
vision_agent-0.2.51.dist-info/METADATA,sha256=xUYxi6YH3U4QTlYNWZ51YI365ER6NANcYBiVeXN4egQ,6817
|
35
|
+
vision_agent-0.2.51.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
36
|
+
vision_agent-0.2.51.dist-info/RECORD,,
|
File without changes
|
File without changes
|