vision-agent 0.2.111__tar.gz → 0.2.112__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.111 → vision_agent-0.2.112}/PKG-INFO +1 -1
- {vision_agent-0.2.111 → vision_agent-0.2.112}/pyproject.toml +1 -1
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/agent_utils.py +3 -8
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/vision_agent_coder.py +28 -20
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/vision_agent_coder_prompts.py +9 -7
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/vision_agent_prompts.py +11 -10
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/__init__.py +8 -4
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/meta_tools.py +0 -1
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/tool_utils.py +8 -3
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/tools.py +350 -110
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/execute.py +0 -1
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/image_utils.py +52 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/LICENSE +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/README.md +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/clients/landing_public_api.py +1 -1
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/video.py +0 -0
@@ -4,14 +4,13 @@ import sys
|
|
4
4
|
from typing import Any, Dict
|
5
5
|
|
6
6
|
logging.basicConfig(stream=sys.stdout)
|
7
|
-
_LOGGER = logging.getLogger(__name__)
|
8
7
|
|
9
8
|
|
10
9
|
def extract_json(json_str: str) -> Dict[str, Any]:
|
11
10
|
try:
|
11
|
+
json_str = json_str.replace("\n", " ")
|
12
12
|
json_dict = json.loads(json_str)
|
13
13
|
except json.JSONDecodeError:
|
14
|
-
input_json_str = json_str
|
15
14
|
if "```json" in json_str:
|
16
15
|
json_str = json_str[json_str.find("```json") + len("```json") :]
|
17
16
|
json_str = json_str[: json_str.find("```")]
|
@@ -19,12 +18,8 @@ def extract_json(json_str: str) -> Dict[str, Any]:
|
|
19
18
|
json_str = json_str[json_str.find("```") + len("```") :]
|
20
19
|
# get the last ``` not one from an intermediate string
|
21
20
|
json_str = json_str[: json_str.find("}```")]
|
22
|
-
|
23
|
-
|
24
|
-
except json.JSONDecodeError as e:
|
25
|
-
error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
|
26
|
-
_LOGGER.exception(error_msg)
|
27
|
-
raise ValueError(error_msg) from e
|
21
|
+
|
22
|
+
json_dict = json.loads(json_str)
|
28
23
|
return json_dict # type: ignore
|
29
24
|
|
30
25
|
|
@@ -4,6 +4,7 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import sys
|
6
6
|
import tempfile
|
7
|
+
from json import JSONDecodeError
|
7
8
|
from pathlib import Path
|
8
9
|
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
|
9
10
|
|
@@ -86,8 +87,8 @@ def format_memory(memory: List[Dict[str, str]]) -> str:
|
|
86
87
|
def format_plans(plans: Dict[str, Any]) -> str:
|
87
88
|
plan_str = ""
|
88
89
|
for k, v in plans.items():
|
89
|
-
plan_str += f"{k}
|
90
|
-
plan_str += "-" + "\n-".join([e
|
90
|
+
plan_str += "\n" + f"{k}: {v['thoughts']}\n"
|
91
|
+
plan_str += " -" + "\n -".join([e for e in v["instructions"]])
|
91
92
|
|
92
93
|
return plan_str
|
93
94
|
|
@@ -228,13 +229,11 @@ def pick_plan(
|
|
228
229
|
"status": "completed" if tool_output.success else "failed",
|
229
230
|
}
|
230
231
|
)
|
231
|
-
tool_output_str =
|
232
|
-
if len(tool_output.logs.stdout) > 0:
|
233
|
-
tool_output_str = tool_output.logs.stdout[0]
|
232
|
+
tool_output_str = tool_output.text().strip()
|
234
233
|
|
235
234
|
if verbosity == 2:
|
236
235
|
_print_code("Code and test after attempted fix:", code)
|
237
|
-
_LOGGER.info(f"Code execution result after
|
236
|
+
_LOGGER.info(f"Code execution result after attempt {count}")
|
238
237
|
|
239
238
|
count += 1
|
240
239
|
|
@@ -251,7 +250,21 @@ def pick_plan(
|
|
251
250
|
tool_output=tool_output_str[:20_000],
|
252
251
|
)
|
253
252
|
chat[-1]["content"] = prompt
|
254
|
-
|
253
|
+
|
254
|
+
count = 0
|
255
|
+
best_plan = None
|
256
|
+
while best_plan is None and count < max_retries:
|
257
|
+
try:
|
258
|
+
best_plan = extract_json(model(chat, stream=False)) # type: ignore
|
259
|
+
except JSONDecodeError as e:
|
260
|
+
_LOGGER.exception(
|
261
|
+
f"Error while extracting JSON during picking best plan {str(e)}"
|
262
|
+
)
|
263
|
+
pass
|
264
|
+
count += 1
|
265
|
+
|
266
|
+
if best_plan is None:
|
267
|
+
best_plan = {"best_plan": list(plans.keys())[0]}
|
255
268
|
|
256
269
|
if verbosity >= 1:
|
257
270
|
_LOGGER.info(f"Best plan:\n{best_plan}")
|
@@ -525,7 +538,7 @@ def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
|
|
525
538
|
|
526
539
|
|
527
540
|
def retrieve_tools(
|
528
|
-
plans: Dict[str,
|
541
|
+
plans: Dict[str, Dict[str, Any]],
|
529
542
|
tool_recommender: Sim,
|
530
543
|
log_progress: Callable[[Dict[str, Any]], None],
|
531
544
|
verbosity: int = 0,
|
@@ -542,8 +555,8 @@ def retrieve_tools(
|
|
542
555
|
tool_lists: Dict[str, List[Dict[str, str]]] = {}
|
543
556
|
for k, plan in plans.items():
|
544
557
|
tool_lists[k] = []
|
545
|
-
for task in plan:
|
546
|
-
tools = tool_recommender.top_k(task
|
558
|
+
for task in plan["instructions"]:
|
559
|
+
tools = tool_recommender.top_k(task, k=2, thresh=0.3)
|
547
560
|
tool_info.extend([e["doc"] for e in tools])
|
548
561
|
tool_desc.extend([e["desc"] for e in tools])
|
549
562
|
tool_lists[k].extend(
|
@@ -737,14 +750,7 @@ class VisionAgentCoder(Agent):
|
|
737
750
|
if self.verbosity >= 1:
|
738
751
|
for p in plans:
|
739
752
|
# tabulate will fail if the keys are not the same for all elements
|
740
|
-
p_fixed = [
|
741
|
-
{
|
742
|
-
"instructions": (
|
743
|
-
e["instructions"] if "instructions" in e else ""
|
744
|
-
)
|
745
|
-
}
|
746
|
-
for e in plans[p]
|
747
|
-
]
|
753
|
+
p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
|
748
754
|
_LOGGER.info(
|
749
755
|
f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
750
756
|
)
|
@@ -793,13 +799,15 @@ class VisionAgentCoder(Agent):
|
|
793
799
|
)
|
794
800
|
|
795
801
|
if self.verbosity >= 1:
|
802
|
+
plan_i_fixed = [{"instructions": e} for e in plan_i["instructions"]]
|
796
803
|
_LOGGER.info(
|
797
|
-
f"Picked best plan:\n{tabulate(tabular_data=
|
804
|
+
f"Picked best plan:\n{tabulate(tabular_data=plan_i_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
798
805
|
)
|
799
806
|
|
800
807
|
results = write_and_test_code(
|
801
808
|
chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
|
802
|
-
plan="\n
|
809
|
+
plan=f"\n{plan_i['thoughts']}\n-"
|
810
|
+
+ "\n-".join([e for e in plan_i["instructions"]]),
|
803
811
|
tool_info=tool_info,
|
804
812
|
tool_output=tool_output_str,
|
805
813
|
tool_utils=T.UTILITIES_DOCSTRING,
|
{vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
@@ -30,18 +30,19 @@ PLAN = """
|
|
30
30
|
|
31
31
|
**Instructions**:
|
32
32
|
1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
|
33
|
-
2. Output three different plans each utilize a different strategy or
|
33
|
+
2. Output three different plans each utilize a different strategy or set of tools.
|
34
34
|
|
35
35
|
Output a list of jsons in the following format
|
36
36
|
|
37
37
|
```json
|
38
38
|
{{
|
39
39
|
"plan1":
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
40
|
+
{{
|
41
|
+
"thoughts": str # your thought process for choosing this plan
|
42
|
+
"instructions": [
|
43
|
+
str # what you should do in this task associated with a tool
|
44
|
+
]
|
45
|
+
}},
|
45
46
|
"plan2": ...,
|
46
47
|
"plan3": ...
|
47
48
|
}}
|
@@ -127,7 +128,8 @@ PICK_PLAN = """
|
|
127
128
|
|
128
129
|
**Instructions**:
|
129
130
|
1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
|
130
|
-
2.
|
131
|
+
2. Try solving the problem yourself given the image and pick the plan that matches your solution the best.
|
132
|
+
3. Output a JSON object with the following format:
|
131
133
|
{{
|
132
134
|
"thoughts": str # your thought process for choosing the best plan
|
133
135
|
"best_plan": str # the best plan you have chosen
|
@@ -15,7 +15,7 @@ This is the documentation for the different actions you can take:
|
|
15
15
|
**Examples**:
|
16
16
|
Here is an example of how you can interact with a user and Actions to complete a task:
|
17
17
|
--- START EXAMPLES ---
|
18
|
-
[Current directory: /workspace
|
18
|
+
[Current directory: /example/workspace]
|
19
19
|
{examples}
|
20
20
|
--- END EXAMPLES ---
|
21
21
|
|
@@ -27,16 +27,17 @@ Here is an example of how you can interact with a user and Actions to complete a
|
|
27
27
|
Here is the current conversation so far:
|
28
28
|
--- START CONVERSATION ---
|
29
29
|
[Current directory: {dir}]
|
30
|
+
|
30
31
|
{conversation}
|
31
32
|
"""
|
32
33
|
|
33
34
|
EXAMPLES_CODE1 = """
|
34
35
|
USER: Can you detect the dogs in this image? Media name dog.jpg
|
35
36
|
|
36
|
-
AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/workspace/
|
37
|
+
AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
|
37
38
|
|
38
39
|
OBSERVATION:
|
39
|
-
[File /workspace/
|
40
|
+
[File /example/workspace/dog_detector.py]
|
40
41
|
0|from vision_agent.tools import load_image, owl_v2
|
41
42
|
1|def detect_dogs(image_path: str):
|
42
43
|
2| image = load_image(image_path)
|
@@ -44,7 +45,7 @@ OBSERVATION:
|
|
44
45
|
4| return dogs
|
45
46
|
[End of file]
|
46
47
|
|
47
|
-
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/
|
48
|
+
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
|
48
49
|
|
49
50
|
OBSERVATION:
|
50
51
|
----- stdout -----
|
@@ -55,10 +56,10 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
|
|
55
56
|
|
56
57
|
USER: The the image only has one dog, can you fix this?
|
57
58
|
|
58
|
-
AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/workspace/
|
59
|
+
AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
|
59
60
|
|
60
61
|
OBSERVATION:
|
61
|
-
[File /workspace/
|
62
|
+
[File /example/workspace/dog_detector.py]
|
62
63
|
0|from vision_agent.tools import load_image, owl_v2
|
63
64
|
1|def detect_dogs(image_path: str):
|
64
65
|
2| image = load_image(image_path)
|
@@ -66,7 +67,7 @@ OBSERVATION:
|
|
66
67
|
4| return dogs
|
67
68
|
[End of file]
|
68
69
|
|
69
|
-
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/
|
70
|
+
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
|
70
71
|
|
71
72
|
OBSERVATION:
|
72
73
|
----- stdout -----
|
@@ -82,10 +83,10 @@ AGENT: {"thoughts": "The user has asked to count workers with helmets but has no
|
|
82
83
|
|
83
84
|
USER: Yes you can use workers.png
|
84
85
|
|
85
|
-
AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/workspace/
|
86
|
+
AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
|
86
87
|
|
87
88
|
OBSERVATION:
|
88
|
-
[File /workspace/
|
89
|
+
[File /example/workspace/code.py]
|
89
90
|
0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
|
90
91
|
1|def count_workers_with_helmets(image_path: str):
|
91
92
|
2| image = load_image(image_path)
|
@@ -104,7 +105,7 @@ OBSERVATION:
|
|
104
105
|
15| return count
|
105
106
|
[End of file]
|
106
107
|
|
107
|
-
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/workspace/
|
108
|
+
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
|
108
109
|
|
109
110
|
OBSERVATION:
|
110
111
|
----- stdout -----
|
@@ -19,16 +19,20 @@ from .tools import (
|
|
19
19
|
detr_segmentation,
|
20
20
|
dpt_hybrid_midas,
|
21
21
|
extract_frames,
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
florence2_image_caption,
|
23
|
+
florence2_object_detection,
|
24
|
+
florence2_ocr,
|
25
|
+
florence2_roberta_vqa,
|
26
|
+
florence2_sam2_image,
|
27
|
+
florence2_sam2_video,
|
26
28
|
generate_pose_image,
|
27
29
|
generate_soft_edge_image,
|
28
30
|
get_tool_documentation,
|
29
31
|
git_vqa_v2,
|
30
32
|
grounding_dino,
|
31
33
|
grounding_sam,
|
34
|
+
ixc25_image_vqa,
|
35
|
+
ixc25_video_vqa,
|
32
36
|
load_image,
|
33
37
|
loca_visual_prompt_counting,
|
34
38
|
loca_zero_shot_counting,
|
@@ -8,7 +8,6 @@ from vision_agent.lmm.types import Message
|
|
8
8
|
from vision_agent.tools.tool_utils import get_tool_documentation
|
9
9
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
10
10
|
|
11
|
-
|
12
11
|
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
13
12
|
|
14
13
|
CURRENT_FILE = None
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import inspect
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
-
from typing import Any, Callable, Dict, List, MutableMapping, Optional
|
4
|
+
from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
|
5
5
|
|
6
6
|
import pandas as pd
|
7
7
|
from IPython.display import display
|
@@ -31,6 +31,7 @@ class ToolCallTrace(BaseModel):
|
|
31
31
|
def send_inference_request(
|
32
32
|
payload: Dict[str, Any],
|
33
33
|
endpoint_name: str,
|
34
|
+
files: Optional[List[Tuple[Any, ...]]] = None,
|
34
35
|
v2: bool = False,
|
35
36
|
metadata_payload: Optional[Dict[str, Any]] = None,
|
36
37
|
) -> Dict[str, Any]:
|
@@ -50,7 +51,7 @@ def send_inference_request(
|
|
50
51
|
response={},
|
51
52
|
error=None,
|
52
53
|
)
|
53
|
-
headers = {"
|
54
|
+
headers = {"apikey": _LND_API_KEY}
|
54
55
|
if "TOOL_ENDPOINT_AUTH" in os.environ:
|
55
56
|
headers["Authorization"] = os.environ["TOOL_ENDPOINT_AUTH"]
|
56
57
|
headers.pop("apikey")
|
@@ -60,7 +61,11 @@ def send_inference_request(
|
|
60
61
|
num_retry=3,
|
61
62
|
headers=headers,
|
62
63
|
)
|
63
|
-
|
64
|
+
|
65
|
+
if files is not None:
|
66
|
+
res = session.post(url, data=payload, files=files)
|
67
|
+
else:
|
68
|
+
res = session.post(url, json=payload)
|
64
69
|
if res.status_code != 200:
|
65
70
|
tool_call_trace.error = Error(
|
66
71
|
name="RemoteToolCallFailed",
|
@@ -2,47 +2,50 @@ import io
|
|
2
2
|
import json
|
3
3
|
import logging
|
4
4
|
import tempfile
|
5
|
-
from uuid import UUID
|
6
|
-
from pathlib import Path
|
7
5
|
from importlib import resources
|
6
|
+
from pathlib import Path
|
8
7
|
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
8
|
+
from uuid import UUID
|
9
9
|
|
10
10
|
import cv2
|
11
|
-
import requests
|
12
11
|
import numpy as np
|
13
|
-
|
12
|
+
import requests
|
14
13
|
from moviepy.editor import ImageSequenceClip
|
15
14
|
from PIL import Image, ImageDraw, ImageFont
|
16
15
|
from pillow_heif import register_heif_opener # type: ignore
|
16
|
+
from pytube import YouTube # type: ignore
|
17
17
|
|
18
|
+
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
18
19
|
from vision_agent.tools.tool_utils import (
|
19
|
-
send_inference_request,
|
20
20
|
get_tool_descriptions,
|
21
21
|
get_tool_documentation,
|
22
22
|
get_tools_df,
|
23
23
|
get_tools_info,
|
24
|
+
send_inference_request,
|
25
|
+
)
|
26
|
+
from vision_agent.tools.tools_types import (
|
27
|
+
BboxInput,
|
28
|
+
BboxInputBase64,
|
29
|
+
FineTuning,
|
30
|
+
Florencev2FtRequest,
|
31
|
+
JobStatus,
|
32
|
+
PromptTask,
|
24
33
|
)
|
25
|
-
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
26
34
|
from vision_agent.utils import extract_frames_from_video
|
35
|
+
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
27
36
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
28
37
|
from vision_agent.utils.image_utils import (
|
29
38
|
b64_to_pil,
|
39
|
+
convert_quad_box_to_bbox,
|
30
40
|
convert_to_b64,
|
31
41
|
denormalize_bbox,
|
42
|
+
frames_to_bytes,
|
32
43
|
get_image_size,
|
33
44
|
normalize_bbox,
|
34
|
-
|
45
|
+
numpy_to_bytes,
|
35
46
|
rle_decode,
|
47
|
+
rle_decode_array,
|
36
48
|
)
|
37
|
-
from vision_agent.tools.tools_types import (
|
38
|
-
BboxInput,
|
39
|
-
BboxInputBase64,
|
40
|
-
PromptTask,
|
41
|
-
Florencev2FtRequest,
|
42
|
-
FineTuning,
|
43
|
-
JobStatus,
|
44
|
-
)
|
45
|
-
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
46
49
|
|
47
50
|
register_heif_opener()
|
48
51
|
|
@@ -141,9 +144,9 @@ def owl_v2(
|
|
141
144
|
box_threshold: float = 0.10,
|
142
145
|
) -> List[Dict[str, Any]]:
|
143
146
|
"""'owl_v2' is a tool that can detect and count multiple objects given a text
|
144
|
-
prompt such as category names or referring expressions. The categories in text
|
145
|
-
are separated by commas. It returns a list of bounding boxes with
|
146
|
-
|
147
|
+
prompt such as category names or referring expressions. The categories in text
|
148
|
+
prompt are separated by commas. It returns a list of bounding boxes with normalized
|
149
|
+
coordinates, label names and associated probability scores.
|
147
150
|
|
148
151
|
Parameters:
|
149
152
|
prompt (str): The prompt to ground to the image.
|
@@ -194,10 +197,10 @@ def grounding_sam(
|
|
194
197
|
box_threshold: float = 0.20,
|
195
198
|
iou_threshold: float = 0.20,
|
196
199
|
) -> List[Dict[str, Any]]:
|
197
|
-
"""'grounding_sam' is a tool that can segment multiple objects given a
|
198
|
-
|
199
|
-
|
200
|
-
|
200
|
+
"""'grounding_sam' is a tool that can segment multiple objects given a text prompt
|
201
|
+
such as category names or referring expressions. The categories in text prompt are
|
202
|
+
separated by commas or periods. It returns a list of bounding boxes, label names,
|
203
|
+
mask file names and associated probability scores.
|
201
204
|
|
202
205
|
Parameters:
|
203
206
|
prompt (str): The prompt to ground to the image.
|
@@ -254,52 +257,114 @@ def grounding_sam(
|
|
254
257
|
return return_data
|
255
258
|
|
256
259
|
|
257
|
-
def
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
time in seconds where the frame was captured. The frame is a numpy array.
|
260
|
+
def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
|
261
|
+
"""'florence2_sam2_image' is a tool that can segment multiple objects given a text
|
262
|
+
prompt such as category names or referring expressions. The categories in the text
|
263
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
264
|
+
mask file names and associated probability scores of 1.0.
|
263
265
|
|
264
266
|
Parameters:
|
265
|
-
|
266
|
-
|
267
|
-
to 0.5.
|
267
|
+
prompt (str): The prompt to ground to the image.
|
268
|
+
image (np.ndarray): The image to ground the prompt to.
|
268
269
|
|
269
270
|
Returns:
|
270
|
-
List[
|
271
|
-
|
271
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
272
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
273
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
274
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
275
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
276
|
+
the background.
|
272
277
|
|
273
278
|
Example
|
274
279
|
-------
|
275
|
-
>>>
|
276
|
-
[
|
280
|
+
>>> florence2_sam2_image("car, dinosaur", image)
|
281
|
+
[
|
282
|
+
{
|
283
|
+
'score': 1.0,
|
284
|
+
'label': 'dinosaur',
|
285
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
286
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
287
|
+
[0, 0, 0, ..., 0, 0, 0],
|
288
|
+
...,
|
289
|
+
[0, 0, 0, ..., 0, 0, 0],
|
290
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
291
|
+
},
|
292
|
+
]
|
277
293
|
"""
|
294
|
+
buffer_bytes = numpy_to_bytes(image)
|
278
295
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
)
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
.first()
|
295
|
-
)
|
296
|
-
if not video:
|
297
|
-
raise Exception("No suitable video stream found")
|
298
|
-
video_file_path = video.download(output_path=temp_dir)
|
296
|
+
files = [("image", buffer_bytes)]
|
297
|
+
payload = {
|
298
|
+
"prompts": [s.strip() for s in prompt.split(",")],
|
299
|
+
"function_name": "florence2_sam2_image",
|
300
|
+
}
|
301
|
+
data: Dict[str, Any] = send_inference_request(
|
302
|
+
payload, "florence2-sam2", files=files, v2=True
|
303
|
+
)
|
304
|
+
return_data = []
|
305
|
+
for _, data_i in data["0"].items():
|
306
|
+
mask = rle_decode_array(data_i["mask"])
|
307
|
+
label = data_i["label"]
|
308
|
+
bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
|
309
|
+
return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
|
310
|
+
return return_data
|
299
311
|
|
300
|
-
return extract_frames_from_video(video_file_path, fps)
|
301
312
|
|
302
|
-
|
313
|
+
def florence2_sam2_video(
|
314
|
+
prompt: str, frames: List[np.ndarray]
|
315
|
+
) -> List[List[Dict[str, Any]]]:
|
316
|
+
"""'florence2_sam2_video' is a tool that can segment and track multiple entities
|
317
|
+
in a video given a text prompt such as category names or referring expressions. You
|
318
|
+
can optionally separate the categories in the text with commas. It only tracks
|
319
|
+
entities present in the first frame and only returns segmentation masks. It is
|
320
|
+
useful for tracking and counting without duplicating counts.
|
321
|
+
|
322
|
+
Parameters:
|
323
|
+
prompt (str): The prompt to ground to the video.
|
324
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
325
|
+
|
326
|
+
Returns:
|
327
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
|
328
|
+
and segment mask. The outer list represents each frame and the inner list is
|
329
|
+
the entities per frame. The label contains the object ID followed by the label
|
330
|
+
name. The objects are only identified in the first framed and tracked
|
331
|
+
throughout the video.
|
332
|
+
|
333
|
+
Example
|
334
|
+
-------
|
335
|
+
>>> florence2_sam2_video("car, dinosaur", frames)
|
336
|
+
[
|
337
|
+
[
|
338
|
+
{
|
339
|
+
'label': '0: dinosaur',
|
340
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
341
|
+
[0, 0, 0, ..., 0, 0, 0],
|
342
|
+
...,
|
343
|
+
[0, 0, 0, ..., 0, 0, 0],
|
344
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
345
|
+
},
|
346
|
+
],
|
347
|
+
]
|
348
|
+
"""
|
349
|
+
|
350
|
+
buffer_bytes = frames_to_bytes(frames)
|
351
|
+
files = [("video", buffer_bytes)]
|
352
|
+
payload = {
|
353
|
+
"prompts": prompt.split(","),
|
354
|
+
"function_name": "florence2_sam2_video",
|
355
|
+
}
|
356
|
+
data: Dict[str, Any] = send_inference_request(
|
357
|
+
payload, "florence2-sam2", files=files, v2=True
|
358
|
+
)
|
359
|
+
return_data = []
|
360
|
+
for frame_i in data.keys():
|
361
|
+
return_frame_data = []
|
362
|
+
for obj_id, data_j in data[frame_i].items():
|
363
|
+
mask = rle_decode_array(data_j["mask"])
|
364
|
+
label = obj_id + ": " + data_j["label"]
|
365
|
+
return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
|
366
|
+
return_data.append(return_frame_data)
|
367
|
+
return return_data
|
303
368
|
|
304
369
|
|
305
370
|
def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
@@ -368,12 +433,19 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
|
368
433
|
|
369
434
|
Returns:
|
370
435
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
371
|
-
value
|
436
|
+
value, e.g. {count: 12} and a heat map for visaulization purposes.
|
372
437
|
|
373
438
|
Example
|
374
439
|
-------
|
375
440
|
>>> loca_zero_shot_counting(image)
|
376
|
-
{'count':
|
441
|
+
{'count': 83,
|
442
|
+
'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
|
443
|
+
[ 0, 0, 0, ..., 0, 0, 0],
|
444
|
+
[ 0, 0, 0, ..., 0, 0, 1],
|
445
|
+
...,
|
446
|
+
[ 0, 0, 0, ..., 30, 35, 41],
|
447
|
+
[ 0, 0, 0, ..., 41, 47, 53],
|
448
|
+
[ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
|
377
449
|
"""
|
378
450
|
|
379
451
|
image_b64 = convert_to_b64(image)
|
@@ -398,12 +470,19 @@ def loca_visual_prompt_counting(
|
|
398
470
|
|
399
471
|
Returns:
|
400
472
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
401
|
-
value
|
473
|
+
value, e.g. {count: 12} and a heat map for visaulization purposes.
|
402
474
|
|
403
475
|
Example
|
404
476
|
-------
|
405
477
|
>>> loca_visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
|
406
|
-
{'count':
|
478
|
+
{'count': 83,
|
479
|
+
'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
|
480
|
+
[ 0, 0, 0, ..., 0, 0, 0],
|
481
|
+
[ 0, 0, 0, ..., 0, 0, 1],
|
482
|
+
...,
|
483
|
+
[ 0, 0, 0, ..., 30, 35, 41],
|
484
|
+
[ 0, 0, 0, ..., 41, 47, 53],
|
485
|
+
[ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
|
407
486
|
"""
|
408
487
|
|
409
488
|
image_size = get_image_size(image)
|
@@ -420,8 +499,8 @@ def loca_visual_prompt_counting(
|
|
420
499
|
return resp_data
|
421
500
|
|
422
501
|
|
423
|
-
def
|
424
|
-
"""'
|
502
|
+
def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
503
|
+
"""'florence2_roberta_vqa' is a tool that takes an image and analyzes
|
425
504
|
its contents, generates detailed captions and then tries to answer the given
|
426
505
|
question using the generated context. It returns text as an answer to the question.
|
427
506
|
|
@@ -434,7 +513,7 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
|
434
513
|
|
435
514
|
Example
|
436
515
|
-------
|
437
|
-
>>>
|
516
|
+
>>> florence2_roberta_vqa('What is the top left animal in this image?', image)
|
438
517
|
'white tiger'
|
439
518
|
"""
|
440
519
|
|
@@ -442,13 +521,73 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
|
442
521
|
data = {
|
443
522
|
"image": image_b64,
|
444
523
|
"question": prompt,
|
445
|
-
"function_name": "
|
524
|
+
"function_name": "florence2_roberta_vqa",
|
446
525
|
}
|
447
526
|
|
448
527
|
answer = send_inference_request(data, "florence2-qa", v2=True)
|
449
528
|
return answer # type: ignore
|
450
529
|
|
451
530
|
|
531
|
+
def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
|
532
|
+
"""'ixc25_image_vqa' is a tool that can answer any questions about arbitrary images
|
533
|
+
including regular images or images of documents or presentations. It returns text
|
534
|
+
as an answer to the question.
|
535
|
+
|
536
|
+
Parameters:
|
537
|
+
prompt (str): The question about the image
|
538
|
+
image (np.ndarray): The reference image used for the question
|
539
|
+
|
540
|
+
Returns:
|
541
|
+
str: A string which is the answer to the given prompt.
|
542
|
+
|
543
|
+
Example
|
544
|
+
-------
|
545
|
+
>>> ixc25_image_vqa('What is the cat doing?', image)
|
546
|
+
'drinking milk'
|
547
|
+
"""
|
548
|
+
|
549
|
+
buffer_bytes = numpy_to_bytes(image)
|
550
|
+
files = [("image", buffer_bytes)]
|
551
|
+
payload = {
|
552
|
+
"prompt": prompt,
|
553
|
+
"function_name": "ixc25_image_vqa",
|
554
|
+
}
|
555
|
+
data: Dict[str, Any] = send_inference_request(
|
556
|
+
payload, "internlm-xcomposer2", files=files, v2=True
|
557
|
+
)
|
558
|
+
return cast(str, data["answer"])
|
559
|
+
|
560
|
+
|
561
|
+
def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
562
|
+
"""'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
|
563
|
+
including regular videos or videos of documents or presentations. It returns text
|
564
|
+
as an answer to the question.
|
565
|
+
|
566
|
+
Parameters:
|
567
|
+
prompt (str): The question about the video
|
568
|
+
frames (List[np.ndarray]): The reference frames used for the question
|
569
|
+
|
570
|
+
Returns:
|
571
|
+
str: A string which is the answer to the given prompt.
|
572
|
+
|
573
|
+
Example
|
574
|
+
-------
|
575
|
+
>>> ixc25_video_vqa('Which football player made the goal?', frames)
|
576
|
+
'Lionel Messi'
|
577
|
+
"""
|
578
|
+
|
579
|
+
buffer_bytes = frames_to_bytes(frames)
|
580
|
+
files = [("video", buffer_bytes)]
|
581
|
+
payload = {
|
582
|
+
"prompt": prompt,
|
583
|
+
"function_name": "ixc25_video_vqa",
|
584
|
+
}
|
585
|
+
data: Dict[str, Any] = send_inference_request(
|
586
|
+
payload, "internlm-xcomposer2", files=files, v2=True
|
587
|
+
)
|
588
|
+
return cast(str, data["answer"])
|
589
|
+
|
590
|
+
|
452
591
|
def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
453
592
|
"""'git_vqa_v2' is a tool that can answer questions about the visual
|
454
593
|
contents of an image given a question and an image. It returns an answer to the
|
@@ -592,8 +731,8 @@ def blip_image_caption(image: np.ndarray) -> str:
|
|
592
731
|
return answer["text"][0] # type: ignore
|
593
732
|
|
594
733
|
|
595
|
-
def
|
596
|
-
"""'
|
734
|
+
def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
|
735
|
+
"""'florence2_image_caption' is a tool that can caption or describe an image based
|
597
736
|
on its contents. It returns a text describing the image.
|
598
737
|
|
599
738
|
Parameters:
|
@@ -606,7 +745,7 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
|
|
606
745
|
|
607
746
|
Example
|
608
747
|
-------
|
609
|
-
>>>
|
748
|
+
>>> florence2_image_caption(image, False)
|
610
749
|
'This image contains a cat sitting on a table with a bowl of milk.'
|
611
750
|
"""
|
612
751
|
image_b64 = convert_to_b64(image)
|
@@ -614,17 +753,19 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
|
|
614
753
|
data = {
|
615
754
|
"image": image_b64,
|
616
755
|
"task": task,
|
617
|
-
"function_name": "
|
756
|
+
"function_name": "florence2_image_caption",
|
618
757
|
}
|
619
758
|
|
620
759
|
answer = send_inference_request(data, "florence2", v2=True)
|
621
760
|
return answer[task] # type: ignore
|
622
761
|
|
623
762
|
|
624
|
-
def
|
625
|
-
"""'florencev2_object_detection' is a tool that can detect
|
626
|
-
prompt such as
|
627
|
-
|
763
|
+
def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
|
764
|
+
"""'florencev2_object_detection' is a tool that can detect and count multiple
|
765
|
+
objects given a text prompt such as category names or referring expressions. You
|
766
|
+
can optionally separate the categories in the text with commas. It returns a list
|
767
|
+
of bounding boxes with normalized coordinates, label names and associated
|
768
|
+
probability scores of 1.0.
|
628
769
|
|
629
770
|
Parameters:
|
630
771
|
prompt (str): The prompt to ground to the image.
|
@@ -639,7 +780,7 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
|
|
639
780
|
|
640
781
|
Example
|
641
782
|
-------
|
642
|
-
>>>
|
783
|
+
>>> florence2_object_detection('person looking at a coyote', image)
|
643
784
|
[
|
644
785
|
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
645
786
|
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
@@ -651,7 +792,7 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
|
|
651
792
|
"image": image_b64,
|
652
793
|
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
653
794
|
"prompt": prompt,
|
654
|
-
"function_name": "
|
795
|
+
"function_name": "florence2_object_detection",
|
655
796
|
}
|
656
797
|
|
657
798
|
detections = send_inference_request(data, "florence2", v2=True)
|
@@ -668,8 +809,8 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
|
|
668
809
|
return return_data
|
669
810
|
|
670
811
|
|
671
|
-
def
|
672
|
-
"""'
|
812
|
+
def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
813
|
+
"""'florence2_ocr' is a tool that can detect text and text regions in an image.
|
673
814
|
Each text region contains one line of text. It returns a list of detected text,
|
674
815
|
the text region as a bounding box with normalized coordinates, and confidence
|
675
816
|
scores. The results are sorted from top-left to bottom right.
|
@@ -683,7 +824,7 @@ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
683
824
|
|
684
825
|
Example
|
685
826
|
-------
|
686
|
-
>>>
|
827
|
+
>>> florence2_ocr(image)
|
687
828
|
[
|
688
829
|
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
689
830
|
]
|
@@ -694,7 +835,7 @@ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
694
835
|
data = {
|
695
836
|
"image": image_b64,
|
696
837
|
"task": "<OCR_WITH_REGION>",
|
697
|
-
"function_name": "
|
838
|
+
"function_name": "florence2_ocr",
|
698
839
|
}
|
699
840
|
|
700
841
|
detections = send_inference_request(data, "florence2", v2=True)
|
@@ -1035,6 +1176,54 @@ def closest_box_distance(
|
|
1035
1176
|
# Utility and visualization functions
|
1036
1177
|
|
1037
1178
|
|
1179
|
+
def extract_frames(
|
1180
|
+
video_uri: Union[str, Path], fps: float = 1
|
1181
|
+
) -> List[Tuple[np.ndarray, float]]:
|
1182
|
+
"""'extract_frames' extracts frames from a video which can be a file path or youtube
|
1183
|
+
link, returns a list of tuples (frame, timestamp), where timestamp is the relative
|
1184
|
+
time in seconds where the frame was captured. The frame is a numpy array.
|
1185
|
+
|
1186
|
+
Parameters:
|
1187
|
+
video_uri (Union[str, Path]): The path to the video file or youtube link
|
1188
|
+
fps (float, optional): The frame rate per second to extract the frames. Defaults
|
1189
|
+
to 10.
|
1190
|
+
|
1191
|
+
Returns:
|
1192
|
+
List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
|
1193
|
+
as a numpy array and the timestamp in seconds.
|
1194
|
+
|
1195
|
+
Example
|
1196
|
+
-------
|
1197
|
+
>>> extract_frames("path/to/video.mp4")
|
1198
|
+
[(frame1, 0.0), (frame2, 0.5), ...]
|
1199
|
+
"""
|
1200
|
+
|
1201
|
+
if str(video_uri).startswith(
|
1202
|
+
(
|
1203
|
+
"http://www.youtube.com/",
|
1204
|
+
"https://www.youtube.com/",
|
1205
|
+
"http://youtu.be/",
|
1206
|
+
"https://youtu.be/",
|
1207
|
+
)
|
1208
|
+
):
|
1209
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
1210
|
+
yt = YouTube(str(video_uri))
|
1211
|
+
# Download the highest resolution video
|
1212
|
+
video = (
|
1213
|
+
yt.streams.filter(progressive=True, file_extension="mp4")
|
1214
|
+
.order_by("resolution")
|
1215
|
+
.desc()
|
1216
|
+
.first()
|
1217
|
+
)
|
1218
|
+
if not video:
|
1219
|
+
raise Exception("No suitable video stream found")
|
1220
|
+
video_file_path = video.download(output_path=temp_dir)
|
1221
|
+
|
1222
|
+
return extract_frames_from_video(video_file_path, fps)
|
1223
|
+
|
1224
|
+
return extract_frames_from_video(str(video_uri), fps)
|
1225
|
+
|
1226
|
+
|
1038
1227
|
def save_json(data: Any, file_path: str) -> None:
|
1039
1228
|
"""'save_json' is a utility function that saves data as a JSON file. It is helpful
|
1040
1229
|
for saving data that contains NumPy arrays which are not JSON serializable.
|
@@ -1099,7 +1288,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
|
|
1099
1288
|
|
1100
1289
|
|
1101
1290
|
def save_video(
|
1102
|
-
frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float =
|
1291
|
+
frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 1
|
1103
1292
|
) -> str:
|
1104
1293
|
"""'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
|
1105
1294
|
|
@@ -1201,15 +1390,43 @@ def overlay_bounding_boxes(
|
|
1201
1390
|
return np.array(pil_image)
|
1202
1391
|
|
1203
1392
|
|
1393
|
+
def _get_text_coords_from_mask(
|
1394
|
+
mask: np.ndarray, v_gap: int = 10, h_gap: int = 10
|
1395
|
+
) -> Tuple[int, int]:
|
1396
|
+
mask = mask.astype(np.uint8)
|
1397
|
+
if np.sum(mask) == 0:
|
1398
|
+
return (0, 0)
|
1399
|
+
|
1400
|
+
rows, cols = np.nonzero(mask)
|
1401
|
+
top = rows.min()
|
1402
|
+
bottom = rows.max()
|
1403
|
+
left = cols.min()
|
1404
|
+
right = cols.max()
|
1405
|
+
|
1406
|
+
if top - v_gap < 0:
|
1407
|
+
if bottom + v_gap > mask.shape[0]:
|
1408
|
+
top = top
|
1409
|
+
else:
|
1410
|
+
top = bottom + v_gap
|
1411
|
+
else:
|
1412
|
+
top = top - v_gap
|
1413
|
+
|
1414
|
+
return left + (right - left) // 2 - h_gap, top
|
1415
|
+
|
1416
|
+
|
1204
1417
|
def overlay_segmentation_masks(
|
1205
|
-
|
1206
|
-
|
1418
|
+
medias: Union[np.ndarray, List[np.ndarray]],
|
1419
|
+
masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
|
1420
|
+
draw_label: bool = True,
|
1421
|
+
) -> Union[np.ndarray, List[np.ndarray]]:
|
1207
1422
|
"""'overlay_segmentation_masks' is a utility function that displays segmentation
|
1208
1423
|
masks.
|
1209
1424
|
|
1210
1425
|
Parameters:
|
1211
|
-
|
1212
|
-
|
1426
|
+
medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
|
1427
|
+
the masks on.
|
1428
|
+
masks (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
|
1429
|
+
dictionaries containing the masks.
|
1213
1430
|
|
1214
1431
|
Returns:
|
1215
1432
|
np.ndarray: The image with the masks displayed.
|
@@ -1229,27 +1446,50 @@ def overlay_segmentation_masks(
|
|
1229
1446
|
}],
|
1230
1447
|
)
|
1231
1448
|
"""
|
1232
|
-
|
1449
|
+
medias_int: List[np.ndarray] = (
|
1450
|
+
[medias] if isinstance(medias, np.ndarray) else medias
|
1451
|
+
)
|
1452
|
+
masks_int = [masks] if isinstance(masks[0], dict) else masks
|
1453
|
+
masks_int = cast(List[List[Dict[str, Any]]], masks_int)
|
1233
1454
|
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1237
|
-
|
1455
|
+
labels = set()
|
1456
|
+
for mask_i in masks_int:
|
1457
|
+
for mask_j in mask_i:
|
1458
|
+
labels.add(mask_j["label"])
|
1459
|
+
color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(labels)}
|
1238
1460
|
|
1239
|
-
|
1240
|
-
|
1241
|
-
|
1242
|
-
|
1243
|
-
|
1461
|
+
width, height = Image.fromarray(medias_int[0]).size
|
1462
|
+
fontsize = max(12, int(min(width, height) / 40))
|
1463
|
+
font = ImageFont.truetype(
|
1464
|
+
str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
|
1465
|
+
fontsize,
|
1466
|
+
)
|
1244
1467
|
|
1245
|
-
|
1246
|
-
|
1247
|
-
|
1248
|
-
|
1249
|
-
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1468
|
+
frame_out = []
|
1469
|
+
for i, frame in enumerate(medias_int):
|
1470
|
+
pil_image = Image.fromarray(frame.astype(np.uint8)).convert("RGBA")
|
1471
|
+
for elt in masks_int[i]:
|
1472
|
+
mask = elt["mask"]
|
1473
|
+
label = elt["label"]
|
1474
|
+
np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
|
1475
|
+
np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
|
1476
|
+
mask_img = Image.fromarray(np_mask.astype(np.uint8))
|
1477
|
+
pil_image = Image.alpha_composite(pil_image, mask_img)
|
1478
|
+
|
1479
|
+
if draw_label:
|
1480
|
+
draw = ImageDraw.Draw(pil_image)
|
1481
|
+
text_box = draw.textbbox((0, 0), text=label, font=font)
|
1482
|
+
x, y = _get_text_coords_from_mask(
|
1483
|
+
mask,
|
1484
|
+
v_gap=(text_box[3] - text_box[1]) + 10,
|
1485
|
+
h_gap=(text_box[2] - text_box[0]) // 2,
|
1486
|
+
)
|
1487
|
+
if x != 0 and y != 0:
|
1488
|
+
text_box = draw.textbbox((x, y), text=label, font=font)
|
1489
|
+
draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label])
|
1490
|
+
draw.text((x, y), label, fill="black", font=font)
|
1491
|
+
frame_out.append(np.array(pil_image))
|
1492
|
+
return frame_out[0] if len(frame_out) == 1 else frame_out
|
1253
1493
|
|
1254
1494
|
|
1255
1495
|
def overlay_heat_map(
|
@@ -1412,7 +1652,6 @@ def florencev2_fine_tuned_object_detection(
|
|
1412
1652
|
|
1413
1653
|
TOOLS = [
|
1414
1654
|
owl_v2,
|
1415
|
-
grounding_sam,
|
1416
1655
|
extract_frames,
|
1417
1656
|
ocr,
|
1418
1657
|
clip,
|
@@ -1420,13 +1659,15 @@ TOOLS = [
|
|
1420
1659
|
vit_nsfw_classification,
|
1421
1660
|
loca_zero_shot_counting,
|
1422
1661
|
loca_visual_prompt_counting,
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1662
|
+
florence2_image_caption,
|
1663
|
+
florence2_ocr,
|
1664
|
+
florence2_sam2_image,
|
1665
|
+
florence2_sam2_video,
|
1666
|
+
florence2_object_detection,
|
1667
|
+
ixc25_image_vqa,
|
1668
|
+
ixc25_video_vqa,
|
1426
1669
|
detr_segmentation,
|
1427
1670
|
depth_anything_v2,
|
1428
|
-
generate_soft_edge_image,
|
1429
|
-
dpt_hybrid_midas,
|
1430
1671
|
generate_pose_image,
|
1431
1672
|
closest_mask_distance,
|
1432
1673
|
closest_box_distance,
|
@@ -1437,7 +1678,6 @@ TOOLS = [
|
|
1437
1678
|
overlay_bounding_boxes,
|
1438
1679
|
overlay_segmentation_masks,
|
1439
1680
|
overlay_heat_map,
|
1440
|
-
template_match,
|
1441
1681
|
]
|
1442
1682
|
TOOLS_DF = get_tools_df(TOOLS) # type: ignore
|
1443
1683
|
TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
|
@@ -416,7 +416,6 @@ class CodeInterpreter(abc.ABC):
|
|
416
416
|
|
417
417
|
|
418
418
|
class E2BCodeInterpreter(CodeInterpreter):
|
419
|
-
|
420
419
|
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
421
420
|
super().__init__(*args, **kwargs)
|
422
421
|
assert os.getenv("E2B_API_KEY"), "E2B_API_KEY environment variable must be set"
|
@@ -1,12 +1,15 @@
|
|
1
1
|
"""Utility functions for image processing."""
|
2
2
|
|
3
3
|
import base64
|
4
|
+
import io
|
5
|
+
import tempfile
|
4
6
|
from importlib import resources
|
5
7
|
from io import BytesIO
|
6
8
|
from pathlib import Path
|
7
9
|
from typing import Dict, List, Tuple, Union
|
8
10
|
|
9
11
|
import numpy as np
|
12
|
+
from moviepy.editor import ImageSequenceClip
|
10
13
|
from PIL import Image, ImageDraw, ImageFont
|
11
14
|
from PIL.Image import Image as ImageType
|
12
15
|
|
@@ -63,6 +66,46 @@ def rle_decode(mask_rle: str, shape: Tuple[int, int]) -> np.ndarray:
|
|
63
66
|
return img.reshape(shape)
|
64
67
|
|
65
68
|
|
69
|
+
def rle_decode_array(rle: Dict[str, List[int]]) -> np.ndarray:
|
70
|
+
r"""Decode a run-length encoded mask. Returns numpy array, 1 - mask, 0 - background.
|
71
|
+
|
72
|
+
Parameters:
|
73
|
+
mask: The mask in run-length encoded as an array.
|
74
|
+
"""
|
75
|
+
size = rle["size"]
|
76
|
+
counts = rle["counts"]
|
77
|
+
|
78
|
+
total_elements = size[0] * size[1]
|
79
|
+
flattened_mask = np.zeros(total_elements, dtype=np.uint8)
|
80
|
+
|
81
|
+
current_pos = 0
|
82
|
+
for i, count in enumerate(counts):
|
83
|
+
if i % 2 == 1:
|
84
|
+
flattened_mask[current_pos : current_pos + count] = 1
|
85
|
+
current_pos += count
|
86
|
+
|
87
|
+
binary_mask = flattened_mask.reshape(size, order="F")
|
88
|
+
return binary_mask
|
89
|
+
|
90
|
+
|
91
|
+
def frames_to_bytes(
|
92
|
+
frames: List[np.ndarray], fps: float = 10, file_ext: str = "mp4"
|
93
|
+
) -> bytes:
|
94
|
+
r"""Convert a list of frames to a video file encoded into a byte string.
|
95
|
+
|
96
|
+
Parameters:
|
97
|
+
frames: the list of frames
|
98
|
+
fps: the frames per second of the video
|
99
|
+
file_ext: the file extension of the video file
|
100
|
+
"""
|
101
|
+
with tempfile.NamedTemporaryFile(delete=True) as temp_file:
|
102
|
+
clip = ImageSequenceClip(frames, fps=fps)
|
103
|
+
clip.write_videofile(temp_file.name + f".{file_ext}", fps=fps)
|
104
|
+
with open(temp_file.name + f".{file_ext}", "rb") as f:
|
105
|
+
buffer_bytes = f.read()
|
106
|
+
return buffer_bytes
|
107
|
+
|
108
|
+
|
66
109
|
def b64_to_pil(b64_str: str) -> ImageType:
|
67
110
|
r"""Convert a base64 string to a PIL Image.
|
68
111
|
|
@@ -78,6 +121,15 @@ def b64_to_pil(b64_str: str) -> ImageType:
|
|
78
121
|
return Image.open(BytesIO(base64.b64decode(b64_str)))
|
79
122
|
|
80
123
|
|
124
|
+
def numpy_to_bytes(image: np.ndarray) -> bytes:
|
125
|
+
pil_image = Image.fromarray(image).convert("RGB")
|
126
|
+
image_buffer = io.BytesIO()
|
127
|
+
pil_image.save(image_buffer, format="PNG")
|
128
|
+
buffer_bytes = image_buffer.getvalue()
|
129
|
+
image_buffer.close()
|
130
|
+
return buffer_bytes
|
131
|
+
|
132
|
+
|
81
133
|
def get_image_size(data: Union[str, Path, np.ndarray, ImageType]) -> Tuple[int, ...]:
|
82
134
|
r"""Get the size of an image.
|
83
135
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|