vision-agent 0.2.111__tar.gz → 0.2.112__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-0.2.111 → vision_agent-0.2.112}/PKG-INFO +1 -1
- {vision_agent-0.2.111 → vision_agent-0.2.112}/pyproject.toml +1 -1
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/agent_utils.py +3 -8
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/vision_agent_coder.py +28 -20
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/vision_agent_coder_prompts.py +9 -7
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/vision_agent_prompts.py +11 -10
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/__init__.py +8 -4
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/meta_tools.py +0 -1
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/tool_utils.py +8 -3
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/tools.py +350 -110
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/execute.py +0 -1
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/image_utils.py +52 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/LICENSE +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/README.md +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/clients/landing_public_api.py +1 -1
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/utils/video.py +0 -0
@@ -4,14 +4,13 @@ import sys
|
|
4
4
|
from typing import Any, Dict
|
5
5
|
|
6
6
|
logging.basicConfig(stream=sys.stdout)
|
7
|
-
_LOGGER = logging.getLogger(__name__)
|
8
7
|
|
9
8
|
|
10
9
|
def extract_json(json_str: str) -> Dict[str, Any]:
|
11
10
|
try:
|
11
|
+
json_str = json_str.replace("\n", " ")
|
12
12
|
json_dict = json.loads(json_str)
|
13
13
|
except json.JSONDecodeError:
|
14
|
-
input_json_str = json_str
|
15
14
|
if "```json" in json_str:
|
16
15
|
json_str = json_str[json_str.find("```json") + len("```json") :]
|
17
16
|
json_str = json_str[: json_str.find("```")]
|
@@ -19,12 +18,8 @@ def extract_json(json_str: str) -> Dict[str, Any]:
|
|
19
18
|
json_str = json_str[json_str.find("```") + len("```") :]
|
20
19
|
# get the last ``` not one from an intermediate string
|
21
20
|
json_str = json_str[: json_str.find("}```")]
|
22
|
-
|
23
|
-
|
24
|
-
except json.JSONDecodeError as e:
|
25
|
-
error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
|
26
|
-
_LOGGER.exception(error_msg)
|
27
|
-
raise ValueError(error_msg) from e
|
21
|
+
|
22
|
+
json_dict = json.loads(json_str)
|
28
23
|
return json_dict # type: ignore
|
29
24
|
|
30
25
|
|
@@ -4,6 +4,7 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import sys
|
6
6
|
import tempfile
|
7
|
+
from json import JSONDecodeError
|
7
8
|
from pathlib import Path
|
8
9
|
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
|
9
10
|
|
@@ -86,8 +87,8 @@ def format_memory(memory: List[Dict[str, str]]) -> str:
|
|
86
87
|
def format_plans(plans: Dict[str, Any]) -> str:
|
87
88
|
plan_str = ""
|
88
89
|
for k, v in plans.items():
|
89
|
-
plan_str += f"{k}
|
90
|
-
plan_str += "-" + "\n-".join([e
|
90
|
+
plan_str += "\n" + f"{k}: {v['thoughts']}\n"
|
91
|
+
plan_str += " -" + "\n -".join([e for e in v["instructions"]])
|
91
92
|
|
92
93
|
return plan_str
|
93
94
|
|
@@ -228,13 +229,11 @@ def pick_plan(
|
|
228
229
|
"status": "completed" if tool_output.success else "failed",
|
229
230
|
}
|
230
231
|
)
|
231
|
-
tool_output_str =
|
232
|
-
if len(tool_output.logs.stdout) > 0:
|
233
|
-
tool_output_str = tool_output.logs.stdout[0]
|
232
|
+
tool_output_str = tool_output.text().strip()
|
234
233
|
|
235
234
|
if verbosity == 2:
|
236
235
|
_print_code("Code and test after attempted fix:", code)
|
237
|
-
_LOGGER.info(f"Code execution result after
|
236
|
+
_LOGGER.info(f"Code execution result after attempt {count}")
|
238
237
|
|
239
238
|
count += 1
|
240
239
|
|
@@ -251,7 +250,21 @@ def pick_plan(
|
|
251
250
|
tool_output=tool_output_str[:20_000],
|
252
251
|
)
|
253
252
|
chat[-1]["content"] = prompt
|
254
|
-
|
253
|
+
|
254
|
+
count = 0
|
255
|
+
best_plan = None
|
256
|
+
while best_plan is None and count < max_retries:
|
257
|
+
try:
|
258
|
+
best_plan = extract_json(model(chat, stream=False)) # type: ignore
|
259
|
+
except JSONDecodeError as e:
|
260
|
+
_LOGGER.exception(
|
261
|
+
f"Error while extracting JSON during picking best plan {str(e)}"
|
262
|
+
)
|
263
|
+
pass
|
264
|
+
count += 1
|
265
|
+
|
266
|
+
if best_plan is None:
|
267
|
+
best_plan = {"best_plan": list(plans.keys())[0]}
|
255
268
|
|
256
269
|
if verbosity >= 1:
|
257
270
|
_LOGGER.info(f"Best plan:\n{best_plan}")
|
@@ -525,7 +538,7 @@ def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
|
|
525
538
|
|
526
539
|
|
527
540
|
def retrieve_tools(
|
528
|
-
plans: Dict[str,
|
541
|
+
plans: Dict[str, Dict[str, Any]],
|
529
542
|
tool_recommender: Sim,
|
530
543
|
log_progress: Callable[[Dict[str, Any]], None],
|
531
544
|
verbosity: int = 0,
|
@@ -542,8 +555,8 @@ def retrieve_tools(
|
|
542
555
|
tool_lists: Dict[str, List[Dict[str, str]]] = {}
|
543
556
|
for k, plan in plans.items():
|
544
557
|
tool_lists[k] = []
|
545
|
-
for task in plan:
|
546
|
-
tools = tool_recommender.top_k(task
|
558
|
+
for task in plan["instructions"]:
|
559
|
+
tools = tool_recommender.top_k(task, k=2, thresh=0.3)
|
547
560
|
tool_info.extend([e["doc"] for e in tools])
|
548
561
|
tool_desc.extend([e["desc"] for e in tools])
|
549
562
|
tool_lists[k].extend(
|
@@ -737,14 +750,7 @@ class VisionAgentCoder(Agent):
|
|
737
750
|
if self.verbosity >= 1:
|
738
751
|
for p in plans:
|
739
752
|
# tabulate will fail if the keys are not the same for all elements
|
740
|
-
p_fixed = [
|
741
|
-
{
|
742
|
-
"instructions": (
|
743
|
-
e["instructions"] if "instructions" in e else ""
|
744
|
-
)
|
745
|
-
}
|
746
|
-
for e in plans[p]
|
747
|
-
]
|
753
|
+
p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
|
748
754
|
_LOGGER.info(
|
749
755
|
f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
750
756
|
)
|
@@ -793,13 +799,15 @@ class VisionAgentCoder(Agent):
|
|
793
799
|
)
|
794
800
|
|
795
801
|
if self.verbosity >= 1:
|
802
|
+
plan_i_fixed = [{"instructions": e} for e in plan_i["instructions"]]
|
796
803
|
_LOGGER.info(
|
797
|
-
f"Picked best plan:\n{tabulate(tabular_data=
|
804
|
+
f"Picked best plan:\n{tabulate(tabular_data=plan_i_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
798
805
|
)
|
799
806
|
|
800
807
|
results = write_and_test_code(
|
801
808
|
chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
|
802
|
-
plan="\n
|
809
|
+
plan=f"\n{plan_i['thoughts']}\n-"
|
810
|
+
+ "\n-".join([e for e in plan_i["instructions"]]),
|
803
811
|
tool_info=tool_info,
|
804
812
|
tool_output=tool_output_str,
|
805
813
|
tool_utils=T.UTILITIES_DOCSTRING,
|
{vision_agent-0.2.111 → vision_agent-0.2.112}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
@@ -30,18 +30,19 @@ PLAN = """
|
|
30
30
|
|
31
31
|
**Instructions**:
|
32
32
|
1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
|
33
|
-
2. Output three different plans each utilize a different strategy or
|
33
|
+
2. Output three different plans each utilize a different strategy or set of tools.
|
34
34
|
|
35
35
|
Output a list of jsons in the following format
|
36
36
|
|
37
37
|
```json
|
38
38
|
{{
|
39
39
|
"plan1":
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
40
|
+
{{
|
41
|
+
"thoughts": str # your thought process for choosing this plan
|
42
|
+
"instructions": [
|
43
|
+
str # what you should do in this task associated with a tool
|
44
|
+
]
|
45
|
+
}},
|
45
46
|
"plan2": ...,
|
46
47
|
"plan3": ...
|
47
48
|
}}
|
@@ -127,7 +128,8 @@ PICK_PLAN = """
|
|
127
128
|
|
128
129
|
**Instructions**:
|
129
130
|
1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
|
130
|
-
2.
|
131
|
+
2. Try solving the problem yourself given the image and pick the plan that matches your solution the best.
|
132
|
+
3. Output a JSON object with the following format:
|
131
133
|
{{
|
132
134
|
"thoughts": str # your thought process for choosing the best plan
|
133
135
|
"best_plan": str # the best plan you have chosen
|
@@ -15,7 +15,7 @@ This is the documentation for the different actions you can take:
|
|
15
15
|
**Examples**:
|
16
16
|
Here is an example of how you can interact with a user and Actions to complete a task:
|
17
17
|
--- START EXAMPLES ---
|
18
|
-
[Current directory: /workspace
|
18
|
+
[Current directory: /example/workspace]
|
19
19
|
{examples}
|
20
20
|
--- END EXAMPLES ---
|
21
21
|
|
@@ -27,16 +27,17 @@ Here is an example of how you can interact with a user and Actions to complete a
|
|
27
27
|
Here is the current conversation so far:
|
28
28
|
--- START CONVERSATION ---
|
29
29
|
[Current directory: {dir}]
|
30
|
+
|
30
31
|
{conversation}
|
31
32
|
"""
|
32
33
|
|
33
34
|
EXAMPLES_CODE1 = """
|
34
35
|
USER: Can you detect the dogs in this image? Media name dog.jpg
|
35
36
|
|
36
|
-
AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/workspace/
|
37
|
+
AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
|
37
38
|
|
38
39
|
OBSERVATION:
|
39
|
-
[File /workspace/
|
40
|
+
[File /example/workspace/dog_detector.py]
|
40
41
|
0|from vision_agent.tools import load_image, owl_v2
|
41
42
|
1|def detect_dogs(image_path: str):
|
42
43
|
2| image = load_image(image_path)
|
@@ -44,7 +45,7 @@ OBSERVATION:
|
|
44
45
|
4| return dogs
|
45
46
|
[End of file]
|
46
47
|
|
47
|
-
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/
|
48
|
+
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
|
48
49
|
|
49
50
|
OBSERVATION:
|
50
51
|
----- stdout -----
|
@@ -55,10 +56,10 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
|
|
55
56
|
|
56
57
|
USER: The the image only has one dog, can you fix this?
|
57
58
|
|
58
|
-
AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/workspace/
|
59
|
+
AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
|
59
60
|
|
60
61
|
OBSERVATION:
|
61
|
-
[File /workspace/
|
62
|
+
[File /example/workspace/dog_detector.py]
|
62
63
|
0|from vision_agent.tools import load_image, owl_v2
|
63
64
|
1|def detect_dogs(image_path: str):
|
64
65
|
2| image = load_image(image_path)
|
@@ -66,7 +67,7 @@ OBSERVATION:
|
|
66
67
|
4| return dogs
|
67
68
|
[End of file]
|
68
69
|
|
69
|
-
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/
|
70
|
+
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
|
70
71
|
|
71
72
|
OBSERVATION:
|
72
73
|
----- stdout -----
|
@@ -82,10 +83,10 @@ AGENT: {"thoughts": "The user has asked to count workers with helmets but has no
|
|
82
83
|
|
83
84
|
USER: Yes you can use workers.png
|
84
85
|
|
85
|
-
AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/workspace/
|
86
|
+
AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
|
86
87
|
|
87
88
|
OBSERVATION:
|
88
|
-
[File /workspace/
|
89
|
+
[File /example/workspace/code.py]
|
89
90
|
0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
|
90
91
|
1|def count_workers_with_helmets(image_path: str):
|
91
92
|
2| image = load_image(image_path)
|
@@ -104,7 +105,7 @@ OBSERVATION:
|
|
104
105
|
15| return count
|
105
106
|
[End of file]
|
106
107
|
|
107
|
-
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/workspace/
|
108
|
+
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
|
108
109
|
|
109
110
|
OBSERVATION:
|
110
111
|
----- stdout -----
|
@@ -19,16 +19,20 @@ from .tools import (
|
|
19
19
|
detr_segmentation,
|
20
20
|
dpt_hybrid_midas,
|
21
21
|
extract_frames,
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
florence2_image_caption,
|
23
|
+
florence2_object_detection,
|
24
|
+
florence2_ocr,
|
25
|
+
florence2_roberta_vqa,
|
26
|
+
florence2_sam2_image,
|
27
|
+
florence2_sam2_video,
|
26
28
|
generate_pose_image,
|
27
29
|
generate_soft_edge_image,
|
28
30
|
get_tool_documentation,
|
29
31
|
git_vqa_v2,
|
30
32
|
grounding_dino,
|
31
33
|
grounding_sam,
|
34
|
+
ixc25_image_vqa,
|
35
|
+
ixc25_video_vqa,
|
32
36
|
load_image,
|
33
37
|
loca_visual_prompt_counting,
|
34
38
|
loca_zero_shot_counting,
|
@@ -8,7 +8,6 @@ from vision_agent.lmm.types import Message
|
|
8
8
|
from vision_agent.tools.tool_utils import get_tool_documentation
|
9
9
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
10
10
|
|
11
|
-
|
12
11
|
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
13
12
|
|
14
13
|
CURRENT_FILE = None
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import inspect
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
-
from typing import Any, Callable, Dict, List, MutableMapping, Optional
|
4
|
+
from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
|
5
5
|
|
6
6
|
import pandas as pd
|
7
7
|
from IPython.display import display
|
@@ -31,6 +31,7 @@ class ToolCallTrace(BaseModel):
|
|
31
31
|
def send_inference_request(
|
32
32
|
payload: Dict[str, Any],
|
33
33
|
endpoint_name: str,
|
34
|
+
files: Optional[List[Tuple[Any, ...]]] = None,
|
34
35
|
v2: bool = False,
|
35
36
|
metadata_payload: Optional[Dict[str, Any]] = None,
|
36
37
|
) -> Dict[str, Any]:
|
@@ -50,7 +51,7 @@ def send_inference_request(
|
|
50
51
|
response={},
|
51
52
|
error=None,
|
52
53
|
)
|
53
|
-
headers = {"
|
54
|
+
headers = {"apikey": _LND_API_KEY}
|
54
55
|
if "TOOL_ENDPOINT_AUTH" in os.environ:
|
55
56
|
headers["Authorization"] = os.environ["TOOL_ENDPOINT_AUTH"]
|
56
57
|
headers.pop("apikey")
|
@@ -60,7 +61,11 @@ def send_inference_request(
|
|
60
61
|
num_retry=3,
|
61
62
|
headers=headers,
|
62
63
|
)
|
63
|
-
|
64
|
+
|
65
|
+
if files is not None:
|
66
|
+
res = session.post(url, data=payload, files=files)
|
67
|
+
else:
|
68
|
+
res = session.post(url, json=payload)
|
64
69
|
if res.status_code != 200:
|
65
70
|
tool_call_trace.error = Error(
|
66
71
|
name="RemoteToolCallFailed",
|
@@ -2,47 +2,50 @@ import io
|
|
2
2
|
import json
|
3
3
|
import logging
|
4
4
|
import tempfile
|
5
|
-
from uuid import UUID
|
6
|
-
from pathlib import Path
|
7
5
|
from importlib import resources
|
6
|
+
from pathlib import Path
|
8
7
|
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
8
|
+
from uuid import UUID
|
9
9
|
|
10
10
|
import cv2
|
11
|
-
import requests
|
12
11
|
import numpy as np
|
13
|
-
|
12
|
+
import requests
|
14
13
|
from moviepy.editor import ImageSequenceClip
|
15
14
|
from PIL import Image, ImageDraw, ImageFont
|
16
15
|
from pillow_heif import register_heif_opener # type: ignore
|
16
|
+
from pytube import YouTube # type: ignore
|
17
17
|
|
18
|
+
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
18
19
|
from vision_agent.tools.tool_utils import (
|
19
|
-
send_inference_request,
|
20
20
|
get_tool_descriptions,
|
21
21
|
get_tool_documentation,
|
22
22
|
get_tools_df,
|
23
23
|
get_tools_info,
|
24
|
+
send_inference_request,
|
25
|
+
)
|
26
|
+
from vision_agent.tools.tools_types import (
|
27
|
+
BboxInput,
|
28
|
+
BboxInputBase64,
|
29
|
+
FineTuning,
|
30
|
+
Florencev2FtRequest,
|
31
|
+
JobStatus,
|
32
|
+
PromptTask,
|
24
33
|
)
|
25
|
-
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
26
34
|
from vision_agent.utils import extract_frames_from_video
|
35
|
+
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
27
36
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
28
37
|
from vision_agent.utils.image_utils import (
|
29
38
|
b64_to_pil,
|
39
|
+
convert_quad_box_to_bbox,
|
30
40
|
convert_to_b64,
|
31
41
|
denormalize_bbox,
|
42
|
+
frames_to_bytes,
|
32
43
|
get_image_size,
|
33
44
|
normalize_bbox,
|
34
|
-
|
45
|
+
numpy_to_bytes,
|
35
46
|
rle_decode,
|
47
|
+
rle_decode_array,
|
36
48
|
)
|
37
|
-
from vision_agent.tools.tools_types import (
|
38
|
-
BboxInput,
|
39
|
-
BboxInputBase64,
|
40
|
-
PromptTask,
|
41
|
-
Florencev2FtRequest,
|
42
|
-
FineTuning,
|
43
|
-
JobStatus,
|
44
|
-
)
|
45
|
-
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
46
49
|
|
47
50
|
register_heif_opener()
|
48
51
|
|
@@ -141,9 +144,9 @@ def owl_v2(
|
|
141
144
|
box_threshold: float = 0.10,
|
142
145
|
) -> List[Dict[str, Any]]:
|
143
146
|
"""'owl_v2' is a tool that can detect and count multiple objects given a text
|
144
|
-
prompt such as category names or referring expressions. The categories in text
|
145
|
-
are separated by commas. It returns a list of bounding boxes with
|
146
|
-
|
147
|
+
prompt such as category names or referring expressions. The categories in text
|
148
|
+
prompt are separated by commas. It returns a list of bounding boxes with normalized
|
149
|
+
coordinates, label names and associated probability scores.
|
147
150
|
|
148
151
|
Parameters:
|
149
152
|
prompt (str): The prompt to ground to the image.
|
@@ -194,10 +197,10 @@ def grounding_sam(
|
|
194
197
|
box_threshold: float = 0.20,
|
195
198
|
iou_threshold: float = 0.20,
|
196
199
|
) -> List[Dict[str, Any]]:
|
197
|
-
"""'grounding_sam' is a tool that can segment multiple objects given a
|
198
|
-
|
199
|
-
|
200
|
-
|
200
|
+
"""'grounding_sam' is a tool that can segment multiple objects given a text prompt
|
201
|
+
such as category names or referring expressions. The categories in text prompt are
|
202
|
+
separated by commas or periods. It returns a list of bounding boxes, label names,
|
203
|
+
mask file names and associated probability scores.
|
201
204
|
|
202
205
|
Parameters:
|
203
206
|
prompt (str): The prompt to ground to the image.
|
@@ -254,52 +257,114 @@ def grounding_sam(
|
|
254
257
|
return return_data
|
255
258
|
|
256
259
|
|
257
|
-
def
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
time in seconds where the frame was captured. The frame is a numpy array.
|
260
|
+
def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
|
261
|
+
"""'florence2_sam2_image' is a tool that can segment multiple objects given a text
|
262
|
+
prompt such as category names or referring expressions. The categories in the text
|
263
|
+
prompt are separated by commas. It returns a list of bounding boxes, label names,
|
264
|
+
mask file names and associated probability scores of 1.0.
|
263
265
|
|
264
266
|
Parameters:
|
265
|
-
|
266
|
-
|
267
|
-
to 0.5.
|
267
|
+
prompt (str): The prompt to ground to the image.
|
268
|
+
image (np.ndarray): The image to ground the prompt to.
|
268
269
|
|
269
270
|
Returns:
|
270
|
-
List[
|
271
|
-
|
271
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
272
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
273
|
+
(xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
|
274
|
+
and xmax and ymax are the coordinates of the bottom-right of the bounding box.
|
275
|
+
The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
|
276
|
+
the background.
|
272
277
|
|
273
278
|
Example
|
274
279
|
-------
|
275
|
-
>>>
|
276
|
-
[
|
280
|
+
>>> florence2_sam2_image("car, dinosaur", image)
|
281
|
+
[
|
282
|
+
{
|
283
|
+
'score': 1.0,
|
284
|
+
'label': 'dinosaur',
|
285
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
286
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
287
|
+
[0, 0, 0, ..., 0, 0, 0],
|
288
|
+
...,
|
289
|
+
[0, 0, 0, ..., 0, 0, 0],
|
290
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
291
|
+
},
|
292
|
+
]
|
277
293
|
"""
|
294
|
+
buffer_bytes = numpy_to_bytes(image)
|
278
295
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
)
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
.first()
|
295
|
-
)
|
296
|
-
if not video:
|
297
|
-
raise Exception("No suitable video stream found")
|
298
|
-
video_file_path = video.download(output_path=temp_dir)
|
296
|
+
files = [("image", buffer_bytes)]
|
297
|
+
payload = {
|
298
|
+
"prompts": [s.strip() for s in prompt.split(",")],
|
299
|
+
"function_name": "florence2_sam2_image",
|
300
|
+
}
|
301
|
+
data: Dict[str, Any] = send_inference_request(
|
302
|
+
payload, "florence2-sam2", files=files, v2=True
|
303
|
+
)
|
304
|
+
return_data = []
|
305
|
+
for _, data_i in data["0"].items():
|
306
|
+
mask = rle_decode_array(data_i["mask"])
|
307
|
+
label = data_i["label"]
|
308
|
+
bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
|
309
|
+
return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
|
310
|
+
return return_data
|
299
311
|
|
300
|
-
return extract_frames_from_video(video_file_path, fps)
|
301
312
|
|
302
|
-
|
313
|
+
def florence2_sam2_video(
|
314
|
+
prompt: str, frames: List[np.ndarray]
|
315
|
+
) -> List[List[Dict[str, Any]]]:
|
316
|
+
"""'florence2_sam2_video' is a tool that can segment and track multiple entities
|
317
|
+
in a video given a text prompt such as category names or referring expressions. You
|
318
|
+
can optionally separate the categories in the text with commas. It only tracks
|
319
|
+
entities present in the first frame and only returns segmentation masks. It is
|
320
|
+
useful for tracking and counting without duplicating counts.
|
321
|
+
|
322
|
+
Parameters:
|
323
|
+
prompt (str): The prompt to ground to the video.
|
324
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
325
|
+
|
326
|
+
Returns:
|
327
|
+
List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
|
328
|
+
and segment mask. The outer list represents each frame and the inner list is
|
329
|
+
the entities per frame. The label contains the object ID followed by the label
|
330
|
+
name. The objects are only identified in the first framed and tracked
|
331
|
+
throughout the video.
|
332
|
+
|
333
|
+
Example
|
334
|
+
-------
|
335
|
+
>>> florence2_sam2_video("car, dinosaur", frames)
|
336
|
+
[
|
337
|
+
[
|
338
|
+
{
|
339
|
+
'label': '0: dinosaur',
|
340
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
341
|
+
[0, 0, 0, ..., 0, 0, 0],
|
342
|
+
...,
|
343
|
+
[0, 0, 0, ..., 0, 0, 0],
|
344
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
345
|
+
},
|
346
|
+
],
|
347
|
+
]
|
348
|
+
"""
|
349
|
+
|
350
|
+
buffer_bytes = frames_to_bytes(frames)
|
351
|
+
files = [("video", buffer_bytes)]
|
352
|
+
payload = {
|
353
|
+
"prompts": prompt.split(","),
|
354
|
+
"function_name": "florence2_sam2_video",
|
355
|
+
}
|
356
|
+
data: Dict[str, Any] = send_inference_request(
|
357
|
+
payload, "florence2-sam2", files=files, v2=True
|
358
|
+
)
|
359
|
+
return_data = []
|
360
|
+
for frame_i in data.keys():
|
361
|
+
return_frame_data = []
|
362
|
+
for obj_id, data_j in data[frame_i].items():
|
363
|
+
mask = rle_decode_array(data_j["mask"])
|
364
|
+
label = obj_id + ": " + data_j["label"]
|
365
|
+
return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
|
366
|
+
return_data.append(return_frame_data)
|
367
|
+
return return_data
|
303
368
|
|
304
369
|
|
305
370
|
def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
@@ -368,12 +433,19 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
|
368
433
|
|
369
434
|
Returns:
|
370
435
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
371
|
-
value
|
436
|
+
value, e.g. {count: 12} and a heat map for visaulization purposes.
|
372
437
|
|
373
438
|
Example
|
374
439
|
-------
|
375
440
|
>>> loca_zero_shot_counting(image)
|
376
|
-
{'count':
|
441
|
+
{'count': 83,
|
442
|
+
'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
|
443
|
+
[ 0, 0, 0, ..., 0, 0, 0],
|
444
|
+
[ 0, 0, 0, ..., 0, 0, 1],
|
445
|
+
...,
|
446
|
+
[ 0, 0, 0, ..., 30, 35, 41],
|
447
|
+
[ 0, 0, 0, ..., 41, 47, 53],
|
448
|
+
[ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
|
377
449
|
"""
|
378
450
|
|
379
451
|
image_b64 = convert_to_b64(image)
|
@@ -398,12 +470,19 @@ def loca_visual_prompt_counting(
|
|
398
470
|
|
399
471
|
Returns:
|
400
472
|
Dict[str, Any]: A dictionary containing the key 'count' and the count as a
|
401
|
-
value
|
473
|
+
value, e.g. {count: 12} and a heat map for visaulization purposes.
|
402
474
|
|
403
475
|
Example
|
404
476
|
-------
|
405
477
|
>>> loca_visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
|
406
|
-
{'count':
|
478
|
+
{'count': 83,
|
479
|
+
'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0],
|
480
|
+
[ 0, 0, 0, ..., 0, 0, 0],
|
481
|
+
[ 0, 0, 0, ..., 0, 0, 1],
|
482
|
+
...,
|
483
|
+
[ 0, 0, 0, ..., 30, 35, 41],
|
484
|
+
[ 0, 0, 0, ..., 41, 47, 53],
|
485
|
+
[ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)}
|
407
486
|
"""
|
408
487
|
|
409
488
|
image_size = get_image_size(image)
|
@@ -420,8 +499,8 @@ def loca_visual_prompt_counting(
|
|
420
499
|
return resp_data
|
421
500
|
|
422
501
|
|
423
|
-
def
|
424
|
-
"""'
|
502
|
+
def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
503
|
+
"""'florence2_roberta_vqa' is a tool that takes an image and analyzes
|
425
504
|
its contents, generates detailed captions and then tries to answer the given
|
426
505
|
question using the generated context. It returns text as an answer to the question.
|
427
506
|
|
@@ -434,7 +513,7 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
|
434
513
|
|
435
514
|
Example
|
436
515
|
-------
|
437
|
-
>>>
|
516
|
+
>>> florence2_roberta_vqa('What is the top left animal in this image?', image)
|
438
517
|
'white tiger'
|
439
518
|
"""
|
440
519
|
|
@@ -442,13 +521,73 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
|
442
521
|
data = {
|
443
522
|
"image": image_b64,
|
444
523
|
"question": prompt,
|
445
|
-
"function_name": "
|
524
|
+
"function_name": "florence2_roberta_vqa",
|
446
525
|
}
|
447
526
|
|
448
527
|
answer = send_inference_request(data, "florence2-qa", v2=True)
|
449
528
|
return answer # type: ignore
|
450
529
|
|
451
530
|
|
531
|
+
def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
|
532
|
+
"""'ixc25_image_vqa' is a tool that can answer any questions about arbitrary images
|
533
|
+
including regular images or images of documents or presentations. It returns text
|
534
|
+
as an answer to the question.
|
535
|
+
|
536
|
+
Parameters:
|
537
|
+
prompt (str): The question about the image
|
538
|
+
image (np.ndarray): The reference image used for the question
|
539
|
+
|
540
|
+
Returns:
|
541
|
+
str: A string which is the answer to the given prompt.
|
542
|
+
|
543
|
+
Example
|
544
|
+
-------
|
545
|
+
>>> ixc25_image_vqa('What is the cat doing?', image)
|
546
|
+
'drinking milk'
|
547
|
+
"""
|
548
|
+
|
549
|
+
buffer_bytes = numpy_to_bytes(image)
|
550
|
+
files = [("image", buffer_bytes)]
|
551
|
+
payload = {
|
552
|
+
"prompt": prompt,
|
553
|
+
"function_name": "ixc25_image_vqa",
|
554
|
+
}
|
555
|
+
data: Dict[str, Any] = send_inference_request(
|
556
|
+
payload, "internlm-xcomposer2", files=files, v2=True
|
557
|
+
)
|
558
|
+
return cast(str, data["answer"])
|
559
|
+
|
560
|
+
|
561
|
+
def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
562
|
+
"""'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
|
563
|
+
including regular videos or videos of documents or presentations. It returns text
|
564
|
+
as an answer to the question.
|
565
|
+
|
566
|
+
Parameters:
|
567
|
+
prompt (str): The question about the video
|
568
|
+
frames (List[np.ndarray]): The reference frames used for the question
|
569
|
+
|
570
|
+
Returns:
|
571
|
+
str: A string which is the answer to the given prompt.
|
572
|
+
|
573
|
+
Example
|
574
|
+
-------
|
575
|
+
>>> ixc25_video_vqa('Which football player made the goal?', frames)
|
576
|
+
'Lionel Messi'
|
577
|
+
"""
|
578
|
+
|
579
|
+
buffer_bytes = frames_to_bytes(frames)
|
580
|
+
files = [("video", buffer_bytes)]
|
581
|
+
payload = {
|
582
|
+
"prompt": prompt,
|
583
|
+
"function_name": "ixc25_video_vqa",
|
584
|
+
}
|
585
|
+
data: Dict[str, Any] = send_inference_request(
|
586
|
+
payload, "internlm-xcomposer2", files=files, v2=True
|
587
|
+
)
|
588
|
+
return cast(str, data["answer"])
|
589
|
+
|
590
|
+
|
452
591
|
def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
453
592
|
"""'git_vqa_v2' is a tool that can answer questions about the visual
|
454
593
|
contents of an image given a question and an image. It returns an answer to the
|
@@ -592,8 +731,8 @@ def blip_image_caption(image: np.ndarray) -> str:
|
|
592
731
|
return answer["text"][0] # type: ignore
|
593
732
|
|
594
733
|
|
595
|
-
def
|
596
|
-
"""'
|
734
|
+
def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
|
735
|
+
"""'florence2_image_caption' is a tool that can caption or describe an image based
|
597
736
|
on its contents. It returns a text describing the image.
|
598
737
|
|
599
738
|
Parameters:
|
@@ -606,7 +745,7 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
|
|
606
745
|
|
607
746
|
Example
|
608
747
|
-------
|
609
|
-
>>>
|
748
|
+
>>> florence2_image_caption(image, False)
|
610
749
|
'This image contains a cat sitting on a table with a bowl of milk.'
|
611
750
|
"""
|
612
751
|
image_b64 = convert_to_b64(image)
|
@@ -614,17 +753,19 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
|
|
614
753
|
data = {
|
615
754
|
"image": image_b64,
|
616
755
|
"task": task,
|
617
|
-
"function_name": "
|
756
|
+
"function_name": "florence2_image_caption",
|
618
757
|
}
|
619
758
|
|
620
759
|
answer = send_inference_request(data, "florence2", v2=True)
|
621
760
|
return answer[task] # type: ignore
|
622
761
|
|
623
762
|
|
624
|
-
def
|
625
|
-
"""'florencev2_object_detection' is a tool that can detect
|
626
|
-
prompt such as
|
627
|
-
|
763
|
+
def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
|
764
|
+
"""'florencev2_object_detection' is a tool that can detect and count multiple
|
765
|
+
objects given a text prompt such as category names or referring expressions. You
|
766
|
+
can optionally separate the categories in the text with commas. It returns a list
|
767
|
+
of bounding boxes with normalized coordinates, label names and associated
|
768
|
+
probability scores of 1.0.
|
628
769
|
|
629
770
|
Parameters:
|
630
771
|
prompt (str): The prompt to ground to the image.
|
@@ -639,7 +780,7 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
|
|
639
780
|
|
640
781
|
Example
|
641
782
|
-------
|
642
|
-
>>>
|
783
|
+
>>> florence2_object_detection('person looking at a coyote', image)
|
643
784
|
[
|
644
785
|
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
645
786
|
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
@@ -651,7 +792,7 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
|
|
651
792
|
"image": image_b64,
|
652
793
|
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
653
794
|
"prompt": prompt,
|
654
|
-
"function_name": "
|
795
|
+
"function_name": "florence2_object_detection",
|
655
796
|
}
|
656
797
|
|
657
798
|
detections = send_inference_request(data, "florence2", v2=True)
|
@@ -668,8 +809,8 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
|
|
668
809
|
return return_data
|
669
810
|
|
670
811
|
|
671
|
-
def
|
672
|
-
"""'
|
812
|
+
def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
813
|
+
"""'florence2_ocr' is a tool that can detect text and text regions in an image.
|
673
814
|
Each text region contains one line of text. It returns a list of detected text,
|
674
815
|
the text region as a bounding box with normalized coordinates, and confidence
|
675
816
|
scores. The results are sorted from top-left to bottom right.
|
@@ -683,7 +824,7 @@ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
683
824
|
|
684
825
|
Example
|
685
826
|
-------
|
686
|
-
>>>
|
827
|
+
>>> florence2_ocr(image)
|
687
828
|
[
|
688
829
|
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
689
830
|
]
|
@@ -694,7 +835,7 @@ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
694
835
|
data = {
|
695
836
|
"image": image_b64,
|
696
837
|
"task": "<OCR_WITH_REGION>",
|
697
|
-
"function_name": "
|
838
|
+
"function_name": "florence2_ocr",
|
698
839
|
}
|
699
840
|
|
700
841
|
detections = send_inference_request(data, "florence2", v2=True)
|
@@ -1035,6 +1176,54 @@ def closest_box_distance(
|
|
1035
1176
|
# Utility and visualization functions
|
1036
1177
|
|
1037
1178
|
|
1179
|
+
def extract_frames(
|
1180
|
+
video_uri: Union[str, Path], fps: float = 1
|
1181
|
+
) -> List[Tuple[np.ndarray, float]]:
|
1182
|
+
"""'extract_frames' extracts frames from a video which can be a file path or youtube
|
1183
|
+
link, returns a list of tuples (frame, timestamp), where timestamp is the relative
|
1184
|
+
time in seconds where the frame was captured. The frame is a numpy array.
|
1185
|
+
|
1186
|
+
Parameters:
|
1187
|
+
video_uri (Union[str, Path]): The path to the video file or youtube link
|
1188
|
+
fps (float, optional): The frame rate per second to extract the frames. Defaults
|
1189
|
+
to 10.
|
1190
|
+
|
1191
|
+
Returns:
|
1192
|
+
List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
|
1193
|
+
as a numpy array and the timestamp in seconds.
|
1194
|
+
|
1195
|
+
Example
|
1196
|
+
-------
|
1197
|
+
>>> extract_frames("path/to/video.mp4")
|
1198
|
+
[(frame1, 0.0), (frame2, 0.5), ...]
|
1199
|
+
"""
|
1200
|
+
|
1201
|
+
if str(video_uri).startswith(
|
1202
|
+
(
|
1203
|
+
"http://www.youtube.com/",
|
1204
|
+
"https://www.youtube.com/",
|
1205
|
+
"http://youtu.be/",
|
1206
|
+
"https://youtu.be/",
|
1207
|
+
)
|
1208
|
+
):
|
1209
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
1210
|
+
yt = YouTube(str(video_uri))
|
1211
|
+
# Download the highest resolution video
|
1212
|
+
video = (
|
1213
|
+
yt.streams.filter(progressive=True, file_extension="mp4")
|
1214
|
+
.order_by("resolution")
|
1215
|
+
.desc()
|
1216
|
+
.first()
|
1217
|
+
)
|
1218
|
+
if not video:
|
1219
|
+
raise Exception("No suitable video stream found")
|
1220
|
+
video_file_path = video.download(output_path=temp_dir)
|
1221
|
+
|
1222
|
+
return extract_frames_from_video(video_file_path, fps)
|
1223
|
+
|
1224
|
+
return extract_frames_from_video(str(video_uri), fps)
|
1225
|
+
|
1226
|
+
|
1038
1227
|
def save_json(data: Any, file_path: str) -> None:
|
1039
1228
|
"""'save_json' is a utility function that saves data as a JSON file. It is helpful
|
1040
1229
|
for saving data that contains NumPy arrays which are not JSON serializable.
|
@@ -1099,7 +1288,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
|
|
1099
1288
|
|
1100
1289
|
|
1101
1290
|
def save_video(
|
1102
|
-
frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float =
|
1291
|
+
frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 1
|
1103
1292
|
) -> str:
|
1104
1293
|
"""'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
|
1105
1294
|
|
@@ -1201,15 +1390,43 @@ def overlay_bounding_boxes(
|
|
1201
1390
|
return np.array(pil_image)
|
1202
1391
|
|
1203
1392
|
|
1393
|
+
def _get_text_coords_from_mask(
|
1394
|
+
mask: np.ndarray, v_gap: int = 10, h_gap: int = 10
|
1395
|
+
) -> Tuple[int, int]:
|
1396
|
+
mask = mask.astype(np.uint8)
|
1397
|
+
if np.sum(mask) == 0:
|
1398
|
+
return (0, 0)
|
1399
|
+
|
1400
|
+
rows, cols = np.nonzero(mask)
|
1401
|
+
top = rows.min()
|
1402
|
+
bottom = rows.max()
|
1403
|
+
left = cols.min()
|
1404
|
+
right = cols.max()
|
1405
|
+
|
1406
|
+
if top - v_gap < 0:
|
1407
|
+
if bottom + v_gap > mask.shape[0]:
|
1408
|
+
top = top
|
1409
|
+
else:
|
1410
|
+
top = bottom + v_gap
|
1411
|
+
else:
|
1412
|
+
top = top - v_gap
|
1413
|
+
|
1414
|
+
return left + (right - left) // 2 - h_gap, top
|
1415
|
+
|
1416
|
+
|
1204
1417
|
def overlay_segmentation_masks(
|
1205
|
-
|
1206
|
-
|
1418
|
+
medias: Union[np.ndarray, List[np.ndarray]],
|
1419
|
+
masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
|
1420
|
+
draw_label: bool = True,
|
1421
|
+
) -> Union[np.ndarray, List[np.ndarray]]:
|
1207
1422
|
"""'overlay_segmentation_masks' is a utility function that displays segmentation
|
1208
1423
|
masks.
|
1209
1424
|
|
1210
1425
|
Parameters:
|
1211
|
-
|
1212
|
-
|
1426
|
+
medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
|
1427
|
+
the masks on.
|
1428
|
+
masks (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
|
1429
|
+
dictionaries containing the masks.
|
1213
1430
|
|
1214
1431
|
Returns:
|
1215
1432
|
np.ndarray: The image with the masks displayed.
|
@@ -1229,27 +1446,50 @@ def overlay_segmentation_masks(
|
|
1229
1446
|
}],
|
1230
1447
|
)
|
1231
1448
|
"""
|
1232
|
-
|
1449
|
+
medias_int: List[np.ndarray] = (
|
1450
|
+
[medias] if isinstance(medias, np.ndarray) else medias
|
1451
|
+
)
|
1452
|
+
masks_int = [masks] if isinstance(masks[0], dict) else masks
|
1453
|
+
masks_int = cast(List[List[Dict[str, Any]]], masks_int)
|
1233
1454
|
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1237
|
-
|
1455
|
+
labels = set()
|
1456
|
+
for mask_i in masks_int:
|
1457
|
+
for mask_j in mask_i:
|
1458
|
+
labels.add(mask_j["label"])
|
1459
|
+
color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(labels)}
|
1238
1460
|
|
1239
|
-
|
1240
|
-
|
1241
|
-
|
1242
|
-
|
1243
|
-
|
1461
|
+
width, height = Image.fromarray(medias_int[0]).size
|
1462
|
+
fontsize = max(12, int(min(width, height) / 40))
|
1463
|
+
font = ImageFont.truetype(
|
1464
|
+
str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
|
1465
|
+
fontsize,
|
1466
|
+
)
|
1244
1467
|
|
1245
|
-
|
1246
|
-
|
1247
|
-
|
1248
|
-
|
1249
|
-
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1468
|
+
frame_out = []
|
1469
|
+
for i, frame in enumerate(medias_int):
|
1470
|
+
pil_image = Image.fromarray(frame.astype(np.uint8)).convert("RGBA")
|
1471
|
+
for elt in masks_int[i]:
|
1472
|
+
mask = elt["mask"]
|
1473
|
+
label = elt["label"]
|
1474
|
+
np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
|
1475
|
+
np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
|
1476
|
+
mask_img = Image.fromarray(np_mask.astype(np.uint8))
|
1477
|
+
pil_image = Image.alpha_composite(pil_image, mask_img)
|
1478
|
+
|
1479
|
+
if draw_label:
|
1480
|
+
draw = ImageDraw.Draw(pil_image)
|
1481
|
+
text_box = draw.textbbox((0, 0), text=label, font=font)
|
1482
|
+
x, y = _get_text_coords_from_mask(
|
1483
|
+
mask,
|
1484
|
+
v_gap=(text_box[3] - text_box[1]) + 10,
|
1485
|
+
h_gap=(text_box[2] - text_box[0]) // 2,
|
1486
|
+
)
|
1487
|
+
if x != 0 and y != 0:
|
1488
|
+
text_box = draw.textbbox((x, y), text=label, font=font)
|
1489
|
+
draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label])
|
1490
|
+
draw.text((x, y), label, fill="black", font=font)
|
1491
|
+
frame_out.append(np.array(pil_image))
|
1492
|
+
return frame_out[0] if len(frame_out) == 1 else frame_out
|
1253
1493
|
|
1254
1494
|
|
1255
1495
|
def overlay_heat_map(
|
@@ -1412,7 +1652,6 @@ def florencev2_fine_tuned_object_detection(
|
|
1412
1652
|
|
1413
1653
|
TOOLS = [
|
1414
1654
|
owl_v2,
|
1415
|
-
grounding_sam,
|
1416
1655
|
extract_frames,
|
1417
1656
|
ocr,
|
1418
1657
|
clip,
|
@@ -1420,13 +1659,15 @@ TOOLS = [
|
|
1420
1659
|
vit_nsfw_classification,
|
1421
1660
|
loca_zero_shot_counting,
|
1422
1661
|
loca_visual_prompt_counting,
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1662
|
+
florence2_image_caption,
|
1663
|
+
florence2_ocr,
|
1664
|
+
florence2_sam2_image,
|
1665
|
+
florence2_sam2_video,
|
1666
|
+
florence2_object_detection,
|
1667
|
+
ixc25_image_vqa,
|
1668
|
+
ixc25_video_vqa,
|
1426
1669
|
detr_segmentation,
|
1427
1670
|
depth_anything_v2,
|
1428
|
-
generate_soft_edge_image,
|
1429
|
-
dpt_hybrid_midas,
|
1430
1671
|
generate_pose_image,
|
1431
1672
|
closest_mask_distance,
|
1432
1673
|
closest_box_distance,
|
@@ -1437,7 +1678,6 @@ TOOLS = [
|
|
1437
1678
|
overlay_bounding_boxes,
|
1438
1679
|
overlay_segmentation_masks,
|
1439
1680
|
overlay_heat_map,
|
1440
|
-
template_match,
|
1441
1681
|
]
|
1442
1682
|
TOOLS_DF = get_tools_df(TOOLS) # type: ignore
|
1443
1683
|
TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
|
@@ -416,7 +416,6 @@ class CodeInterpreter(abc.ABC):
|
|
416
416
|
|
417
417
|
|
418
418
|
class E2BCodeInterpreter(CodeInterpreter):
|
419
|
-
|
420
419
|
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
421
420
|
super().__init__(*args, **kwargs)
|
422
421
|
assert os.getenv("E2B_API_KEY"), "E2B_API_KEY environment variable must be set"
|
@@ -1,12 +1,15 @@
|
|
1
1
|
"""Utility functions for image processing."""
|
2
2
|
|
3
3
|
import base64
|
4
|
+
import io
|
5
|
+
import tempfile
|
4
6
|
from importlib import resources
|
5
7
|
from io import BytesIO
|
6
8
|
from pathlib import Path
|
7
9
|
from typing import Dict, List, Tuple, Union
|
8
10
|
|
9
11
|
import numpy as np
|
12
|
+
from moviepy.editor import ImageSequenceClip
|
10
13
|
from PIL import Image, ImageDraw, ImageFont
|
11
14
|
from PIL.Image import Image as ImageType
|
12
15
|
|
@@ -63,6 +66,46 @@ def rle_decode(mask_rle: str, shape: Tuple[int, int]) -> np.ndarray:
|
|
63
66
|
return img.reshape(shape)
|
64
67
|
|
65
68
|
|
69
|
+
def rle_decode_array(rle: Dict[str, List[int]]) -> np.ndarray:
|
70
|
+
r"""Decode a run-length encoded mask. Returns numpy array, 1 - mask, 0 - background.
|
71
|
+
|
72
|
+
Parameters:
|
73
|
+
mask: The mask in run-length encoded as an array.
|
74
|
+
"""
|
75
|
+
size = rle["size"]
|
76
|
+
counts = rle["counts"]
|
77
|
+
|
78
|
+
total_elements = size[0] * size[1]
|
79
|
+
flattened_mask = np.zeros(total_elements, dtype=np.uint8)
|
80
|
+
|
81
|
+
current_pos = 0
|
82
|
+
for i, count in enumerate(counts):
|
83
|
+
if i % 2 == 1:
|
84
|
+
flattened_mask[current_pos : current_pos + count] = 1
|
85
|
+
current_pos += count
|
86
|
+
|
87
|
+
binary_mask = flattened_mask.reshape(size, order="F")
|
88
|
+
return binary_mask
|
89
|
+
|
90
|
+
|
91
|
+
def frames_to_bytes(
|
92
|
+
frames: List[np.ndarray], fps: float = 10, file_ext: str = "mp4"
|
93
|
+
) -> bytes:
|
94
|
+
r"""Convert a list of frames to a video file encoded into a byte string.
|
95
|
+
|
96
|
+
Parameters:
|
97
|
+
frames: the list of frames
|
98
|
+
fps: the frames per second of the video
|
99
|
+
file_ext: the file extension of the video file
|
100
|
+
"""
|
101
|
+
with tempfile.NamedTemporaryFile(delete=True) as temp_file:
|
102
|
+
clip = ImageSequenceClip(frames, fps=fps)
|
103
|
+
clip.write_videofile(temp_file.name + f".{file_ext}", fps=fps)
|
104
|
+
with open(temp_file.name + f".{file_ext}", "rb") as f:
|
105
|
+
buffer_bytes = f.read()
|
106
|
+
return buffer_bytes
|
107
|
+
|
108
|
+
|
66
109
|
def b64_to_pil(b64_str: str) -> ImageType:
|
67
110
|
r"""Convert a base64 string to a PIL Image.
|
68
111
|
|
@@ -78,6 +121,15 @@ def b64_to_pil(b64_str: str) -> ImageType:
|
|
78
121
|
return Image.open(BytesIO(base64.b64decode(b64_str)))
|
79
122
|
|
80
123
|
|
124
|
+
def numpy_to_bytes(image: np.ndarray) -> bytes:
|
125
|
+
pil_image = Image.fromarray(image).convert("RGB")
|
126
|
+
image_buffer = io.BytesIO()
|
127
|
+
pil_image.save(image_buffer, format="PNG")
|
128
|
+
buffer_bytes = image_buffer.getvalue()
|
129
|
+
image_buffer.close()
|
130
|
+
return buffer_bytes
|
131
|
+
|
132
|
+
|
81
133
|
def get_image_size(data: Union[str, Path, np.ndarray, ImageType]) -> Tuple[int, ...]:
|
82
134
|
r"""Get the size of an image.
|
83
135
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|