vision-agent 0.2.176__tar.gz → 0.2.178__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.176 → vision_agent-0.2.178}/PKG-INFO +1 -1
- {vision_agent-0.2.176 → vision_agent-0.2.178}/pyproject.toml +1 -1
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/tools/__init__.py +18 -1
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/tools/tools.py +48 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/LICENSE +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/README.md +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/agent/agent_utils.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/agent/vision_agent_planner.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/utils/video.py +0 -0
@@ -1,5 +1,17 @@
|
|
1
1
|
from typing import Callable, List, Optional
|
2
2
|
|
3
|
+
from .meta_tools import (
|
4
|
+
create_code_artifact,
|
5
|
+
edit_code_artifact,
|
6
|
+
edit_vision_code,
|
7
|
+
generate_vision_code,
|
8
|
+
get_tool_descriptions,
|
9
|
+
list_artifacts,
|
10
|
+
object_detection_fine_tuning,
|
11
|
+
open_code_artifact,
|
12
|
+
use_object_detection_fine_tuning,
|
13
|
+
view_media_artifact,
|
14
|
+
)
|
3
15
|
from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
4
16
|
from .tool_utils import get_tool_descriptions_by_names
|
5
17
|
from .tools import (
|
@@ -53,6 +65,7 @@ from .tools import (
|
|
53
65
|
template_match,
|
54
66
|
vit_image_classification,
|
55
67
|
vit_nsfw_classification,
|
68
|
+
video_temporal_localization,
|
56
69
|
)
|
57
70
|
|
58
71
|
__new_tools__ = [
|
@@ -65,7 +78,11 @@ def register_tool(imports: Optional[List] = None) -> Callable:
|
|
65
78
|
def decorator(tool: Callable) -> Callable:
|
66
79
|
import inspect
|
67
80
|
|
68
|
-
from .tools import
|
81
|
+
from .tools import ( # noqa: F811
|
82
|
+
get_tool_descriptions,
|
83
|
+
get_tools_df,
|
84
|
+
get_tools_info,
|
85
|
+
)
|
69
86
|
|
70
87
|
global TOOLS, TOOLS_DF, TOOL_DESCRIPTIONS, TOOL_DOCSTRING, TOOLS_INFO
|
71
88
|
|
@@ -975,6 +975,54 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
|
975
975
|
return answer["text"][0] # type: ignore
|
976
976
|
|
977
977
|
|
978
|
+
def video_temporal_localization(
|
979
|
+
prompt: str,
|
980
|
+
frames: List[np.ndarray],
|
981
|
+
model: str = "qwen2vl",
|
982
|
+
chunk_length: Optional[float] = None,
|
983
|
+
chunk_length_seconds: Optional[float] = None,
|
984
|
+
chunk_length_frames: Optional[int] = 2,
|
985
|
+
) -> List[float]:
|
986
|
+
"""'video_temporal_localization' is a tool that can find objects in a video given a question about it.
|
987
|
+
It returns a list of floats with a value of 1.0 if the object to be found is present in the chunk of video being analyzed.
|
988
|
+
|
989
|
+
Parameters:
|
990
|
+
prompt (str): The question about the video
|
991
|
+
frames (List[np.ndarray]): The reference frames used for the question
|
992
|
+
model (str): The model to use for the inference. Valid values are 'qwen2vl', 'gpt4o', 'internlm-xcomposer'
|
993
|
+
chunk_length (Optional[float]): length of each chunk in seconds
|
994
|
+
chunk_length_seconds (Optional[float]): alternative length for chunk in seconds
|
995
|
+
chunk_length_frames (Optional[int]): length of each chunk in frames
|
996
|
+
|
997
|
+
Returns:
|
998
|
+
List[float]: A list of floats with a value of 1.0 if the object to be found is present in the chunk of video
|
999
|
+
|
1000
|
+
Example
|
1001
|
+
-------
|
1002
|
+
>>> video_temporal_localization('Did a goal happened?', frames)
|
1003
|
+
[0.0, 0.0, 0.0, 1.0, 1.0, 0.0]
|
1004
|
+
"""
|
1005
|
+
|
1006
|
+
buffer_bytes = frames_to_bytes(frames)
|
1007
|
+
files = [("video", buffer_bytes)]
|
1008
|
+
payload: Dict[str, Any] = {
|
1009
|
+
"prompt": prompt,
|
1010
|
+
"model": model,
|
1011
|
+
"function_name": "video_temporal_localization",
|
1012
|
+
}
|
1013
|
+
if chunk_length is not None:
|
1014
|
+
payload["chunk_length"] = chunk_length
|
1015
|
+
if chunk_length_seconds is not None:
|
1016
|
+
payload["chunk_length_seconds"] = chunk_length_seconds
|
1017
|
+
if chunk_length_frames is not None:
|
1018
|
+
payload["chunk_length_frames"] = chunk_length_frames
|
1019
|
+
|
1020
|
+
data = send_inference_request(
|
1021
|
+
payload, "video-temporal-localization", files=files, v2=True
|
1022
|
+
)
|
1023
|
+
return [cast(float, value) for value in data]
|
1024
|
+
|
1025
|
+
|
978
1026
|
def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
|
979
1027
|
"""'clip' is a tool that can classify an image or a cropped detection given a list
|
980
1028
|
of input classes or tags. It returns the same list of the input classes along with
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
File without changes
|
{vision_agent-0.2.176 → vision_agent-0.2.178}/vision_agent/agent/vision_agent_planner_prompts.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|